\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images

\usepackage{multirow}
\usepackage{adjustbox}
\usepackage{tabularray}
\usepackage{amsmath}
\usepackage{booktabs}

% \usepackage{amsmath,graphicx}
\usepackage{xcolor} 
\newcommand{\xxx}[1]{\textcolor{red}{#1}}
\newcommand{\ppp}[1]{\textcolor{blue}{#1}}
\newcommand{\yyy}[1]{\textcolor{orange!70}{#1}}

\jmlrvolume{-- 131}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[LOTUS]{LOTUS: Latent Outpainting Diffusion Model for Three-Dimensional Ultrasound Stitching}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Xing Yao\nametag{$^{1}$}} \Email{xing.yao@vanderbilt.edu}\\
\Name{Runxuan Yu\nametag{$^{1}$}} \Email{runxuan.yu@vanderbilt.edu}\\
\Name{Nick DiSanto\nametag{$^{1}$}} \Email{nicolas.c.disanto@vanderbilt.edu}\\
\Name{Ehsan Khodapanah Aghdam\nametag{$^{1}$}} \Email{ehsan.khodapanah.aghdam@vanderbilt.edu}\\
\Name{Kanyifeechukwu Oguine\nametag{$^{1}$}} \Email{kanyifeechukwu.j.oguine@vanderbilt.edu}\\
% \Name{Jingyi Zhu\nametag{$^{1}$}} \Email{jingyi.zhu@vanderbilt.edu}\\
\Name{Daiwei Lu\nametag{$^{1}$}} \Email{daiwei.lu@vanderbilt.edu}\\
\Name{Ange Lou\nametag{$^{1}$}} \Email{ange.lou@vanderbilt.edu}\\
\Name{Jiacheng Wang\nametag{$^{1}$}} \Email{jiacheng.wang.1@vanderbilt.edu}\\
\Name{Dewei Hu\nametag{$^{2}$}} \Email{hu.dewei@mayo.edu}\\
\Name{Gabriel Arenas\nametag{$^{3}$}} \Email{gabriel.arenas@pennmedicine.upenn.edu}\\
\Name{Baris Oguz\nametag{$^{3}$}} \Email{barisumog@gmail.com}\\
% \Name{Wensi Wu\nametag{$^{3}$}} \Email{wuw4@chop.edu}\\
\Name{Alison Pouch\nametag{$^{3}$}} \Email{pouch@pennmedicine.upenn.edu}\\
\Name{Nadav Schwartz\nametag{$^{3}$}} \Email{nadav.schwartz@pennmedicine.upenn.edu}\\
\Name{Brett C Byram\nametag{$^{1}$}} \Email{brett.c.byram@vanderbilt.edu}\\
\Name{Ipek Oguz\nametag{$^{1}$}} \Email{ipek.oguz@vanderbilt.edu}\\
\addr $^{1}$ Vanderbilt University \\
\addr $^{2}$ Mayo Clinic \\
\addr $^{3}$ University of Pennsylvania \\
% \addr $^{3}$ Children's Hospital of Philadelphia \\
}

% \midlauthor{\Name{Xing Yao\nametag{$^{1}$}} \Email{xing.yao@vanderbilt.edu}\\
% \Name{Runxuan Yu\nametag{$^{1}$}} \Email{runxuan.yu@vanderbilt.edu}\\
% \Name{Nick DiSanto\nametag{$^{1}$}} \Email{nicolas.c.disanto@vanderbilt.edu}\\
% \Name{Ehsan Khodapanah Aghdam\nametag{$^{1}$}} \Email{ehsan.khodapanah.aghdam@vanderbilt.edu}\\
% \Name{Kanyifeechukwu Oguine\nametag{$^{1}$}} \Email{kanyifeechukwu.j.oguine@vanderbilt.edu}\\
% % \Name{Jingyi Zhu\nametag{$^{1}$}} \Email{jingyi.zhu@vanderbilt.edu}\\
% \Name{Daiwei Lu\nametag{$^{1}$}} \Email{daiwei.lu@vanderbilt.edu}\\
% \Name{Ange Lou\nametag{$^{1}$}} \Email{ange.lou@vanderbilt.edu}\\
% \Name{Jiacheng Wang\nametag{$^{1}$}} \Email{jiacheng.wang.1@vanderbilt.edu}\\
% \addr $^{1}$ Vanderbilt University \AND
% \Name{Dewei Hu\nametag{$^{2}$}} \Email{hu.dewei@mayo.edu}\\
% \addr $^{2}$ Mayo Clinic \AND
% \Name{Gabriel Arenas\nametag{$^{3}$}} \Email{gabriel.arenas@pennmedicine.upenn.edu}\\
% \addr $^{3}$ University of Pennsylvania \AND
% \Name{Baris Oguz\nametag{$^{3}$}} \Email{barisumog@gmail.com}\\
% % \Name{Wensi Wu\nametag{$^{3}$}} \Email{wuw4@chop.edu}\\
% \Name{Alison Pouch\nametag{$^{3}$}} \Email{pouch@pennmedicine.upenn.edu}\\
% \Name{Nadav Schwartz\nametag{$^{3}$}} \Email{nadav.schwartz@pennmedicine.upenn.edu}\\
% \Name{Brett C Byram\nametag{$^{1}$}} \Email{brett.c.byram@vanderbilt.edu}\\
% \Name{Ipek Oguz\nametag{$^{1}$}} \Email{ipek.oguz@vanderbilt.edu}\\
% % \addr $^{3}$ Children's Hospital of Philadelphia \\
% }

% % More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Xing Yao\nametag{$^{1}$}}, %\Email{xing.yao@vanderbilt.edu}\\
% \Name{Runxuan Yu\nametag{$^{1}$}}, \Name{Nick DiSanto\nametag{$^{1}$}}, \Name{Ehsan K.\ Aghdam\nametag{$^{1}$}}, \Name{Kanyifeechukwu Oguine\nametag{$^{1}$}}, \Name{Daiwei Lu\nametag{$^{1}$}}, \Name{Ange Lou\nametag{$^{1}$}}, \Name{Jiacheng Wang\nametag{$^{1}$}}, \Name{Dewei Hu\nametag{$^{2}$}}, \Name{Gabriel Arenas\nametag{$^{3}$}}, \Name{Baris Oguz\nametag{$^{3}$}}, \Name{Alison Pouch\nametag{$^{3}$}}, \Name{Nadav Schwartz\nametag{$^{3}$}}, \Name{Brett C Byram\nametag{$^{1}$}}, \Name{Ipek Oguz\nametag{$^{1}$}}\\
% \addr $^{1}$Vanderbilt University, 
% \addr $^{2}$Mayo Clinic, 
% \addr $^{3}$University of Pennsylvania
% % \addr $^{3}$ Children's Hospital of Philadelphia \\
% }

\begin{document}

\maketitle

\begin{abstract}
% In this study, we address the challenging task of registering two partially overlapping 3D ultrasound (3DUS) volumes for stitching. Mainstream registration algorithms struggle with this registration task because the sector-shaped field of view (FOV) dominates the registration, and it is not always helpful to constrain the registration to the FOV since the images are only partially overlapping. To address these challenges, we propose
% outpainting the sector-shaped FOV of 3DUS into a rectangular FOV. We introduce LOTUS, a Latent Outpainting Diffusion Model (LDM) specifically designed for FOV expansion in 3DUS. LOTUS encodes 3DUS data into latent features and performs outpainting directly within the latent space at  test time, effectively transforming the sector-shaped FOV into a standard rectangular shape. Experimental results demonstrate that the outpainted 3DUS images benefit the subsequent registration tasks by mitigating the local minima issues inherent to the original sector-shaped FOV.

% 3D ultrasound (3DUS) stitching can enlarge the field-of-view (FOV) by registering partially overlapping 3DUS images collected from different probe positions. However, mainstream registration algorithms struggle with this registration task and sector-shaped FOV affection is regarded as a potential reason, which introduces a strong local minimum at the initial registration stage and hinders further optimization.
% To address these challenges, we propose
% outpainting the sector-shaped FOV of 3DUS into a square-shaped FOV. We introduce LOTUS, a Latent Outpainting Diffusion Model (LDM) specifically designed for FOV expansion in 3DUS. LOTUS encodes 3DUS data into latent features and performs outpainting directly within the latent space at test time, effectively transforming the sector-shaped FOV into a standard rectangular shape. Experimental results demonstrate that the outpainted 3DUS images significantly improve the registration performance by mitigating the local minima issues inherent to the original sector-shaped FOV.



3D ultrasound (3DUS) stitching can enlarge the field-of-view (FOV) by registering partially overlapping 3DUS images collected from different probe positions. However, standard registration algorithms frequently encounter difficulties with this task, primarily due to the sector-shaped FOV, which often leads to pronounced local minima, thereby obstructing optimization efforts.
To address these limitations, we propose LOTUS, a novel Latent Diffusion Model (LDM) specifically designed for 3DUS FOV outpainting. LOTUS innovatively encodes the 3DUS data into a compact latent space and performs outpainting at test time, effectively extending the sector-shaped FOV into a standard rectangular shape. This transformation facilitates a more robust registration by mitigating the issues of local minima associated with the original FOV shape. Experimental results show that LOTUS significantly improves the accuracy of the registration as well as the efficiency of the outpainting process compared to existing models. The code is available at \href{https://github.com/MedICL-VU/LOTUS}{github.com/MedICL-VU/LOTUS}.


\end{abstract}

\begin{keywords}
Latent Diffusion Model, Ultrasound, Outpainting, Registration
\end{keywords}

\section{Introduction}

% Ultrasound imaging (US) is widely used in diverse diagnostic applications \cite{bano2024image}. 
US image registration \cite{che2017ultrasound,entrekin2001real,wang2014towards}  is a pivotal task for downstream US analysis. A particularly important application is image stitching \cite{banerjee2015fast, gomez2019image,wright2023fast,bano2024image}, which can compound multiple US images collected from different probe positions by aligning the overlapping image contents to extend the US field of view (FOV). This is important for complete visualization of larger anatomical structures, such as the fetus and the placenta during the second/third trimesters of pregnancy \cite{roy2017comparison,gomez2017fast}. However, the sector-shaped FOV inherent to US imaging poses considerable challenges for effective image stitching \cite{yao2024synstitch}. It introduces a strong trivial local minimum of the similarity metric at the initial stage, rendering registration optimization difficult. Limiting the metric computation to just the overlapping region can help this problem, but this introduces a further complication as the similarity metric within the overlapping region can be trivially optimized by artificially reducing overlap. In this study, we explore the potential of outpainting the 3DUS volume to obtain a rectangular FOV for alleviating these optimization problems during registration. %Our overall goal is robust 3DUS image stitching.

Image outpainting has become a prominent topic in computer vision in recent years, with diffusion model (DM)-based methods achieving remarkable performance in natural image outpainting \cite{avrahami2023blended,lugmayr2022repaint,corneanu2024latentpaint,ju2024brushnet,xie2023smartbrush,zhuang2025task}. However, all these methods face challenges with artifacts between the original and synthetic regions, and they are primarily restricted to 2D domains, without the ability to maintain consistency across slices in a 3D volume. In the context of US outpainting, echoGAN \cite{gazda2024generative} introduced a conditional GAN to outpaint 2D cardiac US images from a smaller-angle FOV to a larger-angle FOV while maintaining the sector shape. SynStitch \cite{yao2024synstitch} proposed a ControlNet-based framework for 2D kidney US image outpainting. Nevertheless, both echoGAN and SynStitch focus on 2DUS and are unable to outpaint to a rectangular FOV.
Outpainting for US images, and particularly for 3DUS, thus remains underexplored.

% Outpainting a sector-shaped FOV in 3D ultrasound (3DUS) to a square-shaped FOV poses unique challenges, unlike similar tasks in other medical imaging modalities with rectangular FOVs \cite{liman2024diffusion,li2024multi}. These challenges arise due to the lack of comprehensive ground truth for training in a supervised manner. The reliance on sector-shaped FOV images for training constrains the model’s outpainting capabilities by embedding the sector shape as prior knowledge and complicates the task due to significant missing regions.

Unlike 3D outpainting task on the other medical imaging modalities \cite{liman2024diffusion,li2024multi} with rectangular FOVs, outpaint a sector-shape FOV 3DUS to a rectangular FOV presents unique challenges. These challenges stem from the absence of whole-FOV ground truth to train the outpainting network in a supervised manner. Training models on sector-shaped FOV images restricts outpainting capabilities, as the model tends to adopt the sector shape as prior knowledge. Furthermore, the sector-shaped FOV introduces substantial missing regions, further increasing the complexity of 3DUS outpainting. 

 % the key difference between lotus and previous work: 1) don't need repaint, 2) then don't need to be added into multiple hidden layers of unet. 3) LMG enable it to eliminate the edge affect and achieve seamless outpainting

% To address above-mentioned challenges, in this work, we propose LOTUS: a Latent Diffusion Model (LDM) \cite{rombach2022high} specifically designed for 3DUS FOV outpainting. Inspired by the previous work\cite{lugmayr2022repaint,corneanu2024latentpaint}, we overcome the following challenges to extend the former methods from 2D natural image to 3DUS outpainting: 1) to eliminate the heavy computational burden and speed up the inference in 3D domain, LOTUS encodes 3D ultrasound data into latent features and performs outpainting directly within the latent space during the inference time. 2) We observe that removing edge-related latent space features is critical for achieving a seamless integration between the original area and the outpainted area in the image space. In this work, we propose a Latent Mask Generator (LMG) that preserves the majority of content-related features while effectively excluding edge-related features from the original FOV. 3) We proposed an effective augmentation strategy to extract square-shaped FOV patch as groundtruth to train the outpainting LDM, introduce square-shape FOV as a prior knowledge to the model. The key contributions of this work are summarized as follows: 

%To address above-mentioned challenges, in this work, we propose LOTUS: a Latent Diffusion Model (LDM) \cite{rombach2022high} specifically designed for 3DUS FOV outpainting. Inspired by the previous work \cite{lugmayr2022repaint,corneanu2024latentpaint}, we overcome the following challenges to extend the former methods from 2D natural image to 3DUS outpainting:\xxx{ill come back to this}1) to eliminate the heavy computational burden and speed up the inference, LOTUS executes 3D outpainting in the latent space instead of the image space during the inference time. 2) To address inconsistencies between the outpainted regions and the original image, we propose a Latent Mask Generator (LMG). The LMG preserves the majority of content-related features while effectively excluding edge-related features from the original FOV, resulting in more natural and seamless outpainting. 3) We proposed an effective augmentation strategy to extract rectangular FOV patches as ground truth to train the outpainting LDM, introducing rectangular FOVs as a prior knowledge to the model. The key contributions of this work are summarized as follows: 

 % For evaluation, we employ two state-of-the-art (SOTA) conventional registration methods, Greedy \cite{yushkevich2016ic} and ANTs \cite{avants2008symmetric}, as baselines. Both methods optimize registration using the mean square error (MSE) metric \cite{sokooti2019quantitative} and apply rigid transformations.

%\begin{itemize}
    % \item[$\bullet$] We proposedc LOTUS, a 3D latent outpainting that can expand the 3DUS FOV in the latent space at the test time. It can expand the FOV with realistic and detailed structure, while reduce computational burden and inference costing, making it highly effective for 3D medical image outpainting tasks.

%    \item[$\bullet$] We, for the first time, investigate the potential of improving 3DUS image registration performance using FOV-outpainted 3DUS images. Our results demonstrate that outpainted images with a rectangular FOV significantly enhance intensity-based registration performance for both ANTs and Greedy.
    
%    \item[$\bullet$] To the best of our knowledge, LOTUS is the first method to address the challenging task of outpainting the sector-shaped FOV of 3DUS into a square shape. LOTUS can achieve realistic outpainting results while dramatically improve the inference speed with lower computational burden compared to its image-space counterparts.
    
%    \item[$\bullet$] We propose a Latent Mask Generator (LMG) that preserves the majority of content-related features while effectively excluding FOV-edge-related features to achieve harmonization between the original and synthetic regions in the image-space.

    % \item[$\bullet$] Our results demonstrate that outpainted images with a square-shaped FOV significantly enhance intensity-based registration performance for both ANTs and Greedy.
    % \item[$\bullet$] We compare LOTUS with existing methods, including Diffusion Models (DM), ControlNet, Latent-ControlNet, and a modified 3D version of Repaint. In the downstream 3DUS registration task, Our results demonstrate that outpainted images with a rectangular FOV significantly enhance intensity-based registration performance for both ANTs and Greedy.
    % latent edge-excluding strategy
%\end{itemize}


To address these challenges,  we propose LOTUS: a Latent Diffusion Model (LDM) \cite{rombach2022high} specifically designed for 3DUS FOV outpainting, inspired by the previous work on 2D natural images \cite{lugmayr2022repaint,corneanu2024latentpaint}. %Our key contributions can be summarized as follows: 

\begin{itemize}
    
   \item To the best of our knowledge, LOTUS is the first method to address the challenging task of outpainting the sector-shaped FOV of 3DUS into a rectangular shape. LOTUS executes 3D outpainting in the latent space instead of the image space at inference time. It can achieve realistic results while dramatically improving the inference speed with lower computational burden compared to its image-space counterparts.
   
    \item We propose an effective  strategy to extract rectangular FOV patches to train the outpainting LDM, introducing rectangular FOVs as a prior knowledge to the model.
    
    \item To address inconsistencies between the outpainted regions and the original image, we propose a Latent Mask Generator (LMG) that preserves the majority of content-related features while effectively excluding FOV-edge-related features to achieve  seamless outpainting between the original and synthetic regions.

    \item We propose improving 3DUS image registration performance using FOV-outpainted 3DUS images, and we show this significantly enhances registration performance.

    % \item[$\bullet$] Our results demonstrate that outpainted images with a square-shaped FOV significantly enhance intensity-based registration performance for both ANTs and Greedy.
    % \item[$\bullet$] We compare LOTUS with existing methods, including Diffusion Models (DM), ControlNet, Latent-ControlNet, and a modified 3D version of Repaint. In the downstream 3DUS registration task, Our results demonstrate that outpainted images with a rectangular FOV significantly enhance intensity-based registration performance for both ANTs and Greedy.
    % latent edge-excluding strategy
\end{itemize}


\section{Methods}
% \label{sec:lotus_overview}

LOTUS (Fig.~\ref{lotus_pipline}(a)) performs outpainting during inference using a pretrained latent diffusion model (LDM), which combines an AutoEncoder (AE) with a Diffusion U-Net. %The following sections provide a detailed overview of each component within the LOTUS framework. \xxx{maybe add section refs later}

\begin{figure}
% \vspace{-2mm}
\centering
\includegraphics[width=\linewidth]{imgs/all_pipeline.pdf}
% \vspace{-3mm}
\caption{(a) Test-time latent space outpainting with LOTUS. (b) The training of AutoEncoderKL. $\mathcal{E}$, $\mathcal{D}$, and $D_{\psi}$ are encoder, decoder, and patch-based discriminator, respectively. (c) The training and inference of LDM. } 
\label{lotus_pipline}
% \vspace{-7mm}
\end{figure}

\subsection{Training of AutoEncoderKL}
\label{sec:autoenckl}

As the first step of LOTUS, we train an AutoEncoderKL \(\{\mathcal{E}, \mathcal{D}\}\), where the encoder \(\mathcal{E}\) compresses the 3DUS image \(I\) of size $N^3$ into a latent representation \(L\) of size $(\frac{N}{S})^3$, with a scaling factor \(S = 4\). The decoder \(\mathcal{D}\) reconstructs \(L\) back to the image space, producing \(\bar{I}\). As depicted in Fig.~\ref{lotus_pipline}(b), the spatial correspondence between \(I\) and \(L\) enables effective pixel-space outpainting through operations performed in the latent space. To enhance the AE's generalization, we apply random affine transformations and cropping to the input 3DUS images, generating variations in size and shape. Training follows the implementation of \cite{rombach2022high} and employs an adversarial framework, where a patch-based discriminator \(D_{\psi}\) learns to distinguish between the original \(I\) and the reconstructed \(\bar{I}\).



\subsection{Training and Inference of the Latent Diffusion Model}

% Once the pretrained AutoEncoder is obtained, capable of encoding 3D medical images into latent features, we proceed to train the LDM for unconditionally generating realistic 3D ultrasound (3DUS) images with a rectangular-shaped FOV. However, 3DUS images acquired using sector array probes inherently have sector-shaped FOVs. Directly training models on these complete sector-shaped FOV images limits outpainting capabilities, as the model tends to learn the sector shape as prior knowledge.

After obtaining the pretrained AutoEncoder, we train the LDM to unconditionally generate realistic 3DUS images with a rectangular FOV. However, 3DUS images acquired with sector array probes inherently exhibit sector-shaped FOVs. Training directly on these  images restricts outpainting capabilities, as the model learns the sector shape as prior knowledge.


% To address this issue, we apply augmentation and central patch extraction, as illustrated in Fig.\ref{lotus_pipline}(c). First, a random affine transformation \(\mathcal{A}\) is applied to the input 3D ultrasound (3DUS) image \(I\), resulting in a transformed image \(I' = I(\Phi(\mathcal{A}))\). Next, the central region of interest (ROI) \(P\), with dimensions \(N \times N \times N\), is extracted and used as input for the LDM training. During the LDM training phase, the pretrained encoder \(\mathcal{E}\) encodes the extracted rectangular FOV image \(P\) into its latent representation \(L\). Then in the forward diffusion process, a noisy latent feature \(L_t\) is generated by iteratively adding Gaussian noise to \(L\). This noisy latent feature at any given time step \(t\) is computed using the closed-form expression:
% $L_t = \sqrt{\bar{\alpha}_t} L + \sqrt{1 - \bar{\alpha}_t} \epsilon$,
% where \(\bar{\alpha}_t = \prod_{s=1}^t \alpha_s\) represents the cumulative product of noise scheduling coefficients \(\alpha_t\), and \(\epsilon \sim \mathcal{N}(0, L)\) is Gaussian noise. In the subsequent denoising stage, the model \(\epsilon_\theta\), processes the noisy latent feature \(L_t\) to predict and subtract the noise \(\epsilon\) added to \(L\) at each time step \(t\). The model optimizes this process by minimizing the loss function, defined as:


To address this issue, we employ augmentation and central patch extraction, as illustrated in Fig.~\ref{lotus_pipline}(c). The input images are upsampled by half, such that they contain $(2N)^3$ voxels. A random affine transformation \(\mathcal{A}\) is applied to the input  image \(I\), producing  \(I' = I(\Phi(\mathcal{A}))\). Next, a central patch \(P\) of size $N^3$ is extracted and used for LDM training. During training, the pretrained encoder \(\mathcal{E}\) encodes the extracted rectangular FOV patch \(P\) into its latent representation \(L\). In the forward diffusion process, a noisy latent feature \(L_t\) is generated by iteratively adding Gaussian noise to \(L\), computed using the closed-form expression:
$L_t = \sqrt{\bar{\alpha}_t} L + \sqrt{1 - \bar{\alpha}_t} \epsilon$,
where \(\bar{\alpha}_t = \prod_{s=1}^t \alpha_s\) is the cumulative product of noise scheduling coefficients \(\alpha_t\), and \(\epsilon \sim \mathcal{N}(0, L)\) is Gaussian noise. In the denoising stage, the model \(\epsilon_\theta\) predicts and removes the noise \(\epsilon\) from \(L_t\) at each  step \(t\),  by minimizing the loss: 


\begin{equation}
\mathcal{L_{LDM}} = \mathbb{E}_{L, \epsilon \sim \mathcal{N}(0, L), t} \left[ \left\| \epsilon - \epsilon_\theta(L_t, t) \right\|^2 \right].
\label{eq:ldm_loss}
\end{equation}

% In Fig.\ref{lotus_pipline}(c), the gray dashed box illustrates the iterative denoising process applied to the noisy latent representation. At each time step $t$, the denoising process updates the latent representation as \(L_{t-1} = L_{t} - \epsilon_{\theta}(L_t, t)\),
% where the noise \(\epsilon_{\theta}(L_t, t)\) is predicted by the frozen Diffusion U-Net. This predicted noise is then subtracted from the noisy latent representation at each time step. Through this iterative process, the model progressively refines the latent representation until the clean latent representation, \(\bar{L}\), is obtained. Finally, \(\bar{L}\) is fed into the decoder \(\mathcal{D}\), which reconstructs the 3DUS image \(\bar{P}\) in the pixel space.

In Fig.~\ref{lotus_pipline}(c), the gray dashed box illustrates the iterative denoising process applied to the noisy latent representation. At each time step \(t\), the latent representation is updated as \(L_{t-1} = L_{t} - \epsilon_{\theta}(L_t, t)\),
where \(\epsilon_{\theta}(L_t, t)\) is the noise predicted by the frozen Diffusion U-Net. This noise is iteratively removed, progressively refining the latent representation until the clean latent state \(\bar{L}\) is obtained. Finally, \(\bar{L}\) is passed through the decoder \(\mathcal{D}\) to reconstruct the 3DUS image \(\bar{P}\) in pixel space.



\subsection{Test Time Latent Space Out-Painting with LOTUS}
\label{sec:lotus_outpaint}
As illustrated in Fig.~\ref{lotus_pipline}(a), once the pretrained LDM is obtained, LOTUS performs outpainting in the latent space without additional training. Given an input 3DUS image \(I\), its foreground mask \(M\) is extracted using a thresholding function. The image \(I\) is then encoded by the pretrained encoder \(\mathcal{E}\) into its latent representation \(L\), while the latent mask \(M_L\) is derived from the binary mask \(M\) using LMG.

\noindent \textbf{Latent Mask Generator (LMG):} The LMG generates a latent mask \(M_L\) that preserves content-related features while excluding those associated with the sector-shaped FOV edges. This process consists of two steps: (1) The binary mask \(M\) of size $N^3$ is downscaled to $(\frac{N}{S} - d)^3$, where \(d\) is a small integer satisfying \(d < \frac{N}{S}\). We use \(d = 2\) in this work. This slight shrinking ensures that \(M_L\) effectively filters out edge-related features while retaining essential content. (2) The downscaled mask is zero-padded to $(\frac{N}{S})^3$. 

\noindent \textbf{Latent Outpainting Iteration (LOI):} The LOI begins by extracting the latent condition \(L' = L \times M_L\), where the latent mask \(M_L\) ensures that \(L'\) retains only content-relevant regions while excluding sector-shaped edges. The objective of latent outpainting is to preserve the masked content in \(L'\) while generating new latent features outside the mask. We refer to the foreground of \(M_L\) as the ``Condition Region of Interest (CROI)'' and the background as the ``Outpainting Region of Interest (OROI)'' in the following discussion.


In the first iteration at time step \(T\), \(T-1\) steps of Gaussian noise are added to the latent condition \(L'\), yielding a noisy latent condition \(L'_{T-1}\). Simultaneously, a random Gaussian noise sample \(g_T \sim \mathcal{N}(0, L)\) is processed by the pretrained Diffusion U-Net to produce a one-step denoised synthetic latent feature $g_{T-1} = g_{T} - \epsilon_{\theta}(g_{T}, T)$.
Next, the CROI of the noisy latent condition \(L'_{T-1}\) is merged with the OROI of the denoised synthetic latent feature \(g_{T-1}\), forming the one-step denoised latent feature \(\bar{L}_{T-1}\), computed as:
$\bar{L}_{T-1} = M_L \times L'_{T-1} + (1 - M_L) \times g_{T-1}$ (Fig.\ \ref{lotus_pipline}(a)).


% In the subsequent iteration steps, the random Gaussian noise \(g_{T}\) is updated using the one-step denoised latent feature \(\bar{L}_{T-1}\) from the previous step. By integrating the information from the CROI of \(L'\) and the OROI of \(g_{T-1}\), \(\bar{L}_{T-1}\) introduces the guidance of the latent condition into the reverse diffusion process. This ensures that the latent condition \(L'\) is preserved during generation while guiding the outpainting to follow a distribution similar to the latent condition.

In subsequent iterations, \(g_T\) is updated using the one-step denoised latent feature \(\bar{L}_{T-1}\) from the previous step. By integrating information from the CROI of \(L'\) and the OROI of \(g_{T-1}\), \(\bar{L}_{T-1}\) guides the reverse diffusion process while preserving the latent condition \(L'\). This ensures that \(L'\) remains unchanged  while guiding  the outpainting within the OROI to follow a realistic distribution consistent with the latent condition. 


% After \(T\) iterations of latent reverse diffusion, the final outpainted latent feature \(L_{op}\) is generated. In this feature, the CROI maintains the latent condition \(L'\), while the OROI is outpainted with a realistic latent distribution. Finally, the outpainted image $ I_{op} = \mathcal{D}(L_{op})$ is reconstructed from the latent feature \(L_{op}\) using the pretrained decoder \(\mathcal{D}\).

After \(T\) iterations of latent reverse diffusion, the final outpainted latent feature $L_{op}=\bar{L}_0$ is obtained. Finally, the pretrained decoder reconstructs the outpainted image \(I_{op} = \mathcal{D}(L_{op})\).


%\subsection{Registration using LOTUS}
% For a given registration pair consisting of a target image \(I_{F}\) and a source image \(I_{M}\), we first outpaint their FOVs using LOTUS to obtain \(I^{op}_{F}\) and \(I^{op}_{M}\). Next, we calculate the rigid transformation matrix \(\mathcal{A}_{op} = \Theta(I^{op}_{F}, I^{op}_{M})\), where \(\Theta\) represents the registration algorithm. We then apply \(\mathcal{A}_{op}\) to the source image \(I_{M}\) to obtain the registered results \(I^{op}_{reg} = I_{M} \Phi(\mathcal{A}_{op})\), where \(\Phi\) denotes the rigid coordinate transformation.

%Given a registration pair consisting of a fixed image \(I_F\) and a moving image \(I_M\), we first apply LOTUS to outpaint their FOVs, obtaining \(I^{op}_F\) and \(I^{op}_M\). We then rigidly register \(I^{op}_F\) and \(I^{op}_M\), and apply the resulting transform to the original moving image \(I_M\). %The rigid transformation matrix is then computed as \(\mathcal{A}^{op} = \Theta(I^{op}_F, I^{op}_M)\), where \(\Theta\) denotes the registration algorithm. Finally, we apply \(\mathcal{A}^{op}\) to the source image \(I_M\), yielding the registered result \(I^{reg}_{M} = I_M (\Phi(\mathcal{A}^{op}))\), where \(\Phi\) represents the rigid coordinate transformation.


% overview
\subsection{Datasets and Implementation Details}
% We use 2 in-house datasets: \textbf{GenUS} and \textbf{RegUS} for LOTUS training and registration testing, respectively. Both datasets are acquired using a GE Voluson E8 machine. 

We use 2 in-house placenta 3DUS datasets, \textbf{GenUS} for LOTUS outpainting training and  testing, and \textbf{RegUS}, for registration testing. All data was acquired with a GE Voluson E8.


% \noindent \textbf{GenUS Dataset:} This dataset consists of 99 3DUS images of the placenta from subjects in the first trimester. During this early pregnancy stage, the placenta typically fits within a single 3DUS volume. All the volumes are resampled to a space of $0.5mm \times 0.5mm \times 0.5mm$ and central cropped to $256 \times 256 \times 256$. and normalized to [0, 1]. The 
% GenUS dataset is split to 89:10 for the LOTUS training and testing. The testing dataset is used to evaluate the outpainting performance. During the AutoEncoderKL training, the GenUS dataset are further resampled to a space of  $4mm \times 4mm \times 4mm$  central cropped to $64 \times 64 \times 64$ and the corresponding foreground masks are extracted at the same time. Then an extensive augmentation include random translations ([-12, 12]), random rotations ([$-\frac{\pi]}{2}$, $\frac{\pi]}{2}$]), random scaling ([0.8, 1.2]), and random patch cropping with the foregroundmasks are applied to the resampled image with a probability of 0.5. During the diffusion U-Net training, the similar random affine augmentation are applied to the GenUS, and then the cental patch in dimension of $64 \times 64 \times 64$ are extracted with square-shaped FOV.

\noindent \textbf{GenUS Dataset:} This dataset comprises 99 3DUS placenta images from first-trimester subjects, where the placenta typically fits within a single 3DUS volume. All volumes are resampled to a spatial resolution of $(0.5\mathrm{mm})^3$, centrally cropped to $(256)^3$ voxels, and intensity normalized to \([0,1]\). The dataset is split 89:10 for LOTUS outpainting training and testing. During AutoEncoderKL training, the images are further resampled to $(4\mathrm{mm})^3$, centrally cropped to $(64)^3$ voxels with zero-padding. We extract the foreground mask $M$ defined as the non-zero voxels. Extensive augmentation is applied, including random translations (\([-12,12]\)), rotations (\([-\frac{\pi}{2}, \frac{\pi}{2}]\)), scaling (\([0.8,1.2]\)), and random patch cropping (patches that don't overlap with $M$ are not allowed), each with a probability of 0.5. For Diffusion U-Net training, similar random affine augmentations are applied, followed by the extraction of a $(64)^3$ central patch with a rectangular FOV.


% \noindent \textbf{RegUS Dataset:} The RegUS Dataset is used for testing the registration performance. IT consists of 3DUS placenta images from 10 subjects in the first trimester. Each subject contains 2 3DUS volumes. All the volumes are resampled to a space of $2mm \times 2mm \times 2mm$ and central cropped to $64 \times 64 \times 64$, and then normalized to [0, 1]. The dataset after preprocessing covers rotation angles between [$40^{\circ}$,$90^{\circ}$] \xxx{dont you mean 0 to 40?} and translation between [12, 33] (unit: pixel) to \xxx{achieve content alignment}. 
% For registration, we consider registering the 3DUS volumes for each subject back-and-forth, resulting in $2\times10=20$ registration pairs. The rigid-transformation groundtruth are manually obtained by 2 experts and further validated visually by 3 experts. 

\noindent \textbf{RegUS Dataset:} The RegUS dataset is used to evaluate registration performance and consists of two 3DUS placenta volumes from each of 20 first-trimester subjects. All volumes are resampled to a spatial resolution of $(2\mathrm{mm})^3$, centrally cropped to $(64)^3$ voxels, and intensity normalized to \([0,1]\). The `ground-truth' rigid transformations are manually annotated by two experts and further validated visually by three additional experts. This ground truth is used as an independent standard for evaluation. The manual registrations show the maximum rotation angles range between [$30^{\circ}$, $117^{\circ}$] and maximum translations between \([25, 83]\) mm.
For registration, each subject’s 3DUS volumes are registered bidirectionally (both A to B and B to A), yielding \(2 \times 20 = 40\) registration pairs. 

The data is empirically split into two categories, typical cases and hard cases. Hard cases consist of 20 pairs where there is, for example, a very large (e.g., $>80^\circ$) rotation along one axis, a very large translation (e.g., $>60$mm), or image quality issues (e.g., shadow artifacts). These hard cases are a challenge for all compared methods (baseline and LOTUS). We hypothesize that a rough initial registration would help overcome registration optimization issues in these cases. We thus introduce a fixed  initial rotation along the primary axis of rotation for each subject, which we call the Compensation Rotation (CR). We compare all methods for CR settings of 10$^\circ$, 20$^\circ$, or 30$^\circ$ to test our hypothesis. %\xxx{my former words for the CR algorithm: For challenging cases where all algorithms failed, we designed an angle correction algorithm that applies a compensation rotation (CR) as an initial transformation. For each registration pair, we first identify the primary rotation axis and direction, then apply a fixed-angle correction along that axis. In this study, we tested CR values of 10, 20, and 30 degrees}. 
%The rest of data are assigned as typical cases.

\noindent \textbf{Outpainting Evaluation:} We compare LOTUS with RePaint~\cite{lugmayr2022repaint} and LOTUS*. RePaint is a test-time outpainting diffusion model originally designed for 2D natural images, which we extend to 3D for fair comparison. LOTUS* is a variant of LOTUS where the LMG module is replaced with a simple downsampling process~\cite{corneanu2024latentpaint}. Outpainting performance is assessed using image similarity metrics, namely, normalized cross-correlation (NCC), structural similarity index measure (SSIM), and mean squared error (MSE). We also compare inference time and model parameter size. 

\noindent \textbf{Registration Evaluation:} We register the pairs of images from the RegUS dataset using two of the most widely used conventional registration methods, Greedy~\cite{yushkevich2016ic} and ANTs~\cite{avants2008symmetric}. We compare the performance of these algorithms using either the original sector-shaped FOV images or the LOTUS outpainting results. We use MSE similarity metric and rigid transformations for both methods. For LOTUS, given a fixed image \(I_F\) and a moving image \(I_M\), we first apply LOTUS to outpaint their FOVs, obtaining \(I^{op}_F\) and \(I^{op}_M\). We then rigidly register \(I^{op}_F\) and \(I^{op}_M\), and apply the resulting transform to the original moving image \(I_M\).
We report the same image similarity metrics, along with peak signal-to-noise ratio (PSNR), mean rotation error RE (L1 norm, degrees), and mean translation error TE (L2 norm, mm).


% \noindent \textbf{Implementation Details:} We implemented Repaint (3D), which includes a 3D diffusion model (DM), and the LOTUS framework, comprising a 3D AutoEncoderKL and 3D LDM, using MONAI \cite{pinaya2307generative}. The DM employs a 3-level U-Net with encoder channels of 256, 256, and 512, while both AutoEncoderKL and Diffusion U-Net use a 3-level U-Net with channels of 32, 64, and 64. Training spans 2000, 1500, and 200 epochs for the DM, AutoEncoderKL, and LDM, respectively, with a batch size of 1. All experiments run on an NVIDIA A6000 GPU. Inference uses DDPM \cite{ho2020denoising} with 1000 sampling steps.


\noindent \textbf{Implementation Details:} We implemented RePaint (3D) and  LOTUS  using MONAI~\cite{pinaya2307generative, Cardoso_MONAI_An_open-source_2022}. The 3D diffusion model (DM) in RePaint employs a 3-level U-Net with 256, 256, and 512 encoder channels. Both AutoEncoderKL and Diffusion U-Net in LOTUS use a 3-level U-Net with channels of 32, 64, and 64. Training is conducted for 2000, 1500, and 200 epochs for the DM, AutoEncoderKL, and LDM, respectively, with a batch size of 1. Inference is conducted using DDPM~\cite{ho2020denoising} with 1000 sampling steps. All experiments are performed on an NVIDIA A6000 GPU. 



\section{Results and Discussion}



\subsection{Outpainting Results} 

\input{fig2}


% Fig.~\ref{fov_op} (a) presents a qualitative comparison of FOV outpainting performance among LOTUS, LOTUS$^{*}$, and RePaint. For LOTUS and LOTUS*, the outpainted results in both the latent space and image space are shown, while for RePaint, only image space results are available. Across all four samples, LOTUS effectively removes the edge-related latent feature of the sector-shaped FOV and consistently produces high-quality outpainting results. The outpainted images preserve the detailed structure and contrast of the original image while realistically filling the blank regions with ultrasound-like textures. Notably, the results from the third and fourth samples further highlight LOTUS's ability to handle diverse shapes and sizes, generating realistic and detailed structures in the outpainted regions. LOTUS* is unable to effciently remove the FOV edge-related features due to the lack of LMG module, and result in failure of outpainting. The fourth row presents the results from RePaint, due to the challenges of 3D outpainting in the pixel space, RePaint fails in almost all samples.

Fig.~\ref{fov_op} presents a qualitative comparison of FOV outpainting performance. For LOTUS and LOTUS*, results are shown in both latent and image spaces, whereas for RePaint, only image space results are available. Across all samples, LOTUS consistently produces high-quality outpainting results with a rectangular FOV. The generated images preserve the original structure and contrast while realistically filling blank regions with ultrasound-like textures. The results further demonstrate LOTUS’s ability to handle diverse FOV orientations and sizes. In contrast, LOTUS*, lacking the LMG module, struggles to remove FOV edge-related features, leading to outpainting failures. RePaint has sub-optimal performance due to the challenges of 3D outpainting directly in pixel space, especially when the window size is small.


% Fig.~\ref{fov_op} (b) further evaluate the outpainting capability of LOTUS under varying scales of outpainting area. In this experiment, we utilize multiple cubic-shaped outpainting windows in different sizes to extract outpainting input from the original image. The original image has a size of \(64 \times 64 \times 64\), and ``OW'' denotes the side length of the outpainting window. The boundary of each outpainting condition is highlighted in purple on both the outpainting input (green tag) and the LOTUS results (blue tag). From top to bottom, the OW decreases from 56 to 32 in steps of 8, meaning the outpainting task becomes increasingly challenging. We observe that even as the OW decreases dramatically, LOTUS continues to generate outpainting results with realistic details and structures. However, the outpainted image when OWL = 32 appears slightly brighter than the outpainted image when OWL = 56. This is because the contrast of the outpainted area is guided by the contrast of the outpainting input. Table \ref{tab:quant_results} shows the quantitative comparison between RePaint and LOTUS in multi-scale outpainting task in the \textbf{GenUS} validation dataset. The result is consistant with the qualitative results in Fig. ~\ref{fov_op}. With OW increasing, both RePaint and LOTUS get better performance. However, LOTUS consistently and significantly outperforms RePaint across all metrics and scales, with a dramatic improvement in inference time and memory efficiency.

%Fig.~\ref{fov_op}(b) further evaluates LOTUS's outpainting capability across varying outpainting scales. In this experiment, multiple cubic-shaped outpainting windows of different sizes are used to extract the outpainting input from the original \(64 \times 64 \times 64\) image, where ``OW'' represents the side length of the outpainting window. The boundary of each outpainting condition is highlighted in purple on both the outpainting input (green tag) and the LOTUS results (blue tag).  From top to bottom, OW decreases from 56 to 32 in steps of 8, making the outpainting task progressively more challenging. Despite this, LOTUS consistently generates realistic structures and details. However, when OW = 32, the outpainted image appears slightly brighter than at OW = 56, as the contrast of the outpainted region is influenced by the contrast of the input. 

Table~\ref{tab:quant_results} presents the quantitative results on the \textbf{GenUS} validation dataset. Performance improves for both methods as the outpainting window size (OW) increases. We observe that LOTUS significantly outperforms RePaint across all metrics and scales, with a substantial advantage in inference time and memory efficiency. Appendix Fig.~\ref{fig:supp_multi_scale_op} illustrates LOTUS performance qualitatively with respect to the outpaint window size. LOTUS consistently generates realistic structures and details even when conditioned on very limited patch sizes.


% It is important to note that there is a trade-off between computational efficiency and generated image quality. Smaller size of encoded latent features can improve the computational efficiency but also leads to more information loss. Therefore, it is crucial to suitable selection depending on the specific application scenario.

%It is important to note the trade-off between computational efficiency and generated image quality. Reducing the size of encoded latent features enhances computational efficiency but increases information loss. Therefore, selecting an appropriate size is crucial and should be tailored to the specific application scenario.



% \begin{table*}[!ht]
%     \centering
%     \caption{Quantitative comparison of outpainting methods in multi-scale outpainting task. \xxx{this table is SO hard to understand/read. move the two first metrics to the bottom of the table, so you can have vertical lines between the three OW settings starting line 2 and through NCC, SSIM and MSE. then time and params without the vertical lines.} \xxx{also: what do you want them to take away from this table? put it in the caption} \xxx{do ttest so you can put * or something for significant}}
%     \begin{adjustbox}{width=\textwidth}
%     \setlength{\tabcolsep}{1.5pt}
%     \renewcommand{\arraystretch}{1}
%     \footnotesize % 使用小字体
%     \begin{tabular}{|l|ccc|ccc|}
%     \hline
%         \multirow{2}{*}{Metric} & \multicolumn{3}{c|}{RePaint} & \multicolumn{3}{c|}{LOTUS} \\ 
%         \cline{2-7}
%         & OW=32 & OW=40 & OW=48 & OW=32 & OW=40 & OW=48 \\ \hline
%         % \cline{1-7}
%         NCC ↑ & 0.108$\pm$0.0346 & 0.197$\pm$0.031 & 0.329$\pm$0.039 & \textbf{0.713$\pm$0.095} & \textbf{0.810$\pm$0.108} & \textbf{0.920$\pm$0.037} \\ \hline
%         SSIM ↑ & 0.345$\pm$0.114 & 0.474$\pm$0.118 & 0.706$\pm$0.141 & \textbf{0.472$\pm$0.052} & \textbf{0.613$\pm$0.048} & \textbf{0.816$\pm$0.038} \\ \hline
%         MSE ↓ & 0.078$\pm$0.027 & 0.053$\pm$0.022 & 0.028$\pm$0.014 & \textbf{0.025$\pm$0.006} & \textbf{0.020$\pm$0.005} & \textbf{0.012$\pm$0.003} \\ \hline
%         \multicolumn{1}{|c|}{Parameter Size}& \multicolumn{3}{c|}{759.4MB(3D DM)} & \multicolumn{3}{c|}{9.2MB(AutoEncoderKL)+12.0MB(LDM)} \\ 
%         \cline{1-7}
%         \multicolumn{1}{|c|}{Inference Time}& \multicolumn{3}{c|}{40 minutes/batch (1000 steps, bs = 4)} & \multicolumn{3}{c|}{59 seconds/batch (1000 steps, bs = 4)} \\ 
%         \cline{1-7}
%     \end{tabular}
%     \end{adjustbox}
%     \label{tab:quant_results}
% \end{table*}

% \begin{table}[h]
%     \centering
%     \caption{mmm}
%     \begin{tabular}{c|c c|c}
%     \hline
%     m1 & m1 & m1 & m1 \\
%     \hline
%     m2 & m2 & m2 & m2 \\
%     m3 & \multicolumn{1}{c|}{m3} & m3 & m3 \\
%     \hline
%     \end{tabular}
% \end{table}

% \begin{table*}[!ht]
%     \centering
%     \caption{Quantitative comparison of outpainting methods in multi-scale outpainting task. \textbf{Bold}: best, *significant improvements (paires t-test, p\textless 0.05). The inference time is measured when the batch size is 4 and inference step is 1000. LOTUS significantly outperforms RePaint in all conditions with higher inference and computational efficiency.\xxx{this table is SO hard to understand/read. move the two first metrics to the bottom of the table, so you can have vertical lines between the three OW settings starting line 2 and through NCC, SSIM and MSE. then time and params without the vertical lines.} \xxx{also: what do you want them to take away from this table? put it in the caption} \xxx{do ttest so you can put * or something for significant}}
%     % \begin{adjustbox}{width=\textwidth}
%     \begin{adjustbox}{width=\textwidth}
%     \setlength{\tabcolsep}{1.2pt}
%     \renewcommand{\arraystretch}{1}
%     \footnotesize % 使用小字体
%     \begin{tabular}{|l|ccc|ccc|}
%     \hline
%         \multirow{2}{*}{Metric} & \multicolumn{3}{c|}{RePaint} & \multicolumn{3}{c|}{LOTUS} \\ 
%         \cline{2-7}
%         & \multicolumn{1}{c|}{OW=32} & \multicolumn{1}{c|}{OW=40} & \multicolumn{1}{c|}{OW=48} & \multicolumn{1}{c|}{OW=32} & \multicolumn{1}{c|}{OW=40} & \multicolumn{1}{c|}{OW=48} \\ \hline
%         % \cline{1-7}
%         NCC ↑ & \multicolumn{1}{c|}{0.108$\pm$0.035} & \multicolumn{1}{c|}{0.197$\pm$0.031} & \multicolumn{1}{c|}{0.329$\pm$0.039} & \multicolumn{1}{c|}{\textbf{0.713$\pm$0.095}} & \multicolumn{1}{c|}{\textbf{0.810$\pm$0.108}} & \multicolumn{1}{c|}{\textbf{0.920$\pm$0.037}} \\ \hline
%         SSIM ↑ & \multicolumn{1}{c|}{0.345$\pm$0.114} & \multicolumn{1}{c|}{0.474$\pm$0.118} & \multicolumn{1}{c|}{0.706$\pm$0.141} & \multicolumn{1}{c|}{\textbf{0.472$\pm$0.052}} & \multicolumn{1}{c|}{\textbf{0.613$\pm$0.048}} & \multicolumn{1}{c|}{\textbf{0.816$\pm$0.038}} \\ \hline
%         MSEx10 ↓ & \multicolumn{1}{c|}{0.078$\pm$0.027} & \multicolumn{1}{c|}{0.053$\pm$0.022} & \multicolumn{1}{c|}{0.028$\pm$0.014} & \multicolumn{1}{c|}{\textbf{0.025$\pm$0.006}} & \multicolumn{1}{c|}{\textbf{0.020$\pm$0.005}} & \multicolumn{1}{c|}{\textbf{0.012$\pm$0.003}} \\ \hline
%         \multicolumn{1}{|c|}{Parameter Size}& \multicolumn{3}{c|}{759.4MB(3D DM)} & \multicolumn{3}{c|}{\textbf{9.2MB(AE)+12.0MB(LDM)}} \\ 
%         \cline{1-7}
%         \multicolumn{1}{|c|}{Inference Time}& \multicolumn{3}{c|}{40 minutes/batch} & \multicolumn{3}{c|}{\textbf{59 seconds/batch}} \\ 
%         \cline{1-7}
%     \end{tabular}
%     \end{adjustbox}
%     \label{tab:quant_results}
% \end{table*}

% \begin{table*}[!ht]
%     \centering
%     \caption{Quantitative comparison of outpainting methods in multi-scale outpainting task. \textbf{Bold}: best, *significant improvements (paires t-test, p\textless 0.05). LOTUS significantly outperforms RePaint in all conditions with higher inference and computational efficiency.\xxx{this table is SO hard to understand/read. move the two first metrics to the bottom of the table, so you can have vertical lines between the three OW settings starting line 2 and through NCC, SSIM and MSE. then time and params without the vertical lines.} \xxx{also: what do you want them to take away from this table? put it in the caption} \xxx{do ttest so you can put * or something for significant}}
%     \begin{adjustbox}{width=\textwidth}
%     % \setlength{\tabcolsep}{1.5pt}
%     \setlength{\tabcolsep}{5pt}
%     \renewcommand{\arraystretch}{1}
%     \footnotesize % 使用小字体
%     \begin{tabular}{|l|ccc|ccc|}
%     \hline
%         \multirow{2}{*}{Metric} & \multicolumn{3}{c|}{RePaint} & \multicolumn{3}{c|}{LOTUS} \\ 
%         \cline{2-7}
%         & \multicolumn{1}{c|}{OW=32} & \multicolumn{1}{c|}{OW=40} & \multicolumn{1}{c|}{OW=48} & \multicolumn{1}{c|}{OW=32} & \multicolumn{1}{c|}{OW=40} & \multicolumn{1}{c|}{OW=48} \\ \hline
%         % \cline{1-7}
%         NCC ↑ & \multicolumn{1}{c|}{0.11$\pm$0.03} & \multicolumn{1}{c|}{0.20$\pm$0.03} & \multicolumn{1}{c|}{0.33$\pm$0.04} & \multicolumn{1}{c|}{\textbf{0.71$\pm$0.10*}} & \multicolumn{1}{c|}{\textbf{0.81$\pm$0.11*}} & \multicolumn{1}{c|}{\textbf{0.92$\pm$0.04*}} \\ \hline
%         SSIM ↑ & \multicolumn{1}{c|}{0.35$\pm$0.11} & \multicolumn{1}{c|}{0.47$\pm$0.12} & \multicolumn{1}{c|}{0.71$\pm$0.14} & \multicolumn{1}{c|}{\textbf{0.47$\pm$0.05*}} & \multicolumn{1}{c|}{\textbf{0.61$\pm$0.05*}} & \multicolumn{1}{c|}{\textbf{0.82$\pm$0.04*}} \\ \hline
%         MSE ↓ & \multicolumn{1}{c|}{0.08$\pm$0.03} & \multicolumn{1}{c|}{0.05$\pm$0.02} & \multicolumn{1}{c|}{0.03$\pm$0.01} & \multicolumn{1}{c|}{\textbf{0.03$\pm$0.01*}} & \multicolumn{1}{c|}{\textbf{0.02$\pm$0.01*}} & \multicolumn{1}{c|}{\textbf{0.01$\pm$0.00*}} \\ \hline
%         % Parameter Size& \multicolumn{3}{c|}{759.4MB(3D DM)} & \multicolumn{3}{c|}{9.2MB(AutoEncoderKL)+12.0MB(LDM)} \\ 
%         % \cline{1-7}
%         % Inference Time& \multicolumn{3}{c|}{40 minutes/batch (1000 steps, bs = 4)} & \multicolumn{3}{c|}{59 seconds/batch (1000 steps, bs = 4)} \\ 
%         Parameter Size ↓& \multicolumn{3}{c|}{759.4MB(3D DM)} & \multicolumn{3}{c|}{\textbf{9.2MB(AE)+12.0MB(LDM)}} \\ 
%         \cline{1-7}
%         Inference Time ↓& \multicolumn{3}{c|}{40 minutes/batch} & \multicolumn{3}{c|}{\textbf{59 seconds/batch}} \\ 
%         \cline{1-7}
%     \end{tabular}
%     \end{adjustbox}
%     \label{tab:quant_results}
% \end{table*}

% \begin{table*}[!ht]
%     \centering
%     \caption{Quantitative comparison of outpainting methods in multi-scale outpainting task. \textbf{Bold}: best, *significant improvements (paires t-test, p\textless 0.05). The inference time is measured when the batch size is 4 and inference step is 1000. LOTUS significantly outperforms RePaint in all conditions with higher inference and computational efficiency.\xxx{this table is SO hard to understand/read. move the two first metrics to the bottom of the table, so you can have vertical lines between the three OW settings starting line 2 and through NCC, SSIM and MSE. then time and params without the vertical lines.} \xxx{also: what do you want them to take away from this table? put it in the caption} \xxx{do ttest so you can put * or something for significant}}
%     % \begin{adjustbox}{width=\textwidth}
%     \begin{adjustbox}{width=\textwidth}
%     \setlength{\tabcolsep}{1.2pt}
%     \renewcommand{\arraystretch}{1}
%     \footnotesize % 使用小字体
%     % \begin{tabular}{|l|ccc|ccc|}
%     \begin{tabular}{|l|ccc|ccc|}
%     \hline
%         \multirow{2}{*}{Metric} & \multicolumn{3}{c|}{RePaint} & \multicolumn{3}{c|}{LOTUS} \\ 
%         \cline{2-7}
%         & \multicolumn{1}{c|}{OW=32} & \multicolumn{1}{c|}{OW=40} & \multicolumn{1}{c|}{OW=48} & \multicolumn{1}{c|}{OW=32} & \multicolumn{1}{c|}{OW=40} & \multicolumn{1}{c|}{OW=48} \\ \hline
%         % \cline{1-7}
%         NCC ↑ & \multicolumn{1}{c|}{0.108$\pm$0.035} & \multicolumn{1}{c|}{0.197$\pm$0.031} & \multicolumn{1}{c|}{0.329$\pm$0.039} & \multicolumn{1}{c|}{\textbf{0.713$\pm$0.095}} & \multicolumn{1}{c|}{\textbf{0.810$\pm$0.108}} & \multicolumn{1}{c|}{\textbf{0.920$\pm$0.037}} \\ \hline
%         SSIM ↑ & \multicolumn{1}{c|}{0.345$\pm$0.114} & \multicolumn{1}{c|}{0.474$\pm$0.118} & \multicolumn{1}{c|}{0.706$\pm$0.141} & \multicolumn{1}{c|}{\textbf{0.472$\pm$0.052}} & \multicolumn{1}{c|}{\textbf{0.613$\pm$0.048}} & \multicolumn{1}{c|}{\textbf{0.816$\pm$0.038}} \\ \hline
%         MSE$\times$10 ↓ & \multicolumn{1}{c|}{0.788$\pm$0.027} & \multicolumn{1}{c|}{0.533$\pm$0.022} & \multicolumn{1}{c|}{0.283$\pm$0.014} & \multicolumn{1}{c|}{\textbf{0.257$\pm$0.006}} & \multicolumn{1}{c|}{\textbf{0.209$\pm$0.005}} & \multicolumn{1}{c|}{\textbf{0.128$\pm$0.003}} \\ \hline
%         \multicolumn{1}{|c|}{Parameter Size}& \multicolumn{3}{c|}{759.4MB(3D DM)} & \multicolumn{3}{c|}{\textbf{9.2MB(AE)+12.0MB(LDM)}} \\ 
%         \cline{1-7}
%         \multicolumn{1}{|c|}{Inference Time}& \multicolumn{3}{c|}{40 minutes/batch} & \multicolumn{3}{c|}{\textbf{59 seconds/batch}} \\ 
%         \cline{1-7}
%     \end{tabular}
%     \end{adjustbox}
%     \label{tab:quant_results}
% \end{table*}

\input{table1}


\subsection{US Image Registration Results}


\input{fig3}





% Fig.~\ref{reg_results} (a) presents a qualitative comparison of 3DUS registration performance on images w and w/o FOV outpainting by LOTUS. The baseline results from Greedy and ANTs are highlighted with yellow tags, while the proposed LOTUS-based results are highlighted in blue. It is evident that directly applying Greedy and ANTs to sector-shaped FOV images leads to poor registration performance. 
% In contrast, both LOTUS-based methods demonstrate excellent and consistent registration performance, closely approximating the manual registration results. These findings indicate that FOV outpainting by LOTUS significantly improves 3DUS image registration.


\textbf{Evaluation on Typical Cases:} Fig.~\ref{reg_results} compares 3DUS registration performance with and without  LOTUS outpainting. Applying Greedy and ANTs directly to sector-shaped FOV images results in poor registration performance, often just returning the identity matrix.  In contrast, using LOTUS outpainting as input, both Greedy and ANTs consistently achieve accurate registration, closely aligning with manual registration. These findings highlight the effectiveness of LOTUS's FOV outpainting in enhancing 3DUS image registration.


% \input{table2}
% \input{table3.tex}

% \input{table2_short}
% \input{table3_short}

\input{table2and3}

\input{fig4}

% In this work, we quantitatively evaluate the registration performance by comparing the the registration result to (a). its corresponding fixed image within their overlapping regions (Table \ref{tab:compare_fix_wmask_gt_final} ), and (b). the manural registration GT (Table. \ref{tab:reg_res_whole_gtfinal}). While directly compared to the fixed image helps eliminate the subjective bias in the manural GT, however, the results may not fully reflected the registraiton performance due to the difference of contrast, artifacts, and deformation between fixed and moving images. Therefore, we combine both experiments to comprehensively evaluate the registration performance. 

We quantitatively assess performance by comparing the registration results to (a) the fixed image within the overlapping region, and (b) the manual registration ground truth (GT).  While comparison to the fixed image mitigates any error in the manual GT, it may not fully reflect registration accuracy due to differences in contrast, artifacts, and deformation between the images. We thus report both in Table \ref{tab:combined_registration} to provide a comprehensive evaluation. 


% In Table \ref{tab:compare_fix_wmask_gt_final}, we use the dice coefficient score (DSC) between the binary masks of overlapping FOVs as a quantitative indicator for the overlapping area. The moving image, while covering the largest overlapping area, consistently exhibits the poorest performance across all metrics and functions as the lowest-bound across all metrics. The GT results, in contrast, perform the best among all the metrics and serves as a highest bound of all the metrics. Both Greedy and ANTs show moderately better performance compared to the moving image, and their metric outcomes are closely matched, showing no significant differences. The large DSC for baselines indicate the failure of their registration due to the sector-shape FOV affect. The proposed LOTUS-based methods significantly outperforms their baseline counterparts. Specifically, LOTUS(ANTs) surpasses LOTUS(Greedy) in all considered metrics, except for PSNR, indicating a more robust alignment. Table. \ref{tab:reg_res_whole_gtfinal} compares the registration results to the manural GT. While the pixel-centric similarity metrics may be affected by the contrast and intensity variation, $\lvert ER\rvert$ and $\Vert ET\Vert$ are also reported to directly evaluate registration performance. Table. \ref{tab:reg_res_whole_gtfinal} results show the consistant trend as Table. \ref{tab:compare_fix_wmask_gt_final}, in which LOTUS-based methods dramatically outperform their baseline counterparts and significantly decrease the transformation error. 

% In Table~\ref{tab:compare_fix_wmask_gt_final}, we use the Dice coefficient score (DSC) between the binary masks of overlapping FOVs as a quantitative measure of the overlapping area. The moving image, despite covering the largest overlap, consistently exhibits the poorest performance across all metrics, serving as the lower bound.  Both Greedy and ANTs show moderate improvements over the moving image, with closely matched results and no significant differences. The high DSC values for these baselines and the moving image indicate registration failure due to the sector-shaped FOV effect. In contrast, the proposed LOTUS-based methods significantly outperform their baseline counterparts. Notably, LOTUS(ANTs) surpasses LOTUS(Greedy) across all metrics except PSNR, suggesting a more robust alignment.  Surprisingly, LOTUS(ANTs) even outperform the GT, which means the comprison to fix image may not fully reflect registration accuracy due to differences in contrast, artifacts, and deformation between fixed and moving images.

In Table~\ref{tab:combined_registration}-left, the moving image, as expected, exhibits the poorest similarity to the fixed image, serving as the lower bound.  Both Greedy and ANTs show moderate improvements. In contrast, the proposed LOTUS-based methods significantly outperform their baselines, with LOTUS(ANTs) surpassing LOTUS(Greedy),  suggesting a more robust alignment.  Interestingly, LOTUS(ANTs) even outperforms the manual GT.

% Table~\ref{tab:reg_res_whole_gtfinal} compares the registration results against the manual GT. While pixel-centric similarity metrics may be influenced by contrast and intensity variations, \(\lvert ER\rvert\) and \(\Vert ET\Vert\) are included to directly assess registration accuracy. The results in Table~\ref{tab:reg_res_whole_gtfinal} align with those in Table~\ref{tab:compare_fix_wmask_gt_final}, demonstrating that LOTUS-based methods dramatically outperform their baselines and significantly reduce transformation errors.
 Table \ref{tab:combined_registration}-right presents the registration results compared to the manual GT. We note that RE provides a direct measure of registration accuracy against the manual transformations, unlike the more indirect image similarity measures. The results are consistent with the left panel. Additional metrics (PSNR, SSIM, TE) are available in Appendix Tables \ref{tab:appendix:compare_fix_wmask_gt_final} and \ref{tab:appendix:reg_res_whole_gtfinal}.


\noindent \textbf{Evaluation on Hard Cases:} We evaluate whether an initial transform helps performance in hard cases. 
% Fig. \ref{reg_results}(d) shows the comparison of registration on a randomly selected hard case sample. The baselines are all failed due to the strong local minimum induced by the sector-shaped FOV, and even nullifies the advantage of the CR and reverse the source image after rotation compensation to the original position. In contrast, LOTUS-based methods can further improve the registration results based on the initial transformation and successfully achieve registration when $CR = 30$. Specifically, LOTUS(Greedy) achieves successful registration when $CR = 10$. These improvements in typical and hard cases highlight the effectiveness of LOTUS in enhancing 3DUS registration performance.
Fig.~\ref{reg_results_hard}-left shows a random hard case for CR values of 10$^\circ$, 20$^\circ$, and 30$^\circ$. Both baseline methods fail due to the strong local minima induced by the sector-shaped FOV. These methods even reverse the initial CR back to the identity transform. In contrast, LOTUS-based methods leverage the CR initialization to refine registration and successfully align the images at CR = $30^\circ$, with LOTUS(Greedy) achieving success even at CR = $10^\circ$. %These improvements in difficult cases demonstrate the effectiveness of LOTUS in enhancing 3DUS registration performance.


 Fig.~\ref{reg_results_hard}-right shows the rotation error (RE) as a function of the initial CR across the 20 hard pairs. As CR increases, all methods improve. However, baseline methods only exhibit an RE reduction of about 10$^\circ$ for each CR step of 10$^\circ$, suggesting the improvements merely echo the CR, rather than effective optimization. Thus, even at $CR=30^\circ$, they have over $40^\circ$ residual error (Appendix Table \ref{tab:appendix:quant_results}). In contrast, LOTUS-based methods outperform the baselines across all CR levels, and achieve consistently good registration at CR=$30^\circ$. 

%Overall, the substantial quantitative and qualitative improvement from baseline methods confirms that FOV outpainting significantly enhances 3DUS registration.  The noticeable performance gap between LOTUS(ANTs) and LOTUS(Greedy) suggests that LOTUS performance also depends on the choice of the registration method.


% Table \ref{tab:compare_fix_wmask_gt_final} presents a quantitative comparison between the moving image, manual registration GT, and algorithmic registration results with respect to the fixed image within their overlapping FOV. We use the dice coefficient score (DSC) between the binary mask of overlapping FOV as a quantitative indicator for the overlapping area. The moving image, while covering the largest overlapping area, consistently exhibits the poorest performance across all metrics and functions as the lowest-bound across all metrics. The GT results, in contrast, perform the best among all the metrics and serves as a highest bound of all the metrics. Both Greedy and ANTs show moderately better performance compared to the moving image, and their metric outcomes are closely matched, showing no significant differences. The large DSC for baselines indicate the failure of their registration due to the sector-shape FOV affect. The proposed LOTUS-based methods significantly outperforms their baseline counterparts. Specifically, LOTUS(ANTs) surpasses LOTUS(Greedy) in all considered metrics, except for PSNR, indicating a more robust alignment. 



% Table. \ref{tab:reg_res_whole_gtfinal} presents a quantitative comparison among all algorithmic registration results with respect to the manural ground truth. The quantitative results show the consistant trend as Table. \ref{tab:compare_fix_wmask_gt_final}, in which LOTUS-based methods significantly outperform their baseline counterparts and dramatically improve the registration performance. The significant improvement from the baseline to their LOTUS-based counterparts in Both the quanlitative and quantitative results demonstrate that FOV outpainting can significantly enhance intensity-based 3DUS registration performance.


%\input{table4}

% \input{table5}


\subsection{Discussion and Conclusion}

% \xxx{remember i also want a figure showing one of the large rotations where it's failing}
% Conclusion: 1. Advantages, 2. Disadvantages: slow speed, influence of latent processing, hard to tell on large transform since all failed, small datasize. 3. Future, solve the above, the extension to learning based method, extension to other outpaintinv task

% In this work, we solve the sector-shaped FOV affection in 3DUS stitching by proposing LOTUS, a latent outpainting diffusion model that can outpaint the 3DUS sector-shaped FOVs to square shape, and significantly improves the subsequent registration performance. LOTUS not only generates realistic contents, but also dramatically improves the inferencing speed and decreases the computational costing compared to its counterparts working on the image-space. Both qualitative and quantitative results prove the FOV outpainting strategy can significantly improve the 3DUS stitching performance.
% However, this work also exists several limitations includes the trade-off between computational efficiency and generated image quality, and the dependency on registration methods selection. Due to the limitation number of open-source 3DUS registration dataset, this work only uses a small in house dataset for evaluation, and only applied on conventional registration methods. Future work will focus on validating LOTUS across additional learning-based registration and datasets, and extend the outpainting application to other medical imaging
% domains.

We proposed LOTUS, a latent outpainting diffusion model that expands sector-shaped FOVs into rectangular ones, significantly enhancing subsequent registration performance. LOTUS not only generates realistic content but also dramatically improves inference speed and reduces computational costs compared to image-space approaches. Our results demonstrate that FOV outpainting substantially enhances 3DUS registration performance.

%However, this work has several limitations, including the trade-off between computational efficiency and image quality, as well as its dependence on the choice of registration methods. Additionally, d
Our experiments only used conventional registration methods. Future work will focus on also validating LOTUS across learning-based registration approaches, as well as extending its  applications to other medical imaging domains.


% However, this work also exists several limitations. Firstly, we notice the improvement on registration performance also depends on the choosen baseline-methods as well as the complexity of the registration task. In some cases with extensive transformation, both LOTUS-based methods and traditional methods failed. Besides, there is a trade-off between computational efficiency and generated image quality. Smaller size of encoded latent features can improve the computational efficiency but also leads to more information loss. Therefore, it is crucial to suitable selection depending on the specific application scenario. Thirdly, due to the limitation number of open-source 3DUS registration dataset, this work only uses a small in house dataset for evaluation, and only applied on conventional registration methods. Future work will focus on validating LOTUS across
% additional learning-based registration and datasets, and extend the outpainting application to other medical imaging
% domains.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work is supported, in part, by NIH R01-HD109739, R01-HL156034, T32-EB021937, and the Vanderbilt Advanced Computing Center for Research and Education.  }

\bibliography{midl25_131}

\clearpage
\appendix

\setcounter{figure}{0}% Reset figure counter
  \let\oldthefigure\thefigure% Capture figure numbering scheme
  \renewcommand{\thefigure}{A\oldthefigure}% Prefix figure number with A

  \setcounter{table}{0}% Reset figure counter
  \let\oldthetable\thetable% Capture figure numbering scheme
  \renewcommand{\thetable}{A\oldthetable}% Prefix figure number with A

\section{Outpainting results for varying patch sizes}

Fig.\ \ref{fig:supp_multi_scale_op} presents the outpainting results conditioned on different patch sizes. We note that even for small patches, LOTUS is able to effectively outpaint the FOV.

\begin{figure}[h]
% \vspace{-2mm}
\centering
\includegraphics[width=0.7\linewidth]{imgs/multi_scale_op.pdf}
% \vspace{-3mm}
\caption{ Outpainting performance of LOTUS under varying mask sizes. OW denotes the outpainting window size, which decreases from left to right, representing increasingly challenging outpainting tasks.  LOTUS effectively removes FOV edge-related artifacts and produces realistic outpainting across different condition sizes.} 
\label{fig:supp_multi_scale_op}

\end{figure}


\section{Additional metrics for registration performance in typical cases}



Tables \ref{tab:appendix:compare_fix_wmask_gt_final} and \ref{tab:appendix:reg_res_whole_gtfinal} show additional metrics for quantitative registration evaluation, complementary to Table \ref{tab:combined_registration} in the main manuscript.  In Table~\ref{tab:appendix:compare_fix_wmask_gt_final}, the Dice coefficient score (DSC) between the foreground masks of the registration pair is used to quantify the overlapping FOV area. This is to control for trivial local minima that minimize FOV overlap instead of optimizing the anatomical similarity. The moving image, despite having the largest overlap, exhibits the poorest similarity, serving as the lower bound.  None of the algorithms present an alarmingly low Dice score, which shows the trivial local minima is avoided. The SSIM and PSNR metrics show similar patterns to the NCC and MSE metrics reported in Table \ref{tab:combined_registration}. Similarly, translation error (TE) performance is consistent with the rotation error (RE) in Table \ref{tab:combined_registration}.

\input{table2_rest}
\input{table3_rest}

% Table \ref{tab:combined_registration}

\section{Quantitative registration results in hard cases}

Table \ref{tab:appendix:quant_results} presents the quantitative registration results for the hard cases. These correspond to the plots shown in the right panel of Fig.\ \ref{reg_results_hard} in the main manuscript for rotation errors (RE). We also present translation errors (TE) in this Table for completeness.

\input{table4}


\section{Comparison to nearest neighbor outpainting}
Following a reviewer suggestion, we have padded the out-of-FOV regions using the nearest within-FOV pixel values, as shown in the New Supplementary Fig.~\ref{pad}. However, this padding strategy introduces a Voronoi-like unrealistic content that does not accurately reflect the underlying anatomy and therefore impact registration accuracy. We note that padding with a single constant value $c$ would not have changed the registration result compared to having the default value of $c=0$.
\begin{figure}[h]
% \vspace{-2mm}
\centering
\includegraphics[width=1\linewidth]{imgs/nearest_pad.pdf}
% \vspace{-3mm}
\caption{Padding the out-of-FOV regions of typical case 1 (from Fig.\ \ref{reg_results}) the nearest within-FOV pixel value. Left: the original target and source images. Mid: padding results, Target' and Source'. Right: registration results by registering Target' to Source' and Source' to Target'.}
\label{pad}
% \vspace{-7mm}
\end{figure}

\end{document}
