\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
%\usepackage{subfig}
\usepackage{booktabs} 
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 318}
\editors{Accepted for publication at MIDL 2026}

\title[From Surface to Viscera]{From Surface to Viscera: 3D Estimation of Internal Anatomy from Body Surface Point Clouds}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Salih Furkan Atici\nametag{$^{1}$}} \orcid{0000-0002-7010-1179} \Email{salih.atici@uni-luebeck.edu}\\
\Name{Eytan Kats\nametag{$^{1}$}} \orcid{0009-0004-3878-3036} \Email{eytan.kats@uni-luebeck.edu}\\
\Name{Daniel Mensing\nametag{$^{2}$}} \orcid{0009-0009-8107-1313}\Email{daniel.mensing@mevis.fraunhofer.de}\\
\Name{Mattias P Heinrich\nametag{$^{1}$}} \orcid{0000-0002-7489-1972} \Email{mattias.heinrich@uni-luebeck.edu}\\
\addr $^{1}$ Institute of Medical Informatics, University of Luebeck, Luebeck, Germany \\
\addr $^{2}$ Fraunhofer Institute for Digital Medicine MEVIS, Bremen, Germany
}

\begin{document}

\maketitle

\begin{abstract}
Accurate pre-scan positioning in diagnostic imaging is essential for guiding acquisition and reducing manual calibration time, yet current automated approaches typically rely on dense volumetric representations that are not leveraging the geometric properties or sparsity of surface representations. In this work, we introduce a sparse, point-cloud–based framework for estimating patient-specific 3D locations and shapes of multiple internal organs directly from the body surface. Our method leverages a new dual-encoder PointTransformer architecture: one encoder processes a mean-shape point cloud comprising 20 anatomical structures, while a second encoder extracts features from the patient’s body-surface point cloud. A shared decoder then predicts a deformed   shape estimating the hidden individual anatomy patient. This enables accurate organ localization without volumetric rasterization or autoencoder-style bottlenecks. Trained on the German National Cohort (NAKO) dataset, our model substantially outperforms volumetric convolutional autoencoder (CAE) baselines, achieving a mean Chamfer Distance less than 5 mm and markedly lower surface-distance errors. These results demonstrate that sparse geometric learning with deformable point-cloud priors offers an efficient and highly effective alternative improving over dense convolutional deep learning methods for automated imaging workflow optimization.
%Diagnostic imaging relies on preliminary steps to guide image acquisition by pinpointing the scan area. While pre-scanning is crucial, manual calibration time is significant. Automated systems are mainly based on dense data representations which require high computational demands. A strong alternative is to use sparse point clouds in automated patient positioning. This work introduces a novel architecture using PointTransformer to estimate the 3D locations and shapes of multiple internal anatomical structures from the body surface’s point cloud. We propose a guide-template input structure where the external body surface guides an initial template mean shape to accurately determine the positions of the organs. The proposed model, trained on the German National Cohort (NAKO) study, demonstrated strong performance, achieving a mean Chamfer Distance of $5$ mm and a surface distance error of less than convolutional deep learning methods. 
\end{abstract}

\begin{keywords}
Point Cloud Processing, PointTransformer, Medical Image Segmentation, Patient Positioning
\end{keywords}

\section{Introduction}
\label{sec:intro}

% Introduction of preliminary steps in image acquisition.
% Should mention the details, and focus on localization
% Paraphares "there are works on automated patient positioning systems"
% Mention the importance of automated patient positioning, how it can be done and how using RGB-D camer suits the best.
In diagnostic imaging, the quality of the image is significantly influenced by the steps taken before the primary scan. These preliminary steps, also known as the pre-scan, encompass patient preparation and the planning of scan-specific settings. This process is vital to workflow, as it help standardize image acquisition and improve image quality by reducing artifacts and variability \cite{allen2023automated}. For primary image acquisition, the patient is manually positioned on the table, then scout imaging is performed to quickly plan the detailed geometric layout, but the process can be influenced by the operator’s skill and judgment \cite{van2020cinderellas, al2022influence}. In certain cases, scout scans must be repeated because the patient was not positioned correctly. These scout images are used to visually verify the patient’s correct position and to plan the geometry of the diagnostic sequences, including slice orientation, field-of-view, and scan range \cite{koken2009towards}. Fig. \ref{fig:patientpos} captures the overall process of pre-scan steps. As the demand for radiology examinations rises, the efficiency of the preparatory steps becomes increasingly important to enhance patient throughput. Automated patient positioning systems are designed to streamline radiology workflows by identifying and localizing anatomical regions of interest and adjusting the patient table accordingly. Research supports the potential of these systems to reduce manual adjustments, minimize positioning errors, and enhance efficiency \cite{obuchowicz2024clinical, ghesu2022automatically}.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.85\textwidth]{figures/patientpos.png} 
    \caption{Diagnostic Imaging Preliminary Steps}
    \label{fig:patientpos}
\end{figure}

For automated patient positioning, learning-based frameworks have emerged as the preferred choice due to their scalability and ability to generalize well to unseen cases. A comparison between manual positioning and artificial intelligence-based automatic positioning in CT imaging by utilizing RGB-D camera-based system is conducted, revealing that AI-based positioning significantly reduced the time required by 28\% \cite{Gang2020A}. The experiment is carried out by mounting RGB-D cameras above the scanner to detect key anatomical landmarks on the patient's body surface, enabling automatic table movement and centering. Similarly, Booij et al. created a 3D camera system that automatically identifies a patient’s body shape to adjust the CT table height and centering, greatly enhancing positioning accuracy over manual techniques \cite{booij2019accuracy, booij2021automated}. The RGB-D camera’s depth sensing capability generates a surface map of the patient’s body. By utilizing anatomical landmarks, the internal anatomical regions of interest can be inferred.

% Introduction of Point Clouds and its importance in medical imaging
% Should mention the constructions of Point Clouds, how it can be imported to this problem, why using Point Clouds is a good choice.
% Should also mentioned the problems with the Point Clouds
Medical image analysis has predominantly relied on dense voxel grid representations and traditional machine learning approaches specifically designed for data structure on a grid. While advancements in dense 3D image analysis have been made, the limitations of structured data persist, including high computational demands and the need for large labeled datasets. On the other hand, sparse data representations like unordered point clouds offer a lightweight, topology-independent spatial sampling set that maintains millimeter-level detail while circumventing the memory overhead associated with meshes or dense grids. \cite{zhang2025survey, heinrich2024pointvoxelformer, bronstein2017geometric}. While the unordered nature of point clouds is encouraging because it offers lightweight solutions, it presents a challenge in incorporating local geometry and preserving permutation invariance.

\subsection{Previous Works}
\label{subsec:previous}
% 1.1 Previous Works
% Should include the previous work on automated patient positioning.
% Should include the previous work on the use of Point Clouds in medical imaging.
% Should include the use of grouping the permutation invariance nature of Point Clouds and the solution PointTransformer brings.
% Should include the previous work on the generative nature of Point Clouds.
In recent years, advancements in deep learning have enabled improvements in techniques for locating anatomical landmarks. Noothout et al. introduced a deep learning-based landmark localization technique utilizing fully convolutional neural networks (FCNNs) \cite{Noothout2020Deep}. Zhao et al. developed a multi-resolution region proposal and segmentation network for orientation detection and localization, outperforming previous methods in accuracy and robustness \cite{Zhao2022Deep}. Alansary et al. employed Deep Reinforcement learning (DQN) to enhance landmark localization capabilities \cite{Alansary2019Evaluating}. Kats et al. proposed a workflow that accurately predicts the coordinates of patients’ internal organs and bones using depth images. They assert that this approach provides a contactless, fast, and standardized method for patient localization \cite{kats2025internal}. 

With their advantages, point clouds are becoming preferable choice in medical imaging, particularly when compared to volumetric 3D CNNs and vision transformers. Yu et. al. investigated the potential of point clouds in medical field, specifically in disease detection and treatment by developing attention-based point transformer model called 3DMedPt\cite{yu20213d}. Adams et. al. used unordered point clouds to learn Statistical Shape Modeling which is used in investigating and quantifying anatomical variations within populations of anatomies \cite{adams2023can}. In the work of Keller et. al., they train an implicit function to infer the 3D location of three important anatomic tissues from body shape model \cite{keller2024hit}. Although HIT can learn to reconstruct soft tissue from the body surface, its precision in pinpointing the exact location of tissues within the body remains limited. The HIT model currently predicts the volume percentage of tissues but fails to accurately determine their precise pixel-wise location, particularly for sparse structures such as intra-muscular and visceral adipose tissue (IMVAT). Therefore, achieving accurate localization of these tissues continues to be a significant challenge.

\subsection{Contribution and Outline}
\label{subsec:contribution}

% 1.2. Contributions and Outline
% Should mention what is the algorithm, how it solves the problems mentioned, how it can be implemented, in which data it is implemented.
% Details of the next chapters.
In this work, we propose a learning-based shape deformation model, DeformingPointTransformer, that utilizes body contours and a template shape to deform into a patients' internal organs and bones and determine their positions to eliminate the need for manual positioning. The model uses self attention modules to construct two encoders and a decoder - that employ the patient's body contour along with a population-based deformable template. The model can process these two inputs to accurately estimate the 3D shape of the internal structures.

To achieve strong performance across various anatomical variations and body types, we train our model using a comprehensive dataset of 6231 whole-body MRI scans from the German National Cohort (NAKO) study \cite{peters2022framework}. This dataset encompasses subjects of different ages, body sizes, and physiological traits, allowing the model to generalize well across diverse patient groups. Our experimental results show the effectiveness of our proposed method in precisely identifying 20 different internal anatomical structures, such as bones and soft tissue organs, utilizing only depth images.

The key contributions of this work are as follows:

\begin{enumerate}
    \item We show that a point cloud representation of body surface can be used to estimate the 3D positions of internal anatomical structures accurately and reliably.
    \item We propose a transformer-based architecture, namely DeformingPointTransformer, which generates the 3D coordinates of the internal organs and bones using the point clouds of body surface and the mean deformable shape.
    \item We demonstrate that in 3D generative tasks, point clouds are strong candidate to improve the workflow and process.
\end{enumerate}
\url{https://github.com/multimodallearning/DeformingPointTransformer}
\section{Materials and Methods}
\label{sec:methodology}

Our research utilizes 6231 complete MRI scans from the NAKO dataset \cite{peters2022framework}. This extensive collection of images enables us to represent anatomical details across a broad spectrum of anatomical variations within the population.

\subsection{Dataset Preparation}
\label{subsec:data}

The dataset is composed of $3$ main point cloud components; the body contour, deformable template, target internal anatomical structures. The preparation of the data and generation of the point clouds are comprised of $3$ steps as demonstrated in Fig. \ref{fig:dataprep}. These steps are introduced in the subsequent sections.

\begin{figure}[h!]
\centering
\subfigure[Body Surface Extraction]{\includegraphics[width=0.32\textwidth]{figures/bodysurfaceextract.png}}
\subfigure[Boundary Extraction]{\includegraphics[width=0.32\textwidth]{figures/boundaryextract.png}}
\subfigure[Point Cloud Generation]{\includegraphics[width=0.32\textwidth]{figures/pcgenerate.png}}
\caption{Three-stage dataset generation pipeline: (a) extraction of the body surface by simulating a top-down RGB-depth sensor from MRI volumes, (b) boundary extraction of 20 consistently present internal organs from automated segmentation masks, and (c) point-cloud generation of body surface, organ boundaries, and the deformable mean template using Farthest Point Sampling (FPS).
Each training sample thus contains a body-surface point cloud and corresponding organ point clouds. In addition a deformable template is created by computing the mean mask across subjects and sampling its surface.}
\label{fig:dataprep}
\end{figure}

\subsubsection{Body Surface Extraction}
\label{subsubsec:bodysurface}
In patient positioning, an automation system can utilize RGB-D cameras to scan the body surface. In our dataset, the body surface is extracted using MRI volumes by simulating the depth sensor. An RGB-D camera positioned on top of the patient, looking downward, can determine the body surface locations using a top-down approach. Similarly, we employed the MRI scans to mimic this approach and extracted the body surface.

\subsubsection{Boundary of the Internal Structure Extraction}
\label{subsubsec:boundary}
As MRI scans are available, the 3D segmentation masks of anatomical reference labels are generated using TotalSegmentatorMRI \cite{d2024totalsegmentator}, MRSegmentator\cite{hantze2024mrsegmentator}, and TotalVibeSegmentator\cite{graf2024totalvibesegmentator}. The models are trained on large-scale MRI datasets and used to generate segmentation masks in our datasets. The combined masks cover 137 unique anatomical structures, comprising various organs, blood vessels, bones, and tissue types. We then applied the gradient operator to one-hot representation of each considered segmentation label in all three dimensions, took the absolute value, and summed the results across all dimensions to extract the boundary voxels.
Although the resulting segmentation masks contain 137 anatomical structures, two criteria are set to determine the final selection of the labels. These criteria include the existence of the label in all masks and the sufficient number of non-zero boundary voxels across all subjects. Consequently, a final list of 20 internal structures are selected as shown in Table \ref{tab:labels}.

\begin{table}[h!] % h! = try to place here
    \centering
    \begin{tabular}{l r} % l = left, c = center, r = right
        \toprule
        Soft Tissue Organs & Bones \\
        \midrule
        Lung Right & Hip Left\\
        Lung Left & Hip Right \\
        Liver & Clavicula Left \\
        Kidney Right & Clavicula Right \\
        Kidney Left & Femur Left \\
        Pancrease & Femur Right \\
        Duodenum & Sacrum \\
        Aorta & Scapula Left \\
        Heart & Scapula Right \\
         & Vertebrae L4 \\
         & Vertebrae L5 \\  
        % \bottomrule
    \end{tabular}
    \caption{The curated set of 20 internal anatomical structures used in this study. Labels were obtained from automated MRI segmentations (TotalSegmentatorMRI, MRSegmentator, TotalVibeSegmentator) and selected based on consistent presence across subjects and sufficient boundary-voxel density.}
    \label{tab:labels}
\end{table}

\subsubsection{Point Cloud Generation}
\label{subsubsec:pointcloudgeneration}

We used the same approach to generate the point clouds of the body contours and internal anatomical structure. After the body surface is extracted in 3D grid, we located the points and employed Farthest Point Sampling (FPS) algorithm to generate a point cloud with the size of $(16384,3)$. 

In generating a point cloud for each label, we created two versions: one with $1024$ points and the other with $4096$ points per organ. This step is crucial to assess the model’s ability to handle a limited number of points. Similar to the body surface extraction process, the boundary voxels are detected, and for each organ, FPS operation took place to generate the point clouds with the size of $(1024/4096, 3)$. The resulting dataset comprises $6231$ point clouds representing the body surface and $20$ internal structures. Out of these pairs, $4780$ are utilized for training, while $1454$ are reserved for testing. 

Finally, a template shape is generated using mean mask. We calculated the mean of one-hot representation of all the segmentation mask. Similar to Sec. \ref{subsubsec:pointcloudgeneration}, the point cloud of the deformable mean shape is generated. A sample of point clouds (body surface, deformable template, target) used in our dataset is available in Fig. \ref{fig:data}.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.85\textwidth]{figures/data.png} 
    \caption{Point Cloud samples from the dataset.}
    \label{fig:data}
\end{figure}

\subsection{Model Architecture}
\label{subsec:model}
When processing 3D point clouds several challenges arise: the input data is unstructured and non-uniformly sampled - and hence a lack of a regular grid that would otherwise enable convolutions in voxel domains. Consequently, models are required that can operate on unordered sets and remain permutation invariant, while still being able to learn local, spatially dependent features. To construct a hierarchical feature representation that balances efficiency and accuracy makes methods for local neighbourhood aggregation necessary that can be applied during downsampling.

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.95\textwidth]{figures/model_arch.drawio-2.png} 
    \caption{Overview of the proposed DeformingPointTransformer. A dual-encoder architecture processes the body-surface point cloud and the mean-organ template separately, fusing only their global features. The decoder applies topology-preserving skip connections solely from the mean-organ stream, enabling deformation of the template into patient-specific organ shapes conditioned on external body geometry.}
    \label{fig:model}
\end{figure*}
The PointTransformer architecture \cite{point_transformer} is well suited to address those challenges. It combines farthest-point sampling (FPS) with k-nearest neighbour (kNN) grouping to impose a meaningful local structure on the irregular input and efficiently deals with high-resolution point clouds. These steps together with grouped self-attention layers enable the network to learn localised geometric relationships to compute context-aware features while preserving permutation invariance.

In this work, we extend the PointTransformer encoder-decoder design to work on two related point clouds - one from the body surface and the other from the internal mean organ model - jointly. This offers an effective balance between global shape learning and local deformation to infer the non-observed organs. During upsampling, trilinear interpolation propagates features from coarse to fine resolutions and is guided by the topology of the known mean model. This enables smooth reconstructions of dense point sets despite non-uniform input sampling and accurate prediction of the deformed shapes. 


%In point cloud processing, two crucial factors are to account for permutation invariance and the ability to process unordered sets. Before downsampling, it’s essential to have a grouping operation in the hierarchical structure creation. PointTransformer \cite{point_transformer} effectively addresses these challenges. We believe the choice of PointTransformer encoder and decoder blocks is also appropriate for this task. We employed the same encoder and decoder design in this project. The encoder and decoder blocks are illustrated in Fig. \ref{fig:encoder_decoder}. The PointTransformer architecture cleverly combines kNN and FPS to tackle the aforementioned challenges. Additionally, the use of trilinear interpolation guides the model to upsample the point clouds.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.85\textwidth]{figures/encoder_decoder.png} 
    \caption{The details of Encoder and Decoder blocks. Each block follows the same architecture from PointTransformer.}
    \label{fig:encoder_decoder}
\end{figure}

%The proposed model, DeformingPointTransformer, comprises dual encoders and one decoder. The model independently generates encoded features from the body contour and the initial mean shape. These features are then concatenated and fed into the decoder, which uses them to predict the exact 3D coordinates of internal anatomy. 

The proposed DeformingPointTransformer architecture consists of two encoders and a single decoder, but the two encoded streams do not play symmetrical roles. The body-surface encoder extracts a global shape representation from the external point cloud, capturing coarse anthropometric variability. In parallel, the mean-organ encoder processes the template internal model and produces a hierarchical set of multi-scale features. Only the global feature vectors from both encoders are concatenated and passed to the decoder, ensuring that the predicted organ geometry is conditioned on the overall body shape.

Importantly, skip connections in the decoder operate exclusively on the multi-scale features of the mean-organ model stream. This design preserves the point ordering of the mean model throughout the decoding process and effectively turns the output into an ordered set aligned with the template topology. As a result, the network learns deformations of the internal anatomy rather than reconstructing an unordered point cloud from scratch. This structure enables the model to combine global shape cues from the body surface with local, topology-aware deformation fields derived from the mean internal model.

To process the mean shape and body contour point clouds, we employ five encoder blocks that utilize Transformer layers and down-sampling operations. Within each block, the number of points is reduced by a factor of four. At the latent code level, two point clouds with dimensions $\left[16,256\right]$ are concatenated to form a single point cloud with dimensions $\left[16,512\right]$. Subsequently, the decoder utilizes this latent code to generate the predicted organs, which have dimensions $\left[20480,3\right]$.

In a hierarchical structure, incorporating a skip connection between the encoder and decoder would provide valuable information. However, in this task, the points derived from the body contours are not used in residual learning. Instead, they serve as a guide for deforming the mean shape. Consequently, we opted to utilize the points generated by the mean shape decoder and connect them to the decoder to enhance its output as they are the only branch where point ordering is stable. The model diagram is shown in Figure \ref{fig:model}.




\section{Results}
\label{sec:results}

We trained our model using $4780$ randomly chosen point cloud pairs of body shape and their corresponding internal structures. For testing, we reserved a distinct set of $1454$ samples. This sizable and diverse collection allows for a comprehensive evaluation of the model’s performance and demonstrates its capacity to learn robust, generalizable features across a variety of anatomical variations.

% \subsection{Training and Evaluation Details}
% \label{subsec:training}

The model is trained separately with both versions of the dataset. We denote the model trained on the $1024$ points dataset DeformingPointTransformer1K and the other DeformingPointTransformer4K. Both models are trained with Adam optimizer with $10^{-3}$ initial learning rate. We decreased the learning rate in a multi-step fashion. Chamfer Distance (CD) cost function is used to calculate the loss value for the training. The CD is computed separately for each anatomical structure, to enable the model to optimally learn the localization and shape of each label.

To evaluate the model's performance, we used Chamfer Distance (CD), Average Symmetric Surface Distance (HD95) and Mean Absolute Detection Offset Error (DOE). CD validates the global structural correctness, and HD95 validates the local boundary accuracy. To calculate DOE, we derive the smallest bounding boxes that encapsulates the prediction and target point cloud and calculate the Detection Oﬀset Error, defined as the absolute distance between the corresponding sides of the ground truth and predicted bounding boxes. To gain clearer insight into how the models behave across various spatial directions, the DOE is presented individually for each face of the bounding box (left, right, superior, inferior, posterior, and anterior). Breaking the results down this way lets us evaluate how well each approach places anatomical structures throughout the entire 3D space.

After training our own models, we compared them with the mean baseline model, and a Convolutional AutoEncoder adopted in the MONAI framework \cite{cardoso2022monai}. % PointTransformer\cite{point_transformer},

\textbf{Convolutional AutoEncoder}: As a volumetric baseline, we implemented a 3D convolutional autoencoder that operates on rasterized body-surface data. The input point cloud is first converted into a dense voxel grid using Gaussian splatting following \cite{heinrich2023chasing}, with a Gaussian kernel width of $\sigma$ = 1.7 on a 2 mm isotropic grid. The resulting volume is processed by a 3D convolutional encoder–decoder architecture which incorporates a fully connected bottleneck to compress the encoded feature map into a 256-dimensional latent vector.
The encoder uses seven 3D convolutional blocks with increasing channel widths and down-sampling strides, producing a compact bottleneck feature map that is flattened and projected to a 256-dimensional latent vector. The decoder expands this latent code back to the bottleneck shape via a fully connected layer and reconstructs the 21-channel volumetric output through a symmetric convolutional decoder. This baseline therefore evaluates a dense 3D rasterization and volumetric convolutional approach against our proposed sparse, point-based deformation model.

As further lower baseline \textbf{mean model} we simply evaluate the mean template shape (same "prediction" for all patients) against the ground truth for comparison.  %Add info about UNet (Autoencoder)

% \subsection{Performance Comparison}
% \label{subsec:perf_comp}

In Fig. \ref{fig:hd95_box}, we demonstrate that performance of the models on the Average Symmetric Surface Distance metric calculated over the test set. The metric is calculated for each test subject by first converting each point cloud to metric space (mm), then calculating the symmetric Hausdorff Distance 95th percentile. 

\begin{figure}[tbh]
    \centering
    \includegraphics[width=0.55\textwidth]{figures/hd95_box.png} 
    \caption{Average Symmetric Surface Distance. Results are computed over the test set.}
    \label{fig:hd95_box}
\end{figure}

\begin{table}[tbh]
\centering
\label{tab:aggregate_results}
\begin{tabular}{lcccc}
\toprule
\textbf{Metric} & \textbf{Mean Model} & \textbf{PointCAE} & \textbf{DPT-1K} & \textbf{DPT-4K} \\
\midrule
\textbf{CD} (mm) & $11.20 \pm 1.26$ & $8.07 \pm 1.01$ & $6.29 \pm 1.47$ & $\mathbf{4.69 \pm 1.19}$ \\
\textbf{HD95} (mm) & $31.07 \pm 4.24$ & $24.25 \pm 3.88$ & $15.03 \pm 3.02$ & $\mathbf{11.48 \pm 2.72}$ \\
\bottomrule
\end{tabular}
\caption{\textbf{Aggregate Performance Comparison.} Mean and Std. Dev. ($\pm$) calculated across all 20 organs for each model. Lower values indicate better performance.}
\end{table}

Similarly the plot in Fig. \ref{fig:cd_box}, we demonstrate that performance of the models on the Average Symmetric Chamfer Distance metric calculated over the test set. The metric is calculated for each test subject similar manner. Each predicted point cloud is first converted into metric space (mm), then the symmetric Chamfer Distance is calculated. Both HD95 and CD results suggests that the proposed structure can achieve outstanding performance in estimating the shape of individual structure and it can outperform convolutional based method. The plot in Fig. \ref{fig:offset_box} demonstrate that the model made significant improvement compared to mean model. 

% \begin{figure}[h!]
%     \centering
%     \includegraphics[width=0.95\textwidth]{figures/cd_box.png} 
%     \caption{Chamfer Distance. Results are computed over the test set.}
%     \label{fig:cd}
% \end{figure}


% \begin{figure}[h!]
%     \centering
%     \includegraphics[width=0.95\textwidth]{figures/dimension_box.png} 
%     \caption{Mean Absolute Detection Offset Error. Results are computed over the test set.}
%     \label{fig:offset}
% \end{figure}

\begin{figure}[tbh]
    \centering
    
        \subfigure[Chamfer Distance]{%
            \includegraphics[width=0.48\textwidth]{figures/cd_box.png}
            \label{fig:cd_box}
        }\hfill
        \vrule width 0.2pt
        \hfill
        \subfigure[Mean Absolute Detection Offset Error]{%
            \includegraphics[width=0.48\textwidth]{figures/dimension_box.png}
            \label{fig:offset_box}
        }
    
    \caption{Quantitative results computed over the test set. (a) Chamfer Distance. (b) Dimension offset error.}
    \label{fig:combined_results}
\end{figure}

The more details of each metric for every label can be found in Appendix \ref{append:results}. The experimental results validate the robust performance of the DeformingPointTransformer in localizing internal anatomy. The attention-driven architecture, which explicitly models template deformation, proves significantly more effective than conventional ML baselines. Specifically, the attention mechanism captures the complex, non-linear spatial dependencies between external surface geometry and internal anatomical landmarks. Furthermore, the use of a deformable template imposes a strong geometric prior, ensuring that predicted organs maintain anatomical plausibility even when inferred from sparse surface data.

\section{Discussion \& Conclusion}
\label{sec:disc_conc}

The pre-scan process, to mitigate the throughput bottlenecks caused by error-prone manual positioning, automated systems were developed to localize anatomy and adjust the patient table, reducing preparation time. Although there have been attempts to use learning-based methods to automate this process, they often require dense data representations like depth images or voxel grids. This approach suffers from high computational demands and significant memory overhead. A strong alternative which is both lightweight and efficient is sparse point clouds. 

In this work, we demonstrated for the first time that a sparse point cloud representation of the body surface can be used to accurately and reliably estimate the 3D positions of multiple internal anatomical structures. We proposed DeformingPointTransformer, a PointTransformer based model for estimating inner body structures from external depth sensors. By jointly providing the body-surface point cloud and the mean-organ template as inputs, the model learns how external body shape constrains the configuration of internal anatomy. The DeformingPointTransformer leverages this pairing by using the body surface to supply a global shape descriptor, while the mean-organ stream provides multi-scale, topology-preserving features. Through skip connections that operate only on the ordered mean-shape features, the decoder deforms the template into patient-specific organ geometries conditioned solely on the external surface.

%Through skip connections that operate only on the ordered mean-shape features, the decoder learns to deform the template into patient-specific organ geometries conditioned on the external surface. 

% that processes both body surface and a template mean shape to predict the internal anatomy. The guide-template input structure allows to learn the relation and connection between the external body surface and internal anatomical structures. The DeformingPointTransformer employs an encoder-decoder structure with a skip connection. It utilizes multi-scale features from the mean-shape, which are concatenated at each decoder level. The body surface is processed to encode the estimated shape as a vector at the lowest level.

%The proposed architecture outperformed convolutional machine learning approaches and demonstrated significant improvement in the mean model when trained and tested on the NAKO dataset. The results suggest that incorporating point clouds along with guide-template input structures holds great promise for 3D prediction tasks. We also believe that the guide-template input structure is highly suitable for internal anatomical landmark estimation because there are minimal variations in cases. However, it’s possible that the same remarkable result may not be achieved in a different task.
The proposed DeformingPointTransformer consistently outperformed volumetric convolutional baselines on the NAKO dataset, achieving more accurate alignment of patient-specific organ geometries to the mean template. These results highlight the advantages of using sparse point clouds combined with a dual-input template-guided approach for 3D organ prediction. While this architecture is particularly effective for estimating internal anatomical landmarks with relatively low inter-subject variability, further evaluation is needed to assess its performance on tasks with greater anatomical variation, or real rather than simulated surface information obtained directly from depth sensors. 

In conclusion, we proposed a model that accurately localizes internal organs using only body shape contours and an initial mean shape. In the future, it is possible to use implicit neural representations to further enhance the prediction point clouds for dense organ segmentation or distance map predictions without limitations to a certain voxel resolution.

\section{Compliance with ethical standards}
\label{sec:ethics}

The German National Cohort (NAKO) study is performed with the approval of the relevant ethics committees, and is in accordance with national law and with the Declaration of Helsinki of 1975 (in the current, revised version).




% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We gratefully acknowledge the financial support by German Research Foundation: DFG, HE 7364/10-1, project number 500498869.}


\bibliography{midl26_232}


\appendix

\section{Table of Results}
\label{append:results}

\begin{table}[h!]
\centering
\tiny
\begin{tabular}{lcccccccr}
\toprule
\textbf{Organ} & \textbf{CD} & \textbf{HD95} & \textbf{Right} & \textbf{Left} & \textbf{Superior} & \textbf{Inferior} & \textbf{Anterior} & \textbf{Posterior} \\

\midrule
aorta & 6.30 mm & 19.42 mm & 22.05 mm & 8.04 mm & 13.15 mm & 11.45 mm & 26.09 mm & 23.73 mm \\
clavicula\_left & 2.56 mm & 6.10 mm & 3.61 mm & 3.11 mm & 4.87 mm & 6.01 mm & 14.08 mm & 8.87 mm \\
clavicula\_right & 2.47 mm & 5.60 mm & 2.68 mm & 2.63 mm & 4.21 mm & 6.41 mm & 8.02 mm & 11.24 mm \\
duodenum & 4.61 mm & 12.29 mm & 7.86 mm & 6.88 mm & 7.90 mm & 7.21 mm & 12.23 mm & 17.50 mm \\
femur\_left & 4.57 mm & 11.22 mm & 2.07 mm & 4.38 mm & 4.96 mm & 5.91 mm & 10.24 mm & 9.03 mm \\
femur\_right & 4.57 mm & 10.39 mm & 1.71 mm & 5.09 mm & 4.78 mm & 5.34 mm & 8.14 mm & 11.63 mm \\
heart & 7.75 mm & 17.00 mm & 16.66 mm & 11.75 mm & 15.04 mm & 14.44 mm & 39.47 mm & 26.19 mm \\
hip\_left & 4.46 mm & 10.18 mm & 4.09 mm & 4.12 mm & 4.63 mm & 5.37 mm & 10.84 mm & 12.17 mm \\
hip\_right & 4.51 mm & 10.11 mm & 3.82 mm & 3.80 mm & 4.77 mm & 5.62 mm & 11.75 mm & 11.03 mm \\
kidney\_left & 4.60 mm & 13.03 mm & 7.99 mm & 9.78 mm & 9.05 mm & 7.61 mm & 15.76 mm & 16.08 mm \\
kidney\_right & 5.21 mm & 15.07 mm & 7.93 mm & 11.70 mm & 10.87 mm & 8.32 mm & 15.08 mm & 15.52 mm \\
liver & 6.74 mm & 16.60 mm & 9.75 mm & 7.16 mm & 5.55 mm & 6.47 mm & 35.50 mm & 10.64 mm \\
lung\_left & 5.97 mm & 14.64 mm & 9.86 mm & 3.63 mm & 5.02 mm & 5.03 mm & 10.96 mm & 18.98 mm \\
lung\_right & 6.14 mm & 14.68 mm & 12.90 mm & 3.45 mm & 5.31 mm & 5.28 mm & 17.41 mm & 12.46 mm \\
pancreas & 4.88 mm & 11.97 mm & 6.71 mm & 7.10 mm & 6.48 mm & 12.85 mm & 25.64 mm & 13.71 mm \\
sacrum & 5.01 mm & 13.61 mm & 8.21 mm & 12.96 mm & 18.60 mm & 12.87 mm & 18.28 mm & 17.60 mm \\
scapula\_left & 3.39 mm & 7.70 mm & 3.96 mm & 2.46 mm & 4.49 mm & 4.11 mm & 11.70 mm & 11.08 mm \\
scapula\_right & 3.33 mm & 7.76 mm & 6.28 mm & 2.79 mm & 4.20 mm & 3.81 mm & 12.11 mm & 9.02 mm \\
vertebrae\_L4 & 4.31 mm & 12.44 mm & 11.36 mm & 9.78 mm & 13.48 mm & 12.33 mm & 18.90 mm & 15.56 mm \\
vertebrae\_L5 & 4.13 mm & 11.39 mm & 9.39 mm & 7.61 mm & 13.58 mm & 12.29 mm & 16.20 mm & 15.84 mm \\
\bottomrule
\end{tabular}
\caption{The details of the performance of DeformingPointTransformer4K model}
\label{tab:model_results}
\end{table}

\section{Bar Plots of CD and HD95 Results}
\label{append:results}

\begin{figure}[h]
    \centering
    \includegraphics[width=0.85\textwidth]{figures/hd95_bar.png} 
    \caption{Average Symmetric Surface Distance}
    \label{fig:hd95}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[width=0.85\textwidth]{figures/cd_bar.png} 
    \caption{Chamfer Distance}
    \label{fig:cd}
\end{figure}


\begin{figure}[h]
    \centering
    \includegraphics[width=0.85\textwidth]{figures/dimension_bar.png} 
    \caption{Mean Absolute Detection Offset Error}
    \label{fig:offset}
\end{figure}


\end{document}
