\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}

\title[Annotation-Efficient Strategy for Segmentation of 3D Body Composition]{Annotation-Efficient Strategy for Segmentation of 3D Body Composition}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
 \midlauthor{\Name{Lena Philipp\nametag{$^{1}$}} \Email{lena.philipp@radboudumc.nl}\\
 \Name{Maarten {de Rooij}\nametag{$^{1}$}} \Email{Maarten.deRooij@radboudumc.nl}\\
 \Name{John Hermans\nametag{$^{1}$}} \Email{John.Hermans@radboudumc.nl}\\
\Name{Matthieu Rutten\nametag{$^{1}$}} \Email{Matthieu.Rutten@radboudumc.nl}\\
\Name{Horst Hahn\nametag{$^{2}$}} \Email{horst.hahn@mevis.fraunhofer.de}\\
\Name{Bram {van Ginneken}\nametag{$^{1}$}} \Email{Bram.vanGinneken@radboudumc.nl}\\
\Name{Alessa Hering\nametag{$^{1,2}$}} \Email{Alessa.Hering@radboudumc.nl}\\
 \addr $^{1}$  Department of Medical Imaging, Radboudumc, Nijmegen, The Netherlands \\
 \addr $^{2}$ Fraunhofer MEVIS, Bremen, Germany}

\begin{document}

\maketitle

\begin{abstract}
Body composition as a diagnostic and prognostic biomarker is gaining importance in various medical fields such as oncology. Therefore, accurate quantification methods are necessary, like analyzing CT images. While several studies introduced deep learning approaches to automatically segment a single slice, quantifying body composition in 3D remains understudied due to the high required annotation effort. This study proposes an annotation-efficient strategy using an iterative self-learning approach with sparse annotations to develop a segmentation model for the abdomen and pelvis, significantly reducing manual annotation needs. The developed model demonstrates outstanding performance with Dice scores for skeletal muscle (SM): 0.97+/-0.01, inter-/intra-muscular adipose tissue (IMAT): 0.83 +/-0.07, visceral adipose tissue (VAT): 0.94 +/-0.04, and subcutaneous adipose tissue (SAT): 0.98 +/-0.02. A reader study supported these findings, indicating that most cases required negligible to no correction for accurate segmentation for SM, VAT and SAT. The variability in reader evaluations for IMAT underscores the challenge of achieving consensus on its quantification and signals a gap in our understanding of the precision required for accurately assessing this tissue through CT imaging. Moreover, the findings from this study offer advancements in annotation efficiency and present a robust tool for body composition analysis, with potential applications in enhancing diagnostic and prognostic assessments in clinical settings.
\end{abstract}

\begin{keywords}
Body composition, 3D, CT, Noisy Annotations, Medical Image Segmentation.
\end{keywords}

\section{Introduction}
A body mass index (BMI) greater than 30 kg/m2 is commonly seen as a health risk factor for various cardiovascular diseases \cite{doi:10.1161/CIR.0000000000000973} and cancer types \cite{doi:10.1056/NEJMsr1606602}. However, the BMI is limited in its inability to differentiate between adipose tissue/muscles and to account for the heterogeneity in fat distribution, which can lead to imprecise or misleading results \cite{PICHE2018103}. The importance of body composition is increasingly recognized in studying survival in cancer patients \cite{cespedes2018obesity}, \cite{shachar2016prognostic}, implications for care \cite{prado2018implications} and metabolic diseases \cite{pi2019changes} and highlights the need for accurate methods of measurement. The widespread use of CT imaging in clinical settings, coupled with its ability to distinguish between adipose tissue and muscle and provide detailed insights into fat distribution, positions it as a superior method for improving body composition analysis accuracy.\\
In routine clinical practice, measuring body composition remains difficult due to the expert knowledge required for the time-consuming annotation process. This is why only an axial slice at the level of the lumbar spine level 3 (L3) is often used due to the high correlation to the entire volume \cite{mourtzakis2008practical}. Convolutional neural networks have enabled the automatic segmentation of skeletal muscle (SM), inter-/intra-muscular adipose tissue (IMAT), visceral adipose tissue (VAT), and subcutaneous adipose tissue (SAT) using a slice at the height of L3 (\cite{weston2019automated}, \cite{paris2020automated}, \cite{shen2023deep}), L3/L4 (\cite{nowak2020fully}, \cite{park2020development}), pelvis \cite{hemke2020deep}, and multiple heights \cite{ahmad2023automatic}. Additionally, end-to-end solutions have been developed to automate slice selection as well (\cite{nowak2021end}, \cite{dabiri2020deep}, \cite{zhang2022autonomous}, \cite{bridge2022fully}, \cite{xu2022extending}). The trend has recently evolved towards volumetric body composition analysis for more comprehensive measurements across larger body regions using multi-atlas segmentation (\cite{hu2018automated}, \cite{decazes2019anthropometer3d}) or deep learning (DL) approaches (\cite{koitka2021fully}, \cite{pu2021automated}, \cite{liu2020abcnet}, \cite{dai2024ga}, \cite{fu2020automatic}, \cite{borrelli2021artificial},\cite{lee2021deep}). Despite the advancements, the annotation of data required for training these 3D DL models presents significant challenges, attributed to the labor-intensive and meticulous nature of the task. This often results in training and test sets that are either small in size or only partially annotated. \cite{lee2021deep} and \cite{pu2021automated} utilized larger training datasets to enhance their models. \cite{lee2021deep} employed manual annotations for their training and testing sets, leading to a model that achieved high Dice scores. However, it did not distinguish IMAT with a separate segmentation mask. Addressing the complexity of the volumetric annotation of body composition, \cite{pu2021automated} tackled this challenge by employing semi-supervised self-training in the training process, albeit with suboptimal performance outcomes with Dice scores of 0.82 for VAT and 0.59 for IMAT. \\
Semi-supervised learning is particularly relevant in contexts where extensive data exists, but only a fraction is labeled. It incorporates not only fully annotated data but also utilizes noisy, weak annotations or pseudo labels to enrich the training process. Pseudo labels are annotations created by applying a model to unlabeled data and using the output as new additional annotations. This self-training step is repeated to extend the training data iteratively or improve the label's quality. Self-training can be seen as a form of weak supervision and builds on the potential to outperform the teacher (\cite{guan2018said}, \cite{khoreva2017simple}, \cite{zhang2018self}). While many 3D semi-supervised segmentation methods require an initial dataset of 3D annotations, utilizing sparse annotations can significantly enhance efficiency by annotating just a few slices within a 3D volume, taking advantage of the strong correlation among slices to preserve precise object boundaries with minimal annotations. Approaches using sparse annotations have been shown to outperform traditional weakly supervised techniques that utilize scribbles (\cite{lin2016scribblesup}, \cite{liu2022weakly}) and bounding boxes (\cite{oh2021background}, \cite{dai2015boxsup}) in both efficiency and accuracy. Most of these methods involve the use of registration modules to generate pseudo labels (\cite{li2022pln}, \cite{cai2023orthogonal}, \cite{bitarafan20203d}). \cite{cai20233d} propose a cross-teaching method that enforces consistency between the predictions of 3D and 2D networks, thereby increasing the view difference of networks.\\
In this work, we aimed to develop a robust 3D segmentation model to quantify SM, IMAT, VAT and SAT. Facing challenges like limited annotated datasets, time-consuming manual labeling, and the need for diverse scans for robustness, we introduce a novel self-training strategy with sparse annotations. This strategy transitions from a 2D to a 3D model, substantially reducing the manual annotation workload by requiring only individual slices to be labeled manually. The goal of this method is to decrease annotation efforts without compromising on segmentation quality. To validate the efficacy of our approach, we conducted an evaluation on an internal test set, focusing on the segmentation performance through the Dice score. Additionally, an expert reader study using a larger external dataset was conducted to evaluate the effort needed to correct generated segmentation masks, offering a detailed assessment of our model's clinical utility and accuracy.

\section{Materials and Methods}
The objective is to create a DL model that accurately measures body composition throughout the abdominal and pelvic regions. This involves training the model to produce segmentation masks for four categories: SM, IMAT, VAT, and SAT.

\subsection{Data}
Our \textbf{training dataset} comprises 116 scans, amounting to 38,002 slices, from 100 patients (47\% female), gathered between 2008 and 2021. These scans were selected for their comprehensive field of view spanning the entire abdomen and pelvis.
A subset of 417 slices (extracted from 24 scans) from this dataset were annotated by a trained researcher and student using an open-source tool 3D Slicer \cite{3dslicer}, \cite{fedorov20123d}. For quality control, an experienced radiologist was consulted. The annotation process followed a standardized annotation protocol (Alberta Protocol; \cite{alberta}). The proposed segmentation strategy was applied to the remaining 92 scans and slices. The \textbf{internal test set}, aimed at assessing segmentation performance measured as Dice score, includes 20 scans (10 male, 10 female) chosen for their field of view. Given the labor-intensive nature of annotating each of these scans (each with an average of 322 slices) this set was initially automatically annotated using an intermediate baseline 3D U-Net trained on 12 CT scans. These automatically derived segmentation masks were subsequently manually verified and corrected. For \textbf{external testset for visual assessment}, we utilized 100 cases from the KiTS21 dataset \cite{heller2023kits21}, ensuring a balance in BMI categories (normal, overweight, obese) and sex, using the KiC data \cite{ibm_knight}. These cases were adjusted to focus on the L1 to L5 region and resampled to a 3mm slice thickness to streamline the review process. Further data details are provided in Appendix~\ref{app:DataDetails}.

\subsection{Efficient Annotation and Training Strategy}
The proposed annotation strategy employs a self-training methodology, utilizing both 2D and 3D neural networks. \\ 
 \begin{figure}[h]
\centering
\includegraphics[width=0.65\textwidth]{./method_overviewv5.png}
\caption{Efficient Annotation and Training Workflow. 2D slices are extracted (A) and annotated as training data (B). A 2D U-Net model is trained on these slices (C) to generate masks for all slices of the scan (D). The 2D model's output is assembled into 3D masks. These masks are used to train a 3D model (E), which generates refined segmentation masks (F). These 3D masks guide the training of a second 3D model for enhanced accuracy (G). Iterative retraining with additional targeted scans refines the model further, masks and scans are either used directly as a training example or they are used to retrain the 2D model (H).}
\label{fig:method_overview}
\end{figure}
\textbf{2D training:} The process began with the selection of a subset of scans, totaling 10. For each scan, 15-20 slices were chosen, spanning from the T9 vertebra to the end of the pelvis, and were semi-automatically annotated using a threshold brush to create training data. A 2D U-Net model was then specifically trained to segment these slices accurately. To keep the model's training focused, we avoided using regularization techniques like data augmentation or dropout. After training, this 2D model was used to produce segmentation masks for all slices in the scans, which were then assembled into complete 3D masks. \\
\textbf{3D training:} 
The initial, imperfect masks served as the training foundation for a 3D nnU-Net \cite{nnunet}. Leveraging the added contextual patterns, this 3D model was adept at generating refined, smoother masks for the scans. \\
\textbf{Refinement Process:} After the initial prediction phase, a second nnU-Net was trained using the improved masks from the first 3D U-Net to enhance segmentation accuracy further. This second model is applied to segment new scans. If this model encounters difficulties or inaccurately segments a scan, as determined by human visual assessment, an iterative refinement process is initiated. Starting with step one, slices for the failure case are extracted and used for retraining the 2D U-Net to improve the 3D model's accuracy.\\
This process was repeated three times. Each time the pool of training slices was extended with slices from a focused scan group (certain kernel types, low dose CT scans, different age groups).
With this strategy, expert annotations were only performed for a small subset of 2D masks (417 out of 38002, 1\%) instead of correcting 3D masks, which significantly lower the overall annotation workload. 

\subsection{Evaluation}
Our evaluation comprises two phases to ensure a thorough assessment of the model: \\
\textbf{Quantitative Assessment:} We utilize a dataset of 20 cases to measure the model's Dice score performance, focusing on the area from the thoracic vertebra T9 to the end of the pelvis which we divided into three specific segments: T9-T12, L1-L5, and S1 to the end of the pelvis. \textbf{Visual Assessment:} Acknowledging the limitations presented by the small size of our internal test set, we incorporated an additional visual evaluation step to provide a broader perspective on the model’s performance.
Three experienced radiologists visually evaluate 100 cases within the L1-L5 region, rating the effort to correct the segmentation masks. With this evaluation, conducted on the grand-challenge platform \cite{grand-challenge}, we gain insights into the model’s current performance and usability. The rating scale, adapted from \cite{berta2021automatic}, ranges from 1 (extensive effort) to 5 (no effort). Further details on this scale are available in the Appendix~\ref{app:ScaleReaderStudy}. 
\section{Results}
\begin{table}
\caption{Overview of the full model's performance on the internal test set. The rows present the Dice scores per class averaged across all scans in the corresponding sections.}
\makebox[\textwidth]{\begin{tabular}{|l|l|l|l|l|l|}
\hline
Section   & SM      & IMAT    & VAT  & SAT     & Mean          \\ \hline
\hline
Top  & $0.952\pm0.02$ & $0.833\pm0.05$  & $0.919\pm0.04$  & $0.974\pm0.03$ & $0.919\pm0.07$ \\
L1 - L5        & $0.969\pm0.01$ & $0.834\pm0.04$ & $0.948\pm0.04$ & $0.987\pm0.01$ & $0.935\pm0.07$ \\
Pelvis         & $0.98\pm0.005$ & $0.861\pm0.04$ & $0.932\pm0.03$ & $0.982\pm0.01$ & $0.939\pm0.06$ \\
\hline
\hline
Mean             & $0.973\pm0.007$          & $0.848\pm0.04$          & $0.944\pm0.04$          & $0.983\pm0.01$          & $0.937\pm0.06$          \\ \hline
\end{tabular}}\\
\label{fig:baselineresults}
\end{table}
\textbf{Dice Scores per Class and Segment.} The final 3D model was evaluated using the internal test set. To see the performance for different parts of the body, the performance with split into different segments. The model achieved high precision across all sections, reflected in the high Dice scores for SM, VAT, and SAT, with slightly more variability noted in the IMAT segmentations. The results are presented in Table~\ref{fig:baselineresults}. For additional details, refer to the boxplot in Appendix~\ref{app:boxplot} and the visual overview of a sample case in Appendix~\ref{app:overviewresults}. \\
 \begin{figure}
\centering
\includegraphics[width=\textwidth]{./reader_study.png}
\caption{Left: Aggregated answers of all readers for the 100 cases across classes. Right: An illustrative case from the reader study, displaying segmentation of IMAT where two readers scored a 5, and one reader gave a score of 1. Row 1: Automatically generated mask. Row 2: Manually corrected mask. Row 3: Original image.}
\label{fig:reader_study}
\end{figure}
\textbf{Reader Study.} The aggregated scores from the reader study (Figure~\ref{fig:reader_study}) indicate that most cases required minimal to no corrections for SM, VAT, and SAT, highlighting the model's high accuracy and reliability for these classes.
Specifically, for SM, all readers assigned a score of at least 4 in 81 cases, denoting negligible to no correction needed, and in 97 cases, the score was above 3, suggesting limited correction effort. Similar results are obtained for SAT, for which 79 cases received a score of at least 4, while 94 cases scored above 3. VAT segmentation also demonstrated promising results, with 90 cases rated with a score of 4 or 5, and 97 cases scored above 3. However, IMAT proved more challenging. Two readers found the vast majority (90 cases) to require minimal correction, scoring 4.18 and 4.87 on average, while one reader's average score was 1.32, mainly due to unannotated fatty streaks in muscles, suggesting significant correction time. Figure~\ref{fig:reader_study} presents an example of such a case. Correcting this case with a threshold brush and separating all voxels with HU values below -29 from the SM mask resulted in only minor changes. This points to an inherent limitation of CT imaging in differentiating small, closely aligned structures as IMAT (partial volume effects). For more challenges identified in the reader study, see Appendix~\ref{app:failurecases}. \\
\textbf{Annotation Method.}
The annotation method's effectiveness is shown through targeted evaluations, using three manually corrected training dataset examples to track progressive improvement in annotation quality via self-learning, measured by the Dice score.
Starting with a foundation of 10 training cases, translating to roughly 100 2D slices, the initial 2D model demonstrated strong performance, especially in segmenting SM and SAT.  Training the 3D model with initial masks notable improvement, increasing Dice scores for SM (0.978 to 0.99), IMAT (0.874 to 0.95), VAT (0.963 to 0.989), and SAT (0.989 to 0.99). This progression underscores the benefits of our phased training approach. The 3D context improved IMAT segmentation by refining inconsistencies across slices, leading to smoother masks. Subsequent application of the 3D model for new mask predictions in these cases resulted in a slight increase in the Dice scores (SM: 0.99, IMAT: 0.977, VAT: 0.994, SAT: 0.997). 
Upon completion of the third and final iteration, which included 24 cases equivalent to 417 2D slices, the 2D model's Dice scores saw an increase, indicating that a greater variety of training examples were instrumental in boosting the model's performance across the three cases, even without adding more slices of these scans (SM: 0.978, IMAT: 0.892, VAT: 0.965, SAT: 0.99). Visually, the model's progress is evident from increasingly accurate segmentation outcomes on slices from different patients, detailed in the Appendix~\ref{app:annotationprocess}.

\section{Discussion and Conclusion}
\textbf{Annotation-Efficient Segmentation Approach}. This work introduces a novel annotation-efficient training strategy for 3D body composition segmentation in the abdominal and pelvic cavity, which drastically reduces the need for expert annotations. By self-learning and using 2D and 3D models, only approximately 1\% of the slices used in training required manual labeling. The evaluation shows that subsequent training steps effectively reduced the label noise in the 2D model's output. Additionally, incorporating a human-in-the-loop approach for expanding the training dataset further enhanced the final model's performance, demonstrating the strategic value of combining automated refinement with expert human oversight in the training process. The outcome is a robust model capable of producing high-quality segmentation masks, as evidenced by the evaluations. \\
However, this approach comes with its challenges. While self-training is particularly effective for small labeled datasets \cite{bai2017semi}, its advantages are limited for larger datasets. Moreover, adding weak labels introduces the risk of overfitting to noise and adopting incorrect patterns. In this approach, the reliability is roughly assessed by human oversight. Inadequate masks lead to selected scans being redirected into manual single slice annotation, incorporating an element of active learning. Future improvements could involve automating this process through techniques like uncertainty estimation to flag poor examples. It is also valuable to investigate strategies to counteract negative impacts on the learning process, such as modifying loss functions (\cite{ghosh2017robust}, \cite{zhang2018generalized}) or employing sample weighting strategies (\cite{ren2018learning}, \cite{mirikharaji2019learning}). Regarding the sparse annotation approach, further promising ideas to explore involve 2.5D approaches for multi-view fusion \cite{zhang2022bridging}. Leveraging additional views for sparse annotations introduces more variability and has demonstrated encouraging outcomes \cite{cai20233d}. Future research should compare semi- and weakly supervised methods evaluated on datasets with wide field of views, such as body composition segmentation, since they present unique challenges. \\
\textbf{Robustness}. Our model is trained on a diverse dataset that includes different slice thicknesses, convolution kernel types, low-dose CT scans, and images from multiple manufacturers, enhancing its adaptability to various image qualities and noise levels, particularly in low slice thickness scenarios. \\
In evaluating the model, we noted its strong performance in segmenting SM, VAT, and SAT from early development stages, even for challenging tasks like distinguishing VAT as fat within the abdominal cavity from organ-encased and pericardial fat. However, difficulties arose with both small VAT volumes, complicating SM segmentation near organs, and high SAT or VAT distributions, leading to bigger segmentation gaps. This observation prompted a reevaluation of the training samples to underscore the significance of body type as a critical factor in sample selection. One challenge in creating a balanced dataset stems from the inability to determine fat and muscle distribution from standard scan data or BMI alone. To mitigate this, we propose adopting a sampling strategy that considers demographic characteristics, following the insights from \cite{magudia2021population} highlighting the variance that comes from age, ethnicity, and sex in automated body composition analysis. \\
\textbf{Inclusion of IMAT.} An interesting addition to our study is the inclusion of IMAT as a separate class, a feature often omitted in other studies due to the complexity and time-consuming nature of its annotation. The Dice score indicates a reasonably high level of precision for IMAT segmentation; however, this metric prompts several questions. The absence of exclusively expert-annotated references could be limiting the Dice score's reliability, a factor that is particularly critical for accurately quantifying the relatively small volume of this class. However, even with references like that available, the reader study underscores concerns about whether voxel-based annotations of IMAT alone on CT images are adequate, highlighted by one reader's assessment that most cases required extensive annotation efforts. Given IMAT's small volume, partial volume effects, image noise, and even minimal inaccuracies could have a disproportionate impact on its measurement. A potential approach to these challenges might involve the use of a unified mask for IMAT and SM, coupled with histogram analysis to evaluate the overall attenuation within the muscle. This could provide insights into muscle quality or the patient's overall health status, although the specific application of such an approach remains to be fully explored.\\
\textbf{Limitations.} Additionally, scans with severe beam hardening or strong truncation artifacts were excluded from the study. Future work, inspired by strategies such as those proposed by \cite{xu2023body} for managing truncation artifacts on individual slices, is essential. An overview of our model's performance in scenarios complicated by artifacts and other challenging variations is detailed in the Appendix~\ref{app:failurecases}, offering insights into its current robustness and areas for enhancement. \\
The dimension of our internal test set is acknowledged as limited, underscoring the necessity for broader validation. A reader study incorporating external datasets has begun to affirm our model's efficacy and robustness, yet it is clear that further validation is needed. \\
\textbf{Conclusion.} This work presents an efficient annotation strategy for 3D body composition segmentation that drastically reduces manual annotations while improving model accuracy via iterative improvements. By training with a diverse dataset and specifically identifying IMAT as a distinct class, our model aims to adapt to various imaging conditions and offers in-depth body composition analysis. The divergent assessments of IMAT by readers not only highlight the difficulties in standardizing its measurement but also point to an essential area for further investigation regarding the accuracy needed for CT imaging-based assessments. Overall, the ability to assess body composition in 3D enhances patient care by enabling precise monitoring of changes, providing deeper insights into patients' health status. Assessing these measurements holds potential for refining outcome predictions and diagnostics, especially when integrated with other health parameters.

\bibliography{midl24_105}


\appendix

\newpage
\section{Data Details}
\label{app:DataDetails}

\begin{table}[h]
\caption{Overview of Patient Characteristics - Training Data.}
\makebox[\textwidth]{\begin{tabular}{|l|l|l|l|}
\hline
   & Male (n = 52)      & Female (n = 46)    & All (n = 100)     \\ \hline
\hline
Mean age  & $59\pm15.1$ & $58.7\pm17.2$  & $58.87\pm16.02$ \\
20 - 40  & 7 & 7  & 14 \\
40 - 65  & 26 & 20  & 46 \\
65 - 80  & 16 & 17  & 33 \\
\textgreater 80  & 3 & 2  & 5 \\
NA  &  &   & 2 \\
\hline
\end{tabular}}\\
\label{fig:datadetailstraining}
\end{table}

\begin{table}[h]
\caption{Overview of Patient Characteristics - Internal Test Data.}
\makebox[\textwidth]{\begin{tabular}{|l|l|l|l|}
\hline
   & Male (n = 10)      & Female (n = 10)    & All (n = 20)     \\ \hline
\hline
Mean age  & $52.7\pm18.3$ & $53\pm15.6$  & $53.1\pm16.5$ \\
Age range  & 28 - 84 & 31 - 79  & 28 - 84 \\
\hline
\end{tabular}}\\
\label{fig:datadetailstest}
\end{table}

\begin{table}[h]
\caption{Overview of Acquisition Parameters - Training Data.}
\makebox[\textwidth]{\begin{tabular}{|l|l|}
\hline
Parameter & Value \\
\hline
\hline
Slice thickness range & 0.5 - 5.0 \\
\hspace{6pt} \textless 2 & 61 \\
\hspace{6pt} 2 - 3 & 40 \\
\hspace{6pt} \textgreater 3 & 15 \\
\hline
Manufacturer &  \\
\hspace{6pt} Toshiba & 46 \\
\hspace{6pt} Siemens & 44 \\
\hspace{6pt} Philips & 17 \\
\hspace{6pt} GE Medical Systems & 7 \\
\hspace{6pt} Canon Medical Systems & 2 \\
\hline
Contrast - enhanced & 76 \\
Non - contrast - enhanced & 40 \\
\hline
\end{tabular}}\\
\label{fig:datadetailstraining}
\end{table}

\begin{table}[h]
\caption{Overview of Acquisition Parameters - Internal Test Data.}
\makebox[\textwidth]{\begin{tabular}{|l|l|}
\hline
Parameter & Value \\
\hline
\hline
Slice thickness range (mm) & 1.0 - 3.0 \\
\hspace{6pt} \textless 2 & 10 \\
\hspace{6pt} 2 - 3 & 10 \\
\hline
Manufacturer  &  \\
\hspace{6pt} Toshiba  & 17 \\
\hspace{6pt} Siemens  & 3 \\
\hline
Contrast - enhanced & 19 \\
Non - contrast - enhanced & 1 \\
\hline
\end{tabular}}\\
\label{fig:datadetailstest}
\end{table}

\begin{table}[h]
\caption{Overview of Patient Characteristics - External Test Data.}
\makebox[\textwidth]{\begin{tabular}{|l|l|l|l|}
\hline
   & Male (n = 47)      & Female (n = 53)    & All (n = 100)     \\ \hline
\hline
Mean age  & $59\pm13$ & $60.6\pm14$  & $60.1\pm13.5$ \\
\hspace{6pt}20 - 40  & 2 & 6  & 8 \\
\hspace{6pt}40 - 65  & 32 & 25  & 57 \\
\hspace{6pt}65 - 80  & 9 & 19  & 28 \\
\hspace{6pt}\textgreater 80  & 4 & 3  & 7 \\
\hline
\hline
BMI &  &  & \\
\hspace{6pt} Normal (\textgreater 18.5 - 25) & 14 & 18 & 32 \\
\hspace{6pt} Overweight (25 - 30) & 16 & 16 & 32 \\
\hspace{6pt} Obese (\textgreater 30) & 17 & 19 & 36 \\
\hline
\end{tabular}}\\
\label{fig:datadetailskits}
\end{table}

\newpage
\section{Scale Reader Study}
\label{app:ScaleReaderStudy}
\begin{enumerate}
    \item Extensive effort: segmentation with extensive errors requiring the reader excessive effort to correct them                         
    \item Considerable effort: segmentation with errors that require sizeable and/or time-consuming corrections                          
    \item Limited effort: segmentation with inaccuracies that require limited and/or brief correction                                     
    \item Insignificant effort: segmentation with small imperfections negligible for the reader  
    \item No effort: segmentation corresponding to the ideal result for the reader
\end{enumerate}

\newpage
\section{Boxplot Internal Testset}
\label{app:boxplot}
 \begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{./boxplot_fullmodel.png}
\caption{Visualization of the distribution of Dice scores for each class.}
\label{fig:boxplot1}
\end{figure}

Figure~\ref{fig:boxplot1} illustrates the distribution of Dice scores for each class. Notably, SAT and SM exhibited tight clustering near perfect scores, indicating consistent and accurate segmentation. IMAT and VAT, while also high, showed more variability, suggesting areas where the model may require refinement.

\label{app:boxplot}
 \begin{figure}
\centering
\includegraphics[width=0.8\textwidth]{./boxplot_genderv2.png}
\caption{Visualization of the distribution of Dice scores for each class split by gender.}
\label{fig:boxplot2}
\end{figure}

\newpage
\section{Visual Overview of Segmentations from Cases of the Internal Testset}
\label{app:overviewresults}
 \begin{figure}[h]
\centering
\includegraphics[width=0.85\textwidth]{./overview_results.png}
\caption{Representative case from the internal test set with segmentation masks generated by the final model. Top row: Segmentation of the first, middle, and last axial slices of the scan. Bottom row: Coronal and sagittal views of the generated masks.}
\end{figure}

\newpage
\section{Visualization Annotation Process}
\label{app:annotationprocess}
 \begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{./overview_annotationprocess.png}
\caption{Segmentation output for two training cases from the 2D and 3D model. Column 1: Original image. Column 2: Generated masks by the 2D model. Column 3: Generated masks by the 3D model trained with noisy masks.}
\label{fig:annotationprocess}
\end{figure}

\newpage
\section{Failure Cases from Reader Study}
\label{app:failurecases}
 \begin{figure}[h]
\centering
\includegraphics[width=0.9\textwidth]{./failure_cases.png}
\caption{Axial CT slices and corresponding masks illustrating segmentation challenges. These cases received a score of 3 or less from at least one of the readers. Top row, column 1: Undersegmentation of muscle tissue is evident. Top row, column 2: A hernia complicates VAT/SAT differentiation and oversegmentation of IMAT can be seen. Top row, column 3: Misclassification of fat within the bowel as VAT. Bottom row, column 1: Water within the SAT area complicates its quantification. Bottom row, column 2: Scan artifacts lead to distorted predictions. Bottom row, column 3: Truncation of the scan affects SAT estimation accuracy.
}
\end{figure}


\end{document}
