% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{booktabs}


\newlength\savewidth\newcommand\shline{\noalign{\global\savewidth\arrayrulewidth
  \global\arrayrulewidth 1pt}\hline\noalign{\global\arrayrulewidth\savewidth}}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}
%
\begin{document}
%
\title{3D Swin Transformer for Partial Medical Auto Segmentation}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Aneesh Rangnekar~\orcidID{0000-0002-0079-9495}, Jue Jiang, Harini Veeraraghavan}
%
\authorrunning{Rangnekar et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Memorial Sloan Kettering Cancer Center \\
\email{rangnea@mskcc.org}}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Transformers are the highest accuracy segmentation frameworks in computer vision for natural imagery from the past few years. In contrast, medical imaging approaches, except a select few (for example, SwinUNETR and SMIT), are still dominated by the nnU-Net architecture family. In this paper, we investigate the application of a hierarchical vision transformer to the FLARE-23 challenge.

Specifically, we benchmark our results using a relatively lightweight architecture, Swin-X Seg. We use multi-model self-training, wherein we use nnU-Net for predicting pseudo labels on partially labeled cases and then optimize the transformer architecture for memory requirements. Our network achieved the average DSC scores of 83.13 \% and 35.19 \% on the open validation set (50 cases) for organs and tumors, respectively, while staying under a max GPU memory utilization of 4GB at evaluation runtime. Our results show that there is potential for the transformer architecture to perform at par or better than conventional convolutional approaches, and we hope our findings encourage more research in the area.
\keywords{Auto Segmentation  \and Self-training \and Swin Transformer.}
\end{abstract}

\section{Introduction}
Accurate, fast, and automated volumetric segmentation of organs and tumors is essential for radiotherapy treatment planning. It often constitutes one of the time-consuming parts of radiation treatment planning workflows~\cite{Vandewinckele2020}. Abdominal organs are particularly time-consuming to segment owing to the presence of a large number of organs as well as due to the random and large variation in the appearance and shape of gastrointestinal organs and limited soft-tissue contrast on clinically used computed tomography (CT) images. Hence, deep learning methods to generate segmentation are under active development~\cite{landman2015miccai,antonelli2022medical}.

Deep learning methods have shown the capability to generate multi-organ segmentation for abdomen~\cite{nnUNet,jiang2022self,tang2022self,Amjad2022} and other disease sites. The availability of well-curated public challenge datasets~\cite{landman2015miccai,antonelli2022medical} has enabled the evaluation of various methods using the same reference benchmark with well-defined metrics. However, a fundamental prerequisite of well-curated pixel-wise annotations or volumetric segmentations of the various organs for training these networks must be more expensive and time-consuming to generate on large datasets. One recent promising approach to alleviate the need for large, curated datasets is the self-supervised pretraining followed by a fine-tuning approach that has demonstrated success in medical image analysis, mainly when using transformer-based architectures\cite{tang2022self,jiang2022self}. Swin UNETR~\cite{tang2022self} and SMIT~\cite{jiang2022self} have shown that using self-supervised learning (SSL) improves the performance of transformer-based networks on semantic segmentation, as compared to training the networks from scratch. Our approach builds on these methods and utilizes a transformer architecture~\cite{liu2021swin} for segmentation with a pretraining step (self-supervised learning) using labeled and unlabeled examples followed by fine-tuning.

We also follow the FLARE-23 rules, whereby, unlike prior works\cite{tang2022self,jiang2022self}, which used a large number of CT scans from various disease sites for pretraining, we used only the 4,000 example scans provided as part of the training set for self-supervised pretraining. Furthermore, keeping with the requirements for using a relatively small architecture with limited memory requirements, we also constructed a lightweight transformer architecture.

Our learning framework uses multi-model self-training~\cite{xie2020self,yang2021st++,rangnekar2022semantic}, where the teacher is an fine-tuned nnU-Net~\cite{FLARE22-1st-Huang} that generates pseudo labels for the various categories. The student network uses a Swin transformer backbone~\cite{liu2021swin} segmentation network (here on referred to as Swin-X Seg) that accepts a combination of FLARE-23 and pseudo labeled examples for fine-tuning (Fig.~\ref{fig:summarypipeline}). Our initial studies show that naively using the partially labeled dataset, with a transformer backbone to obtain pseudo labels, results in poor performance across multiple categories~\cite{touvron2021training,cao2022training,weng2022semi}. 
. Hence, we resort to this combination of semi-supervised learning, wherein the teacher is an nnU-Net and the student is Swin-X Seg.

Our approach allows us to fully utilize the partially-labeled training dataset to its fullest extent, while leveraging fundamental augmentation techniques shown to be effective in natural image analysis. This mitigates the need for requiring complex approaches like the CutMix~\cite{yun2019cutmix} or ClassMix~\cite{Olsson_2021_WACV}, wherein extensive registration would be required before mixing two 3D scans so that the networks do not lose understanding of organ placements, especially with architectures that rely heavily on positional information.

Our key contributions are (a) a lightweight 3D vision transformer applied to multi-organ and tumor segmentation, (b) the SSL approach extending prior works by learning the downstream task using partial labels, and the application of this approach on an open-source FLARE-23 dataset.

\begin{figure}[t]
\centering
\includegraphics[scale=0.33]{imgs/pipeline-crop.pdf}
\label{fig:summarypipeline} 
\caption{Our three-stage pipeline: (a) self-supervised training of the backbone network~\cite{jiang2022}, (b) uses a combination of pseudo labels ($M_{Pseudo}$)~\cite{FLARE22-1st-Huang} and FLARE-23 provided annotations ($M_{GT}$) to obtain refined labels ($M_{Refined}$) for learning segmentation, and (c) inference on a new unseen volumetric scan.}
\end{figure}

\section{Method}
\subsection{Overview} 
We studied the performance of hierarchical vision transformer-based U-Net architecture on the FLARE-23 challenge. Vision transformers require large amounts of data~\cite{touvron2021training,cao2022training,weng2022semi,kirillov2023segment} to achieve high generalization performance. Hence, FLARE-23, which consists of 4,000 training images, provides a nice test bed for evaluating vision transformer architectures. However, 1800 CTs in FLARE-23 are unlabeled with the remaining 2200 CTs provided with partial labels, wherein some but not all the 14 different organs and tumors were segmented, which makes supervised training challenging. Therefore, we used a two-step training approach consisting of: (i) self-supervised pretraining performed on the entire dataset of 4,000 CTs without using any segmentations for supervised training, and (ii) supervised fine-tuning that combined fully labeled CTs together with CTs with pseudo labels created using a different model. We discuss each part of our approach in detail, and the specificities involved in our final implementation.

%###########################
\subsection{Preprocessing}

We used the following preprocessing steps in all our experiments:
\begin{itemize}
    \item Reorient the scans to the right-anterior-superior (RAS) view.
    \item Clip the intensities based on the Hounsfield units to [-250, 250].
    \item We resize all scans to $x,y,z$ volumetric spacings of $1.0,1.0,1.0$ during training and inference. 
    \item In addition, we randomly sample 4 scans of 96 $\times$ 96 $\times$ 96 size from each scan as training examples, representing 2 positive and 2 negative samples for the network at every instance.
\end{itemize}

\subsection{Proposed Method}

\noindent\textbf{Choice of Transformer:} 

Hierarchical Vision
Transformers~\cite{liu2021swin,fan2021multiscale} are pyramid-shaped architectures that rely on gradual down-sampling, similar to convolutional neural networks, while maintaining a global look-out with their multi-scale designs. We use the Swin-Transformer backbone for our approach as it has been widely adopted for 3D medical auto segmentation~\cite{tang2022self,jiang2022self} and shown to be more accurate than the vanilla vision transformer\cite{dosovitskiy2020image}.

Swin UNETR~\cite{tang2022self} and SMIT~\cite{jiang2022self} have over 60 million (M) parameters. Whereas Swin UNETR processes data at 96 $\times$ 96 $\times$ 96, SMIT processes data at 128 $\times$ 128 $\times$ 128 resolution. Both methods use sliding windows for generating final inference. The FLARE-23 constraints require memory efficient inference. A straightforward memory efficient approach to reduce the total number of flops used for inference would be to utilize CT scans reduced to 96 $\times$ 96 $\times$ 96 pixels, at the risk of decreasing the image resolution, which can impact accuracy for smaller organs. Hence, we reduced the number of parameters used in the network by decreasing the total number of blocks per depth to the final $2-2-2-2$ configuration as well as reduced the total number of channels through the UNETR architecture using 1$\times$1 convolutions. This reduced the network size from 60M parameters to 31M parameters, a relatively lightweight architecture compared to current state-of-the-art methods. This is also crucial towards keeping the GPU requirements under 4GB as stipulated under FLARE-23 rules.

\noindent\textbf{Self-supervised Learning:}
The SSL approach made use of the self-distillation based pretext tasks used in the SMIT~\cite{jiang2022self}, including namely Masked Image Modeling (MIM), Masked Patch self-Distillation (MPD) and Image Token self-Distillation (ITD). SMIT performs self distillation by concurrently maintaining an online teacher model (${NET}_T$) with the same network architecture as the student model (${NET}_S$)~\cite{antti2017meanteacher}. The loss functions used to optimize the network are briefly discussed here and we refer interested details to the original paper\cite{jiang2022self} for more details. 

Suppose $\{x_1,x_2\}$ are two augmented views of a 3D image $x$. $N$ image patches are extracted from the images to create a sequence of image tokens~\cite{dosovitskiy2020image}. The image tokens are then corrupted by randomly masking image tokens based on a binary vector, with a probability $p$, and then replacing with mask token~\cite{bao2021beit}. The second augmented view $v$ is also corrupted but using a different mask vector instance. In this order, the three losses deal with the views in the following manner:

\begin{itemize}
    \item \textbf{Masked Image Prediction (MIP)} $\rightarrow$ $x_1$, ${NET}_S$, involves dense pixel regression of image intensities within masked patches using the context of unmasked patches~\cite{he2022masked}.
    \item \textbf{Masked patch token self-distillation (MPD):} $\rightarrow$ $x_1$, ${NET}_S$, ${NET}_T$, trains the student network to predicts the tokens of the teacher network (distillation).
    \item \textbf{Global image token self-distillation (ITD):} $\rightarrow$ $x_1, x_2$, ${NET}_S$, ${NET}_T$, learns to match the global image embedding of the view-scan seen by the student network to the view-scan seen by the teacher network.  
\end{itemize}

SSL training is performed by optimizing the network using all three aforementioned losses. FLARE-23 rules dictate that no external data be used. Hence, following the rules, SSL used the same 4,000 CTs provided as part of the training set. No segmentations provided with the data was used for network optimization in this step. 

\noindent\textbf{Supervised Fine-tuning}:

In order to fully utilize all available training data to improve accuracy, we used the best performing nnU-Net model, the winner from FLARE22\cite{FLARE22-1st-Huang} to provide pseudo labels for the partially labeled and unlabeled datasets the FLARE 23 training sets. We only use 735 examples from the 2200 images that contain a labeled instance of tumor, with the combination of FLARE-23 and nnU-Net pseudo labels (Fig.~\ref{fig:summarypipeline}). We trained our network  sing a combination of Dice loss and cross-entropy loss following previous approaches~\cite{LossOdyssey,nnUNet,tang2022self,jiang2022self}.

\subsection{Post-processing}

No data specific post processing was used following pixel-level classifications generated by the segmentation methods. Sliding window inference with 50\% overlap was used for generating segmentations for the whole 3D image volumes.

\section{Experiments}
\subsection{Dataset and evaluation measures}
The FLARE-23 challenge is an extension of the FLARE 2021-2022~\cite{MedIA-FLARE21}\cite{FLARE22}, aiming to promote the development of foundation models in abdominal disease analysis. The segmentation targets cover 13 organs and various abdominal lesions around the organs. The dataset comprises scans from more than 30 medical centers, including TCIA~\cite{TCIA}, LiTS~\cite{LiTS}, MSD~\cite{simpson2019MSD}, KiTS~\cite{KiTS,KiTSDataset}, autoPET~\cite{autoPET-Data,autoPET-MICCAI22}, TotalSegmentator~\cite{TotalSegmentator}, and AbdomenCT-1K~\cite{AbdomenCT-1K}, with appropriate licensing. The training set includes 4,000 abdomen CT scans, 2,200 CT scans with partial segmentation labels for some of them, and 1,800 CT scans without any segmentation labels. The validation and testing sets include 100 and 400 CT scans, respectively, covering various abdominal cancer types, such as liver, kidney, pancreas, colon, and gastric, to name a few. The organ annotation process used ITK-SNAP~\cite{ITKSNAP}, nnU-Net~\cite{nnUNet}, and MedSAM~\cite{MedSAM}.

The evaluation metrics encompass two accuracy measures—Dice Similarity Coefficient (DSC) and Normalized Surface Dice (NSD)—alongside two efficiency measures—running time and instantaneous GPU maximum memory consumption.

\subsection{Implementation details}
\subsubsection{Environment settings}
The development environments and requirements are presented in Table~\ref{table:env}. We provide all the requirements in our released codebase on GitHub.

\begin{table}[t]
\caption{Development environments and requirements.}\label{table:env}
\centering
\def\arraystretch{1.25}
\begin{tabular}{ll}
\hline
System       & Ubuntu 18.04.5 LTS\\
\hline
CPU   & AMD EPYC 7543P 32-Core Processor @ 2.8 Ghz \\
\hline
RAM & 128 GB\\
\hline
GPU (number and type)                         & NVIDIA A100 80 GB $\times$ 4 \\
\hline
CUDA version                  & 11.8\\                          \hline
Programming language                 & Python 3.8 \\ 
\hline
Deep learning framework & Pytorch 1.13 $\pm$ CUDA 11.7~\cite{paszke2019pytorch} \\
\hline
Specific dependencies         & MONAI, SimpleITK, Nibabel  \\ \hline
Code & \url{https://github.com/The-Veeraraghavan-Lab/FLARE23} \\ \hline
\end{tabular}
\end{table}

\subsubsection{Training protocols}

The model training protocols are shown in in Table~\ref{tab:training}. An image patch size of 96 $\times$ 96 $\times$ 96 with random 3D flips performed on the data to provide augmented samples was used for network training. 

\begin{table*}[t]
\caption{Training protocols.}
\label{tab:training}
\def\arraystretch{1.25}
\begin{center}
%\resizebox{0.8\textwidth}{!}{
\begin{tabular}{ll}
\hline
Network initialization & SSL-FLARE-23~\cite{jiang2022self} \\ \hline
Batch size & 4 \\ \hline
Patch size & 96 $\times$ 96 $\times$ 96 \\ \hline 
Total epochs & 100 \\ \hline
Optimizer & AdamW~\cite{loshchilov2017decoupled} \\ \hline 
Initial learning rate (lr)  & 2$e$-4 \\ \hline 
Lr decay schedule & Linear Warmup with Cosine Annealing~\cite{loshchilov2016sgdr,goyal2017accurate} \\ \hline
Training time  & 33 hours \\ \hline 
Loss function & Cross-Entropy Loss /w Dice Loss\\ \hline 
 \end{tabular}
%}
\end{center}
\end{table*}

\section{Results and discussion}

\begin{table}[t]
\caption{Quantitative evaluation results. Segmentation accuracy results (DSC and NSD with mean and standard deviation) are reported on the publicly provided 50 validation cases made available by the FLARE-23 organizers. 
}\label{tab:final-results}
\def\arraystretch{1.25}
\centering
\small
%\resizebox{0.8\textwidth}{!}{%
\begin{tabular}{l|ll}
\hline
\multirow{2}{*}{Target} & \multicolumn{2}{l}{Public Validation} \\ \cline{2-3} 
 & DSC(\%) & NSD(\%) \\ \hline
Liver & 96.08 $\pm$ 4.230 & 93.58 $\pm$ 10.66 \\
Right Kidney & 87.00 $\pm$ 20.81 & 83.37 $\pm$ 21.81 \\
Spleen & 93.24 $\pm$ 9.730 & 90.92 $\pm$ 14.23 \\ 
Pancreas & 80.47 $\pm$ 7.860 & 89.99 $\pm$ 7.020 \\ 
Aorta & 90.55 $\pm$ 14.80 & 91.61 $\pm$ 16.30 \\
Inferior vena cava & 87.88 $\pm$ 6.800 & 86.97 $\pm$ 9.300 \\
Right adrenal gland & 77.35 $\pm$ 17.46 & 87.78 $\pm$ 19.00 \\
Left adrenal gland & 72.44 $\pm$ 15.83 & 82.03 $\pm$ 16.59 \\
Gallbladder & 75.61 $\pm$ 28.21 & 71.61 $\pm$ 30.06 \\
Esophagus & 74.81 $\pm$ 16.56 & 84.85 $\pm$ 15.99 \\
Stomach & 89.17 $\pm$ 9.110 & 87.60 $\pm$ 11.85 \\
Duodenum & 70.78 $\pm$ 10.77 & 84.21 $\pm$ 9.240 \\
Left kidney & 85.65 $\pm$ 21.81 & 82.33 $\pm$ 23.22 \\
Tumor & 35.19 $\pm$ 30.17 & 22.99 $\pm$ 22.10 \\ \hline
Average (Organ) & 83.13 $\pm$ 8.440 & 85.55 $\pm$ 12.58 \\
Average & 79.70 $\pm$ 11.43 & 81.08 $\pm$ 14.93 \\ \hline
\end{tabular}%
%}
\end{table}

\begin{table}[t]
\caption{Quantitative evaluation of segmentation efficiency of the reported cases using running time and maximum GPU memory consumption ($<$ 4096 MB). Evaluation GPU platform: A100 (80GB).
}\label{tab:gputime}
\def\arraystretch{1.25}
\centering
\begin{tabular}{ccccc}
\hline
Case ID & Image Size      & Running Time (s) & Max GPU (MB) \\ \hline
0001    & (512, 512, 55)  & 28.01       & 3464   \\
0051    & (512, 512, 100) & 65.86       & 3850   \\
0017    & (512, 512, 150) & 73.94       & 3896   \\
0019    & (512, 512, 215) & 48.00       & 3616   \\
0099    & (512, 512, 334) & 69.28       & 3756   \\
0063    & (512, 512, 448) & 84.76       & 3776   \\
0048    & (512, 512, 499) & 74.73       & 3748   \\
0029    & (512, 512, 554) & 102.5      & 4032   \\ \hline
\end{tabular}
\end{table}

\subsection{Quantitative results on validation set}

Table~\ref{tab:final-results} shows our Swin-X Seg's performance on the 50 validation cases provided by the FLARE-23 organizers. The network was slightly less accurate (< 80\% DSC) for organs such as the adrenal glands, gallbladder, esophagus, duodenum, as well as for tumors compared to larger organs like the liver, spleen, left and right kidneys, and the stomach. The tumor segmentation accuracy was low because of the larger variability in the types of tumors analyzed and the relatively few examples with complete labels. Overall, the network accuracy was lower for smaller organs like the adrenal glands and gallbladder when compared to larger organs like the liver. Poor accuracy for organs also resulted when they were adjacent to the tumors. 

Table~\ref{tab:gputime} shows that inference requirements of under 4GB GPU memory consumption were satisfied for all cases. However, all except two cases (0001, 0019) did not satisfy the running time requirement under 60 secs owing to sliding window-based inference, with 50\% overlap. A natural option is to use sliding window inference without any overlap (0\%). However, this results in a poor overall score (77\% DSC average on organ, 27\% DSC on tumor); hence, we did not pursue it. In addition, we optimized for test-time efficiency by performing foreground thresholding to use only the body regions for analysis by ignoring the surrounding air for inference. Our analysis showed that in cases with larger field of view, wherein the body occupied higher volume the inference time utilization increased (e.g. 0017 $>$ 0019, 0063 $>$ 0048).

\subsection{Qualitative results on validation set}

Figures \ref{fig:goodseg} and \ref{fig:badseg} show the segmentations generated by our network on representative examples taken from the validation set of FLARE-23. As shown in Fig.\ref{fig:goodseg}, whereas the model tends to consistently segment the normal tissues with high accuracy, misclassifications occur within tumor regions, tumor voxels classified as the kidney, despite achieving a relatively high DSC accuracy for the tumors. The higher DSC accuracy for tumors is not surprising given the larger tumor volumes. On the other hand, as shown in Fig.~\ref{fig:badseg} for really large tumors such as \#0057 and \#0095, the algorithm generated highly inaccurate segmentation, misclassifying the tumors occurring on the left side of anatomy as liver. \#0027 shows an example where the kidney tumor was correctly segmented together with the kidney adjacent to the tumor, although the esophagus occurring distally to the pancreatic head was misclassified as pancreas. Similarly, in \#0089, the pancreas is oversegmented by the model, whereas the kidney tumor encased within the kidney is undersegmented, highlighting the challenges, particularly when the tumor and the healthy tissues are adjacent to each other.    


\begin{figure}[htbp]
\centering
\includegraphics[scale=0.4]{imgs/imgs_good-crop.pdf}
\caption{Example scans showing relatively good performance in terms of misclassifications by the trained Swin-X Seg model. DSC\_T refers to tumor DSC and DSC\_O refers to average multi-organs DSC.}
\label{fig:goodseg}
\end{figure}

\begin{figure}[htbp]
\centering
\includegraphics[scale=0.4]{imgs/imgs_bad-crop.pdf}
\caption{Example scans showing relatively poor performance in terms of misclassifications by the trained Swin-X Seg network. DSC\_T refers to tumor DSC and DSC\_O refers to average multi-organs DSC.}
\label{fig:badseg}
\end{figure}

\subsection{Segmentation efficiency results on validation set}
We optimized for segmentation inference efficiency by extracting the foreground or the body as a preprocessing step using standard image thresholding. No additional optimization was performed in terms of training or testing. Even this simple approach showed that it is possible to improve inference efficiency as seen in Table~\ref{tab:gputime}.

\subsection{Results on final testing set}
This is a placeholder. We will send you the testing results during MICCAI (2023.10.8). (This is to be left as is.)

\subsection{Limitation and future work}

Our goal was to evaluate the capability of transformer-based  approach for multi-organ and tumor segmentation. We used a relatively lightweight (31M) in order to satisfy the memory requirements of the competition as well as to study to what extent such methods are successful in comparison to convolutional-based approaches such as the nnU-Net used in the previous iteration of the competition~\cite{FLARE22-1st-Huang,FLARE22-bestDSC-Wang}. Our approach to use nnU-Net generated pseudo labels was motivated by prior results using Semiformer~\cite{weng2022semi}, which showed poor accuracy with vision transformer with small labeled training samples can be improved when combined with pseudo labels produced by convolutional neural networks (CNN). However, VITs have generally shown to be more accurate than CNN models. Hence, one approach is to use VIT instead of a CNN for providing pseudo labels. its important to note that the approach combining pseudo labels with CNN and larger VIT models becomes impractical due to increasing memory needs. Another limitation of our approach is the poor segmentations we observed on the tumor and tissue interface, which we plan to address in the future. 

\section{Conclusion}

We presented our approach, multi-model self-training, that used nnU-Net to generate pseudo labels and then Swin transformer to establish a foundation for research into auto segmentation with pseudo labels. In addition, we also identify limitations and discuss research approaches to mitigate them, including knowledge distillation and semi-supervised learning. We believe that our framework serves as a good foundation for further research into efficient
network designs and methodology for accurate medical image segmentation.

\subsubsection{Acknowledgements} The authors of this paper declare that the segmentation method they implemented for participation in the FLARE-23 challenge has not used any pre-trained models and additional datasets other than those provided by the organizers. The proposed solution is fully automatic without any manual intervention. We thank all the data owners for making the CT scans publicly available and CodaLab~\cite{codalab} for hosting the challenge platform. This research was partly funded through grant from NCI R01CA258821-01A1 and the Memorial Sloan Kettering (MSK) Cancer Center Support Grant/Core Grant NCI P30 CA008748. 


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref, ref_cv}

\newpage
% Please add the following required packages to your document preamble:
% \usepackage[normalem]{ulem}
% \useunder{\uline}{\ul}{}
\begin{table}[!htbp]
\caption{Checklist Table. Please fill out this checklist table in the answer column.}
\centering
\begin{tabular}{ll}
\hline
Requirements                                                                                                                    & Answer        \\ \hline
A meaningful title & Yes                        \\ \hline
The number of authors ($\leq$6) & 3             \\ \hline
Author affiliations and ORCID & Yes             \\ \hline
Corresponding author email is presented & Yes   \\ \hline
Validation scores are presented in the abstract & Yes \\ \hline
\begin{tabular}[c]{@{}l@{}}Introduction includes at least three parts: \\ background, related work, and motivation\end{tabular} & Yes \\ \hline
A pipeline/network figure is provided & Fig. \ref{fig:summarypipeline} \\ \hline
Pre-processing & Pages 3, 4 \\ \hline
Strategies to use the partial label & Page 5 \\ \hline
Strategies to use the unlabeled images & Page 5 \\ \hline
Strategies to improve model inference & Page 4 \\ \hline
Post-processing & Pages 5, 6 \\ \hline
Dataset and evaluation metric section is presented & Page 5 \\ \hline
Environment setting table is provided & Table \ref{table:env} \\ \hline
Training protocol table is provided & Table \ref{tab:training} \\ \hline
Ablation study & N/A \\ \hline
Efficiency evaluation results are provided & Table \ref{tab:gputime} \\ \hline
Visualized segmentation example is provided & Figures \ref{fig:goodseg}, \ref{fig:badseg} \\ \hline
Limitation and future work are presented & Yes \\ \hline
Reference format is consistent.  & Yes \\ \hline

\end{tabular}
\end{table}

\end{document}
