\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{mathtools}
\usepackage{amsmath}
\usepackage{bbold}
\usepackage{algorithmic}

% Remove these packages after all the editing is done
\usepackage{comment}
\usepackage{color,soul}

% Header for extended abstracts
\jmlrproceedings{MIDL}{Medical Imaging with Deep Learning}
\jmlrpages{}
\jmlryear{2024}

% to be uncommented for submissions under review
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 045}
\editors{Accepted for publication at MIDL 2024}

\title[Comparing Performance of Radiation Oncologists to Deep Learning Dose Predictor]{Comparing the Performance of Radiation Oncologists versus a Deep Learning Dose Predictor to Estimate Dosimetric Impact of Segmentation Variations for Radiotherapy}

\midlauthor{\Name{Amith Kamath\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{amith.kamath@unibe.ch} \\
\Name{Zahira Mercado\midlotherjointauthor\nametag{$^{1}$}} \Email{zahira.mercadoaufdermaur@unibe.ch}\\
\Name{Robert Poel \nametag{$^{2}$}} \Email{robert.poel@insel.ch} \\
\Name{Jonas Willmann \nametag{$^{3}$}} \Email{jonas.willmann@usz.ch}\\
\Name{Ekin Ermis \nametag{$^{2}$}} \Email{ekin.ermis@insel.ch}\\
\Name{Elena Riggenbach \nametag{$^{2}$}} \Email{elena.riggenbach@insel.ch}\\
\Name{Nicolaus Andratschke \nametag{$^{3}$}} \Email{nicolaus.andratschke@usz.ch}\\
\Name{Mauricio Reyes \nametag{$^{1, 2}$}} \Email{mauricio.reyes@unibe.ch}\\
\addr $^{1}$ ARTORG Center for Biomedical Engineering Research, University of Bern, \\
\addr $^{2}$ Department of Radiation Oncology, University Hospital Bern, University of Bern \\
\addr $^{3}$ Department of Radiation Oncology, University Hospital Zurich, University of Zurich
}

\begin{document}

\maketitle

\begin{abstract}
Current evaluation methods for quality control of manual/automated tumor and organs-at-risk segmentation for radiotherapy are driven mostly by geometric correctness. 
It is however known that geometry-driven segmentation quality metrics cannot characterize potentially detrimental dosimetric effects of sub-optimal tumor segmentation. 
In this work, we build on prior studies proposing deep learning-based dose prediction models to extend its use for the task of contour quality evaluation of brain tumor treatment planning. 
Using a test set of 54 contour variants and their corresponding dose plans, we show that our model can be used to dosimetrically assess the quality of contours and can outperform clinical expert radiation oncologists while estimating sub-optimal situations. 
We compare results against three such experts and demonstrate improved accuracy in addition to time savings. Our code is available at \href{https://github.com/ubern-mia/radonc-vs-dldp}{https://github.com/ubern-mia/radonc-vs-dldp}. 
\end{abstract}

\begin{keywords}
Image segmentation, Radiotherapy, Dosimetric clinical evaluation.
\end{keywords}

\section{Introduction}


Glioblastoma, accounting for about 45\% of brain tumors \cite{McFaline-Figueroa2018BrainTumors}, is an aggressive malignant tumor treated with surgery, radiotherapy (RT), and chemotherapy \cite{Stupp_Radiotherapy_2005}. RT aims to target the tumor while minimizing dose to healthy organs-at-risk (OAR). The planning involves a trade-off between tumor control and tissue toxicity \cite{Scaringi2018TechnicalTumors}. A critical step is the segmentation of structures, which is time-consuming when done manually, can take up to seven hours per patient in the head and neck anatomy \cite{Das2009AnalysisTherapy}. 

With advancements in deep learning-based auto-segmentation, the role of radiation oncologists is shifting from manually drawing to monitoring and correcting these automated segmentations \cite{Claessens2022QualityTherapy}. Quality checks are hence crucial since it has been reported that incorrect tumor volumes cause 25\% of non-compliant treatment plans, leading to untreated tumors or harmful radiotherapy doses \cite{Peters2010Critical02.02}. While geometric metrics like Dice score coefficient (DSC) and Hausdorff distance are currently the de-facto metrics to evaluate segmentation quality, it has been reported that they do not correlate with dosimetric effects of contouring errors \cite{Poel2021a,KoflerAreCoefficient}. In RT, it has been postulated that auto-segmentation methods must be evaluated using a diverse range of performance metrics, including impact on delivered dose \cite{Harrison2022MachinePlanning}, which ultimately impacts clinical outcome.

\noindent \textbf{Related work:} The clinical RT community has developed standards for target contouring \cite{Niyazi2023ESTRO-EANOGlioblastoma}, which mainly includes geometrical and anatomical considerations. 
Dosimetry-based considerations require dose plan calculations, which are time-consuming and necessitate iterations between the radiation oncologist and dosimetrists or medical physicists. Hence, due to its time-consuming nature, dosimetric assessment of segmentation quality has not been conventionally employed in the clinics. 
Nonetheless, as recently pointed out by \cite{Claessens2022QualityTherapy}, dosimetry considerations are urgently needed in the quality control process of tumor and organs-at-risk segmentations.

Previously proposed approaches to evaluate the quality of automated segmentations include methods that predict segmentation metrics, such as DSC, \cite{Valindria2017ReverseTruth} or use Graph Neural Networks to identify segmentation errors in radiotherapy \cite{Henderson2022AutomaticLearning}. 
These approaches assume that these geometric metrics reflect the quality of dosimetry, which is not the case \cite{Poel2021a,KoflerAreCoefficient}. Furthermore, models predicting DSC perform poorly with low-quality segmentations (the main target of such QC system) due to a lack of representative training data for this performance range.

%Uncertainty based
Other approaches have explored the use of uncertainty estimation in OAR segmentation in head and neck cancer \cite{Cubero2023ExploringSegmentation}, under the premise that high uncertainty is linked with potentially low-quality segmentations. 
However, geometric variability and uncertainty estimates may not imply dosimetric effects (e.g., high uncertainty of a contour located in a non-dosimetrically relevant area), and uncertainties based on imaging information alone may not sufficiently guide quality assurance.


%Dose predictors

In line with dosimetry-focused quality assurance, a deep learning-based dose prediction model is utilized in \cite{Roberfroid2024DIVE-ART:Therapy} to guide radiation oncologists on which volume slices require manual adjustments. This segmentation editing tool demonstrates potential for time efficiency while maintaining dosimetric equivalence with distribution maps produced without its assistance.
In \cite{Kamath_ASTRA_2023} we introduced a method that uses a deep learning-based dose predictor to assess the impact of local segmentation changes on dosimetric outcomes. 
However, this work focused on organ-at-risk segmentation, and not on tumor lesions, which is clinically more important due to the higher complexity of this segmentation task. 

\noindent \textbf{Hypothesis and Contributions:} Beyond the state of the art, we postulate in this study that a deep learning based dose predictor can be employed within a quality control framework to detect dosimetrically worse segmentations, with levels of performance superior to human experts. We substantiate this by comparing the performance of our deep learning-based quality assurance method with that of three expert radiation oncologists, using a test dataset comprising 54 segmentation variations from brain tumor patients, and reference dose plans produced by a clinically validated treatment planning system (Varian Medical Systems Inc., Palo Alto, USA).

To the best of our knowledge, this is the first study comparing the levels of dosimetric awareness on contour modifications between human experts and a deep-learning dose predictor model. 

\section{Methods}

\begin{figure}[htbp]
\floatconts
  {fig:example}
  {\caption{Is a deep learning dose prediction model able to ascertain dosimetric impact of tumor target volume contour changes when compared to radiation oncologists? An experimental study is run with 54 contour variations which are individually re-planned to generate three categories of results: ``Worse", ``No Change" and ``Better".}}
  {\includegraphics[width=0.95\linewidth]{images/figure-one.png}} \label{fig:one}
\end{figure}

Our study design, depicted in Figure~\ref{fig:one}, involves a set of reference tumor segmentations (ground-truth) and corresponding expert-derived contour variations (four per reference segmentation). For each reference and contour variation (n=54 pairs), dose plans are computed. Each contour is classified as \textit{``Worse''}, \textit{``No change''}, or \textit{``Better''} based on dosimetric variations relative to the reference segmentation. This classification scheme is used to evaluate the ability of three experienced radiation oncologists and the proposed deep-learning dose predictor model to accurately classify each contour variation.

We report classification metrics and time taken to perform the task.

\subsection{Data}
Our data set includes imaging and segmentation data from 100 patients diagnosed with glioblastoma.
This includes CT and Magnetic Resonance Image (MRI) T1 contrast-enhanced images, and binary segmentation masks of $13$ OARs as well as the tumor target volume. 
The OARs include Brainstem, optic chiasm, cochlea (left and right), eye (left and right), hippocampus (left and right), lacrimal gland (left and right), optic nerve (left and right), and the pituitary gland.
Each of these subjects also has a reference dose plan, calculated using a standardized clinical protocol with Eclipse (Varian Medical Systems Inc., Palo Alto, USA).
This reference is a double arc co-planar volumetric modulated arc therapy (VMAT) plan with $6$ mega volt flattening filter free beams, optimized (Varian photon optimizer version $15.6.05$) to deliver $30$ times $2$ Gray while maximally sparing the OARs. 
The dose is calculated with the AAA algorithm \cite{VanEsch2006TestingCalculation}, normalized so that $100\%$ of the prescribed dose covers $50\%$ of the target volume.
All the volumes are resampled to an isometric \texttt{2x2x2} millimeter grid of size $128^3$ voxels using PyRaDiSe \cite{Rufenacht2023PyRaDiSe:Conversion} and converted to NIfTI files to use for training and evaluation. 

\subsection{Experimental Setup}

We train a cascaded 3D U-Net dose prediction model \cite{Liu2021TechnicalRadiotherapy}, which has been previously evaluated to show a mean prediction error of 1.38 Gray \cite{Poel2023Deep-Learning-BasedContouring, Kamath2023HowSegmentation} on a subset of n=60 cases (from the original 100 cases). 
The inputs are the CT volume, the OAR and tumor binary segmentation masks (14 volumes), and the output is the dose prediction. 
Ten cases are used as validation, and $14$ are used as the test set for this study (Implementation details below). 
We save the rest of the 16 cases for future evaluations.

\begin{figure}[htbp]
\floatconts
  {fig:example}
  {\caption{Example showing a tumor target volume contour change that is ``Worse" - negatively impactful. The green overlay is the reference contour, red overlay indicates change, marked with a yellow arrow. OAR contours are shown for anatomic reference.}}
  {\includegraphics[width=1.0\linewidth]{images/figure-two.png}} \label{fig:two}
\end{figure}

\noindent \textbf{Contour modifications and replanning:} For $14$ test subjects, between three to four variations to the tumor target volume are made independently by an expert using the same rationale as in \cite{Poel2021a}. The reference segmentation follows current delineation standards.
The dose is then re-planned with the same settings as earlier to construct a ground truth dose plan for each of the variations. 
This leads to 54 test scenarios, which are categorized into ``Worse'', ``No Change'' and ``Better'' from a dosimetry perspective. 
Figure.~\ref{fig:two} shows how a variation (in red) looks compared to the reference (in green).

\noindent \textbf{Ground-truth dosimetric categorization of contour variations:} We define three categories of dosimetric impact based on a $10\%$ change on the maximum dose recorded within each of the 13 OARs with respect to each reference contour as computed using the commercial treatment planning system.  
A contour variation is considered as dosimetrically impactful if at least one OAR crosses this threshold. 
We define three categories: ``Worse'', ``No Change'' or ``Better'' - indicating if such a change to the tumor target volume leads to higher OAR dose, no change, or lower OAR dose as compared to the reference segmentation, respectively. 
On the set of 54 cases, this definition leads to 33 Worse, 17 No Change, and 4 Better scenarios based on the ground truth reference dose plans. 

%Automated part
\noindent \textbf{Deep learning based dosimetric categorization of contour variations:} To automate the categorization of contours, we use the trained dose prediction model on the reference contour and each analyzed contour variation, yielding dose maps $D_{ref}$ and $D_{cv}$, respectively. Each contour variation is then classified following algorithm \ref{Alg:1}, which follows the class definitions presented above for ground-truth generation.

\begin{algorithm}
\caption{Contour Variation Quality Classification - $Q(cv)$}
\begin{algorithmic}[1]
\FOR{each $OAR$}
\IF{$max(D_{cv}) > max(D_{ref})(1+\alpha T)$}
\STATE increment counter for Worse
\ELSE
\IF{$max(D_{cv}) < max(D_{ref})(1-\alpha T)$}
\STATE increment counter for Better
\ENDIF
\ENDIF
\ENDFOR
\IF{counter for Worse $\geq$ nOAR}
\STATE $Q(cv)$ = Worse
\ELSIF{counter for Better $\geq$ nOAR}
\STATE $Q(cv)$ = Better
\ELSE
\STATE $Q(cv)$ = No Change
\ENDIF
\end{algorithmic}
\label{Alg:1}
\end{algorithm}

Algorithm \ref{Alg:1} examines each Organ-at-Risk (OAR) to identify those exceeding a specified percentage dose change, determined by threshold $T$. If the count of such OARs surpasses the hyperparameter $nOAR$, the case is deemed ``Worse''. In the absence of dose violations, the algorithm investigates potential dose improvements (line 5). If none are found, the contour variation is labeled as ``No-Impact''. The hyperparameter $\alpha$ serves as a calibration parameter, akin to the ROC-AUC threshold in classification models. The hyperparameter $nOAR$ sets the model’s overall sensitivity to dose violations.

\noindent \textbf{Implementation details:} Each of the two U-Nets in the cascade \cite{Liu2021TechnicalRadiotherapy} has a depth of five sets of convolution layers in the encoder and decoder with $16, 32, 64, 128, 256$ channels in the first level and twice this number in the second. 
The model input is a normalized CT volume and binary segmentation masks for each of the $13$ OARs and target volume, and predicts a continuous-valued dose volume (up-scaled from [$0$, $1$] to $0$ to $70$ Gray) of the same dimension as the input.
The loss is computed as a weighted sum of L1 losses between outputs of the first and second U-Nets versus the reference dose: $Loss = 0.5 * L1(reference, A) + L1(reference, B)$,
where $A$ and $B$ are the outputs of the first and second U-Nets respectively, $reference$ is the reference dose and $L1$ refers to the L1 loss.
The weights are randomly initialized using the `He' method. 
Training runs for $80000$ iterations and the model with the best validation dose score is saved. 
The training batch size is set to $2$ and the learning rate to $1e-3$ with a weight decay of $1e-4$. 
Data augmentation is done with random flipping and random rotation along the z-axis (in the axial plane). 
T in Algorithm.~\ref{Alg:1} is set to $0.1$. 
All experiments are run with PyTorch $1.12$ on an NVIDIA RTX A5000 GPU, and each training run takes approximately $24$ hours.

\noindent \textbf{Comparing against human expert baselines:} For each test case, three expert radiation oncologists evaluated contour variations and were asked to classify them using the defined dosimetric categorization.  
We use 3D Slicer (version 5.6.0) for this evaluation and show all three slice planes (see Figure.~\ref{fig:appendix} in the Appendix).
We also include a 3D rendering of the geometric relationship between the OARs and the tumor target volume, highlighting where the contour change is made. We time their responses, and show all the variations for each subject simultaneously so that they can make visual comparisons against the reference for each contour variation. 

As classification metrics, we report precision and recall, and the confusion matrices. Average time to evaluate each variant is also presented to compare performance.

\section{Results}

Table.~\ref{tab:one} shows the weighted average (across the three categories) precision and recall as well as the average time taken to evaluate each of the 54 variants by the three radiation oncologists (in rows marked R1, R2 and R3) as compared to the deep learning dose prediction model in the last row. 
On both precision and recall, the model outperforms all three experts. Notably, we underline the high inter-rater variability in performance among the three experts. Expert R3, being the most meticulous and expert rater, used significantly more time than other experts. While the proposed model tends to classify more ``No-Impact'' contours as ``Worse'', we view this as a beneficial trade-off. In practice, it would lead to additional checks, which is preferable to potentially overlooking increased toxicity to the patient.
The time taken by the model is dominated by two inference runs through the reference as well as the variant contours. 
Additionally, the range of time taken varies broadly between experts, from 19 to 138 seconds per variation, while the deep learning predictor always takes the same quantum of time irrespective of the difficulty in geometry.

\begin{table}[htbp]
\centering
\begin{tabular}{c|c|c|c|l}
\cline{2-4}
\multicolumn{1}{l|}{}                     & \textbf{Precision} & \textbf{Recall} & \textbf{\begin{tabular}[c]{@{}c@{}}Time Taken\\ (per variant)\end{tabular}} &  \\ \cline{1-4}
\multicolumn{1}{|c|}{\textbf{Radiation Oncologist \#1}}         & 0.41               & 0.35            & 48 [19 - 64]s                                                                           &  \\ \cline{1-4}
\multicolumn{1}{|c|}{\textbf{Radiation Oncologist \#2}}         & 0.48               & 0.46            & 50 [28 - 100]s                                                                          &  \\ \cline{1-4}
\multicolumn{1}{|c|}{\textbf{Radiation Oncologist \#3}}         & 0.55               & \textbf{0.57}   & 71 [29 - 138]s                                                                         &  \\ \cline{1-4}
\multicolumn{1}{|c|}{\textbf{Deep Learning Dose Predictor}} & \textbf{0.57}      & \textbf{0.57}   & 30 s                                                                           &  \\ \cline{1-4}
\end{tabular}
{\caption{Precision and recall (weighted average) for each of the three expert radiation oncologists compared with model predictions. Average (max - min) time taken per variant evaluated is indicated in the last column in seconds.} \label{tab:one}}  
\end{table}

\begin{figure}[htbp]
\floatconts
  {fig:example}
  {\caption{Confusion matrices for the classifier using the dose predictor model versus the performance of three expert radiation oncologists. Sensitive predictions imply more entries in the upper triangular region, leading to further manual checks, while still saving clinician time for correctly classified variations (on the diagonal).}}
  {\includegraphics[width=0.9\linewidth]{images/figure-three.png}} \label{fig:three}
\end{figure}

Figure.~\ref{fig:three} shows the confusion matrices (normalized by true category) for the three expert radiation oncologists (R1, R2 and R3) and the dose predictor model (right-most panel marked ``Prediction''). Darker green on the diagonal is better, while darker red on the off-diagonal is not. The model outperforms all three radiation oncologists in the ``Worse" category. None of the experts mark any variant as ``Better".

\begin{figure}[htbp]
\floatconts
  {fig:example}
  {\caption{Performance of dose predictor model on variation of $\alpha$ and number of OARs crossing the threshold based on precision and recall. We prefer models with reasonable precision and higher recall - as we want the classification to be more sensitive in catching ``Worse'' plans as opposed to missing out on those that may have ``No change''. Red circles indicate values chosen for comparing with experts.}}
  {\includegraphics[width=1.0\linewidth]{images/figure-four.png}} \label{fig:four}
\end{figure}

Figure.~\ref{fig:four} shows the sensitivity of (weighted average) precision and recall to the hyperparameters $\alpha$ and $nOAR$ used to classify dosimetric impact. 
$\alpha$ (horizontal axis) ranges from $0.005$ to $0.15$ in each of the two heat maps. 
Specifically, $\alpha = 0.1$ means that the percentage change threshold for the model is $0.1$ times that used for the reference (in this case, $1\%$).
The vertical axis is $nOAR$, where smaller values make the model more sensitive and strict, and larger values increase model robustness while trading off sensitivity. 
Both the precision and recall metrics show a reasonably smooth variation, except that the precision values drop significantly for small $\alpha$ and $nOAR$. As good trade-off we chose $\alpha$ = 0.1 and $nOAR$ = 3 for the results presented in Figure.~\ref{fig:three} and Table.~\ref{tab:one}.

\section{Discussion and Conclusion}

Radiation oncologists indicate that their mental model emphasizes proximity to OARs (closer and in the line of sight between the tumor target and OARs are more impactful) and the size of the variation (larger causes higher residual dose to OARs). 

\begin{figure}[htbp]
\floatconts
  {fig:example}
  {\caption{Two exemplar situations. Each set is shown as a 3D render, reference dose plan (axial) and predicted dose (axial). Left half: all three experts mark ``No Change" due to its posterior nature (yellow arrow) away from the OARs, while the model predicts correctly. Right half: experts mark correctly as ``No Change" while the model incorrectly flagged as ``Worse". The yellow arrow shows the beam artifacts in the reference dose plan which are not replicated by our model.}}
  {\includegraphics[width=1.0\linewidth]{images/figure-five.png}} \label{fig:five}
\end{figure}

Figure.~\ref{fig:five} demonstrates two exemplar situations that we use to showcase the strengths as well as weaknesses of our proposed idea.
The left half shows one such condition where the model correctly classifies a condition that conventionally would be considered to be not impactful. 
Conversely, the right half shows a situation where the model overestimates the severity.
This can be attributed to the predicted dose being a smooth proxy to the actual beam structure.
Recent advances \cite{Teng2024Beam-wiseRadiotherapy} aim to account for this effect. 

We present a novel dosimetry-driven quality control framework, where our dose predictor model outperforms human experts, indicates a promising baseline on which to build on improvements. This work demonstrates human clinician baseline, upon which we plan to work on the next set of evaluations where radiation oncologists are shown assistive maps like \cite{Kamath_ASTRA_2023} to measure if their performance improves both in time and accuracy.

\midlacknowledgments{We acknowledge funding by Swiss Cancer Research (KFS-5127-08-2020). We report no financial relationship or conflicts of interest.}

\bibliography{midl24_045}

\appendix

\newpage 

\section{User Interface for Expert Evaluations}

\begin{figure}[htbp]
\floatconts
  {fig:example}
  {\caption{User interface for radiation oncologists to review and classify contour changes (selecting variants via left panel) with three slice plane views and 3D volume rendering. 3D Slicer version 5.6.0.}}
  {\includegraphics[width=0.85\linewidth]{images/figure-appendix.png}} \label{fig:appendix}
\end{figure}

\end{document}
