\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images

% add new packages here
\usepackage{todonotes}
\usepackage{hyperref}
\hypersetup{colorlinks,linkcolor={blue},citecolor={green},urlcolor={red}}  
\usepackage{multirow}
\usepackage{multicol}
\usepackage{booktabs}
\usepackage{makecell}
% \pagestyle{empty}% for cropping
% \usepackage[demo]{graphicx}

\newcommand{\HV}[1]{\textcolor{blue}{#1}}
\newcommand{\HVc}[1]{\textcolor{blue}{[Huy: #1]}}




\jmlrvolume{}
%\editors{Under Review for MIDL 2021}

\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}


\title[Improving Weakly Supervised Lesion Segmentation Using Multi-Task Learning]{Improving Weakly Supervised Lesion Segmentation using Multi-Task Learning}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
%   \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
%   \Name{Author Name2} \Email{xyz@sample.edu}\\
%   \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


 %Authors with different addresses:
%Tianshu Chu, Xinmeng Li, Huy V. Vo, Ronald M. Summers, Elena Sizikova
\midlauthor{
 \\ \Name{Tianshu Chu*} \Email{tc2992@nyu.edu}\\
 \addr New York University
 \AND
 \Name{Xinmeng Li*} \Email{xl1575@nyu.edu}\\
 \addr New York University
 \AND
 \Name{Huy V. Vo} \Email{van-huy.vo@inria.fr}\\
 \addr Ecole Normale Superieure, INRIA and Valeo.ai
 \AND
 \Name{Ronald M. Summers} \Email{rsummers@cc.nih.gov}\\
 \addr National Institutes of Health Clinical Center 
 \AND
 \Name{Elena Sizikova} \Email{es5223@nyu.edu}\\
 \addr New York University \\
 * - contributed equally}

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }
 %\midlauthor{Anonymous}

\begin{document}

\maketitle

\begin{abstract}
We introduce the concept of multi-task learning to weakly-supervised lesion segmentation, one of the most critical and challenging tasks in medical imaging. Due to the lesions' heterogeneous nature, it is difficult for machine learning models to capture the corresponding variability. We propose to jointly train a lesion segmentation model and a lesion classifier in a multi-task learning fashion, where the supervision of the latter is obtained by clustering the RECIST measurements of the lesions. We evaluate our approach specifically on liver lesion segmentation and more generally on lesion segmentation in computed tomography (CT), as well as segmentation of skin lesions from dermatoscopic images. We show that the proposed joint training improves the quality of the lesion segmentation by 4\% percent according to the Dice coefficient and 6\% according to averaged Hausdorff distance (AVD), while reducing the training time required by up to 75\%. 
\end{abstract}

\begin{keywords}
Lesion Localization, Weakly Supervised Learning, Multi-Task Learning
\end{keywords}

% \vspace{-0.3cm}
\section{Introduction}
% goal + challenge
Assessing lesion and tumor growth is a central problem in medical imaging for oncology, required for assessing cancer burden and aiding radiologists in accurately labeling important findings. To analyze lesions, radiologists typically manually annotate computed tomography (CT) scans containing lesions with response evaluation criteria in solid tumors (RECIST) measurements, consisting of the major and minor axes of the best-fit ellipse that coarsely describes the lesion segmentation mask~\cite{eisenhauer2009new}. This process, however, does not provide accurate pixel-level segmentation that would be used to monitor lesion shape, and may prove crucial to identifying any abnormalities.  

% overview 
In this work, we propose a new automatic method for predicting lesion segmentations that leverages existing RECIST annotations~\cite{yan2018deeplesion} as training data in a weakly-supervised learning setting. In particular, we show that jointly segmenting and classifying lesions according to their shapes in a multi-task learning fashion achieves better segmentation accuracy than learning a segmentation network alone. This insight is motivated by the successful application of multi-task learning in other domains and   problems~\cite{zhang2017survey,standley2020tasks},  and the idea that clustering provides a strong supervision signal for training computational models~\cite{caron2018deep}. Unlike other recent weakly supervised lesion segmentation  methods~\cite{agarwal2020weakly, agarwal2020weakly1}, we do not use co-segmentation, and instead allow the model to learn appropriate lesion similarities via clustering, thereby improving segmentation quality and reducing training time. We generate class labels by clustering lesions using their RECIST measurements and learn a model to simultaneously classify and segment input CT slices with lesions. We compare our approach to the state-of-the art medical imaging weakly supervised baseline~\cite{agarwal2020weakly} on CT and skin lesion segmentation and show that our method generates more accurate segmentations while requires significantly less training time. 


% contributions
Our contributions can be summarized as follows. We propose a new joint classification and localization scheme for training weakly-supervised lesion segmentation models. We demonstrate how to modify existing segmentation architectures to incorporate this new algorithm. We conduct a systematic analysis of the utility of the proposed new methodology and show that it quantitatively and qualitatively outperforms prior baselines in this domain. 

% \vspace{-0.4cm}
\section{Related Work}
% \vspace{-0.1cm}
%\subsection*{Weakly-Supervised Segmentation in Medical Imaging}
Manually producing dense pixel-wise segmentations for medical images is a time-consuming task that requires domain expertise. Hence, large-scale datasets required for training automatic segmentation models are not available for many medical imaging tasks~\cite{guo2018review, tajbakhsh2020embracing}. On the other hand, weakly-supervised methods allow for the use of class labels~\cite{hu2020weakly} or coarse segmentation masks~\cite{agarwal2020weakly,xie2020mutual}. In particular, response evaluation criteria in solid tumors (RECIST) annotations~\cite{eisenhauer2009new} can be used in lieu of pixel-wise segmentation for training automatic segmentation models. RECIST measurements are often stored in hospital picture archiving and communication systems (PACS) and are annotated manually by radiologists during screening procedures~\cite{cai2018accurate}. Annotations in the form of clicks~\cite{roth2020going} or bounding boxes~\cite{rajchl2016deepcut} are also popular. Finally, weakly-supervised learning is often combined with interactive techniques to speed up image annotation during organ screening~\cite{roth2020going}.
%\textbf{add content and multi-task learning}

Machine learning models trained on a single task can ignore useful domain information that is contained in the training signals of other related, but different tasks, which hinders their generalization ability and performance. Multi-task learning (MTL) is an approach to alleviate this limitation~\cite{Caruana1998}. MTL jointly trains multiple relevant tasks, usually with a shared presentation, to simultaneously improve their performance. Recently, MTL has been proven effective in multiple machine learning applications: Natural language processing~\cite{liu2019multi}, speech recognition~\cite{DengICASSP2013}, stock prediction~\cite{zhou2020stock}, computer vision~\cite{zhou2020pattern} and medical image analysis~\cite{le2019multitask}. In particular, MTL can be used to improve the performance of a single task by jointly training it with other weaker tasks whose training supervision can be obtained from the supervision of the main task~\cite{GirshickICCV2015, LeeCVPR2019}. Our work belongs to this particular branch of MTL. In medical imaging, MTL has also been employed for lesion segmentation~\cite{yang2017novel} or analyzing COVID-19 CT images~\cite{amyar2020multi} but these work require exact class labels for all training images and dense segmentations for at least a part of them. On the other hand, our work use only coarse segmentation supervision (RECIST measurements), and extracts class labels in an unsupervised manner.

% \vspace{-0.3cm}
\section{Multi-task Learning Approach for Weakly-Supervised Segmentation}
We now describe our multi-task model for weakly supervised lesion segmentation. An overview of our proposed method can be seen in Figure~\ref{fig:overview}. 

\begin{figure}[t]
\centering
  \includegraphics[width=1.0\linewidth]{images/overview3.pdf}
  \caption{\footnotesize Overview of the proposed joint classification localization network. We first extract class labels by clustering the lesion RECIST shapes into $K$ classes (shown with $K=4$). We then train a joint segmentation and classification model using the extracted labels.}
%   \vspace{-0.8cm}
  \label{fig:overview}
\end{figure}

\noindent{\textbf{Weakly-Supervised Lesion Segmentation using RECIST.}} Let $R = (\{I_i,r_i\} | i=\{1,2,\ldots , N\} )$ be the set of images $I_i$ and their corresponding RECIST measures $r_i$ with $r_i = (x_i,y_i,w_i,h_i,a_i)$, where $(x_i,y_i)$ are the coordinates of the center, $w_i$ and $h_i$ are the width and height, and $a_i$ is the rotation angle of the RECIST ellipse\footnote{Typically, RECIST is encoded via user clicks $\hat{R}= (x_{11},y_{11}, x_{12},y_{12},x_{21},y_{21},x_{22},y_{22})$ where $(x_j,y_j)$ denote the endpoints of the ellipse axis. To better capture characteristic features,  we fit an ellipse to these measurements and use the transformed parameterization.}. We train a model, denoted by a function $f$, such that for a given image $I_i \in \mathbb{R}^{H\times W\times C}$ ($H$, $W$ and $C$ are the image's height, width and number of channels respectively), $f(I_i)$ is the lesion segmentation map of $I_i$, i.e., $f(I_i) \in \{0,1\}^{H\times W}$ where $f(I_i)[p,q] = 1$ if pixel $(p,q)$ is contained in the lesion. The model consists of an encoder $E$ which produces an embedded representation of the input, and a decoder $D$ which transforms the embedded representation into a binary map, i.e., $f(I_i) = D(E(I_i))$. In the presence of a ground-truth segmentation map $s_i^{gt}$, $f$ is trained to minimize $\mathcal{L}^{seg}(f(I_i),s_i^{gt})$, the difference between the decoder's output and $s_i^{gt}$. Since ground-truth segmentation maps are not available in our setting, we instead generate pseudo-ground truth segmentation maps $s_i$ from RECIST measures and minimize $\mathcal{L}^{seg}(f(I_i),s_i)$, the difference between the decoder's output and $s_i$. We define $\mathcal{L}^{seg}$ to be the binary cross entropy (BCE) loss:
\begin{equation}
\small
% \vspace{-0.1cm}
% \mathcal{L}^{seg}(f(I_i),s_i)=\mathcal{L}_{BCE}(f(I_i), s_i).
% \mathcal{L}^{seg}(f(I_i),s_i)=\mathcal{L}_{BCE}(f(I_i), s_i).
\mathcal{L}^{seg}(f(I_i),s_i) = -\frac{1}{HW} \sum_{p,q} s_i[p,q] \log f(I_i)[p,q] + (1-s_i[p,q])\log (1-f(I_i)[p,q])
% \vspace{-0.1cm}
\end{equation}
where $s_i[p,q]$ and $f(I_i)[p,q]$ are respectively the values of $s_i$ and $f(I_i)$ at pixel $(p,q)$. Intuitively, by projecting the input image into a small embedding space with the encoder $E$, we force the model to learn only relevant features that are useful in reconstructing the pseudo-ground truth segmentation map $s_i$. The information learned in the embedded representation is therefore critical for the model performance. We propose to regularize this representation by training a lesion classification model simultaneously with the segmentation loss.

\noindent{\textbf{Lesion Classification.}} Given the class $c_{I_i} \in \{1,...,K\}$ of image $I_i$, we train a classifier $g$ on the encoder's output with the cross-entropy loss:
\begin{equation}
\small
% \vspace{-0.2cm}
% \mathcal{L}^{cls}(g(E(I_i)),c_i) = -\sum_k \mathbf{1}_{k=c_i} \log g(E(I_i)).    
\mathcal{L}^{cls}(g_{I_i},c_{I_i}) = -\log \left( \frac{\exp(g_{I_i}[c_{I_i}])}{\sum_j \exp(g_{I_i}[j]) } \right),
% \vspace{-0.1cm}
\end{equation}
where $g_{I_i}= g(E(I_i))$ is the output of the classifier $g$ on image $I_i$, and $g_{I_i}[k]$ is the value of $g_{I_i}$ at index $k$. In our context, $c_{I_i}$ is generated from the RECIST measure as follows. The set of segmentation examples $S = \{s_1, s_2,\ldots , s_t\}$ is partitioned into $K\le |S|$ sets $Q=\{q_1,q_2,\ldots, q_K\}$ (i.e., $K$ clusters) via the K-means algorithm~\cite{macqueen1967some}. Subsequently, for every example $I_i$, where $i=1,2,\ldots N$, we use the index $c_{I_i}$ of its assigned cluster as the label for classification loss. 

\noindent{\textbf{Multi-task Learning.}} We train our model in a multi-task learning fashion to minimize the joint loss:
\begin{equation}
\small
\mathcal{L}(I_i,r_i) = \alpha_{seg}\mathcal{L}^{seg}(f(I_i),s_i) + \alpha_{cls} \mathcal{L}^{cls}(g_{I_i},c_i),
\end{equation}
where $\alpha_{seg}$ and $\alpha_{cls}$ are the segmentation and the classification loss weights, respectively. Since the segmentation and classification modules share the same encoder, useful information retained by the lesion classifier is transferred to the segmentation head for better performance. We now turn to experimental validation of the proposed learning framework. 

% \vspace{-0.3cm}
\section{Datasets, Metrics and Implementation Details}
% \vspace{-0.1cm}
% \subsusection{Datasets}
\noindent{\textbf{Computed Tomography (CT) Images.}} For evaluation on CT, we consider two publicly available datasets. The first dataset is the DeepLesion dataset~\cite{yan2018deeplesion} which consists of 32,735 computed tomography (CT) images with lung nodules, liver tumors, enlarged lymph nodes and other internal organ abnormalities from an anonymized set of 4,400 unique patients. Each lesion image is annotated with long and short diameter RECIST measurements. Since ground-truth pixel-level segmentations for this dataset are not available, we rely on approximate GrabCut-based segmentations as ground truth. We use the official data split for this dataset. The second dataset is LiTS~\cite{bilic2019liver} which consists of CT images containing liver tumors from  the Liver Tumor Segmentation Benchmark. This dataset has manually annotated dense segmentation masks for tumors in each CT slice but RECIST measure is not available. We therefore fit ellipses to these masks to obtain RECIST parameters. We use 11,522 images (48 volumes) for training, 3310 images (44 volumes) for validation and 1630 images (20 volumes) for testing.

\noindent{\textbf{Dermatoscopic Images.}}
We evaluate the generalization of our method on  dermatoscopic images of skin lesions. While RECIST is more commonly used for CT datasets, we use skin lesion as an additional test case due to the wider availability of publicly accessible data-sets and dense segmentation. We consider the HAM10K dataset~\cite{tschandl2018ham10000}, which consists of 10,015 dermatoscopic images and pixel-level segmentation masks of common pigmented skin lesions, such as carcinomas, keratoses and melanomas.  Lesions in this dataset often have blurry boundaries and represent a challenging test case for our method. We split the dataset into 70\%/20\%/10\% for training, validation, and testing subsets, respectively.

% \vspace{-0.3cm}
\noindent{\textbf{Evaluation Metrics.}}
We use several standard image processing and medical imaging evaluation metrics to evaluate segmentation performance: Intersection over union (IoU), Dice coefficient, volumetric similarity and averaged Hausdorff distance. We also evaluate whether the segmentation faithfully captures the shape of the lesion with the center error and the perimeter error. The center error measures the Euclidean distance between the predicted and ground truth segmentation center. The perimeter error is the difference in length between the perimeter of the ground-truth segmentation and the model output. We define all metrics in the appendix. In particular, IoU, Dice, and VS measure volumetric simiarity, while AVD, center error and perimeter error evaluate the surface similarity. 

% \vspace{-0.3cm}
\noindent{\textbf{Model Architecture and Training Details.}}
We use the DeepLabV3+ model~\cite{chen2018encoder} with a ResNet101~\cite{He2015} backbone as our encoder-decoder architecture, to which we refer as A1. This architecture relies on atrous spatial pyramid pooling (ASPP) to encode context at multiple scales and a decoder that refines segmentation boundaries.  In the decoder part, the encoder features are first bilinearly upsampled by 4 and then concatenated with the corresponding low-level features after applying the 1x1 convolution. After the concatenation, two 3x3 convolutions with 256 filters are used to refine the features followed by another simple bilinear up-sampling by the factor of 4. This architecture has been widely popular for semantic segmentation in both non-medical ~\cite{lateef2019survey} and medical ~\cite{anderson2021automated,khan2020evaluation} imaging semantic segmentation tasks. We further modify this architecture to add a classification head with $K$ class outputs at the bottleneck layer to test the effect of added class labels, to which we refer as A1+L. Code for our experiments is made publicly available~\footnote{Link to code repository: \url{https://github.com/esizikova/weaklesionmtl}}. 

Similar to prior work~\cite{agarwal2020weakly,agarwal2020weakly1}, we focus on the segmentation problem only, and assume that the lesion location has already been given by RECIST measurements. Our input data therefore consists of patches around each lesion. In DeepLesion, we use the center of the provided bounding box and crop a square patch of side length 120 pixels. For LiTS, we estimate the bounding box with paddings of 20 pixels. The HAM10K dataset is already provided in a patch form and required no pre-processing. To compare with prior results, we evaluate the co-segmentation method of \citet{agarwal2020weakly}, to which we refer as ACoseg. We implement this approach using the same encoder and decoder as A1, but add two branches and co-attention in order to perform co-segmentation. Please see \citet{agarwal2020weakly} for more details.  We train both models for 50 epochs with early stopping by segmentation loss on the validation set. We use Adam optimizer with learning rate 0.001 and batch size 10 for A1 and A1+L and learning rate $10^{-5}$ with batch size 8 for ACoseg. We use $\alpha_{seg}=50$ and $\alpha_{cls}=1.0$ in all of the multi-task experiments. Finally, we perform an ablation study (see Figure~\ref{fig:clusters_deep}) on the number of clusters, and chose $K_{opt}= 190$ for NIH DeepLesion, $K_{opt}= 45$ for HAM10K, and $K_{opt}= 40$ for LiTS. We report additional ablation studies and a comparison to UNet~\cite{ronneberger2015u} in supplementary material.   


\begin{figure}[h]
\centering
\includegraphics[width=0.20\linewidth,height=2cm]{images/clusters_DL.pdf}
\includegraphics[width=0.20\linewidth,height=2cm]{images/clusters_LiTS.pdf}
\includegraphics[width=0.20\linewidth,height=2cm]{images/clusters_HAM.pdf}

\caption{\footnotesize An illustration of the segmentation quality (Dice coefficient) as a function of the number of lesion clusters ($K$) for the DeepLesion (left), LiTS (middle) and HAM10K (right) datasets.}
\label{fig:clusters_deep}
\end{figure}

\section{Experimental Results}
We now quantitatively and qualitatively evaluate our multi-task learning scheme for weakly supervised lesion segmentation.

\noindent{\textbf{Visualization of Lesion Clusters.}} We visualize the groups of lesions obtained by K-means clustering on each dataset in Figure~\ref{fig:clustering}. While there exists significant variation of lesion attributes such as size, shape and intensity within all datasets, the clustering procedure groups lesions into classes of similar size, position, and relative rotation. Cluster assignment is therefore a reasonable supervision for lesion classification.

\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{images/lits.pdf}
\caption{\footnotesize Visualization of lesion groups obtained by clustering ellipse measurements. In a dataset, each row corresponds to a cluster. Clustering groups similar examples within each of the considered CT (DeepLesion, LiTS) and dermatoscopic (HAM10K) datasets.}
\label{fig:clustering}
\end{figure}


\noindent{\textbf{Quantitative Segmentation Performance.}} We investigate the effectiveness of our joint classification and segmentation training for lesion segmentation in Table~\ref{tab:ss-eval}.
The classification+segmentation multi-task training A1+L outperforms the single-task training A1 on all datasets and according to all metrics. In particular, the Dice coefficient is improved by about $1-3\%$ compared to the non-classification baseline A1 on all datasets, and by $1-4\%$ compared to the co-segmentation ACoseg baseline on the CT datasets. The proposed method also significantly improves the surface and center-based metrics: Our method generates $2\%-6\%$ better segmentations with respect to AVD, improves the center estimation by up to $3\%$, and perimeter estimation by up to $15\%$. On HAM10K and LiTS, ACoseg slightly outperforms or matches our A1+L model according to AVD. However, we observe that it exhibits more segmentation artifacts: In some cases, it cannot generate a mask and outputs a zero prediction. For evaluation purposes, in cases where a zero mask is predicted, the worst value of the same metric among other test cases is used. It is also noteworthy that ACoseg performs better than A1, validating the conclusion in \cite{agarwal2020weakly1}. Sample qualitative results of this experiment are reported in Figure~\ref{fig:qual_effect}. 


\begin{table}[]
    \centering
    \resizebox{\linewidth}{!}{
    \begin{tabular}{|c|c||c|c|c||c|c|c|}
        \hline
        Dataset & Model & IoU & Dice & VS & \makecell{Center Error} & \makecell{Perimeter Error} & AVD \\
        \hline
        \multirow{3}{*}{\makecell{NIH \\ DeepLesion}} & A1 & 63.02$\pm$1.47 & 75.36$\pm$1.25 & 80.65$\pm$1.60 & 3.19$\pm$0.06 & 9.21$\pm$0.64 & 7.37$\pm$0.24 \\ 
         & ACoseg & 62.75$\pm$0.42 (-0.43\%) & 74.38$\pm$0.77 (-1.30\%) & 80.58$\pm$0.49 (-0.09\%) & 5.56$\pm$1.19 (+74.29\%) & 8.43$\pm$0.3 (-8.47\%) & 11.84$\pm$2.24 (+60.65\%) \\
          & A1+L & \textbf{65.53$\pm$0.71 (+3.98\%)} & \textbf{77.43$\pm$0.57 (+2.75\%)} & \textbf{83.31$\pm$0.70 (+3.30\%)} & \textbf{3.09$\pm$0.02 (-3.13\%)} & \textbf{7.79$\pm$0.38 (-15.42\%)} & \textbf{6.92$\pm$0.19 (-6.11\%)}  \\ 
        \hline
        \multirow{3}{*}{LiTS} & A1 & 71.44$\pm$0.68 & 82.68$\pm$0.51 & 89.07$\pm$0.64 & 2.00$\pm$0.03 & 9.58$\pm$0.24 & 9.25$\pm$0.17 \\
         & ACoseg & 71.86$\pm$0.34 (+0.59\%) & 82.92$\pm$0.26 (+0.29\%) & 88.81$\pm$0.22 (-0.29\%) & \textbf{1.90$\pm$0.06 (-5.00\%)} & 8.99$\pm$0.16 (-6.16\%) & \textbf{8.90 $\pm$ 0.07 (-3.78\%)}\\
         & A1+L & \textbf{72.76$\pm$0.14 (+1.85\%)} & \textbf{83.62$\pm$0.09 (+1.14\%)} & \textbf{89.94$\pm$0.16 (+0.98\%)} & 1.96$\pm$0.09 (-2.00\%) & \textbf{8.27$\pm$0.05 (-13.67\%)} & 9.05$\pm$0.20 (-2.16\%)  \\
        \hline
        \multirow{3}{*}{HAM10K} & A1 & 82.65$\pm$0.12 & 90.01$\pm$0.09 & 91.84$\pm$0.03 & 2.57$\pm$0.09 & 9.20$\pm$0.26 & 10.23$\pm$0.17 \\
         & ACoseg & 83.19$\pm$0.86 (+0.65\%) & 90.42$\pm$0.51 (+0.46\%) & 92.10$\pm$0.77 (+0.28\%) & \textbf{2.42$\pm$0.03 (-5.84\%)} & 9.77$\pm$0.59 (+6.20\%) & \textbf{9.61$\pm$0.31 (-6.06\%)} \\
         & A1+L & \textbf{83.51$\pm$0.26 (+1.04\%)} & \textbf{90.54$\pm$0.22 (+0.59\%)} & \textbf{92.82$\pm$0.34 (+1.07\%)} & 2.52$\pm$0.09 (-1.95\%) & \textbf{8.87$\pm$0.4 (-3.59\%)} & 9.73$\pm$0.14 (-4.89\%)  \\
        \hline
    \end{tabular}
    }
    \caption{\footnotesize Evaluation of the effect of multi-task learning on segmentation quality using A1 and ACoseg architectures. Best result is shown in bold. For IoU, Dice, and VS metrics, higher number is better. For Center Error, Perimeter Error, and AVD, lower number is better.}
    % \vspace{-0.7cm}
    \label{tab:ss-eval}
\end{table}


\begin{figure}[htb]
\centering
\includegraphics[width=0.8\textwidth]{images/finalviz1.pdf}
\caption{\footnotesize Visualization of the effect of clustering (denotes ``+L'') on segmentation prediction across datasets. For DeepLesion dataset, we use GrabCut to obtain segmentations because the ground truth is not available.}
% \vspace{-0.6cm}
\label{fig:qual_effect}
\end{figure}

\noindent{\textbf{Training Time.}}\label{sef:training-time} Adding label classification using our proposed multi-task learning setup offers not only quantitative segmentation improvement, but also a significant savings in training time.
Table~\ref{tab:runtime_type} shows the GPU time required to train the proposed model A1+L and the ACoseg baseline on each dataset. A1+L saves 69\%-75\% of training time (29.2 hours less on DeepLesion) compared to ACoseg, while achieving substantially better quantitative performance (see Table~\ref{tab:ss-eval}). The improvement can be seen in both smaller (LiTS, HAM10K) and larger (NIH DeepLesion) datasets. Note that the cluster classification head contains very few parameters compared to the backbone, and therefore the A1 and A1+L models require approximately the same amount of training time.



\begin{table}[htb]
    \centering
    \resizebox{0.3\linewidth}{!}{
    \begin{tabular}{|c|c|c|}
        \hline
        Dataset & ACoseg & A1+L \\
        \hline
        NIH DeepLesion & 41.5 & 12.3 (-70\%) \\
        LiTS & 16.6 & 4.2 (-75\%) \\
        HAM10K & 12.5 & 3.9 (-69\%) \\
        \hline
    \end{tabular}
    }
    % \vspace{-0.2cm}
    \caption{\footnotesize Training time (GPU hours) until convergence of different models on the three datasets.}
    % \vspace{-0.9cm}
    \label{tab:runtime_type}
\end{table}

\section{Conclusion}
We have shown that generating labels from existing weak supervision and jointly train to segment and cluster lesions in a multi-task learning fashion allows more accurate segmentations of computed tomography (CT) and dermatoscopic imaging data while significantly reducing training time. Our approach is of course only one of many potential ways to incorporate unsupervised learning into segmentation problems in medical imaging tasks. As unsupervised and weakly-supervised learning is only starting to become more widely used in medical imaging, there are a number of future directions this approach could be extended. In particular, it would be useful to understand the performance of the proposed methodology on other types of medical images, such as chest radiographs (CXR), magnetic resonance imaging (MRI) and others. Also, it would be important to investigate how other types of data typically present in radiology reports can be used for a stronger supervision signal. We hope that our exploration into weakly-supervised MTL learning for lesion segmentation will encourage more research applications in this domain.

\section{Acknowledgements}
TC, XL, ES were supported by the Moore-Sloan Data Science Environment initiative (funded by the Alfred P. Sloan Foundation and the Gordon and Betty Moore Foundation) through the NYU Center for Data Science. HVV was supported in part by a Valeo/Prairie CIFRE PhD Fellowship. RMS was supported by the Intramural Research Program of the National Institutes of Health Clinical Center. 

{\tiny
\bibliography{chuli21.bib}
}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\clearpage
\section*{Appendix}
In this section, we report additional results and ablation studies using the studied models.

\subsection{Metric Definition}
Let $\mathcal{A}^{t}$ be the pixels in the ground truth segmentation area, and $\mathcal{A}^{p}$ is be the pixels in the predicted segmentation area. Then the IoU metric is defined by: $M_{iou}(\mathcal{A}^{p}, \mathcal{A}^{t}) = |\mathcal{A}^{t}\cap \mathcal{A}^{p}|/|\mathcal{A}^{t}\cup \mathcal{A}^{p}|$. The Dice coefficient is defined as: $M_{dice}(\mathcal{A}^{p}, \mathcal{A}^{t}) = 2\times|\mathcal{A}^{t}\cap \mathcal{A}^{p}|/(|\mathcal{A}^{t}|+|\mathcal{A}^{p}|)$. Following \citet{agarwal2020weakly}, we measure averaged Hausdorff distance (AVD) and volumetric similarity (VS) (please see \cite{taha2015metrics} for details). AVD is defined by: $M_{AVD}(A,B) = \max(d(A, B), d(B, A))$ where $d(\cdot)$ is the directed average Hausdorff distance: $d(A,B) = 1/|A| \sum_{a\in A} \min _{b\in B} \|a-b \|$. The volumetric similarity is defined as: $M_{VS} = 1 - |FN - FP|/(2TP + FP + FN)$, where $FN$, $FP$, $TP$ are the false negative, false positive, and true positive cardinalities, respectively. The center error is defined as $E_{ctr} = \sqrt{(x_t-x_p)^2+(y_t-y_p)^2}$ where $(x_t,y_t)$ are coordinates of the ground truth segmentation center and $(x_p,y_p)$ are coordinates of the predicted segmentation center (we calculate the leftmost, rightmost, top and bottom segmentation points and use them to calculate the center). The perimeter error is the difference in length between the perimeter of the ground-truth segmentation and the model output, mathematically defined as $E_{ptr} = |\mathcal{P}_t-\mathcal{P}_p|$ where $\mathcal{P}_t$ is the length (number of pixels) of the outermost edge of the ground truth segmentation, and $\mathcal{P}_p$ is the length of the predicted segmentation. 


\subsection{Effect of Supervision Type}
In Tables~\ref{tab:quant_comp_dl_supp}, \ref{tab:quant_comp_lits_supp}, and \ref{tab:quant_comp_ham10k_supp}, we report ablation studies comparing the effect of adding classification to the network when using ellipse (RECIST) and GrabCut supervision on all of the considered datasets. It can be seen that adding labels helps improve results according to IoU, Dice and Center Error with all supervisions. On NIH DeepLesion~\cite{yan2018deeplesion}, GrabCut-based method A1+L generates the best results according to IoU an Dice, but ellipse-based method A1+L generates a slightly more accurate lesion center. On LiTS~\cite{bilic2019liver}, the ellipse-based method A1+L significantly outperforms GrabCut, due to the difficulty of generating GrabCut for the lesions found in this dataset that have a dark background. On HAM10K~\cite{tschandl2018ham10000}, the ellipse-based methods generate best results according to all metrics, except circumference error.

\begin{table}[h]
\scriptsize
\centering
\begin{tabular}{|l|cccc|}
\hline
Method                                    &  IoU        & Dice         & Center Error      & Circumf. Error \\
\hline
A1 on GrabCut                & 65.55       & 76.82        & 3.84            & 8.06 \\
A1+L on GrabCut  & 69.78       & 80.52        & 3.09            & 6.81                 \\ \hline
A1 on Ellipse  & 61.17  & 73.77  & 3.24  & 9.75     \\
A1+L on Ellipse  & 66.49  & 78.18  & 3.06  & 7.34     \\ 
\hline
\end{tabular}
\caption{\small Evaluation of segmentation in the NIH DeepLesion~\cite{yan2018deeplesion} dataset using different input segmentations.}
\label{tab:quant_comp_dl_supp}
\end{table}


\begin{table}[h]
\centering
\scriptsize
\begin{tabular}{|l|cccc|}
\hline
Method                                    &  IoU        & Dice         & Center Error      & Circumf. Error \\
\hline
A1 on GrabCut        & 70.86  & 82.14 & 2.18 & 9.22 \\
A1+L on GrabCut  & 72.41 & 83.30 & 2.18 & 8.56 \\ \hline
A1 on Ellipse  & 70.89 & 82.27 & 2.00 & 9.64 \\
A1+L on Ellipse  & 72.84 & 83.69 & 1.97 & 8.21 \\ 
\hline
\end{tabular}
\caption{\small Evaluation of segmentation quality in the LiTS ~\cite{bilic2019liver} dataset using different input segmentation.}
\label{tab:quant_comp_lits_supp}
\end{table}


\begin{table}[h]
\centering
\scriptsize
\begin{tabular}{|l|cccc|}
\hline
Method                                    &  IoU        & Dice         & Center Error      & Circumf. Error \\
\hline
A1 on GrabCut        & 83.30  & 90.39  &  2.73 & 8.79 \\
A1+L on GrabCut  & 83.63 & 90.56  & 2.64 & 8.69 \\ \hline
A1 on Ellipse  & 82.52 & 89.92 & 2.63 & 8.86 \\
A1+L on Ellipse  & 83.87 & 90.85 & 2.50 & 9.38 \\ 
\hline
\end{tabular}
\caption{\small Evaluation of segmentation quality in the HAM10K ~\cite{tschandl2018ham10000} dataset using different input segmentation.}
\label{tab:quant_comp_ham10k_supp}
\end{table}

\subsection{Comparison with Fully Supervised Methods}
Next, we compare how results of the weakly-supervised methods compare with results of their fully-supervised counterparts. In Table~\ref{tab:supervision_type1}, we compare models trained with ellipse-based (weak) segmentation, GrabCut (weak) supervision and dense, pixel-wise full segmentation supervision. For NIH DeepLesion~\cite{yan2018deeplesion} dataset, we use GrabCut-based segmentation as ground truth since human-annotated dense segmentation is not publicly available. While fully-supervised models achieve more accurate (according to Dice) segmentation, weak supervision reduces performance by about $3\%$, demonstrating the utility of such methods on lesion segmentation tasks. On LiTS, the ellipse based MTL segmentation method slightly outperforms the fully-supervised segmentation baseline. This is likely due to the fact that the ellipse prior helps regularize the model and prevent it from generating incorrectly-shaped outliers.

\begin{table}[htb]
    \centering
    \resizebox{0.5\linewidth}{!}{
    \begin{tabular}{|c|c|c|c|}
        \hline
        \multirow{2}{*}{Dataset} & \multicolumn{3}{c|}{Model}  \\
        \cline{2-4}
                       & GrabCut & Ellipse & Fully-Supervised \\
        \hline
        NIH DeepLesion & 80.52$^*$       & 78.18     & 80.52$^*$ \\
        LiTS           & 83.30       & 83.62    & 82.87 \\
        HAM10K         & 90.56       & 90.85     & 93.53 \\
        \hline
    \end{tabular}
    }
    \caption{\small Evaluation of the effect of supervision type on segmentation quality. Results are reported according to the Dice metric. *-For DeepLesion we use GrabCut-based segmentation as ground truth as human-annotated dense segmentation is not publicly available.}
    \label{tab:supervision_type1}
\end{table}

\subsection{Effect of a Different Backbone}
In Table~\ref{tab:unet_compare} we compare the segmentation quality when using the DeepLabV3+~\cite{chen2018encoder} and the U-Net~\cite{ronneberger2015u} backbone networks. We find that DeepLabV3+ architecture slightly outperforms U-Net.

\begin{table}[htb]
    \centering
    \resizebox{0.4\linewidth}{!}{
    \begin{tabular}{|c|c|c|}
        \hline
        Dataset & U-Net & DeepLabV3+ \\
        \hline
        NIH DeepLesion & 76.76 & \textbf{76.83} \\
        LiTS & 83.28 & \textbf{83.40} \\
        HAM10K & 89.86 & \textbf{89.97} \\
        \hline
    \end{tabular}
    }
    \caption{\small Performance comparison (Dice) between U-Net and DeepLabV3+ architectures.}
    \label{tab:unet_compare}
\end{table}

\subsection{Hyperparameter Selection and Classification Performance}
In Figure~\ref{fig:tuning_ham}, we perform an ablation study of the segmentation loss weight $\alpha_{seg}$ using Dice performance on the HAM10K dataset~\cite{tschandl2018ham10000}. We find that the performance gradually increases up to $\alpha_{seg}=50$, after which it drops slightly. For low values of $\alpha_{seg}$, the classification loss likely significantly outweighs the segmentation loss, and performance drops. With $\alpha_{seg}=50$, optimal performance is achieved. With larger values, the segmentation loss outweighs classification, and no longer helps performance. In Table~\ref{tab:class_acc}, we additionally report the train and test classification a of the A1+L model across all datasets. 
\begin{figure}[htb]
\centering
\includegraphics[width=0.5\linewidth]{images/ham_alphas.pdf}
\caption{\small Ablation study of segmentation accuracy (Dice coefficient) as functions of segmentation loss weight $\alpha_{seg}$ for the HAM10K dataset~\cite{tschandl2018ham10000} and A1+L model.}
\label{fig:tuning_ham}
\end{figure}

\begin{table}[htb]
    \centering
    \resizebox{0.3\linewidth}{!}{
    \begin{tabular}{|c|c|c|}
        \hline
        Dataset & Train & Test \\
        \hline
        NIH DeepLesion & 16.9 & 12.1 \\
        LiTS & 48.5 & 23.9 \\
        HAM10K & 54.0 & 46.7 \\
        \hline
    \end{tabular}
    }
    \caption{\small Classification accuracy from A1+L model on different datasets.}
    \label{tab:class_acc}
\end{table}


\subsection{Number of Model Parameters}
We report the total number of parameters for each model. For A1 and ACoseg, the number of parameters is the same across all datasets: A1 requires 45,669,713 parameters and ACoseg requires 64,554,577 parameters. The number of parameters A1+L depends on $K$, the number of classes in the classification head: 46,059,023 for NIH DeepLesion, 45,751,673 for LiTS and 45,761,918 for HAM10K. 


\subsection{Additional Qualitative Results}
In Figure~\ref{fig:qual_effect3}, we report additional qualitative segmentation results. Our method appears to generate more accurate lesion boundaries with respect to the ground truth, as compared to both A1 and ACoseg baselines, introducing less border artifacts across all three considered datasets. 


\begin{figure}[htb]
\centering
\includegraphics[width=0.8\textwidth]{images/finalviz2.pdf}
\caption{\small Visualization of the effect of clustering (denoted ``+L'') on segmentation prediction across datasets.}
\label{fig:qual_effect3}
\end{figure}


\end{document}
