\documentclass{midl} % Include author names


\jmlrvolume{}
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}


\usepackage{booktabs} % table
\usepackage{siunitx}
\usepackage{bm}
\usepackage{caption}
\usepackage{placeins}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{xcolor}
\usepackage [english]{babel}
\usepackage [autostyle, english = american]{csquotes}
\MakeOuterQuote{"}
\usepackage[font=small,skip=2pt]{caption}
\graphicspath{ {./} }
\title[Benefits of Linear Conditioning with Metadata for Image Segmentation]{Benefits of Linear Conditioning with Metadata for Image Segmentation}

\vspace{-2mm}
\midlauthor{\Name{Andreanne Lemay \nametag{$^{1,2 }$}} \Email{andreanne.lemay@polymtl.ca}\\
\addr $^{1}$ NeuroPoly Lab, Institute of Biomedical Engineering, Polytechnique Montreal, Canada \\
\addr $^{2}$ Mila, Quebec AI Institute, Canada \\
\Name{Charley Gros}\nametag{$^{1, 2}$} \Email{charley.gros@gmail.com}\\
\Name{Olivier Vincent\nametag{$^{1, 2}$}} \Email{ovincent.poly@gmail.com}\\
\Name{Yaou Liu}\nametag{$^{3}$} \Email{yaouliu80@163.com} \\
  \addr $^{3}$ Beijing Tiantan Hospital, Capital Medical University, China \\
\Name{Joseph Paul Cohen\midljointauthortext{Contributed equally}\nametag{$^{2, 4}$}} \Email{joseph@josephpcohen.com}\\
\addr $^{4}$ Stanford University Center for Artificial Intelligence in Medicine \& Imaging  \\
\Name{Julien Cohen-Adad\midlotherjointauthor\nametag{$^{1, 2, 5}$}} \Email{jcohen@polymtl.ca}\\
\addr $^{5}$ Functional Neuroimaging Unit, CRIUGM, University of Montreal, Montreal, Canada}

\begin{document}
% TLDR: This work adapts a linear conditioning method for image segmentation models enabling integration of metadata and multi-class training with few or missing labels.

\maketitle

\vspace{-2mm}
\begin{abstract}
Medical images are often accompanied by metadata describing the image (vendor, acquisition parameters) and the patient (disease type or severity, demographics, genomics). This metadata is usually disregarded by image segmentation methods. In this work, we adapt a linear conditioning method called FiLM (\textbf{F}eature-w\textbf{i}se \textbf{L}inear \textbf{M}odulation) for image segmentation tasks. This FiLM adaptation enables integrating metadata into segmentation models for better performance. We observed an average Dice score increase of 5.1\% on spinal cord tumor segmentation when incorporating the tumor type with FiLM. The metadata modulates the segmentation process through low-cost affine transformations applied on feature maps which can be included in any neural network's architecture. Additionally, we assess the relevance of segmentation FiLM layers for tackling common challenges in medical imaging: multi-class training with missing segmentations, model adaptation to multiple tasks, and training with a limited or unbalanced number of annotated data. Our results demonstrated the following benefits of FiLM for segmentation: FiLMed U-Net was robust to missing labels and reached higher Dice scores with few labels (up to 16.7\%) compared to single-task U-Net. The code is open-source and available at \url{www.ivadomed.org}.

\end{abstract}

\begin{keywords}
Deep learning, linear conditioning, segmentation, metadata, task adaptation.
\end{keywords}

\vspace{-2mm}
\section{Introduction}
Segmentation tasks in the medical domain are often associated with metadata: medical condition of the patients, demographic specifications, acquisition center, acquisition parameters, etc. Depending on which structure is segmented, these metadata can help deep learning models improve their performance, however, metadata is usually overlooked. In this work, we improve segmentation models using recent advances in visual question answering called FiLM \cite{perez2018film, devries2017modulating} (\textbf{F}eature-w\textbf{i}se \textbf{L}inear \textbf{M}odulation). Using FiLM to condition a segmentation model enables the integration of prior metadata into neural networks through linear modulation layers. For instance, knowledge of the tumor type could provide useful information to the model. \cite{rebsamen2019divide} demonstrated that by stratifying the learning by brain tumor type, high-grade glioma, or low-grade glioma, segmentation could be improved. With FiLM, the tumor type information can be included without requiring multiple models as done in \cite{rebsamen2019divide}. The input metadata generates feature-specific affine coefficients learned during training, enabling the model to modulate the segmentation output to improve its performance.

The metadata could also be exploited for task adaptation. When training a multi-class segmentation model, each class needs to be annotated on every image, as missing labels will hamper the learning \cite{zhou2019prior}. 
Label availability often represents a bottleneck in deep learning \cite{minaee2020image}. Segmentation is costly in terms of time, money, and logistics \cite{bhalgat2018annotation}. For instance, chest CT scans contain hundreds of 2D scans (up to 861 axial slices in the dataset used for this work) depending on the resolution. As a reference, Google sets the price of image segmentation to 870 USD for 1000 images \footnote{\url{https://cloud.google.com/ai-platform/data-labeling/pricing}}, which totals 435 USD for a single subject with 500 axial slices. For medical segmentation requiring expert knowledge (e.g., tumor segmentation), this price could be higher considering the hourly wage of a radiologist. As for the time, \cite{ciga2021learning} reports that it takes between 15 minutes and two hours depending on the size and resolution to segment a single image of lymph nodes for breast cancer. An approach dealing with missing modalities and requiring fewer labels can reduce the monetary and time-related costs. 

We hypothesize that conditioning the model based on the organ to be segmented (e.g., "kidney", "liver") will make it robust to missing segmentations. A multi-class model could then be trained on data from multiple datasets with a single class annotated in each. Since the different tasks share weights, fewer labels are required for a given class as the model can learn from the other tasks. This enables the model to easily adapt a single segmentation model to several tasks requiring only a small amount of annotations for novel tasks.

\vspace{-2mm}
\subsection{Prior work}
Conditional linear modulation was introduced in many deep learning fields: visual reasoning \cite{perez2018film, devries2017modulating}, style transfer \cite{dumoulin2017learned}, speech recognition \cite{kim2017dynamic}, domain adaptation \cite{li2018adaptive}, few-shot learning \cite{oreshkin2018tadam}, to name a few. In the medical image field, FiLM was leveraged for learning when limited or no annotation is available for one modality \cite{chartsias2020multimodal}. Image reconstruction was performed with FiLM to enable self-supervised learning of the anatomical and modality factors of an image. Modality factors were passed through FiLM to modulate anatomical factors generating a reconstructed image of a given modality. While in \cite{chartsias2020multimodal} information extracted from the image is used for modulation, in this work, we want to assess the impact of integrating metadata that is not directly encoded in the image.

The adaptation of FiLM (i.e., linear conditioning) for segmentation was experimented on cardiovascular magnetic resonance modulated by the distribution of class labels \cite{jacenkow2019conditioning}, on ACDC with modulation on spatio-temporal information \cite{jacenkow2020inside} and on multiple sclerosis lesions with a FiLMed U-Net conditioned on the modality (T2-weighted or T2star-weighted) \cite{vincent2020automatic}. \cite{jacenkow2019conditioning} had consistent improvement by including the prior information on an encoder-decoder architecture but mitigated results on the U-Net architecture. Results from \cite{vincent2020automatic} were inconclusive regarding the performance of FiLM compared to a regular U-Net. A possible explanation for this lack of improvement is that the modality-related features might already be encoded in the regular U-Net, therefore the metadata added to FiLM is not informative enough and thus does not translate to an increase in segmentation performance. In light of these results, in the present work, we generalized the modified-FiLM implementation to be able to modulate a model by inputting any type of discrete metadata data.

\subsection{Contribution}
The key contributions of this work are: \textbf{(i)} We introduce an adaptation of linear conditioning \cite{perez2018film} based on metadata for segmentation tasks using the U-Net architecture. \textbf{(ii)} We demonstrate that including metadata can contribute to the model's performance. As a proof of concept, we input the spinal cord tumor type (astrocytoma, ependymoma, hemangioblastoma), which is often associated with its size, composition, and anatomical location. The tumor type knowledge led to an average Dice score improvement of 5.1\%. \textbf{(iii)} We show that robust learning with missing annotations can be achieved with FiLM. Moreover, we illustrate that linear modulation enables task adaptation with fewer labeled data when jointly trained on multiple tasks. A Dice score improvement of up to 16.7\% was observed when using our approach with a limited number of annotations compared to a single class U-Net. 


\section{Methods}
\subsection{Architecture and Implementation}

\begin{figure}[htbp]
\vspace{-6mm}
\floatconts
  {fig:Architecture}
  {\caption{FiLMed U-Net architecture of depth 3. Depth describes the number of maximum pooling or up convolutions in the U-Net. $\gamma$ and $\beta$ values are generated using a multi-layer perceptron with shared weights across FiLM layers. $\gamma$ and $\beta$ have the same shape as the input. An element-wise multiplication is applied between the input and $\gamma$ while the $\beta$ is added.}}
  {\includegraphics[width=\textwidth]{architecture.png}}
\end{figure}
The core architecture is based on the 2D U-Net \cite{ronneberger2015unet} (Figure \ref{fig:Architecture}). The model has two inputs: the image and the one-hot encoded metadata (i.e., prior knowledge). FiLM layers and generator are responsible for conditioning the neural network with the given metadata. Two parameters, $\gamma_{(i)}$ and $\beta_{(i)}$, are required to linearly modulate the inputs of the $i^{th}$ FiLM layer. The metadata is passed through a multi-layer perceptron (i.e., FiLM generator) with two hidden layers (64 and 16 neurons). The FiLM generator outputs one value of $\gamma$ and $\beta$ for each filter (i.e., feature extractor) which are respectively multiplied and added by the FiLM layers to each convolutional feature map. The computational cost of FiLM is low and independent of the image resolution. The weights from the generator are shared for a more efficient learning \cite{perez2018film}. Since the input of the FiLM generator is the same, the same features should be extracted from the metadata. The values are constrained between 0 and 1 due to the sigmoid activation. Preliminary experiments favored sigmoid over ReLU or tanh activation function for the FiLM parameters. $\gamma_{(i)}$ values near 0 silence some features, while $\gamma_{(i)}$ values near 1 output the key features. Since the linear modulation is computationally inexpensive, FiLM layers were placed after each convolutional unit to ensure the metadata is properly used by the network. The code is open-source and available in the ivadomed toolbox \cite{Gros2021}.

\vspace{-2mm}
\subsection{Experiment 1: Segmentation using relevant metadata}
This experiment assessed the relevance of including metadata during the training. 

\vspace{-2mm}
\subsubsection{Dataset: Spinal cord tumor}
We used a spinal cord tumor segmentation dataset \cite{lemay2021multiclass}. The dataset included 343 MRI scans, where each image was associated with one of the following tumor types: astrocytoma (101), ependymoma (122), or hemangioblastoma (120). The tumor type can be informative for the model since each type has particular characteristics, e.g., size, location, contrast intensity patterns, tissue constitution, \cite{kim2014differentiation, baleriaux1999spinal}. Two modalities, Gadolinium-enhanced T1-weighted (Gd-e T1w) and T2-weighted (T2w), are required to properly segment each component of the tumor: tumor core, edema, and liquid-filled cavity. Here, for simplicity, only the tumor core labels were used.
\vspace{-2mm}
\subsubsection{Training scheme}
The first scenario used the FiLM architecture without any input metadata, while the second scenario included the tumor type as metadata. To simulate the absence of metadata, the same input vector was passed through FiLM, hence no informative data is seen by the model. The same architecture was used in both scenarios in order to isolate the specific effect of the input metadata. Preliminary experiments gave similar results when using a regular U-Net architecture without the FiLM layers or a FiLMed U-Net with always the same input. A 320x256 sagittal image of resolution 1mmx1mm associated with the tumor type constituted one training sample. The dataset was split per patient with the following proportions: 60\% training, 20\% validation, 20\% testing. To compare the overall segmentation performance, 10 models were trained with different random splits.
\vspace{-2mm}
\subsection{Experiment 2: FiLM for multiple tasks}
Here, the ability of FiLM to modulate the network to adapt to different segmentation tasks was assessed. The FiLMed model was presented with labels from three classes that are all included in the scan, but only one segmentation was given at the time. The class of the presented segmentation was input into the network to teach the model to properly segment each class. A similar experiment was performed with few segmentations and unbalanced datasets.

\vspace{-2mm}
\subsubsection{Dataset: Spleen, kidneys, and liver}
The organs selected for this task were the spleen, kidneys, and liver. The datasets were collected from two different sources: Medical Segmentation Decathlon \cite{simpson2019large} for spleen and liver scans, and KiTS19 \cite{heller2019kits19} for kidney scans. Liver and kidney scans had tumor labeling which was ignored for the current experiments: organ and tumor annotations were merged as a single segmentation. Due to the large size of the kidney and liver datasets, subdatasets were extracted. Since the spleen dataset contained 41 scans with associated ground truths, only the first 41 kidney and liver scans were retained. 

\subsubsection{Training scheme}
First, the FiLMed U-Net was trained on the spleen, kidney, and liver images with the whole dataset (41 images for each). A training example was a 2D axial slice of 512x512 pixels paired with the available label (kidney, spleen, or liver).  The dataset was split per patient with the following proportions: 60\% training, 20\% validation, 20\% testing.

Second, the performance on small and unbalanced datasets was assessed with an independent sub-experiment: FiLMed U-Net was trained on subdatasets of the spleen and kidney datasets. For simplicity, only two classes were used. The experimental design of this sub-experiment is presented in appendix \ref{appendix:design}.  The subdatatsets were randomly chosen with a size of 2, 4, 6, 8, and 12 for one class and 12 subjects of the other class (i.e., a total of 10 models: \{2, 4, 6, 8, 12\} spleens with 12 kidneys each and \{2, 4, 6, 8, 12\} kidneys with 12 spleens each). The size of the dataset included all the subjects for training and validation. The models were tested on 25 subjects of the class with the least subject. For a model trained on 2 kidney subjects and 12 spleen subjects, the model would be tested on 25 kidney subjects not included in the training or validation set. During the training process, the data was sampled to expose each class evenly to the model even when the number of subjects is unbalanced. All the trainings were repeated 10 times with varying random splits (100 trainings).


Regular 2D U-Nets trained on only one class at the time, spleen, kidney, or liver were trained following the same training, validation, and test splits for comparison.
 \vspace{-2mm}
\subsection{Training parameters}
The tumor types or organ labels were evenly separated into three groups, training, validation, and testing groups, and the data were sampled with a batch size of 8. The FiLMed U-Nets of depth 4 for the spinal cord tumor and 5 for the chest CT were trained with a Dice loss function until the validation loss plateaued for 50 epochs (early stopping with $\epsilon = 0.001$). The depth was chosen according to the size of the input images. The initial learning rate was 0.001 and was modulated according to a cosine annealing learning rate. 

\vspace{-2mm}
\subsection{Evaluation}
The Dice score was selected to compare the performance of each approach. All FiLMed approaches were compared with the conventional approach: training without informative metadata for spinal cord tumors and on a regular U-Net for the multi-organ segmentation tasks. To assess the statistical differences between groups, a one-sided Wilcoxon signed-rank test with a p-value $<5\%$ was considered to be a significant difference. 

\vspace{-2mm}
\section{Results}
\subsection{Experiment 1: Segmentation using relevant metadata}

Prior knowledge of the tumor type led to a significant Dice score improvement between the regular U-Net and the FiLMed U-Net: 10.5\% for the hemangioblastomas (p-value=0.006),  4.5\% for the astrocytomas (p-value=0.003), and  5.1\% for all tumors combined (p-value=0.003) (Table \ref{tab:sc_tumor}). Astrocytomas and hemangioblastomas showed the highest Dice score gain when the model was informed with the tumor type. Astrocytomas are typically large, have ill-defined boundaries, and present heterogeneous, moderate, or partial enhancement in the Gd-e T1w contrast \cite{baleriaux1999spinal}. Conversely, hemangioblastomas are usually associated with a small tumor core  \cite{baleriaux1999spinal} intensely enhanced on Gd-e T1w \cite{baker2000mr}. These distinctive characteristics can be learned by the model to perform a more informed segmentation (see appendix \ref{appendix:tumor} to visualize segmentation differences). 



% \FloatBarrier
% \setlength{\abovecaptionskip}{0pt}
% 
\begin{table}[!htbp]
\centering
\floatconts
  {tab:sc_tumor}%
  {\caption{Spinal cord tumor core segmentation performance for regular and FiLMed U-Net (mean $ \pm $ STD \% for 10 random splits). The FiLMed U-Net was trained with the tumor type as input. ** p-value $<$ 0.05 for one-sided Wilcoxon signed-rank test.}}
{\centering

\begin{tabular}{lcc}
\cmidrule[\heavyrulewidth]{2-3}

\multicolumn{1}{l}{} & \multicolumn{2}{c}{\textbf{Dice score [\%]}} \\
\toprule
\textbf{Tumor type} & \textbf{No prior info.}  & \textbf{Prior info.}\\
\midrule
Astrocytoma & $53.3 \pm 4.8$  & \bm{$57.8 \pm 4.9$} ** \\
Ependymoma & $57.2 \pm 3.2$  & \bm{$57.7 \pm 2.4$}  \textcolor{white}{ **}  \\
Hemangioblastoma & $51.2 \pm 4.0$  & \bm{$61.7 \pm 3.7$ } ** \\

\midrule
\textbf{All}   &  $54.0 \pm 2.2$  & \bm{$59.1 \pm 2.3$} ** \\
\bottomrule 
\end{tabular}}
\end{table}
% \FloatBarrier
\vspace{-3mm}
\subsection{Experiment 2: FiLM for multiple tasks}
Table \ref{tab:organs} shows that the FiLMed multi-class model trained with missing labels (i.e., only one organ labeled per scan) was able to reach equivalent performance to single-class U-Nets (i.e., one model per class) trained without missing annotations. As a reference, a multi-class 2D U-net without FiLM was trained with the same dataset containing missing labels. Poor performance was reached with an average Dice score of 41.7 $\pm$ 16.0 for all classes combined: only partial segmentation of each organ was performed by the model. This result illustrates the hindered learning caused by the missing annotations. Inputting the class label through FiLM layers allowed the model to properly train with missing segmentations enabling the option to have a single model adapted to multiple tasks even when all annotations are not available. For comparison, the Dice scores reached by other studies on the whole challenge datasets, 61 spleens, 300 kidneys, and 201 livers, with 2D U-Nets was included. While being trained on less data (41 images per dataset), our 2D FiLMed U-Net reached Dice scores comparable with these published studies (see Table \ref{tab:organs}).


% \FloatBarrier
\setlength{\abovecaptionskip}{0pt}
\begin{table}
\centering
\floatconts
  {tab:organs}%
  {\caption{Multiple-organ segmentation Dice score with multi-class, single-class and FiLMed U-Nets (mean $ \pm $ STD \%). The FiLMed U-Net was trained on spleen, kidney, and liver while regular U-Nets were trained on each class independently. A one-sided Wilcoxon signed-rank test was performed on columns 2 (2D U-Net) and 3 (FiLMed U-Net): no statistical difference was observed.}}%
{\centering

\begin{tabular}{lccc|c}
\toprule
& \multicolumn{3}{c}{Our experiments} & Literature \\
\midrule
 \textbf{Task}  & \textbf{ \thead{Multi-class \\ 2D U-Net} }  &\textbf{ \thead{Single-class \\ 2D U-Net} }  & \textbf{ \thead{Multi-class \\ FiLMed U-Net }} & \textbf{\thead{2D U-Net \\ (On whole challenge dataset)}}\\
\midrule
Liver & $50.3 \pm 18.3$ &  $95.1 \pm 1.4$  & $94.1 \pm 1.6$ & \thead{ $94.37 \pm N/A$ \cite{isensee2018nnu}} \\
Spleen & $35.6 \pm 14.2$ & $91.7 \pm 6.3$  & $92.2 \pm 5.3$ & \thead{$94.2 \pm N/A$  \cite{isensee2019automated}} \\
Kidney & $39.2 \pm 13.1$ & $90.4 \pm 9.3$  & $90.7 \pm 8.1$  & \thead {$93.0 \pm 1.2$  \cite{ahmed2020medical}}  \\

\bottomrule 
\end{tabular}}
\end{table}
% \FloatBarrier


\begin{figure}[!htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:spleen_kidney}
  {\caption{Spleen and kidney segmentation Dice scores for small and unbalanced datasets. The number of subjects combines training and validation subjects. Dice scores for all experiments on the test set (25 subjects) were averaged across the number of subjects and aggregated according to the approach, FiLMed (red) or regular U-Net (blue). The error bars show the standard deviation. $\Delta$ indicates the difference of mean Dice scores between the two approaches. The data totals 10 models trained on different random splits. ** p-value $<5\%$ with one-sided Wilcoxon signed-rank test.}}  {\includegraphics[width=0.9\textwidth]{spleen_kidney.png}}
\end{figure}

Figure \ref{fig:spleen_kidney} demonstrates the ability of FiLMed U-Net to be trained on small or unbalanced datasets. With the same amount of labels for a given class, FiLMed models reached superior Dice scores for datasets of size 2, 4, 6, and 8 compared with the regular U-Nets trained on a single class, 11.5\%, 16.7\%, 5.5\%, and 4.7\%, respectively. This suggests that the FiLMed models were able to learn from the images associated with the other task. The more subjects are included in the dataset, the more similar FiLM performances become to regular U-Nets, as seen in Table 2. However, FiLMed models have the advantage of being robust to missing classes.
\vspace{-2mm}
\section{Discussion}
FiLM provides a flexible, low computational cost option to integrate prior knowledge. In this paper, the type of spinal cord tumor was exploited as a proof of concept, but the possibilities of metadata that can improve the performance of a model are vast. The prior metadata could include domain information (e.g., acquisition center, scanner vendor), anatomical data (e.g., location in the body, pose estimation, disease type or severity), or rater specification (e.g., rater's experience, rater's id). To elaborate on an example, inter-expert variability is an important aspect in medical segmentation \cite{renard2020variability}. Integrating this information in the model would enable one to make predictions according to the rater with the most experience or to create a model that can replicate inter-expert predictions (i.e., generating one prediction per expert learned in training).

FiLM is capable of dealing with missing labels by indicating which annotations are presented to the model. Many new medical imaging datasets are available, however, most have limited scopes and annotations. FiLM makes it possible to use data from different sources with only one class annotated to create a multi-class model instead of single-class models trained on each dataset. Without the need for more labels, combining datasets increases the number of examples seen by the model. Since weights are shared between tasks, the model learns from the data of the other tasks as seen in Figure \ref{fig:spleen_kidney}. The transfer learning between tasks and the robustness with respect to missing segmentations reduce the number of annotations required. 



Since the metadata is one-hot encoded before being introduced into the FiLM generator, discrete prior information is needed. The approach presented works with continuous data (e.g., age, size, MRI acquisition parameters), but it must be discretized into a binned range. Future work should explore methods to best encode different data types. This enhancement would allow the integration of MRI acquisition parameters (e.g., echo-time, flip angle) that might make the model agnostic to the different acquisition sequences.

\vspace{-2mm}
\section{Conclusion}
The integration of linear conditioning through FiLM for segmentation models enables a flexible option to integrate metadata to enhance the predictions. FiLM also facilitates the training of multi-class models by being robust to missing labels. Future work could focus on the impact of integrating other types of data than the tumor type, increasing the number of metadata used to modulate the network, or evaluating the impact of including prior information on the model's uncertainty.  

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank the contributors of the ivadomed project, Lucas Rouhier, Ainsleigh Hill, Valentine Louis-Lucas, and Christian Perone for fruitful discussions.

Funded by the Canada Research Chair in Quantitative Magnetic Resonance Imaging [950-230815], the Canadian Institute of Health Research [CIHR FDN-143263], the Canada Foundation for Innovation [32454, 34824], the Fonds de Recherche du Québec - Santé [28826], the Fonds de Recherche du Québec - Nature et Technologies [2015-PR-182754], the Natural Sciences and Engineering Research Council of Canada [RGPIN-2019-07244], the Canada First Research Excellence Fund (IVADO and TransMedTech), the Courtois NeuroMod project and the Quebec BioImaging Network. This research is based on work partially supported by the CIFAR AI and COVID-19 Catalyst Grants. A.L. has a fellowship from NSERC, FRQNT, and UNIQUE, C.G. has a fellowship from IVADO [EX-2018-4], O.V. has a fellowship from NSERC, FRQNT, and UNIQUE.}


\bibliography{lemay21}

\clearpage
\appendix
\section{Experimental design of organ segmentation with limited annotations}
\label{appendix:design}
\begin{figure}[!htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:experimental_design}
  {\caption{Experimental design of organ segmentation with limited annotations. The images associated to each model represent the training and validation set. This experimental design was used to generate Figure \ref{fig:spleen_kidney}.}}
  {\includegraphics[height=0.5\textheight,angle=90,origin=c]{few_shot_experimental_design.png}}
\end{figure}

\clearpage
\section{Spinal cord tumor segmentation}
\label{appendix:tumor}
\begin{figure}[!htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:Tumor}
  {\caption{Tumor segmentation prediction by FiLMed U-Net informed by the tumor type, "With prior", or not informed, "No prior". A1 and A2 presents two subjects with astrocytomas. H1 and H2 presents two subjects with hemangioblastomas. GT: Ground truth.}}
  {\includegraphics[width=\textwidth]{film_tumorseg.png}}
\end{figure}

 Astrocytomas are typically large, have ill-defined boundaries, and present heterogeneous, moderate, or partial enhanced in the Gd-e T1w contrast \cite{baleriaux1999spinal}. Astrocytomas are usually extensive, expanding from 2 to 19 vertebral bodies in size \cite{baleriaux1999spinal}. In both A1 and A2 predictions from the model without prior information, the segmented tumor size was one vertebral body or less and corresponded to the most enhanced tumor signal on the Gd-e T1w (ignoring the rest of the lesion). 
 
In counterpart, hemangioblastomas are usually associated with a small tumor core  \cite{baleriaux1999spinal} intensely enhanced on Gd-e T1w \cite{baker2000mr}. Figure \ref{fig:Tumor} H1 presents a hemangioblastoma barely apparent in T2w and hidden by the cavity (hyperintense signal). The small hyperintense signal on the Gd-e T1w contrast was overseen by the regular approach. On H2, the model oversegmented the tumor and identified a second tumor on a hypointense signal. The false positive tumor identification does not present an intense Gd-e T1w enhancement which is usually the case for hemangioblastomas. This false positive is not present for the model informed by the tumor type.

\label{appendix:tumor_style}
\begin{figure}[!htbp]
\floatconts
  {fig:Tumor_style}
  {\caption{Impact of inputting different tumor types with FiLMed U-Net on the model's segmentation.  True label represents the tumor type while input label is the tumor type input into the model through FiLM. Astr.: Astrocytoma, Epen.:Ependymoma, Hema.: Hemangioblastoma.}}
  {\includegraphics[width=\textwidth]{tumor_style.png}}
\end{figure}


\begin{table}[!htbp]
\centering
\floatconts
  {tab:tumor_style}%
  {\caption{Spinal cord tumor core segmentation Dice scores for FiLMed U-Net with the different tumor types as input (mean $ \pm $ STD \% for 10 random splits). True label represents the tumor type while input label is the tumor type input into the model through FiLM. ** p-value $<$ 0.05 for one-sided Wilcoxon signed-rank test compared to the highest value in each row.}}
{\centering

\begin{tabular}{lccc}
\cmidrule[\heavyrulewidth]{2-4}

\multicolumn{1}{l}{} & \multicolumn{3}{c}{\textbf{Input label}} \\
\toprule
\textbf{True label} & \textbf{Astrocytoma}  & \textbf{Ependymoma} & \textbf{Hemangioblastoma}\\
\midrule
\textbf{Astrocytoma} & \bm{$57.9 \pm 4.9$}  & $57.3 \pm 4.9$  & $32.2 \pm 5.1$ ** \\
\textbf{Ependymoma} & $57.6 \pm 2.6$  & \bm{$57.7 \pm 2.4$}  & $35.9 \pm 4.7$ **  \\
\textbf{Hemangioblastoma} & $41.5 \pm 4.7$ ** & $41.8 \pm 6.4$ ** & \bm{$61.7 \pm 3.7$} \\
\bottomrule 
\end{tabular}}
\end{table}


To assess the impact of inputting the tumor type, each prediction was modulated by the different tumor types. Table \ref{tab:tumor_style} presents the quantitative results for each condition while Figure \ref{fig:Tumor_style} qualitatively illustrates the impact of changing the tumor type. The highest Dice scores are reached when the input label corresponds to the true label. The modulation with FiLM successfully encoded knowledge about the tumor types and the predictions are in agreement with known characteristics of the different types. Astrocytoma and ependymoma yield similar predictions. Both tumor types have overlapping characteristics \cite{kim2014differentiation}: high intensity signals on T2w, comparable enhancement patterns, similar size (astrocytoma: 2-19 vertebral bodies, ependymoma: 2-13 vertebral bodies \cite{baleriaux1999spinal}), etc. Predictions with hemangioblastoma as input diverge from the other tumor types. Hemangioblastoma predictions reflect their characteristics: small tumor cores intensely enhanced in Gd-e T1w, as seen in Figure \ref{fig:Tumor}. When inputting the hemangioblastoma label for the astrocytoma (first row of Figure \ref{fig:Tumor_style}) no prediction is given since the Gd-e T1w modality has moderate enhancement. Similarly, for the ependymoma, only the most Gd-enhanced portion of the tumor is predicted when assigning the hemangioblastoma label with FiLM (second row of Figure \ref{fig:Tumor_style}). The results from Table \ref{tab:tumor_style} and Figure \ref{fig:Tumor} - \ref{fig:Tumor_style} confirm that FiLM layers are able to learn characteristics from the metadata that are relevant for the segmentation.
\end{document}