\documentclass{midl} % Include author names

% Header for extended abstracts
\jmlrproceedings{MIDL}{Medical Imaging with Deep Learning}
\jmlrpages{}
\jmlryear{2021}

% to be uncommented for submissions under review
\jmlrworkshop{Short Paper -- MIDL 2021}
\jmlrvolume{}
\editors{Under Review for MIDL 2021}

\title[Image Segmentation on Edge TPUs]{Efficient biomedical image segmentation on Edge TPUs}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Andreas M. Kist\nametag{$^{1,2}$}} \Email{andreas.kist@fau.de}\\
\addr $^{1}$ Department of Artificial Intelligence in Biomedical Engineering, Friedrich-Alexander-University Erlangen-Nürnberg, Germany \\
\addr $^{2}$ Division of Phoniatrics and Pediatric Audiology, Department of Otorhinolaryngology, Head- and Neck surgery, University Hospital Erlangen, Friedrich-Alexander-University Erlangen-Nürnberg, Germany \AND
\Name{Michael Döllinger\nametag{$^{2}$ \Email{michael.doellinger@uk-erlangen.de}}} 
}

\begin{document}

\maketitle

\begin{abstract}
Biomedical semantic segmentation is typically performed on dedicated, costly hardware. In a recent study, we suggested an optimized, tiny-weight U-Net for an inexpensive hardware accelerator, the Google Edge TPU. Using an open biomedical dataset for high-speed laryngeal videoendoscopy, we exemplarily show that we can dramatically reduce the parameter space and computations while keeping a high segmentation quality. Using a custom upsampling routine, we fully deployed optimized architectures to the Edge TPU. Combining the optimized architecture and the Edge TPU, we gain a total speedup of \textgreater79$\times$ compared to our initial baseline while keeping a high accuracy. This combination allows to provide immediate results at the point of care, especially in constrained computational environments.
\end{abstract}

\begin{keywords}
Edge TPU, image segmentation
\end{keywords}

\section{Introduction}

Semantic segmentation is an important tool in biomedical data analysis. Many quantitative measures rely on the extracted information, such as cancer size, fetal development or voice quality. The latter relies on the semantic segmentation of the glottal area (\figureref{fig:unet}(a)) that has been an active area of research for years \cite{andrade2020laryngeal}, only recently deep convolutional neural networks (DNNs)  were applied \cite{gomez2020bagls}. However, the focus has been mainly on the segmentation quality and not particularly on the segmentation speed. In a recent study that we briefly present here, we showcase efficient DNNs that are both, fast and highly accurate \cite{kist2020efficient}, especially in constrained, CPU-only environments. We not only utilize computational tweaks similar to previous studies \cite{tan2019efficientnet}, but also make use of a novel, inexpensive hardware accelerator termed Edge TPU \cite{cass2019taking}. The use of Edge TPUs in biomedical tasks has been barely touched, therefore, we briefly present our advances on this topic.

\section{Methods}

\begin{figure}[tb]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:unet}
  {\caption{Biomedical semantic segmentation on Edge TPUs. (a) Glottis segmentation task with optimization strategies and deployment scheme. Aim is to map all operations to the Edge TPU. (b) Network performance across different U-Net depths and base filters. (c) Custom Upsampling2D operation. (d) Inference speed for na\"ive Keras on a conventional consumer CPU (blue), partly mapped with the default UpSampling2D operation to the Edge TPU (red) and using our custom UpSampling2D routine (green) across network size configurations. }}
  {\includegraphics[width=\linewidth]{fig1_merge.pdf}}
\end{figure}

We implement a U-Net and the DeepLabV3+ \cite{chen2017rethinking} architecture as described previously \cite{kist2020efficient} in TensorFlow/Keras in v.1.15. Not supported operations by the Edge TPU were replaced accordingly, such as dilation in Conv2D layers. We further ensured that BatchNorm layers were not fused, as this caused issues in the compilation. We use the BAGLS dataset \cite{gomez2020bagls} for semantic segmentation of the glottis (binary pixel-wise classification). We train our networks with the Adam
optimizer, a Dice loss, a cyclic learning rate  between $10^{-3}$ and $10^{-6}$, and a varying base filter size (Equation \ref{eq:filter}) to scale the U-Net size dynamically. 

\begin{equation}
    \label{eq:filter}
    \text{filter}_{\text{Depth}} = \text{filter}_{\text{base}} \cdot 2^{\text{Depth}-1}, \text{Depth} \in [1, \text{Depth}_{max}]
\end{equation}

Images were rescaled to 512$\times$256 px. All networks were trained in quantization aware mode and subsequently converted to TFLITE. Next, the Edge TPU compiler mapped the operations to CPU and Edge TPU. When activation maps in the decoder exceeded a given size, we applied a custom UpSampling2D routine that allowed us to fully map all operations to the Edge TPU in contrast to the default UpSamling2D operation (see \figureref{fig:unet}(c)). We used the Intersection over Union (IoU) score as evaluation metric.

\section{Results}

We found that a variety of U-Net configurations are able to tackle the glottis segmentation task (\figureref{fig:unet}(a)) as measured by the IoU score on the validation dataset (\figureref{fig:unet}(b)). Our results suggest that the base filter count is more tolerated than the depth of the network, indicating the importance of high-level features. Reducing the base filter count and using separable convolutions had the most impact on reducing the parameter space (\textgreater 99\% parameter reduction). We further found that Conv2Ds are more efficiently implemented in Edge TPUs than separable Conv2D layers similar to previous reports.

We encountered that upsampling large activation maps are partly mapped to the CPU, introducing a bottleneck across different network size configurations, here shown for the DeeplabV3+ architecture with the MobileNetV2 backbone (\figureref{fig:unet}(d)). However, using a custom UpSampling2D routine (\figureref{fig:unet}(c)), we were able to circumvent this issue. This affected the  inference speed significantly and removed the CPU bottleneck (\figureref{fig:unet}(d)). In detail, we observed an inference speed of up to 30 fps compared to the constant 6 fps on the partially mapped configuration. In comparison, the same networks in the naive Keras environment have a runtime performance between 2 and 10 fps (\figureref{fig:unet}(d)), resulting in a 3-8$\times$ speed-up. Additionally, we do not observe significant drops in segmentation quality by int8 conversion \cite{kist2020efficient}. Combining the architecture optimization and the use of the Edge TPU, we gained in total a \textgreater79$\times$ speed-up compared to our initial baseline \cite{kist2020efficient}.

\section{Conclusion}

In this work, we highlight the use of Edge TPUs across different architectures in biomedical image segmentation and show that slight modifications to the architectures result in significant performance boosts. We envision that these findings will influence the deployment of DNNs in constrained or remote environments, such as endoscopic imaging setups.

% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{The project was partially funded by the BMWi (ZF4010105/BA8) to MD. AMK was further funded by a Joachim Herz foundation fellowship.}


\bibliography{kist21}


% \appendix
\let\clearpage\relax

\end{document}
