\documentclass{midl} 

\usepackage{amsmath}
\usepackage{bbm}
\usepackage{mathrsfs}
\usepackage{color}
\usepackage{natbib}
\usepackage{booktabs}
\usepackage{multirow}
\DeclareMathOperator*{\argmax}{argmax}

\newcommand{\mname}{SeqNet}
\newcommand{\Mname}{SeqNet}

\usepackage{mwe} 
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}

\title[Segmentation and A/V Classification on Retinal Images with Post-processing]{Joint Learning of Vessel Segmentation and Artery/Vein Classification with Post-processing}


  \midlauthor{\Name{Liangzhi Li} \Email{li@ids.osaka-u.ac.jp}\\
   \Name{Manisha Verma} \Email{mverma@ids.osaka-u.ac.jp}\\
   \Name{Yuta Nakashima} \Email{n-yuta@ids.osaka-u.ac.jp}\\
   \Name{Ryo Kawasaki} \Email{ryo.kawasaki@ophthal.med.osaka-u.ac.jp}\\
   \Name{Hajime Nagahara} \Email{nagahara@ids.osaka-u.ac.jp}\\
   \addr Osaka University, 1-1 Yamadaoka, Suita, Osaka, Japan 565-0871}
 
\begin{document}

\maketitle

\begin{abstract}
Retinal imaging serves as a valuable tool for diagnosis of various diseases. However, reading retinal images is a difficult and time-consuming task even for experienced specialists. The fundamental step towards automated retinal image analysis is vessel segmentation and artery/vein classification, which provide various information on potential disorders. To improve the performance of the existing automated methods for retinal image analysis, we propose a two-step vessel classification. We adopt a UNet-based model, \mname{}, to accurately segment vessels from the background and make prediction on the vessel type. Our model does segmentation and classification sequentially, which alleviates the problem of label distribution bias and facilitates training. To further refine classification results, we post-process them considering the structural information among vessels to propagate highly confident prediction to surrounding vessels. Our experiments show that our method improves AUC to 0.98 for segmentation and the accuracy to 0.92 in classification over DRIVE dataset.
\end{abstract}

\begin{keywords}
Medical imaging, retina images, vessel segmentation, vessel classification, deep learning, computer vision.
\end{keywords}

\section{Introduction}\label{section_introduction}

Retinal imaging is the only feasible way to directly inspect the vessels and the central nervous system in the human body \textit{in vivo}, which can give us informative signs and indications on possible disorders. Fundoscopy  has thus become an important method and the routing examination to help diagnosis of many diseases, including diabetes, hypertension, arterial hardening, and so forth \cite{chatziralli2012value}. Fundoscopy is easy to operate, quick, accurate, and relatively low in cost. Medical doctors, not only ophthalmologists, are considering a wider use of fundoscopy.

However, similarly to other types of medical images, retina images exhibit high complexity and huge diversity \cite{JIN2019}. Sufficiently trained specialists are required to handle ever-increasing requests to read such images. Moreover, reading retinal images by specialists can potentially be error-prone under this highly demanded circumstance. To that end, computer-aided diagnosis can be a promising technical break-through that automatically analyzes such retina images. 

Various high-level tasks of retinal image analysis, such as the calculation of central artery equivalent, central vein equivalent, artery-to-vein diameter ratio \cite{huang2018artery}, as well as the detection of retinal artery occlusion and retinal vein occlusion \cite{woo2016associations}, which can reveal risks of stroke, cerebral atrophy, cognitive decline, and myocardial infarct, etc., are built on top of vessel segmentation and artery/vein (A/V) classification. A vast amount of research efforts have been made for both components. For vessel segmentation, most of the earliest attempts are based on the local information of retinal images \cite{Cheng2014, 7042289}, including intensity, color, some hand-crafted features, etc. In recent years, UNet \cite{UNet}-based segmentation models become more popular \cite{8036917, 8341481}. As for A/V classification, a classic approach is applied to segmented vessels in retinal images \cite{HUANG2018197}, where some structural prior on vessels has been leveraged for better performance \citep{alam2018combining,8598955}. Deep models are also explored and achieved the state-of-the-art performance \cite{10.1007/978-3-319-93000-8_71}. Meanwhile, lack of large-scale labeled datasets motivates data augmentation with generative adversarial networks \cite{8055572}. 


Although many approaches have been proposed in this area, their performances are not satisfactory yet. This is because the retina images are usually complicated and full of noises. It is hard to extract all vessels, including minor ones, while not introducing too many false vessel pixels. Moreover, the available training data are very limited. In most of the public datasets, the number of retina images for training is no more than $20$. Furthermore, things become more difficult when we need to classify the vessels into artery or vein, because this further increases the unbalance between the numbers of pixels on artery or vein vessels and the number of background (non-vessel) pixels.

\begin{figure}[t]
\floatconts
  {fig_story}
  {\caption{An example retina image from a public dataset \cite{staal:2004-855, 10.1007/978-3-642-40763-5_54}. (a) Raw image. (b) Vessel segmentation. (c) Artery (red) / Vein (blue) classification.}}
  {\subfigure[]{\includegraphics[width=0.25\textwidth]{zfig_story_raw.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.25\textwidth]{zfig_story_vessel.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.25\textwidth]{zfig_story_av.pdf}}}
\end{figure}

In this paper, we propose a method for automatically analyzing retinal images, such as the one in Fig.~\ref{fig_story}. Our method consists of two components: (i) A neural model, coined \mname{}, that segments vessels and classifies each pixel into artery and vein, and (ii) post-processing to refine initial classification by \mname{}. The main idea behind our neural model is to jointly training the model, but yet segmentation and classification streams are sequential rather than simultaneous, as shown in Fig.~\ref{fig_structure}. The segmentation stream only cares about vessel extraction. Meanwhile, the classification stream utilizes segmentation results to immunize itself against cluttered backgrounds in input images. The existing methods that simultaneously do segmentation and classification suffer from the severe bias in label distributions since background pixels are dominant in retinal images. We remedy this imbalance by our sequential model, dividing the task into the background/vessel classification (i.e.~segmentation) task and artery/vein classification task, where we employ the state-of-the-art model \cite{li2019iternet} for the segmentation stream. 

There may still be some errors in classification results. This is because fully convolutional network-like models (such as UNet-based ones \cite{10.1007/978-3-319-93000-8_71, hemelings2019artery, 8759380}), or more generally convolution operations, are more suitable to extract local features than handling global context. Hence all UNet-based models' prediction performances depend on local cues, such as color and contrast, rather than the structure of the whole vessel system. This locality leads to many minor errors, as shown in Fig.~\ref{fig_segments_analysis_2}(a) and (b).

We thus incorporate the global context, i.e., the structure of the vessel system, into our method via post-processing for further improving the performance. We divide extracted vessels into many small segments and unifying the pixel-level predictions in each of them into a single prediction, called intra-segment label unification. We also propose a new strategy called inter-segment prediction propagation (PP). This strategy can further refine classification results among neighboring segments by propagating predictions to neighboring segments with judging whether they are connected with each other or just crossed two different vessels.

Our main contribution is three-fold:
\begin{itemize}
	\item We design a joint segmentation and classification model based on the UNet architecture \cite{UNet}, which sequentially handles respective tasks to balance the label distributions for better training.
	\item We propose to post-process classification results for refining them by leveraging global information, called intra-segment label unification and inter-segment prediction propagation, which smooths each pixel's label along the vessel system's structure.
	\item We experimentally demonstrate that our method, including \mname{} and the post-processing, achieves the state-of-the-art performance over two public datasets. The code is available here\footnote{https://github.com/conscienceli/SeqNet}.
\end{itemize}

\section{Methodology}

Our method consists of \mname{} (Fig.~\ref{fig_structure}) for initial segmentation/classification and PP for refinement. Following sections details these two components.

\subsection{\Mname{}}\label{section_joint_learning}

Some existing methods for A/V classification actually formulate the problem as a ternary classification task, where each pixel is labeled as either artery, vein, or background. This can deteriorate the performance by imposing further imbalance among the labels, i.e., there are much more background labels than artery/vein labels. Most state-of-the-art models actually suffer from a poor segmentation ability, which is discussed in Section \ref{section_performance_evaluation}. Unlike these methods, \mname{} sequentially applies segmentation into vessel/background and classification into A/V in a single network. Yet, training is done jointly.

\begin{figure}[t]
\floatconts
  {fig_structure}
  {\caption{The network architecture of \mname{}.}}
  {\includegraphics[width=\textwidth]{zfig_structure.pdf}}
\end{figure}

As shown in Fig.~\ref{fig_structure}, \mname{} mainly consists of two streams (the upper stream with the blue and green blocks and the lower stream with the orange block). The upper stream is for segmentation. We adopt IterNet \cite{li2019iternet}, which iteratively refines the segmentation results by smaller UNets (the green block in Fig.~\ref{fig_structure}) after initial segmentation by the blue block.
The state-of-the-art performance has been achieved with this model over the mainstream datasets \cite{staal:2004-855, 5740926}. In \mname{}, the green block is repeated three times, following the original implementation in \cite{li2019iternet}. Both two streams use separate cross entropy losses and are trained jointly with a batch size of $16$. For the target, IterNet uses the segmentation labels while the classification part uses the A/V labels. Adam \cite{kingma2014adam} is used as the optimizer with a learning rate of $0.001$. 

With input retinal image $\mathbf{x} \in \mathbb{R}^{W \times H}$ and refined vessel map $\mathbf{v} \in [0, 1]^{W\times H}$ by IterNet, where $W = 576$ and $H = 576$ are the width and height the input image and vessel map, we apply another full-size UNet block, which is shown in orange in Fig.~\ref{fig_structure}, to classify each pixel into artery/vein. The possible output labels are \textit{background}, \textit{artery}, and \textit{vein}.
We mask \textit{background} pixels in input image $\mathbf{x}$ by
\begin{equation}
    \mathbf{x}' = \mathbf{x} \odot \mathbf{v},
\end{equation}
where $\odot$ is the element-wise multiplication. This masking reduces the complexity of the input retinal image, so that the classification stream can fully focus on finding the differences in color, thickness, shape, etc., among the vessels.
We put a block layer before the element-wise multiplication to prevent back-propagation from the classification stream to the segmentation stream, so that each steam can be responsible to the respective task and can be trained in a multi-task manner.

The output from the classification stream is merged with the segmentation result. Let $\mathbf{o}_l \in [0, 1]^{W \times H}$, where $l \in \{\text{\textit{background}, \textit{artery}, \textit{vein}}\} $ denote the softmax output of the classification stream. 



\subsection{Intra-segment Label Unification}\label{section_category_unifying}

There are mainly two types errors in classification results: The first one is inconsistency along one single vessel, i.e., both \textit{artery} and \textit{vein} labels appear in a vessel, as shown in Fig.~\ref{fig_segments_common_mistakes1}, because the underlying convolutional network does not count the structure of the vessel system, making decisions mainly based on local features, such as color and shape. These local features can be easily influenced by environmental factors, e.g., illumination and the retinal camera used. The second type of errors is mixed-up prediction that happens mostly near the crossing and branching points, as shown in Fig.~\ref{fig_segments_common_mistakes2}, because local features corresponding to both vessel types may be observed.  To remedy these two kinds of errors, we design a post-processing algorithm, namely, \textit{intra-segment label unification} for the label inconsistency problem and \textit{inter-segment prediction propagation} for the mixed-up prediction problem.

Intra-segment label unification firstly generates a binary image $\mathbf{p}$ of detected vessels from \mname{}'s output $\mathbf{v}$ by:
\begin{equation}
    p_k = \mathbbm{1}\{v_k > \theta\},
\end{equation}
where $p_k$ and $v_k$ are the $k$-th pixels in $\mathbf{p}$ and $\mathbf{v}$, respectively; $\theta$ is a predefined threshold.
We then extract binary skeletons  using a multiple-threshold method introduced in Appendix \ref{appendix_mul_threshold}, as shown in Fig.~\ref{fig_segments_analysis}(a). We detect all \textit{key-points}, which includes the crossing points between vessels and the terminal points (i,.e., start and end points) of vessels (Fig.~\ref{fig_segments_analysis}(b)). Crossing points are detected by looking for vessel pixels on the skeleton image that have more than two neighbors, while terminal points only have no more than one neighbor. Skeletal pixels between connected key-points are extracted as a \textit{segment} as in  Fig.~\ref{fig_segments_analysis}(c). 

Let $S = \{S_i|i=1,\dots,N\}$ be the set of all $N$ segments extracted from $\mathbf{p}$, where $S_i$ is the set of pixels in segment $i$. We compute the confidence $c_i^l$ that segment $S_i$ belongs to $l$ in $\{\text{\textit{artery}, \textit{vein}}\}$ by
\begin{equation} \label{eq_sum_confidence}
    c_{li} = \sum_{k \in S_i}  (o_{\textit{artery},k} - o_{\textit{vein},k}),
\end{equation}
where $o_{lk}$ is the value in $\mathbf{o}_l$ corresponding to pixel $k$. $c_{li}$ can be viewed as unified label confidence of $S_i$ corresponding to $l$, where actual prediction can be done by comparing $c_{li}$'s, i.e., $S_i$ is \textit{artery} if $c_{\textit{artery},i} > c_{\textit{vein},i}$ and \textit{vein} otherwise.  

\begin{figure}[!t]
\floatconts
  {fig_segments_analysis}
  {\caption{An illustrative example of intra-segment label unification. (a) Extracted vessel skeleton. (b) Detected key-points (magenta dot for the cup center and circle for the cup area; blue for crossing points; yellow for terminal points). (c) Extracted segments.}}
  {\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_skeleton_vessel_vis.png}}
	\hfil
    \subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_skeleton_vessel_points_vis.png}}
	\hfil
	\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_skeleton_vessel_segments_vis.png}}
    }
\end{figure}


\subsection{Inter-segment Prediction Propagation}

To address errors around crossing and branching points, we introduce additional post-processing, coined inter-segment prediction propagation, in which the label of a segment is propagated to its connected segments. This is based on the observation that classification failures usually come with a low confidences on their labels and that they can be corrected by the influences from their connecting segments with high confidence. Propagation should happen depending on the similarity between connecting segments based on their shapes, directions, etc. If two segments share similar shapes, are located nearby, and flows in similar directions, it is highly possible that they belong to the same vessel. Therefore, the influence between these segments should be strong.

Based on this observation, we update confidence $c_{li}$ of segment $S_i$ according to the following rule:
\begin{equation}\label{eq_update_type}
    c_{li} \leftarrow c_{li} + \epsilon_{ij} c_{lj} 
\end{equation}
where $j$ is the index of segment connected to $i$. $\epsilon$ is the coefficient to determine the influence of $S_j$ to $S_i$, given by
\begin{equation}\label{eq_cal_coff}
    \epsilon_{ij} = A_{ij}  L_{ij} T_{ij} D_{ij}
\end{equation}

Let $\mathbf{u}_i$ be the unit tangent vector of $S_i$ at a certain key-point, which is computed using the key-point pixel position $\mathbf{p}_{i1}$ and the position $\mathbf{p}_{i5}$ of the fifth pixel along the skeleton, i.e., $\mathbf{u}_i = (\mathbf{p}_{i5} - \mathbf{p}_{i0})/ \|\mathbf{p}_{i5} - \mathbf{p}_{i0}\|$. $A$ involves the angle between $\mathbf{u}_i$ and $\mathbf{u}_j$, defined as
\begin{equation}
    A_{ij} = F_\mathrm{A}(|\alpha(\mathbf{u}_i, \mathbf{u}_j)-180|) \label{eq:a}
\end{equation}
where $\alpha(\mathbf{u}_i, \mathbf{u}_j)$ is the angle formed by segments $\mathbf{u}_i$ and $\mathbf{u}_j$ and $F_\mathrm{A}$ is given by
\begin{equation}
    F_\mathrm{A}(x) = \frac{(x-m_\mathrm{A})^2}{m_\mathrm{A}^2}, \label{eq:F}
\end{equation}
where $m_\mathrm{A}$ is the pre-defined maximum value decided by observing the vessel systems on the training images. This function serves as normalization of $x$ into $[0, 1]$. $A_{ij}$ gives 1 if the tangent vectors are in the opposite directions (i.e., $\alpha(\mathbf{u}_i, \mathbf{u}_j)$ gives 180 degree).

$L$ handles a potential missing connection between two segments, which is  defined as
\begin{equation}
    L_{ij} = F_\mathrm{L}(\alpha(\mathbf{u}_i, \mathbf{w}_{ij})), 
\end{equation}
where $\mathbf{w}_{ij}$ is a unit vector from $S_i$'s key-point to $S_j$'s, and the angle computed by $\alpha$ is normalized by $F_\mathrm{L}$ in the same way as Eq.~(\ref{eq:F}). $L_{ij}$ gives a value close to 1 if one of $S_j$'s key-point is on the line described by $\mathbf{w}_{ij}$. 

Thickness of vessels can also be a informative cue to retrieve connecting vessels since they share a similar thickness when they are connected to each other. We encode this by $T_{ij}$, defined as
\begin{equation}
    T_{ij} = F_{\mathrm{T}} (\beta(S_i, S_j))
\end{equation}
where $\beta(S_i, S_j)$ gives the difference of mean thickness of $S_i$ and $S_j$, computed along the skeleton pixels. 
$D_{ij}$ gives a small value if $S_i$ and $S_j$ are far from each other. We defined this as 
\begin{equation}
    D_{ij} = F_\mathrm{D}(\|\mathbf{p}_{i0} - \mathbf{p}_{j0}\|).
\end{equation}
Both $F_\mathrm{T}$ and $F_\mathrm{D}$ are defined in the same way as Eq.~(\ref{eq:F}).

We apply this update rule to all extracted segments. The detailed algorithm is presented in Algorithm \ref{alg:post-processing} in Appendix. The label confidence $c_{i}$ evolves as shown in Fig.~\ref{fig_propagation_steps}. We can see that several iterations correct the predicted labels. Note that a segment has two end points, while $A_{ij}$, $L_{ij}$, and $D_{ij}$ involve a single end point in each of segments $S_i$ and $S_j$. We update the confidence for all four combinations of end points. 

\begin{figure}[t]
\floatconts
  {fig_propagation_steps}
  {\caption{An illustrative example of prediction propagation. (a) Initial prediction with several errors. (b)--(d) Resulting predictions after individual iterations.}}
  {\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_infection_1.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_infection_2.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_infection_3.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_infection_4.pdf}}}
\end{figure}

This propagation process is not allowed to change the segments in the cup area, which is indicated by the magenta circle in Fig.~\ref{fig_segments_analysis}(b). This is because vessels in this area are too dense and hard to analyze their relationships, i.e., which segments are actually connected together and which segments are merely crossing, etc. Also, higher brightness in the cup area results in many segmentation failures, which may lead to the failure of PP.

\section{Performance Evaluation}\label{section_performance_evaluation}

We use two popular public datasets, namely DRIVE \cite{staal:2004-855}, and the artery/vein labels from \cite{10.1007/978-3-642-40763-5_54}, as well as LES-AV \cite{orlando2018towards}, to evaluate our method. We compare our method with two recent methods, \textit{i.e.}, uncertainty-aware (UA) \cite{8759380} and fully convolutional network (FCN) \cite{hemelings2019artery}, on the DRIVE dataset. 

One problem is that existing methods use different evaluation strategies. Although most of them use accuracy as the performance metric, but usually with different pixel masks, including the whole image, the discovered vessel pixels, the ground-truth vessel pixels, the major vessel pixels, etc. To remove the barrier of reproducing and testing A/V classification methods, we adopt a newly-proposed evaluation procedure \cite{hemelings2019artery} which includes a series of pixel masks, such as full image, center-line of discovered vessels, center-line of major discovered vessels (width$_{\geq2\operatorname{px}}$), the amount of discovered vessels, etc.

Among these results shown in Table.~\ref{table_results_drive} and Table.~\ref{table_results_les_av}, we can see that our method achieves a better AUC value than other models, as our model avoids deterioration of the segmentation performance due to isolation of segmentation and classification. Also, our full method (\mname{} \& LU \& PP) shows higher accuracy on both datasets.



\begin{table}[t]
\floatconts
  {table_results_drive}
  {\caption{Performance evaluation on DRIVE dataset.}}
  {\begin{tabular}{|l |c| c|c| c|c|  c|} \hline
 \multirow{2}{*}{Methods} & \multirow{2}{*}{Full Image} & \multicolumn{2}{c|}{Center} & \multicolumn{2}{c|}{Center$_{\geq2\operatorname{px}}$} & \multirow{2}{*}{Vessel}\\
 \cline{3-6}
 && Acc.&F1 & Acc.&F1 & 
  \\ \hline
  UA \cite{8759380} & 0.966 & 0.888&0.888 & 0.923&0.923 & 0.741 \\
  FCN \cite{hemelings2019artery} & - & -&- & 0.940 &- &  -\\
 \mname{} \textit{w.o.} post-processing & \textbf{0.967} & 0.914&0.914 & 0.946&0.946 & 0.774 \\
 \mname{} \textit{w.} post-processing  &\textbf{0.967} & \textbf{0.919}&\textbf{0.919} & \textbf{0.953}&\textbf{0.953} &\textbf{ 0.778} \\ 
 \hline
  \end{tabular}}
\end{table}

\begin{table}[t]
\floatconts
  {table_results_les_av}
  {\caption{Performance evaluation on LES-AV dataset.}}
  {\begin{tabular}{|l |c| c|c| c|c|  c|} \hline
 \multirow{2}{*}{Methods} & \multirow{2}{*}{Full Image} & \multicolumn{2}{c|}{Center} & \multicolumn{2}{c|}{Center$_{\geq2\operatorname{px}}$} & \multirow{2}{*}{Vessel}\\
 \cline{3-6}
 && Acc.&F1 & Acc.&F1 & 
  \\ \hline
 \mname{} \textit{w.o.} post-processing & \textbf{0.978} & 0.858&0.858 & 0.916&0.916 &  0.776 \\
 \mname{} \textit{w.} post-processing  &\textbf{0.978} & \textbf{0.874}&\textbf{0.874} & \textbf{0.930}&\textbf{0.930} &\textbf{ 0.785} \\ 
 \hline
  \end{tabular}}
\end{table}


\section{Conclusion}\label{conclusion_section}
In this paper, we propose \mname{} for accurate vessel segmentation and artery/vein classification in retinal images, together with a post-processing algorithm. \mname{} sequentially does segmentation and classification but not simultaneously, which may deteriorate the segmentation performance due to the problem of imbalanced label distribution. Our post-processing algorithm then corrects classification results by propagating highly confident labels to their surrounding vessels segments. Experimental results showed that our method is effective and can achieve the state-of-the-art performance on two public datasets.

\midlacknowledgments{This work was supported by Council for Science, Technology and Innovation (CSTI), cross-ministerial Strategic Innovation Promotion Program (SIP), ``Innovative AI Hospital System'' (Funding Agency: National Institute of Biomedical Innovation, Health and Nutrition (NIBIOHN)). This work was also supported by JSPS KAKENHI Grant Number 19K10662.}


\bibliography{li20}

\newpage
\appendix
\section{Multiple Thresholds in Segments Extraction} \label{appendix_mul_threshold}

In order to propagate the influence correctly, we have to extract the vessel segments accurately. Otherwise, the vessel map may be erroneous, resulting in unreasonable propagation, as shown in Fig.~\ref{fig_multiple_thresholds}(a). Due to a missing important segment, a wrong label is propagated to the segment on the right hand side. Therefore, we should make several different binary skeleton with different thresholds and combine them into a complete vessel map. This is also detailed in Algorithm \ref{alg:post-processing}.


\section{Example Results of Intra-Segment Label Unification}

Fig. \ref{fig_segments_analysis_2}(a) shows the direct output from the classification stream, in which we can see many prediction errors. Figs. \ref{fig_segments_analysis_2}(b) and (c) are the results of vessel skeleton extraction and label unification, respectively, where most label inconsistency in a single vessel segment have been resolved.

\begin{figure}[h]
\floatconts
  {fig_multiple_thresholds}
  {\caption{Multiple thresholds and the propagation results.}}
  {\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_mul_tr1.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_mul_tr2.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_mul_tr3.pdf}}
	\hfil
	\subfigure[]{\includegraphics[width=0.22\textwidth]{zfig_mul_tr4.pdf}}}
\end{figure}

\begin{figure}[h]
\floatconts
  {fig_segments_analysis_2}
  {\caption{Example of label unification. (a) The initial prediction by \mname{}. (b) Vessel skeleton extracted from initial prediction. (c) Label unification result.}}
  {\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_rough_prediction.jpg}}
	\hfil
	\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_skeleton_vessel_color_vis.png}}
	\hfil
	\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_skeleton_segment_color_vis.png}}
    }
\end{figure}

\section{Common Prediction Errors}

Figs. \ref{fig_segments_common_mistakes1} and \ref{fig_segments_common_mistakes2} respectively show two common errors in classification, i.e., inconsistency along one single vessel segment and mixed-up prediction that happens around the crossing and branching points in most cases. 

\begin{figure}[h]
\floatconts
  {fig_segments_common_mistakes1}
  {\caption{Prediction errors happened along a vessel segment. (a) Ground-truth labels. (b) Initial prediction by \mname{}. (c) Post-processed result.}}
  {\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_error_in_label.png}}
	\hfil
	\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_error_in_before.png}}
	\hfil
	\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_error_in_after.png}}
    }
\end{figure}

\begin{figure}[h]
\floatconts
  {fig_segments_common_mistakes2}
  {\caption{Prediction errors happened around crossing or branching points. (a) Ground-truth labels. (b) Initial Prediction by \mname{}. (c) Post-processed result.}}
  {\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_error_betw_label.png}}
	\hfil
	\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_error_betw_before.png}}
	\hfil
	\subfigure[]{\includegraphics[width=0.3\textwidth]{zfig_error_betw_after.png}}
    }
\end{figure}

\newpage
\section{Post-Processing Algorithm}

We detail the proposed post-processing in Algorithm \ref{alg:post-processing}, including multiple thresholds fusion, segment extraction, label unification, and prediction propagation. 

The thresholds we select in our implementation are $0.5$, $0.3$, and $0.1$. They are in a descending order because the higher threshold can result in a skeleton in higher confidence by focusing more on major vessels, while the smaller thresholds covers minor vessels.

As introduced in Section \ref{section_category_unifying}, label unification is based on the confidence associated with each segment, which is actually the sum of the prediction confidence of pixels in that segment. The confidence value is also used in PP, which may need several iterations for a better result. In our experiment, the number of iterations is set to $5$.


\begin{algorithm2e}
    \caption{Segment extraction, label unification, and prediction propagation.}
    \label{alg:post-processing}
    \KwIn{Initial prediction result $P = \{P_1, P_2,..., P_n\}$}
    \KwOut{Refined prediction result $P' = \{P'_1, P'_2,..., P'_n\}$}
    \tcc{Start searching segments in the vessel map}
    segments $\leftarrow$ None;\\
    \For{tr in $[0.5,0.3,0.1]$}
    {
        BS $\leftarrow$ Skeletonize(Binarify($P$, threshold=tr));\\
        keypoints $\leftarrow$  FindEndPoints(BS) + FindCrossingPoints(BS);\\
        segments $\leftarrow$ segments + FindSegments(keypoints);
    }
    \tcc{Start unify the segments}
    \For{$S$ in segments}
    {
        $t^{\boldsymbol{S}} \leftarrow$ CalculateTotalConfidence($S$) \tcp*{using Eq.~\ref{eq_sum_confidence}}
        UnifyResultAlongOneSegment($S$);
    }
    \tcc{Start prediction propgation}
    count $\leftarrow$ 0;\\
    \While{count $< 5$}{
        \For{$S$ in segments}
        {
            $t^{\boldsymbol{S}} \leftarrow$ UpdateConfidence($S$, segments) \tcp*{using Eq.~\ref{eq_update_type},\ref{eq_cal_coff}}
            ChangeSegmentCategory($S$, $t^{\boldsymbol{S}}$);
        }
        count $\leftarrow$ count $+ 1$;
    }
\end{algorithm2e}
\newpage

\section{Example Prediction Results}

Figs, \ref{fig_experiment_figures_drive} shows an example result on the DRIVE dataset.

\begin{figure}[h]
\floatconts
  {fig_experiment_figures_drive}
  {\caption{Prediction results for a single retinal image from the DRIVE dataset. (a) The input image. (b) The corresponding ground-truth labels. (c) The output from the uncert-aware method \cite{8759380}. (d) The output from our method.}}
  {\subfigure[]{\includegraphics[width=0.45\textwidth]{zfig_experiment_raw.jpg}}
	\hfil
	\subfigure[]{\includegraphics[width=0.45\textwidth]{zfig_experiment_label.png}}
	
	\subfigure[]{\includegraphics[width=0.45\textwidth]{zfig_experiment_ua.jpg}}
	\hfil
	\subfigure[]{\includegraphics[width=0.45\textwidth]{zfig_experiment_ours.png}}}
\end{figure}

\end{document}
