\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images

% Header for extended abstracts
\jmlrproceedings{MIDL}{Medical Imaging with Deep Learning}
\jmlrpages{}
\jmlryear{2021}

% to be uncommented for submissions under review
\jmlrworkshop{Short Paper -- MIDL 2021}
% \jmlrvolume{-- Under Review}
% \editors{Under Review for MIDL 2021}

\title[Double adversarial domain adaptation for WSI classification]{Double adversarial domain adaptation for whole-slide-image classification}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

% \footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Yuchen Yang\nametag{$^{1}$}} \Email{yy17@ualberta.ca}\\
\Name{Amir Akbarnejad\nametag{$^{1}$}} \Email{ah8@ualberta.ca}\\
\Name{Nilanjan Ray\nametag{$^{1}$}} \Email{nray1@ualberta.ca}\\
\Name{Gilbert Bigras\nametag{$^{1,2}$}} \Email{gilbertbigras@gmail.com}\\
\addr $^{1}$ Department of Computing Science, University of Alberta \\
\addr $^{2}$ Department of Laboratory Medicine and Pathology, University of Alberta
}

\begin{document}

\maketitle

\begin{abstract}
Image classification on whole-slide-image (WSI) is a challenging task. A previous work based on Fisher vector encoding provided a novel end-to-end pipeline with promising accuracy and computational efficiency.
However, the pipeline suffers from an accuracy drop due to domain shift. This poses a limitation on the practical use of the pipeline especially when the diagnoses of WSIs are hard to obtain. This paper aims for a solution to mitigate the accuracy drop by using an unsupervised domain adaptation approach. We propose to insert the domain classifiers into the pipeline in two stages to align the features during training. 
We evaluate accuracy by calculating the confusion matrices before and after the adaptation on two datasets. We demonstrate that placing domain classifiers in different stages will boost accuracy.

\end{abstract}

\begin{keywords}
whole-slide-image, classification, domain adaptation, deep learning.
\end{keywords}

\section{Introduction}
Deep learning for classification of whole-slide-image (WSI) is challenging. The challenge comes from its high resolution and sparsely scattered diagnostic information. Recently, a new end-to-end embedding method \-- Deep Fisher vector coding (DFVC) \cite{Akbarnejad2021partial} is proposed. In this paper, we enhance DFVC pipeline to cope with data distribution shift.

Data distribution shift often results in an accuracy drop when a DNN model is trained on one WSI dataset and tested on another. Potential causes could be different staining processes by different institutions, WSI scanned in different periods or machines, and so on. Also, the cost of labeling WSIs is high, diagnoses such as HER2 or gleason scores are expensive to obtain. In this paper, we dedicate to utilize the unsupervised domain adaptation (UDA) approach to mitigate the accuracy drop among WSI datasets. 

Domain adaptation for medical imaging is previously discussed in works such as \cite{ren2018adversarial}. However, previous attempts only focus on adapting image patches extracted from the WSI to achieve higher patch-based accuracy. Their approaches are limited by the patch-based classification pipelines without consideration on adapting and classifying the WSI as a whole. This paper is established on the DFVC pipeline for WSI classification. We propose a UDA solution that integrates the original pipeline with domain classifiers in two stages to minimize the accuracy decrease. Comparison of a model without adaptation, adapted models, and an oracle model is demonstrated to show the effectiveness of our solution.

\section{Methods}

\begin{figure*}[ht]
\centering
\includegraphics[width=1.0\textwidth]{pdf/MIDL pipeline - Copy.pdf}
\caption{Overview of dual stages adaptation for WSI classification.
}
\label{fig:dual_stage_pipeline}
\end{figure*}

We show the our integrated UDA solution in Fig.\ref{fig:dual_stage_pipeline}. First, the WSIs of both source and target domains are randomly sampled and augmented which follows the original DFVC pipeline, and then fed into a CNN for feature encoding. Afterward, we forward the features to a domain classifier. This domain classifier works with the patch-wise features and is responsible for adapting local distribution shifts on patches from different domains. The domain classifier outputs the two domain labels - source and target and is trained with binary cross-entropy loss. A gradient reverse layer is attached on the top of the domain classifier to enable adversarial training to adapt features from the two domains.

Besides forwarding the features to the local domain adaptation part, both source and target CNN encoded features from stage one are passed to the next stages. The features are further possessed by the Fisher vector encoding stage and then the global average pooling stage. The global average pooling stage aggregates the individual features so that each WSI is represented by a single vector. We insert the domain classifier to this stage to align the aggregated features. This domain classifier adapts the feature distribution shift of the entire WSI. The structure of the domain discriminator in this stage shares the same configuration as in the first stage with adjusted feature size.

During the training stage, the cross-entropy loss from the original pipeline and adversarial loss is combined with a balance parameter $\lambda = 0.1$ to update the entire model. The entire loss calculation of the proposed method can be estimated as:
\begin{align}
  Loss & = L_{CE} + \lambda (L_{adv}^{local} + L_{adv}^{global}), \; 
\end{align}
$L_{adv}^{local}$ and $L_{adv}^{global}$ represent the adversarial loss terms of the two domain classifiers attached after the first (local) and the third (global) stage. 
The domain classifiers are only attached during training and are discarded during the testing. \footnote{An implementation can be found in \url{https://github.com/yuchen2580/double_adaptation_WSI}}

\section{Experiments and conclusion}
We test our method on two HER2 IHC breast tissue datasets. One has 250 WSIs collected from Alberta Cross Cancer Institution (CCI dataset).
The other has 52 WSIs collected from Warwick HER2 challenge \cite{qaiser2018her} (Warwick dataset). We split the Warwick dataset in half to create a train set (24 WSIs) and a test set (28 WSIs). Ratios of categories are kept the same. For adaptation, we train the network with CCI data with label and Warwick train set without label, and evaluate the model on Warwick test set. 

The classification goal is to predict 4 categories of HER2 scores (0,+1,+2,+3) in \cite{qaiser2018her} for each WSI. Table \ref{table:confusion_mtx} (c) and (d) show that both adaptation on local stage and global stage can help increase the accuracy. Compared to the global stage adaptation, the local stage adaptation has a better influence on all categories in the matrix. But global stage adaptation provides better separation between category 0 and category 3+. 
From the result of Table \ref{table:confusion_mtx}(e), the double stage adaptation provides the best accuracy and confusion matrix compared to the single stage adaptation in (c) and (d). Note that the training set of the CCI data is significantly large compared to the Warwick dataset, the increased accuracy in this experiment also indicates that our solution can apply to the scenario where the model could be trained in a bigger dataset and adapt to a smaller dataset elsewhere.
\begin{table}[ht]
\centering
\includegraphics[width=1.0\textwidth]{pdf/confusion matrix - Copy.pdf}
\caption{Comparisons of confusion matrices.
} 
\label{table:confusion_mtx}
\end{table}

In conclusion, this paper focuses on the domain shift problem that exists in WSI classification task. Built on a previous pipeline, we propose to integrate the domain classifiers into two stages to cover local and global distribution shifts. The adapted model from a big HER2 dataset to a small one, shows a significant accuracy boost in the experiment.  

\bibliography{midl-samplebibliography}
\end{document}
