% LaTeX template: https://github.com/MIDL-Conference/MIDLLatexTemplate

\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution


\usepackage{booktabs} 
\usepackage{multirow}
\usepackage{mwe} % to get dummy images
% \jmlrvolume{-- Under Review}
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}
% \editors{Under Review for MIDL 2021}

\title[Semi-supervised chest x-ray segmentation]{CheXseg: Combining Expert Annotations with DNN-generated Saliency Maps for X-ray Segmentation}

\author{\begin{NoHyper}\Name{Soham Gadgil{\nametag{\thanks{Contributed equally}}}} \Email{sgadgil@stanford.edu}\\
        \Name{Mark Endo{\nametag{\footnotemark[1]}}} \Email{markendo@stanford.edu}\\
        \Name{Emily Wen{\nametag{\footnotemark[1]}}} \Email{emilywen@stanford.edu}\\
        \Name{Andrew Y. Ng} \Email{ang@cs.stanford.edu}\\
        \Name{Pranav Rajpurkar} \Email{pranavsr@cs.stanford.edu}\\
        \addr Department of Computer Science, Stanford University
\end{NoHyper}}

% \addr Department of Computer Science, Stanford University

%   \midlauthor{\Name{Soham Gadgil\midljointauthortext{Contributed equally}} \Email{sgadgil@stanford.edu}\\
%   \Name{Mark Endo} \Email{markendo@stanford.edu}\\
%   \Name{Emily Wen\midlotherjointauthor} \Email{emilywen@stanford.edu}\\
%   \Name{Andrew Y. Ng} \Email{ang@cs.stanford.edu}\\
%   \Name{Pranav Rajpurkar} \Email{pranavsr@cs.stanford.edu}\\
%   \addr Department of Computer Science, Stanford University
%   }

\begin{document}

\maketitle

% \begin{figure}[htbp]
% \floatconts
%   {fig:combined_plot}
%   {\caption{Visual summary of our contributions. (a) Fully-supervised baseline outperforms weakly-supervised method (IRNet) for all encoder initializations. (b) Semi-supervised method combining pixel-level labels and image-level labels outperforms baseline with a pixel-level label sampling rate of 0.9. (c) Distilling fully-supervised baseline improves performance of model.}}
%   {\includegraphics[width=1\linewidth]{combined_plot.pdf}}
% \end{figure}

\begin{abstract}
Medical image segmentation models are typically supervised by expert annotations at the pixel-level, which can be expensive to acquire. In this work, we propose a method that combines the high quality of pixel-level expert annotations with the scale of coarse DNN-generated saliency maps for training multi-label semantic segmentation models. We demonstrate the application of our semi-supervised method, which we call CheXseg, on multi-label chest X-ray interpretation. We find that CheXseg improves upon the performance (mIoU) of fully-supervised methods that use only pixel-level expert annotations by 9.7\% and weakly-supervised methods that use only DNN-generated saliency maps by 73.1\%. Our best method is able to match radiologist agreement on three out of ten pathologies and reduces the overall performance gap by 57.2\% as compared to weakly-supervised methods.

% reduces the overall gap from weakly-supervised methods by {half}.

% We also experiment with knowledge distillation as another approach to improve upon the fully-supervised method and observe that self-distilling a fully-supervised model with unlabeled data results in performance improvement.


% Medical image segmentation models typically require lack a sufficient number pixel-level labels for training due to the cumbersome and expensive nature of the annotation process. We hypothesize that saliency maps generated from pre-trained classification models can be used in such situations to improve segmentation performance. FIX?
% In this work, we introduce CheXseg, a semi-supervised method combining DNN-generated saliency maps with expert pixel-level annotations. Our method 
\end{abstract}

\begin{keywords}
Semi-Supervised Segmentation, Saliency Maps,
%Knowledge Distillation,
Localization Performance
\end{keywords}

\section{Introduction}
% Project description / Hypothesis / Question

% Description of our work / contributions
The ``black box" nature of neural networks represents a barrier to physicians' trust and model adoption in the clinical setting \cite{kelly2019key}. Saliency maps are a popular set of explanation methods that highlight regions of the image that are important for disease classification, but they have been shown to be untrustworthy for medical image interpretation \cite{eitel2019testing, crosbynetworkoutput, Young_2019, arun2020assessing}. Segmentation models can produce more accurate pixel-level maps, but their training is typically limited by the time-consuming process of collecting expert annotations. The combination of saliency maps generated from widely available classification models and a limited amount of expert annotations for training medical image segmentation models may be able to provide higher quality segmentations at a lower cost, but this approach remains relatively unexplored.

% [ summary of results and contributions ]
% comparison between supervision levels 
In this work, we develop \textit{CheXseg}, a semi-supervised method for multi-pathology segmentation that leverages both the pixel-level expert annotations and the saliency maps generated by image classification models. First, we find that CheXseg achieves a mean IoU of 0.270, outperforming both fully-supervised (mIoU of 0.246) and weakly-supervised (mIoU of 0.156) methods. Second, we find that initializing the segmentation encoder with weights learned from supervised classification of the same task leads to higher performance than using a self-supervised MoCo initialization \cite{he2020momentum} or ImageNet initialization \cite{deng2009imagenet}. Third, CheXseg reduces the overall gap to radiologist localization performance (mIoU) by 57.2\% compared to solely using DNN-generated saliency maps. We expect this method to be broadly useful for medical image segmentation, where classification models are widely available and expert annotations are expensive.

\section{Related Work}
% self-supervised learning: training fully-supervised segmentation model on the created pseudo pixel-level labels
\subsection{Weakly-Supervised Semantic Segmentation}
In this work, we focus on an approach in which classification models trained with image-level labels are used to create pixel-level pseudo-labels \cite{yao2020saliency, ciga2019learning, ouyang2019weakly}. These pseudo-labels can then be utilized to train a segmentation model. Our paper is closely related to \citet{viniavskyi2020weakly}, which proposes a deep CNN-based approach that generates pseudo-labels by applying an Inter-pixel Relation Network (IRNet) \cite{ahn2019weakly} to improve Grad-CAM++ \cite{chattopadhay2018grad} generated activation maps. This approach is evaluated on the SIIM-ACR Pneumothorax dataset.
In our work, we generate pseudo-labels with IRNet and extend the approach to the semi-supervised setting for a larger set of pathologies.

% TODO: change wording of "build upon" to make contribution seem more significant

Several other methods also propagate class activation from areas of high confidence to similar adjacent regions \cite{kolesnikov2016seed, huang2018weakly, ahn2018learning}. We choose to build upon \citet{viniavskyi2020weakly} rather than these approaches because it has the best performance on the PASCAL VOC 2012 \cite{everingham2010pascal} validation set (mIoU of 0.646) and was shown to perform well on a medical imaging task.

\subsection{Semi- and Fully-Supervised Semantic Segmentation}
% semi-supervised methods
Semi-supervised methods use a combination of expert pixel-level annotations and pseudo-labels to train semantic segmentation models. Some weakly-supervised methods have been extended to semi-supervised methods through the replacement of pseudo-labels. The weakly-supervised SGAN model \cite{yao2020saliency} was adapted to a semi-supervised setting by replacing a subset of the saliency maps with the corresponding manually annotated ground truth labels. In this work, we use a similar idea of utilizing radiologist annotated labels in addition to saliency maps to train the segmentation model, extending \citet{viniavskyi2020weakly}'s weakly-supervised model. 

% In this work, we build on this idea by sampling radiologist annotated labels and saliency maps to train the segmentation model, extending \citet{viniavskyi2020weakly}'s weakly-supervised model. 


% fully supervised
Though weakly- and semi-supervised methods can perform well, fully-supervised methods are still considered the upper-bound \cite{chan2020comprehensive}. Many fully-supervised semantic segmentation approaches have been proposed for chest X-rays \cite{sirazitdinov2019deep, jaiswal2019identifying}, but none of these extend their work to the semi-supervised setting.

\section{Methods}
\subsection{Setup}
The multi-label semantic segmentation task is to classify each pixel of a chest X-ray image into zero or more of 10 possible pathologies: Airspace Opacity, Atelectasis, Cardiomegaly, Consolidation, Edema, Enlarged Cardiomediastinum, Lung Lesion, Pleural Effusion, Pneumothorax, and Support Devices. 

% dataset
We utilize CheXpert \cite{irvin2019chexpert}, an existing large dataset with 224,316 chest X-rays of 65240 patients. This dataset features image-level labels obtained using an automated labeler that detects the aforementioned pathologies from radiology reports. A subset of the dataset is hand-annotated by radiologists at the pixel level. In our work, we use a set of 200 radiologist-annotated chest X-rays to validate model performance of the weakly-supervised method. For the fully-supervised and semi-supervised methods, we use 150 of the radiologist-annotated labels as a train set and save 50 examples for a validation set. We selected this validation set to exclude scarce pathologies, as examples with those pathologies are most valuable in the training process. When evaluating performance on this validation set, we only look at the most common pathologies. For all methods, we use a test set of an additional 500 radiologist-annotated images. 

% The fully-labeled training set is composed of 200 chest X-rays and an additional test set contains annotations of 500 images.

% evaluation metric
Models are evaluated by their average performance on the semantic segmentation task across the ten pathologies of interest. For each of the pathologies, the IoU (Intersection-over-Union) score is computed. We report the mIoU (mean IoU) score, which is the average IoU score across all pathologies. 

\subsection{CheXseg}
We develop \textit{CheXseg}, a semi-supervised method for multi-pathology segmentation that leverages both the pixel-level expert annotations and the saliency maps generated by image classification models. In this method, a DenseNet121 \cite{huang2018densely} classification model, trained on the entire CheXpert train set, is first used to generate saliency maps using Grad-CAM \cite{selvaraju2017grad}. This approach uses the classification model outputs to create a coarse localization map highlighting the image regions important for prediction. The saliency maps are further processed to create per-pixel segmentation masks, referred to as weak pseudo-labels, by using either a thresholding scheme or an Inter-Pixel Relation Network (IRNet) \cite{ahn2019weakly}. IRNet takes these generated CAMs and tries to improve the seeds by training two output branches, a displacement vector field and a class boundary map. Details about these methods are provided in \appendixref{appendix:weak_pseudo_label_generation}.

Once the pseudo-labels have been generated, we combine them with pixel-level expert annotations in a semi-supervised manner to train semantic segmentation models. Due to the scarcity of high-quality pixel-level expert annotations, we implement a sampling strategy of the different label types to allow for a high level of contribution from the radiologist annotations. For comparison, we train fully-supervised segmentation models (solely using pixel-level annotations) and weakly-supervised segmentation models (solely using pseudo-labels).


% In order to adjust the level of contribution each label type has on model training, we alter the sampling rate of each respective label type.

All our methods utilize DeepLabv3+ as the core semantic segmentation model \cite{chen2018encoder}. We experiment with various encoder initializations to transfer knowledge from the classification task to the segmentation task. Our experiments utilize a ResNet encoder architecture \cite{he2016deep}. 
% Additionally, we implement a semi-supervised knowledge distillation method which utilizes extra unlabeled images and does not require access to a trained classification model to create pseudo-labels.
Figure \ref{fig:supervision_levels} shows a visual representation of the different supervision strategies used. 
% All training details are available in \appendixref{appendix:training_details}.

\subsection{Training Details}
Here we describe the training details of our classification and segmentation models. The IRNet training details are available in \appendixref{appendix:training_details}.
\subsubsection{Classification Model}
The pre-trained classification model used for generating the CAMs from image-level labels is a DenseNet121. We use the Adam optimizer with default $\beta$-parameters of $\beta_1$ = 0.9, $\beta_2$ = 0.999 and learning
rate $1 \times 10^{-4}$ which is fixed for the duration of the training.
Batches are sampled using a fixed batch size of 16 images.
We train for 3 epochs, saving checkpoints every 4800 iterations.
\subsubsection{Segmentation Model}
% segmentation model training details

The semantic segmentation model is trained using a class average dice loss and Adam optimizer. We use a learning rate of 0.001 when training a small amount of data %or for distillation, 
and we decrease it to 0.0001 when training a large amount of data. We train on up to four Nvidia GTX 1070s using a batch size of 8.

\begin{figure}[t]
\floatconts
  {fig:supervision_levels}
  {\caption{Workflows of
  the different methods analysed for chest X-ray segmentation}}
  {\includegraphics[width=0.85\linewidth]{supervision_levels.png}}
\end{figure}




\section{Experiments}
\subsection{Combining Weak and Full Supervision}
We investigate the segmentation performance of combining DNN-generated saliency maps and expert annotations with various sampling ratios. We use 100 saliency maps in combination with 200 annotated pixel-level labels and explore different weightings between the two types of labels. We vary the probability of selecting an expert annotation in a single batch during training, $p \in 0, 0.2, 0.4, 0.6, 0.8. 0.85, 0.9, 1$. Thus, $p$ determines the expected fraction of images with expert annotations in a single batch. For each value of $p$, we perform three trials containing different sets of the 100 saliency maps. The results reported are the mIoU scores obtained by averaging across these 3 trials. The segmentation model is initialized with CheXpert encoder weights since it performs the best as observed in experiment \ref{enc_weights}. We also compare the performance of this semi-supervised model with the weakly-supervised and fully-supervised models.

\paragraph{Results} 
% results
We find that for both Grad-CAM and IRNet, there is an inverted U-shape trend in performance as we increase $p$. There is a sharp increase in performance as $p$ increases from 0 (mIoU score of 0.156), and then the curves remain relatively flat before dropping off when $p = 1$. Specifically, $p = 0.9$ (\textit{CheXseg}) gives the best mIoU performance of $0.270 \pm 0.00872$ and $0.267 \pm 0.00993$ (95\% CI) for CAM and IRNet respectively. This high weighting of pixel-level labels takes advantage of the more accurate information encoded within these labels as compared to the saliency maps. The reduced performance for the fully-supervised case ($p = 1$, mIoU score of $0.246 \pm 0.01837$, 95\% CI) is likely attributed to the weak labels no longer being utilized in training. The size of the train set shrinks, and the model does not benefit from the variation and the scale provided by the weak pseudo-labels. For the weakly-supervised case ($p = 0$, mIoU score of 0.156), the poor performance can be attributed to the absence of any pixel-level expert annotations to guide the model predictions. Detailed results are shown in Figure \ref{fig:pneumo_consolid_plot}.

\begin{figure}[htbp]
\floatconts{fig:pneumo_consolid_plot}{\caption{IoU scores of semi-supervised segmentation models (averaged across 3 trials) using either Grad-CAM or IRNet to generate weak labels. The DeepLabV3+ and ResNet18 setup is used with CheXpert encoder initializion.  \textit{p} is the probability of selecting a pixel-level labeled training sample in the current batch. (a) is the average IoU score calculated across all pathologies. (b), (c), and (d) are IoU scores for Pleural, Consolidation, and Edema respectively. $\textit{p}=0$ represents the weakly-supervised case while $\textit{p}=1$ represents the fully-supervised case. Full results in \tableref{tab:weightings}.}}{ \includegraphics[width=1\linewidth]{semi_sup_plot.pdf}}
\end{figure}

\subsection{Comparing Encoder Initializations} 
\label{enc_weights}
% description
We investigate the impact of using various encoder initializations on segmentation performance. In the fully-supervised and weakly-supervised methods, we initialize the encoder weights to either a CheXpert classification model \cite{irvin2019chexpert}, MoCo-CXR \cite{sowrirajan2020moco}, ImageNet \cite{deng2009imagenet}, or random.

\paragraph{Results}
We find that for all methods, the best models are initialized with CheXpert encoder weights. This may be expected since CheXpert weights are learned from supervised learning on the same dataset that we use for segmentation. The models initialized with MoCo-CXR weights have similar performance to the models with ImageNet encoder initialization. Figure \ref{fig:weakly_supervised_results} shows the detailed results for fully-supervised and weakly-supervised encoder initializations.

\begin{figure}[htbp]
\floatconts
{fig:weakly_supervised_results}% label for whole figure
{\caption{Performance of fully-supervised and weakly-supervised methods}}% caption for whole figure
{%
\subfigure[mIoU of fully-supervised segmentation models with DeepLabV3+ and ResNet18 initialized with various weights. CheXpert encoder initialization results in best performance (average 0.246).]{%
\label{fig:pic1}% label for this sub-figure
\includegraphics[width=0.33\linewidth]{fully_sup_plot.pdf}%
} % space out the images a bit
\subfigure[mIoU of weakly-supervised segmentation models with DeepLabV3+ and ResNet18 setup using either Grad-CAM or IRNet pseudo-labels and various encoder initializations. IRNet outperforms CAMs when using CheXpert encoder initialization (0.156 vs 0.142), but underperforms when using other initializations (0.111 vs 0.136 with Random, 0.128 vs 0.139 with MoCo-CXR, and 0.124 vs 0.139 with ImageNet). Full comparisons with confidence intervals in \tableref{tab:weakresults}.]{%
\label{fig:pic2}% label for this sub-figure
\includegraphics[width=0.66\linewidth]{weakly_sup_plot.pdf}%
}%
}%
\end{figure}



\subsection{Comparison to Radiologists}

Compared to our best weakly-supervised method, CheXseg reduces the overall performance (mIoU) gap with radiologists by 57.2\%. CheXseg outperforms radiologists in terms of IoU score on the segmentation of Atelectasis (156\% higher), Airspace Opacity (70\% higher), and Pleural Effusion (30\% higher), while performing worse on the remaining pathologies. Detailed results are shown in Figure \ref{fig:human_comparison}.

\begin{figure}[htbp]
\floatconts{fig:qualitative_results}{\caption{Qualitative Results for Cardomegaly and Airspace Opacity. Column (i) represents the Ground Truth segmentation map. Column (ii) represents the segmentation map obtained from CheXseg. Column (iii) represents the segmentation map obtained from using the best weakly supervised method (IRNet). }}{
\subfigure[Qualitative results for Cardiomegaly]{%
\label{fig:cardio_fig_1}% label for this sub-figure
\includegraphics[width=0.8\linewidth]{cardiomegaly_qualitative.pdf}}\\
\subfigure[Qualitative results for Airspace Opacity]{%
\label{fig:ao_fig_2}% label for this sub-figure
\includegraphics[width=0.8\linewidth]{ao_qualitative.pdf}}}
\end{figure}

\begin{figure}[t]
\floatconts
  {fig:human_comparison}
  {\caption{IoUs of radiologists, our best semi-supervised method (CheXseg), and our best weakly-supervised  method. CheXseg uses Grad-CAM while the weakly-supervised method uses IRNet. }}
  {\includegraphics[width=0.8\linewidth]{human_comp_plot.pdf}}
\end{figure}


% CheXseg achieves higher performance than radiologists on Atelectasis (161\% higher), Airspace Opacity (76\% higher), and Pleural Effusion (20\% higher). Compared to our best weakly-supervised method, CheXseg reduces the overall performance gap, in terms of the mIoU score, with radiologists by 61\%. We also observe that relative to radiologists, both models have correlated performances across the ten pathologies. This finding suggests that segmentation models can learn key identifying features of some pathologies more easily than others. Detailed results are shown in \appendixref{appendix:human_comparison}.
\section{Qualitative Results}
Figure \ref{fig:qualitative_results} shows the qualitative results for two pathologies - Cardiomegaly and Airspace Opeacity - for the best weakly-supervised (IRNet) and semi-supervised (CheXseg) methods. CheXseg gives better visualizations that are closer to ground truth as compared to the best weakly-supervised approach.
\section{Discussion} 
In this work, we develop \textit{CheXseg}, a semi-supervised method for multi-pathology segmentation that leverages the benefits of both available medical image classification models and expert pixel-level annotations.

\textit{How does CheXseg performance compare to fully-supervised and weakly-supervised methods?} % how much data is needed. added utility having 10x larger dataset
We find that with a weighted sampling of saliency maps and expert annotations, our proposed method outperforms both the fully-supervised and weakly-supervised methods alone. An expert annotation to saliency map sample ratio of 0.9 (CheXseg) gives the best mIoU score of 0.270, compared to 0.246 for fully-supervised and 0.156 for weakly-supervised. This suggests a tradeoff between emphasizing the accurate information of expert pixel-level annotations and incorporating additional but noisy cues from weak pseudo-labels. 

\textit{How do segmentation models initialized with CheXpert, ImageNet, MoCo-CXR, and random encoder weights compare?}
% key findings
We find that CheXpert encoder initialization achieves the highest performance, followed by self-supervised (MoCo-CXR) initialization and ImageNet initialization. Random encoder initialization has the worst performance, as no transfer learning occurs. Since CheXpert weights are pre-trained using image-level labels for the same tasks, it is expected that this knowledge transfers well to segmentation on the same dataset and the same set of tasks. Whereas classification models with MoCo-CXR encoder initialization have been found to outperform classification models with ImageNet encoder initialization \cite{sowrirajan2020moco}, we find that the two initializations have approximately the same performance for segmentation. 

In closing, our work proposes a simple semi-supervised method that combines the benefits of the wide availability of classification models with the quality of expert annotations. We expect our method may be broadly useful, able to lower the cost of development and improve the performance of medical image segmentation models.

% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}


\bibliography{gadgil21}


\newpage
\appendix

\section{Methods for Generating Weak Pseudo-Labels}
\label{appendix:weak_pseudo_label_generation}
\subsection{Grad-CAM} 
Grad-CAM is used to obtain saliency maps from the model predictions to highlight the areas that the model focuses on during classification. It makes use of the gradients of the output classes flowing into the last convolutional layer to make low-resolution heatmaps ($3 \times 3$ in case of ResNets). Specifically, the gradients flowing back are global-average-pooled over the width and height dimensions (indexed by i and j respectively) to obtain the importance of the $k$th feature map for target class $c$, $\alpha^c_k$:
\begin{equation}
\label{eq:1}
    \alpha^c_k = \frac{1}{Z}\sum_{i}\sum_{j}\frac{\partial y^c}{A^k_{ij}}
\end{equation}
Here, $y^c$ is the score (before softmax) of the class $c$, $Z$ is the normalization factor, and $A^k$ is the $k$th feature map activation.

After this, a weighted sum of the final feature maps followed by a ReLU is performed:

\begin{equation}
\label{eq:2}
    L^c_{Grad-CAM} = ReLU\left(\sum_{k}\alpha^c_{k}A^k\right)
\end{equation}
A thresholding scheme is then used to convert the heatmap for each pathology into a segmentation map to use as pseudo-labels. The probability threshold is determined per pathology by maximizing the mIoU on the CheXpert train set.

\subsection{Inter-Pixel Relation Network (IRNet)} 
We follow the method IRNet, which takes the previously generated CAMs and tries to improve these seeds by training two output branches. The first branch predicts a displacement vector field in which each pixel is represented by a 2D vector pointing to the centroid of the instance that the pixel is a part of. This displacement field is converted to a class-agnostic instance map by grouping together pixels whose vectors point to the same centroid. The second branch is used to detect class boundaries by computing pairwise semantic affinities, which is a confidence score for class equivalence between a pair of pixels. The instance-wise CAMs obtained from the first branch are enhanced by propagating their attention scores to relevant areas using the computed affinities between neighboring pixels. Finally, pseudo-labels are generated independently for each pathology with positive pixels being the ones with higher class attention scores.


\section{Training Details}
% \appendixref{appendix:training_details}
\label{appendix:training_details}
% \subsection{Classification Model}
% The pre-trained classification model used for generating the CAMs from image-level labels is a DenseNet121. We use the Adam optimizer with default $\beta$-parameters of $\beta$1 = 0.9, $\beta$2 = 0.999 and learning
% rate $1 \times 10^{-4}$ which is fixed for the duration of the training.
% Batches are sampled using a fixed batch size of 16 images.
% We train for 3 epochs, saving checkpoints every 4800 iterations.
% \subsection{Segmentation Model}
% % segmentation model training details

% The semantic segmentation model is trained using a class average dice loss and Adam optimizer. We use a learning rate of 0.001 when training a small amount of data %or for distillation, 
% and we decrease it to 0.0001 when training a large amount of data. We train on up to four Nvidia GTX 1070s using a batch size of 8.

\subsection{IRNet}
% irnet training details
The two branches of IRNet share the same ResNet50 backbone and are jointly trained by minimizing the sum of three losses: 
\begin{enumerate}
    \item Loss for displacement field prediction, which consists of two losses:
    \begin{enumerate}
        
    \item L1 loss between the image coordinate displacement between a pair of nearby foreground pixels $(i, j)$, denoted by $\hat{\delta}(i, j) = x_j - x_i$, and the displacement obtained from their vector fields, $\mathcal{D}$, denoted by $\delta(i, j) = \mathcal{D}(x_j) - \mathcal{D}(x_i)$:
    
    \begin{equation}
        \mathcal{L}^{\mathcal{D}}_{\text{fg}} = \frac{1}{\mid\mathcal{P}^{+}_{\text{fg}}\mid} \sum_{(i, j) \in \mathcal{P}^{+}_{\text{fg}}} \left| \hat{\delta}(i, j) - \delta(i, j)\right|
    \end{equation}
    
    Here, $\mathcal{P}^{+}_{\text{fg}}$ refers to the set of neighboring foreground pixel pairs with the same pseudo label (pixels with attention scores larger than 0.3).
    
    \item Loss for background pixels as a normalized sum of their image coordinate displacements:
    
    \begin{equation}
        \mathcal{L}^{\mathcal{D}}_{\text{bg}} = \frac{1}{\mid\mathcal{P}^{+}_{\text{bg}}\mid} \sum_{(i, j) \in \mathcal{P}^{+}_{\text{bg}}}\left|\delta(i, j) \right|
    \end{equation}
    
     Here, $\mathcal{P}^{+}_{\text{bg}}$ refers to the set of neighboring background pixel pairs with the same pseudo label (pixels with attention scores less than 0.05).
     
    \end{enumerate}
     \item Loss for Class Boundary Detection, which makes use of the semantic affinities between a pair of pixels $x_i$ and $x_j$, $a_{ij}$:
     
    \begin{equation}
         a_{ij} = 1 - \text{max}_{k \in \Pi_{ij}}\mathcal{B}(x_k)
    \end{equation}
     Here, $\Pi_{ij}$ is a set of pixels on the line between $x_i$ and $x_j$ and $\mathcal{B} \in \text{[0, 1]}^{w\times h}$ is the output. Then,  the loss is the cross-entropy loss between the binary affinity label, with value 1 for the same pseudo-class labels and 0 otherwise, and the predicted affinity of two pixels:
     
     \begin{equation}
         \mathcal{L}^{\mathcal{B}} = - \sum_{(i, j) \in \mathcal{P}^{+}_{\text{fg}}} \frac{\text{log}(a_{ij})}{2\mid\mathcal{P}^{+}_{\text{fg}}\mid} - \sum_{(i, j) \in \mathcal{P}^{+}_{\text{bg}}} \frac{\text{log}(a_{ij})}{2\mid\mathcal{P}^{+}_{\text{bg}}\mid} - \sum_{(i, j) \in \mathcal{P}^{-}} \frac{\text{log}(1- a_{ij})}{\mid\mathcal{P}^{-}\mid}
     \end{equation}
     
     Here, $\mathcal{P}^{-}$ represents the set of pixel pairs with different pseudo labels.
\end{enumerate}
The two branches are jointly trained by minimizing all three losses at the same time:
\begin{equation}
    \mathcal{L} = \mathcal{L}^{\mathcal{D}}_{\text{fg}} + \mathcal{L}^{\mathcal{D}}_{\text{bg}} + \mathcal{L}^{\mathcal{B}}
\end{equation}
The model is trained with stochastic gradient descent using a learning rate of 0.1 with polynomial decay and a batch size of 16. The segmentation maps created are then used in the semantic segmentation model as weak pseudo-labels. 
\newpage
\section{Semi-Supervised Results}

\begin{table}[htbp]
\floatconts
{tab:weightings}
{\caption{IoU scores for semi-supervised segmentation using IRNet and CAMs by weighting of pixel-level and weak pseudo-labels. \textit{p} represents the probability of picking a expert annotated training example in the current batch.}}
{%
\resizebox{\textwidth}{!}{
\begin{tabular}{cccccccccc}
    \toprule
    Task & Method & \textit{p}=0 & \textit{p}=0.2 & \textit{p}=0.4 & \textit{p}=0.6 & \textit{p}=0.8 & \textit{p}=0.85 & \textit{p}=0.9 & \textit{p}=1 \\
    \toprule
    \multirow{2}{*}{Mean IoU} & CAM & \textbf{0.142} & \textbf{0.238} & \textbf{0.246} & \textbf{0.248} & \textbf{0.255} & \textbf{0.261}  & \textbf{0.270} & \multirow{2}{*}{0.246}\\
                        & IRNet & 0.156 & 0.216 & 0.228 & 0.236 & 0.252 & 0.254 &  0.267 \\
    \midrule
    \multirow{2}{*}{Airspace Opacity}  & CAM & 0.161 & \textbf{0.364} & \textbf{0.381} & \textbf{0.382} & \textbf{0.376}  & 0.376 & 0.398 & \multirow{2}{*}{0.388} \\
                        & IRNet & \textbf{0.225} & 0.334 & 0.355 & 0.362 & 0.374 & \textbf{0.377} & \textbf{0.403} \\
    \midrule
    \multirow{2}{*}{Atelectasis}  & CAM & \textbf{0.099} & \textbf{0.299} & \textbf{0.295} & \textbf{0.297} & 0.306 & 0.299 & 0.310 & \multirow{2}{*}{0.307}\\
                        & IRNet & 0.182 & 0.269 & 0.278 & 0.287 & \textbf{0.316} & \textbf{0.308} & \textbf{0.323} \\
    \midrule
    \multirow{2}{*}{Cardiomegaly}  & CAM & \textbf{0.326} & \textbf{0.420} & \textbf{0.438} & \textbf{0.434} & \textbf{0.440} & \textbf{0.442} & 0.450 & \multirow{2}{*}{0.445}\\
                        & IRNet & 0.280 & 0.393 & 0.401 & 0.425 & 0.428 & 0.440 & \textbf{0.461} \\
    \midrule
    \multirow{2}{*}{Consolidation}  & CAM & \textbf{0.056} & 0.097 & 0.099 & 0.105 & \textbf{0.109} & 0.108 & \textbf{0.114} & \multirow{2}{*}{0.103} \\
                        & IRNet & 0.076 & \textbf{0.099} & \textbf{0.104} & \textbf{0.107} & 0.108 & \textbf{0.110} & 0.110 \\
    \midrule
    \multirow{2}{*}{Edema}  & CAM & \textbf{0.170} & \textbf{0.231} & \textbf{0.253} & \textbf{0.241} & \textbf{0.250} & \textbf{0.261} & \textbf{0.270} & \multirow{2}{*}{0.225} \\
                        & IRNet & 0.149 & 0.198 & 0.219 & 0.239 & 0.237 & 0.242 & 0.257 \\
    \midrule
    \multirow{2}{*}{Enlarged Cardiomediastinum}  & CAM & 0.236 & \textbf{0.516} & \textbf{0.494} & \textbf{0.494} & \textbf{0.514} & \textbf{0.526} & \textbf{0.543} & \multirow{2}{*}{0.549}\\
                        & IRNet & \textbf{0.327} & 0.466  & 0.480 & 0.489 & 0.528 & 0.531 & 0.535 \\
    \midrule
    \multirow{2}{*}{Lung Lesion}  & CAM & 0.002 & \textbf{0.004} & \textbf{0.003} & \textbf{0.003} & 0.005 & 0.008 & \textbf{0.008} & \multirow{2}{*}{0.002}\\
                        & IRNet & \textbf{0.003} & 0.002 & 0.002 & 0.003 & \textbf{0.014} & \textbf{0.014} & 0.007 \\
    \midrule
    \multirow{2}{*}{Pleural Effusion}  & CAM & \textbf{0.150} & \textbf{0.254} & \textbf{0.256} & \textbf{0.263} & \textbf{0.264} & \textbf{0.266} & 0.272 & \multirow{2}{*}{0.214}\\
                        & IRNet & 0.171 & 0.182 & 0.192 & 0.191 & 0.243 & 0.243 & \textbf{0.273} \\
    \midrule
    \multirow{2}{*}{Pneumothorax}  & CAM & \textbf{0.057} & \textbf{0.021} & 0.025 & \textbf{0.035} & \textbf{0.044} & \textbf{0.074} & \textbf{0.077} & \multirow{2}{*}{0.017}\\
                        & IRNet & 0.024 & 0.018 & \textbf{0.029} & 0.033 & 0.040 & 0.050 & 0.053 \\
    \midrule
    \multirow{2}{*}{Support Devices}  & CAM & \textbf{0.159} & \textbf{0.203} & \textbf{0.223} & \textbf{0.231} & \textbf{0.241} & \textbf{0.249} & \textbf{0.257} & \multirow{2}{*}{0.246}\\
                        & IRNet & 0.123 & 0.202 & 0.217 & 0.226 & 0.229 & 0.221 & 0.248 \\
  \bottomrule

\end{tabular}
}
}
\end{table}

\newpage
\section{Weakly-Supervised Results}

\begin{table}[htbp]
\floatconts
{tab:weakresults}
{\caption{IoU scores of weakly-supervised segmentation models using either CAM or IRNet pseudo-labels and varying encoder architectures and train set sizes. Confidence intervals are calculated using $\alpha=0.05$.}}
{
\begin{tabular}{cccc}
\toprule 
Method & Encoder Architecture & Train Set Size & Test mIoU \\
\toprule
\multirow{12}{*}{CAM}  & \multirow{3}{*}{ResNet18}          & 100   & $0.074 \pm 0.00783$\\
                       &                                    & 2000  & $0.126 \pm 0.00275$ \\
                       &                                    & 20000 & $0.136 \pm 0.00990$ \\ \cline{2-4} 
                       & \multirow{3}{*}{ResNet18-ImageNet} & 100   & $0.082 \pm 0.00396$ \\
                       &                                    & 2000  & $0.132 \pm 0.00687$ \\
                       &                                    & 20000 & $0.139 \pm 0.00095$ \\ \cline{2-4} 
                       & \multirow{3}{*}{ResNet18-MoCo-CXR} & 100   & $0.084 \pm 0.00238$ \\
                       &                                    & 2000  & $0.133 \pm 0.00002$ \\
                       &                                    & 20000 & $0.139 \pm 0.00076$ \\ \cline{2-4} 
                       & \multirow{3}{*}{ResNet18-CheXpert} & 100   & $0.129 \pm 0.00036$ \\
                       &                                    & 2000  & $0.141 \pm 0.00003$ \\
                       &                                    & 20000 & $0.142 \pm 0.00095$ \\ 
\midrule
\multirow{12}{*}{IRNet}& \multirow{3}{*}{ResNet18}          & 100   & $0.076 \pm 0.00133$ \\
                       &                                    & 2000  & $0.105 \pm 0.00292$ \\
                       &                                    & 20000 & $0.111 \pm 0.00103$ \\ \cline{2-4} 
                       & \multirow{3}{*}{ResNet18-ImageNet} & 100   & $0.081 \pm 0.00245$ \\
                       &                                    & 2000  & $0.117 \pm 0.00576$ \\
                       &                                    & 20000 & $0.124 \pm 0.00001$ \\ \cline{2-4} 
                       & \multirow{3}{*}{ResNet18-MoCo-CXR} & 100   & $0.082 \pm 0.00007$ \\
                       &                                    & 2000  & $0.120 \pm 0.00010$ \\
                       &                                    & 20000 & $0.128 \pm 0.00001$ \\ \cline{2-4} 
                       & \multirow{3}{*}{ResNet18-CheXpert} & 100   & $0.125 \pm 0.00031$ \\
                       &                                    & 2000  & $0.146 \pm 0.00002$ \\
                       &                                    & 20000 & $0.156 \pm 0.00000$ \\
\bottomrule
\end{tabular}
}
\end{table}
\end{document}
