%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{chngcntr}
\usepackage{tabularx}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{xie_130}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand\x{\mathbf{x}}
\newcommand\bC{\mathbf{C}}
\newcommand\bx{\mathbf{x}}
\title{Two-Stage Holistic and Contrastive Explanation of Image Classification\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<wxieai@cse.ust.hk>}{Weiyan Xie}\ }
\author[2]{\href{mailto:<lixiaohui33@huawei.com>}{Xiao-Hui Li}\ }
\author[1]{\href{mailto:<zlinaz@cse.ust.hk>}{Zhi Lin}\ }
\author[3]{\href{mailto:<leonard.poon@gmail.com>}{Leonard K. M. Poon}\ }
\author[1]{\href{mailto:<cao@ust.hk>}{Caleb Chen Cao}\ }
\author[1]{\href{mailto:<lzhang@cse.ust.hk>}{Nevin L. Zhang}\ }
% Add affiliations after the authors
\affil[1]{%
	The Hong Kong University of Science and Technology\\
	Hong Kong, China
}
\affil[2]{%
	Huawei Technologies Co., Ltd\\ Shenzhen, China
}
\affil[3]{%
	The Education University of Hong Kong,\\ Hong Kong, China
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\counterwithin{figure}{section}
\counterwithin{table}{section}


\section{Comparison of Base Explainers}

We have evaluated CWOX-2s against SWOX and CWOX-1s in terms of contrastive faithfulness in the main paper, where  Grad-CAM and RISE were used as the base explainer.  Next, we compare Grad-CAM and RISE, and two
other base explainers
MWP  \cite{zhang2018top}
and LIME  \cite{ribeiro2016should}, in the framework of CWOX-2s.  We show how the choice of base explainer influences the contrastive faithfulness of CWOX-2s to the target model. Like Grad-CAM, MWP is a model-dependent explanation method. It backpropagates from the probability score of the target class
to compute the marginal winning probability (MWP), over the pixels on a target layer, of a random walk defined using positive network weights and forward activations. Contrastive MWP (c-MWP) provides a contrast to the target
class by negating the network weights of the last layer. To avoid double contrasting, we only consider the use of MWP as the base explainer for CWOX-2s, but not
c-MWP.


% In Appendix B, the reader will see  explanation results on several examples when RISE, MWP or LIME is used as the base explainer. Together with the results with Grad-CAM given before, the reader should get an idea of how the choice of base explainer influences interpretability of the CWOX-2s explanations to human. In general, Grad-CAM, RISE and MWP all lead to informative and meaningful CWOX-2s explanations, and LIME is relatively weaker.


As discussed in Section 4.2 of the main paper, we use different layers as the pivot layer  in the two stages of CWOX-2s in order to allow more fine-grained explanations in the second stage when applying back-propagation explanation methods like Grad-CAM as the base explainer. We use the same settings for the Grad-CAM and MWP:
\begin{table}[h!]
\centering
\begin{tabular}{r|cc} %\hline
{\em \small  Grad-CAM/MWP}                 & {\small  CWOX-2s Stage 1}  & {\small  CWOX-2s Stage 2 } \\ \hline
{\small  ResNet50 Pivot Layer} &  {\small  ReLU of Conv5$\_$3 } & {\small  ReLU layer of Conv4$\_$6}\\
{\small  GoogleNet Pivot Layer}   &  {\small  Inception5b} & {\small  Inception4e } \\ \hline
\end{tabular}
\end{table}

%are picked as the target layer for the two stages of CWOX-2s and they are the same as the pivot layers for Grad-CAM as specified in Section \ref{sec.eval1}.

In the forward-propagation method - RISE, we specify the  pixel mask probability to have different settings in the two stages:

\begin{table}[h!]
	\centering
\begin{tabular}{r|cc} %\hline
{\em \small   RISE      }          &  {\small  CWOX-2s Stage 1}  &  {\small  CWOX-2s Stage 2}  \\ \hline
{\small  Number of Masks   }      & {\small  5,000}  & {\small   3,000} \\
%{\small  Mask Size  }             &   {\small  12}      & {\small  10} \\
{\small  Pixel Mask Probability }  & {\small  0.3}     & {\small  0.15}  \\ \hline
\end{tabular}
\end{table}


Like RISE, LIME is a model-agnostic explanation method. It
learns a surrogate linear regression model, from superpixels to class score, in
the neighborhoods of the input image.  We use the Quickshift algorithm to compute the superpixels.
LIME has a hyperparameter that determines the number of samples to use for the regression model. Quickshift has a hyperparameter called kernel size, where the larger the size, the larger the neighborhoods of pixels considered. The two hyperparameters  are set for CWOX-2s as follows:



\begin{table}[h!]
	\centering
	\begin{tabular}{rccll}
		
		\multicolumn{1}{r|}{\small $LIME$}                  &  {\small CWOX-2s Stage 1}         & {\small CWOX-2s Stage 2}           &  &  \\ \cline{1-3}
		\multicolumn{1}{c|}{\small Number of Samples}     &  {\small 3000}                 & {\small 1000  }                 &  &  \\
		\multicolumn{1}{c|}{\small Quickshift Kernel Size}           & {\small 8}                    &{\small 4 }                     &  &  \\ \cline{1-3}
		\multicolumn{1}{l}{}                       & \multicolumn{1}{l}{} & \multicolumn{1}{l}{}   &  &
	\end{tabular}
\end{table}



We have conducted experiments on 10,000 randomly selected images from the ImageNet validation set.
Each  example is fed to a target model and the top classes in the outputs are explained using CWOX-2s.  The four base explainers are tested in turns, resulting four heatmaps for each case.


%For the example shown in Figure \ref{c-wox}, the four CWOX-2s heatmaps for {\tt cello} against {\tt violin} are shown in Figure \ref{c-wox} (c.1) (Grad-CAM),  Figure  \ref{a.c-wox} (c.1) (RISE), Figure  \ref{c.c-wox} (c.1) (MWP), and Figure  \ref{d.c-wox} (c.1) (LIME).

%(the image shown in Figure \ref{c-wox} in the main paper).



Table \ref{b.table} displays the performance statistics of CWOX-2s when using each of the four base explainers. Regarding contrastive faithfulness metrics, RISE performs the best, having the lowest CAUC score and the highest CDROP scores. Additionally, RISE identifies the fewest salient pixels, indicating that it can help CWOX-2s accurately pinpoint essential evidence. Grad-CAM is the next best performer, followed by MWP. MWP results in a large number of salient pixels, making it less precise in identifying important pixels. Among the four base explainers, LIME leads to the worst performance for CWOX-2s.


We present in Figure \ref{diff-base1} the results of CWOX-2s explanation using four different base explainers for the {\tt cello-guitar} image. We now focus on the contrastive faithfulness of CWOX-2s heatmaps for {\tt cello} against {\tt violin} generated by the four different base explainers. Figure \ref{b.cf} demonstrates how the probabilities $P({\tt cello})$ and $P({\tt violin})$, as well as the contrastive score $P({\tt cello}) \times (1-P({\tt violin}))$, change as pixels are removed based on the orderings determined by each of the four heatmaps.


\begin{table}[h!]
	\centering
	\begin{tabular}{c|ccc|ccc}
		\hline
		& \multicolumn{3}{c|}{ResNet50} & \multicolumn{3}{c}{GoogleNet} \\ \hline
		{\small 	Base Explainer} & {\small $\bar{n}_{\delta}$} &  {\small CAUC $\downarrow$} & {\small CDROP $\uparrow$} & {\small  $\bar{n}_{\delta} $  } &  {\small CAUC $\downarrow$} &{\small CDROP $\uparrow$} \\ \hline
		{\small Grad-CAM  }     &         {\small  2,029 }     &  {\small $3.11\times 10^{-3}$    }                    &  {\small $8.01\times 10^{-2}$  }                    &        {\small  2,181   }      & {\small $1.80\times 10^{-3}$  }                       & {\small $7.46\times 10^{-2}$    }              \\
		{\small	MWP     }       &       {\small   4,194}   & {\small $3.12\times 10^{-3}$ }                     &  {\small $7.37\times 10^{-2}$ }                     &              {\small 3,026}
		& {\small $1.77\times 10^{-3}$    }                    & {\small $5.85\times 10^{-2}$ }                    \\
		{\small	LIME    }       &      {\small   2,464  }   &{\small  $3.40\times 10^{-3}$    }                         & {\small $4.89\times 10^{-2}$  }                       &               {\small 2,351}  & {\small $1.92\times 10^{-3}$  }                     & {\small $3.64\times 10^{-2}$   }                   \\
		{\small RISE   }        &      {\small      1,282 }   & {\small $3.07\times 10^{-3}$ }                      & {\small $8.97\times 10^{-2}$ }                     &           {\small   1,105  }  & {\small $1.72\times 10^{-3}$   }                       &  {\small $8.32\times 10^{-2}$ }                     \\ \hline
		
	\end{tabular}
	\caption{\small Performances of CWOX-2s with four base explainer. Here, $\bar{n}_{\delta}$ stands for average number of $\delta$-salient pixels.}
	\label{b.table}
\end{table}


\begin{figure}[h!]
	\centering

	\begin{tabular}{cc}
		\begin{tabular}{cc}
			\includegraphics[width=5.8cm]{Fig/eval_result/grad-cam.png} &
			\includegraphics[width=5.8cm]{Fig/eval_result/MWP.png}\\
			{\small   (a) Grad-CAM:  $n_{\delta}=2,377$}  &
			{\small   (b) MWP:  $n_{\delta}=4,495$}\\
			
			
			{\small \hspace{0.5cm}  CAUC=0.0153, CDROP=0.505}    &  {\small \hspace{0.5cm} CAUC=0.0178, CDROP=0.461  }
			
			
		\end{tabular}
		\\
		
		
		\begin{tabular}{cc}
			\includegraphics[width=5.8cm]{Fig/eval_result/LIME.png} &
			\includegraphics[width=5.8cm]{Fig/eval_result/RISE.png} \\
			
			{\small   (c) LIME:  $n_{\delta}=3,131$}  &
			{\small   (d) RISE:  $n_{\delta}=1,223$}\\
			
			
			{\small \hspace{0.5cm}  CAUC=0.0184, CDROP=0.470}    &  {\small \hspace{0.5cm} CAUC=0.0902, CDROP=0.694  }
			
		\end{tabular}
		
	\end{tabular}
	

	
	
	\caption{Changes in the probabilities $P({\tt cello})$ and $P({\tt violin})$ and the contrastive score $P({\tt cello}) \times (1-P({\tt violin}))$ as $\delta$-salient pixels are deleted according to the order induced the CWOX-2s heatmap
		that is obtained with the base explainer: (a) Grad-CAM, (b) MWP, (c) LIME, and (d) RISE.  Note that the CAUC score for Grad-CAM are lower than that in Figure
		\ref{fig.cf} because the number of pixels deleted is 1,223 --- the number of $\delta$-salient pixels in the RISE heatmap. It is smaller than   the number of
		$\delta$-salient pixels in the Grad-CAM heatmap (2,337).
	}
	\label{b.cf}
	
\end{figure}


\section{More details of the user study}

The overall procedures and results of the user study have been discussed in Section 5.2 of the main paper. We provide some additional details in this appendix.

\subsection{Study Form and Participants}

The user study was carried out using a web-based survey via the Qualtrics Survey Tool. Two groups of participants were invited to take part in the survey through email invitations. The first group, known as the expert group, consisted of postgraduate students who were enrolled in a machine learning course at the time of the study. These students had experience with deep computer vision models, including training CNN models. However, they had not yet been exposed to XAI. The second group, the non-expert group, was made up of first-year undergraduate students who had no prior experience or knowledge in training deep learning models.

% This first postgraduate students who were taking machine learning courses at our university. 

\subsection{Detailed Procedures of the study}

The procedure for the user study is detailed as follows.

\begin{enumerate}
\item Tutorial: Participants first received a tutorial on the basics of image classification and the explanations.

\item Training phase: Participants were shown a set of examples and explanations for a pair of confusing class labels (e.g. {\tt cello} and {\tt violin}). For each training example, the explanation results and confusing class labels were shown to the participants, but the matching relationship between the explanations and labels was initially unknown to them. They were asked to guess the features the model uses to distinguish the two confusing classes. They then received verification of their guesses on the next page. Screenshots of the user interface during the training phase can be seen in Figure \ref{fig.hs11}.

\item Evaluation phase: An evaluation phase followed the training phase which was set up similarly to the guessing step in training phase. The participants' understanding of the discriminative features was evaluated by testing how well they can tell the matching relationship between the explanations and the confusing class labels on new unseen examples. 

\end{enumerate}

It is noted that the ``guess first, verify next" setting in the training phase of the user study  is similar to the training process of a deep neural network model. At the beginning of the model training, the model is initialized with random weights, which may result in random guesses and high training losses. However, if there are distinct features for different class labels in the training set, the model can learn these features and gradually reduce the training loss. Similarly, if the explanations provided can reveal the discriminative features used by the model, human subjects are expected to gain a better understanding of these features as they see more examples.







\subsection{the confusing class labels and examples used}


Since there are a variety of images in the ImageNet,  to reduce the burden on users, we showed examples with the same pair of confusing class labels in one round of the training and evaluation. Each participant completed two rounds of the study, each with examples from a randomly assigned label pair.


To make the study manageable, we limited the choices of input images and confusing class labels. We selected the confusing class labels from the latent tree model built from the outputs of ResNet50. In the latent tree model,  the classes are grouped under the same node because they frequently appear together in the top prediction classes of ResNet50. Therefore, many pairs of confusing classes, can be obtained from the level-1 latent nodes of the model. In order to determine which pairs to be used in the human study, we first followed the three criteria \footnote{(1) Familiar, the class should be familiar to all the human subjects; (2) Unambiguous, the class should have only one clear connotation for the given object; (3) Non-specific, the class should not be a specialization or a potential sub-class of another class in ImageNet.} introduced in \cite{zhang2019dissonance} to filter out the unfamiliar, ambiguous, and expert-specific classes. After filtering those classes, we invited ten AI researchers to nominate ten pairs of confusing classes based on their interests from the remaining part of the latent tree model. The final ten confusing class pairs were selected through voting among the ten researchers and included \{{\tt cello}, {\tt violin}\}, \{{\tt ambulance}, {\tt police van}\}, \{{\tt harvester}, {\tt tractor}\}, \{{\tt folding chair}, {\tt rocking chair}\}, \{{\tt basketball}, {\tt volleyball}\}, \{{\tt acoustic guitar}, {\tt electric guitar}\}, and so on.

For each pair of confusing class labels, we collected examples from the ImageNet validation set with ground-truth labels as one of the two confusing class labels. Examples were used in the study if both classes in the confusing class pair appeared in the model's top prediction labels, regardless of whether the classification was correct or incorrect. The images were randomly divided into training and evaluation examples.

 
 
 \begin{figure*}[t]
 	\centering
 	\begin{tabular}{cc}
 		\includegraphics[height=6.4cm]{hs1114.png} &
 		\includegraphics[height=6.4cm]{hs2222.png}  \\
 		{\small (a) The guessing step of the training phase} & 	{\small (b) The verification step of the training phase } 
 	\end{tabular}	
 	\caption{An example of the training phase of the user study: (a) The user is presented with an example along with the associated CWOX-2s explanation and the two confusing class labels. They are instructed to match the class labels with the second-stage CWOX-2s heatmaps by dragging and dropping the labels into boxes below the heatmaps. (b) Once the user has made their guess, they are then able to verify their answer on the next page. The evaluation phase is structured in a similar manner, with participants performing the matching task on new, unseen examples.}
 	%with the guess results displayed at the top and ground-truth matching results shown in the boxes
 	\label{fig.hs11}
 \end{figure*}
 
%two rounds xxx

%\subsection{Procedure oth }

\subsection{Guideline to the participants}

%To better understand the study procedures, the  guideline to the study participants is also provided below. The guideline is used as a script for the introductory video, which we  to participants at the beginning of the study. The video with the institution and contact information hidden is also provided in the supplementary material (\textbf{hs\_video\_information\_remove.mp4}).

The following is the guideline provided to the participants at the beginning of the study.
We also made an introductory video based on the guideline. %The video (\textbf{hs\_video\_information\_remove.mp4}) is included in \textbf{Other Supplementary Material}.

%This guideline was used as a script for the introductory video that was played to participants at the beginning of the study. The video with the institution and \textbf{hs\_video\_information\_remove.mp4}.

\textbf{\em Guideline to the Study Participants:}

{\bf 1.	General Background.}

Hi there, thank you for joining our study. We are a research team from [hidden institution], aiming to build better explanation tools for users to understand the behaviors of AI models. AI models are generally non-intuitive and difficult for humans to understand. They are considered black-box models. Explainable AI, or XAI, aims to provide explanations for model predictions to help users understand how the predictions are reached.

The purpose of this study is to evaluate how well different XAI methods can help YOU, as a model user, understand model behaviors. Your understanding will be assessed by asking you to predict model’s prediction on new inputs. The study will carry out in the context of image classification.

{\bf 2.	Tutorial: Image classification \& Saliency Map \& Confusion Classes.}

Image classification is a supervised learning problem: with a set of target classes, to train a model to recognize the class on the labeled example images. The output of image classification is a probability distribution over multiple classes. 

An explanation of the image classification reveals what regions of the input image that the model relies on to predict the specific label.  An explanation is typically given as a saliency map. The saliency map is a heatmap, where the high-temperature regions are what a model considers important for a prediction. For instance, in the example of Figure \ref{example}, this image is classified as goldfish, and the bodies of goldfishes are highlighted.

 \begin{figure}[h!]
	\centering
	\includegraphics[width=4.5cm]{hs1.png}
	\caption{An example of saliency map.}
	\label{example}
\end{figure}

Although usually we only pick the class with highest prediction probability as the output label for the image, the other classes with high prediction probabilities also reflect some important aspects of the model behaviors. 

For instance, if two classes are always with high prediction probabilities at the same time and co-occur as top classes in classification outputs, they are considered confusion classes where the model always has trouble determining which of them to use as the output class label. Cello and Violin is one pair of such classes for ResNet50. Understanding what features the model relies on to distinguish those confusion classes can help us understand the model behaviors better.


In this study, we will test how well saliency maps created by different methods enable YOU, as a model user, to understand the evidences that the model uses to distinguish between confusion classes. 


{\bf 3.	Study Procedure: Training Phase.}

The study is divided into two phases, the training phase and the evaluation phase.

In the training phase, a series of examples will be presented to you. Each example involves two class labels and a tree of saliency maps.  The task is to match the class labels with the saliency maps at the leave nodes.  The guess-verify strategy is adopted so that you can complete the training phase efficiently.  Your first make a guess about the matching, and drag/drop the class labels to the appropriate boxes below the saliency maps. Next, you click the “Next” button to see the ground-truth match.

Note that the model might not be necessary to think like humans. In this human study, the task is \textbf{NOT} about how you feel the class labels should be matched with saliency maps based on your previous life experience. Rather, it is about {\em learning  how saliency maps created by XAI method match predictions of a model}.

{\bf 4. Study Procedure: Evaluation Phase.}

In the evaluation phase, you are asked to do the matching for a series of new examples, just as in the guess step of the training phase.  Note that the matching must be one-to-one. Otherwise, you won’t be able to proceed. In most cases, you will not be 100\% sure. Just pick the one that you feel more likely according to what you have learned in the training phase.  Pick randomly if necessary.

{\bf 5. Overall Flow of the Study.}
 
You will experience two rounds of the study. Each round contains a training and evaluation phase with examples from a particular pair of confusion classes. The two pairs will be shown to you and you can pick one to start first and work on another pair later.

Although you are encouraged to complete both rounds of study in one sitting, you can leave in the middle and go back to study later. This web-based system will use cookies to keep track of your progress and you can return to the point where you left off, as soon as you access the survey link again with the same device.

%That is the end of this introduction video. You can go to the dry run session of the study now by clicking "Next". The dry run session is to help you familiarize yourselves with the study interface. It will guide you to go through one training and evaluation example with detailed illustrations.

If you have any questions, do not hesitate to contact us. Thanks again for joining our study.

\newpage
\section{More Visual Examples of CWOX-2s}

%In Figure \ref{c-wox}, \ref{fig.syrdinge.intro} and \ref{composite} of the main paper, we have provided several visual examples for CWOX. To supple with the human study to show the interoperability of the CWOX-2s, we provide more visual examples in this appendix. 


\subsection{Examples of visual similar class labels}
\label{example-11}

In the Figure \ref{fig.syrdinge.intro} of the main paper, we have shown that CWOX-2s can provide more meaningful and discriminative explanations for the visual similar class labels - {\tt screwdriver} and {\tt syringe}. We here provide more examples (Figure \ref{visual1}, \ref{visual2} and  \ref{visual3}) to show that how CWOX-2s can provide users understanding of discriminative features model used to distinguish the visual similar classes ({\tt violin} and {\tt cello} in Figure \ref{visual1}; {\tt electric guitar} and {\tt acoustic guitar} in Figure \ref{visual2}; {\tt pitcher} and {\tt water jug} in Figure \ref{visual2}).


\begin{figure*}[h!]
	\centering
	\begin{tabularx}{60em}{Xc}
		
		\multicolumn{1}{m{18em}}{\includegraphics[height=7.cm]{resnet/ILSVRC2012_val_00041209/jcctree1.png}} \vline &
		{\begin{tabular}{ccc} 
				%	\hline \\
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00041209/SWOX_violin.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00041209/SWOX_cello.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00041209/SWOX_studio couch.png}
				\\	{\scriptsize violin }  & {\scriptsize cello }  &{\scriptsize  studio couch}  %\vspace{2cm} 
		\end{tabular}} \\
		\multicolumn{1}{m{23em}}{\  \ \ \ \ \  \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  (a) CWOX-2s} & {(b) SWOX} 
	\end{tabularx}
	\caption{\small  Results of SWOX and CWOX-2s:  Input image with ground-truth label {\tt violin}.  The ResNet50 output on the input consists of three top classes
		{\tt violin} (0.62), {\tt cello} (0.37) and {\tt studio couch} (0.08). (a) CWOX-2s heatmaps, (b) SWOX heatmaps.
	}
	\label{visual1}

	\centering
	\begin{tabularx}{60em}{Xc}
		
		\multicolumn{1}{m{18em}}{\includegraphics[height=7.cm]{resnet/ILSVRC2012_val_00029593/jcctree1.png}} \vline &
		{\begin{tabular}{ccc} 
				%	\hline \\
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00029593/SWOX_electric guitar.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00029593/SWOX_acoustic guitar.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00029593/SWOX_cowboy hat.png}
				\\	{\scriptsize electric guitar}  & {\scriptsize acoustic guitar}  &{\scriptsize cowboy hat}  %\vspace{2cm} 
		\end{tabular}} \\
		\multicolumn{1}{m{23em}}{\  \ \ \ \ \  \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  (a) CWOX-2s} & {(b) SWOX} 
	\end{tabularx}
	\caption{\small  Results of SWOX and CWOX-2s: Input image with ground-truth label {\tt electric guitar}.  The ResNet50 output on the input consists of three top classes
		{\tt electric guitar} (0.90), {\tt acoustic guitar} (0.03) and {\tt cowboy hat} (0.02). (a) CWOX-2s heatmaps, (b) SWOX heatmaps.
	}
	\label{visual2}
\end{figure*}

\begin{figure*}[h!]
	\centering
	\begin{tabularx}{60em}{Xc}
		
		\multicolumn{1}{m{18em}}{\includegraphics[height=7.3cm]{resnet/ILSVRC2012_val_00016541/jcctree1.png}} \vline &
		{\begin{tabular}{cc} 
				%	\hline \\
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00016541/SWOX_pitcher.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00016541/SWOX_water jug.png}\\
			{\scriptsize pitcher }  & {\scriptsize water jug } \\
		\end{tabular}}\\
		\multicolumn{1}{m{23em}}{\  \ \ \ \ \  \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  (a) CWOX-2s} & {(b) SWOX} 
	\end{tabularx}
	\caption{\small  Results of SWOX and CWOX-2s:  Input image with ground-truth label {\tt pitcher}.  The ResNet50 output on the input consists of two top classes
		{\tt  pitcher} (0.73) and {\tt water jug} (0.21). (a) CWOX-2s heatmaps, (b) SWOX heatmaps.
	}
	\label{visual3}
\end{figure*}

\newpage


\subsection{Examples of Composite Object}
\label{example-22}


In the Figure \ref{composite} of the main paper, we have shown that CWOX-2s can well identify the composite object  {\tt mouse+computer-keyboard} .  We here provide more such examples (Figure \ref{composite1}, \ref{composite2} and  \ref{composite3}) where the composite object is  {\tt desk+desktop-computer} in Figure \ref{composite1}, {\tt street-sign+traffic-light} in Figure \ref{composite2} and {\tt drumstick+drum} in Figure \ref{composite3}.

  %({\tt violin} and {\tt cello} in Figure \ref{visual1}; {\tt electric guitar} and {\tt acoustic guitar} in Figure \ref{visual2}; {\tt pitcher} and {\tt water jug} in Figure \ref{visual2}).

\begin{figure*}[h!]
	\centering
	\begin{tabularx}{60em}{Xc}
		
		\multicolumn{1}{m{18em}}{\includegraphics[height=7.cm]{ILSVRC2012_val_00019908/jcctree1.png}} \vline &
		{\begin{tabular}{ccc} 
				%	\hline \\
				\includegraphics[width=1.4cm]{ILSVRC2012_val_00019908/SWOX_desk.png}& 
				\includegraphics[width=1.4cm]{ILSVRC2012_val_00019908/SWOX_desktop computer.png}& 
				\includegraphics[width=1.4cm]{ILSVRC2012_val_00019908/SWOX_monitor.png}
				\\	{\scriptsize desk }  & {\scriptsize desktop computer}  &{\scriptsize  monitor}  %\vspace{2cm} 
		\end{tabular}} \\
		\multicolumn{1}{m{23em}}{\  \ \ \ \ \  \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  (a) CWOX-2s} & {(b) SWOX} 
	\end{tabularx}
	\caption{\small  Results of SWOX and CWOX-2s: Input image with ground-truth label {\tt desk}.  The ResNet50 output on the input consists of three top classes
		{\tt desk} (0.59), {\tt desktop computer} (0.28) and {\tt monitor} (0.09). (a) CWOX-2s heatmaps, (b) SWOX heatmaps.
	}
	\label{composite1}
\end{figure*}


\begin{figure*}[h!]
	\centering
	\begin{tabularx}{60em}{Xc}
		
		\multicolumn{1}{m{18em}}{\includegraphics[height=7.3cm]{resnet/ILSVRC2012_val_00024952/jcctree1.png}} \vline &
		{\begin{tabular}{cc} 
				%	\hline \\
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00024952/SWOX_street sign.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00024952/SWOX_traffic light.png}
				\\	{\scriptsize street sign }  & {\scriptsize traffic light}  %\vspace{2cm} 
		\end{tabular}} \\
		\multicolumn{1}{m{23em}}{\  \ \ \ \ \  \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  (a) CWOX-2s} & {(b) SWOX} 
	\end{tabularx}
	\caption{\small  Results of SWOX and CWOX-2s: Input image with ground-truth label {\tt traffic light}.  The ResNet50 output on the input consists of two top classes
		{\tt street sign} (0.73) and {\tt traffic light} (0.27). (a) CWOX-2s heatmaps, (b) SWOX heatmaps.
	}
	\label{composite2}
	
	\
	
		\begin{tabularx}{60em}{Xc}
		
		\multicolumn{1}{m{18em}}{\includegraphics[height=7.cm]{resnet/ILSVRC2012_val_00049569/jcctree1.png}} \vline &
		{\begin{tabular}{ccc} 
				%	\hline \\
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00049569/SWOX_drumstick.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00049569/SWOX_drum.png}& 
				\includegraphics[width=1.4cm]{resnet/ILSVRC2012_val_00049569/SWOX_banjo.png}
				\\	{\scriptsize drumstick }  & {\scriptsize Drum}  &{\scriptsize  Banjo}  %\vspace{2cm} 
		\end{tabular}} \\
		\multicolumn{1}{m{23em}}{\  \ \ \ \ \  \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  (a) CWOX-2s} & {(b) SWOX} 
	\end{tabularx}
	\caption{\small  Results of SWOX and CWOX-2s:  Input image with ground-truth label {\tt drumstick}.  The GoogleNet output on the input consists of three top classes
		{\tt drumstick} (0.58), {\tt drum} (0.32) and {\tt banjo} (0.05). (a) CWOX-2s heatmaps, (b) SWOX heatmaps.
	}
	\label{composite3}
\end{figure*}


\newpage
\subsection{Examples of Explaining misclassification}
\label{example-33}


Figures \ref{result-necklace} (a) - (d) show the contrastive heatmaps produced by CWOX-2s for the output of ResNet50 on the image with ground-truth label {\tt padlock}. ResNet50 is completely wrong in this case. Both SWOX (e.5) and CWOX-2s (d) reveal the apparently reasonable evidence for {\tt wall clock} --- the two keys look like the hands on a clock. However, CWOX-2s does  a better job than SWOX at identifying the evidence for  {\tt necklace} (a) and {\tt whistle} (b). Furthermore, heatmap (c) suggests that a part of the ring is evidence for {\tt magnet-compass} and {\tt stopwatch}. The contrastive evidence for distinguishing those two classes includes an area where one would expect a hanging ring for a compass (c.1), and an area where one would expect a button for a stopwatch (c.2).

Figure \ref{result-gown} depicts an example with a ground-truth label of {\tt gown}.  ResNet50 misclassifies it as {\tt hoopskirt}  (0.54). Other top predicted classes for this example include {\tt gown} (0.22), {\tt groom} (0.14), and {\tt lakeside} (0.09). When Grad-CAM is applied to explain these top classes, the SWOX heatmaps for {\tt hoopskirt}, {\tt gown}, and {\tt  groom} (d.1 - d.3) are essentially the same. In contrast, CWOX-2s provides clearer explanations for them, where (a) highlights the large skirt as evidence for {\tt hoopskirt}, (b.1) highlights the female body as evidence for {\tt gown}, and (b.2) highlights the male face as evidence for {\tt groom}.




\begin{figure*}[t]
	\centering
	\begin{tabular}{c}
\includegraphics[height=7.5cm]{resnet/necklace/jcctree1.png}  \\  \hline \\ 
	{\begin{tabular}{ccccc} 
			%	\hline \\
			\includegraphics[width=1.9cm]{resnet/necklace/SWOX_necklace.png}& 
		\includegraphics[width=1.9cm]{resnet/necklace/SWOX_whistle.png}& \includegraphics[width=1.9cm]{resnet/necklace/SWOX_magnetic compass.png}& \includegraphics[width=1.9cm]{resnet/necklace/SWOX_stopwatch.png}& \includegraphics[width=1.9cm]{resnet/necklace/SWOX_wall clock.png} 
			\\	{\scriptsize  (e.1) necklace }  & {\scriptsize  (e.2) whistle}  &{\scriptsize   (e.3) magnetic compass}   & {\scriptsize (e.4) stopwatch}  &{\scriptsize  (e.5) wall clock}  %\vspace{2cm} 
	\end{tabular}} 
	\end{tabular}

\caption{\small Results of SWOX and CWOX-2s: (a) Input image with ground-truth label {\tt padlock}.  The ResNet50 output on the input consists of five top classes
	{\tt  necklace} (0.130), {\tt whistle} (0.127), {\tt magnetic-compass} (0.114), {\tt stopwatch} (0.089), and {\tt wall-clock} (0.069). (a-d) CWOX-2s heatmaps; (e) SWOX heatmaps.}
\label{result-necklace}

\end{figure*}
\begin{figure*}[t]
	\centering
	\begin{tabular}{c}
		\includegraphics[height=7.5cm]{resnet/ILSVRC2012_val_00022456/jcctree1.png}  \\  \hline \\ 
		{\begin{tabular}{cccc} 
				%	\hline \\
				\includegraphics[width=1.9cm]{resnet/ILSVRC2012_val_00022456/SWOX_hoopskirt.png}& 
				\includegraphics[width=1.9cm]{resnet/ILSVRC2012_val_00022456/SWOX_gown.png}& \includegraphics[width=1.9cm]{resnet/ILSVRC2012_val_00022456/SWOX_groom.png} 
				&  \includegraphics[width=1.9cm]{resnet/ILSVRC2012_val_00022456/SWOX_lakeside.png}
				\\	{\scriptsize  (d.1) hoopskirt }  & {\scriptsize  (d.2) gown}  &{\scriptsize   (d.3) groom}   & {\scriptsize (d.4) lakeside}    %\vspace{2cm} 
		\end{tabular}} 
	\end{tabular}
	
	\caption{\small Results of SWOX and CWOX-2s: Input image with ground-truth label {\tt gown}.  The ResNet50 output on the input consists of four top classes
		{\tt  hoopskirt} (0.54), {\tt gown} (0.22), {\tt groom} (0.14), and {\tt lakeside} (0.09). (a-c) CWOX-2s heatmaps; (d) SWOX heatmaps.}
	\label{result-gown}
\end{figure*}


\newpage




\subsection{Examples with Different base explainers}

%We have shown many CWOX-2s explanations with Grad-CAM as the base explainer in the main paper and Appendix \ref{example-11}, \ref{example-22}, \ref{example-33}. In order to compare different base explainers, we show some examples with CWOX-2s explanations from Grad-CAM, MWP, LIME and RISE in this section.  

%In this section, we present some examples of CWOX-2s explanations using different base explainers for comparison. 

While we used Grad-CAM as the CWOX-2s base explainer in the main paper and Appendices \ref{example-11}, \ref{example-22}, \ref{example-33}, we now include some examples in Figure \ref{diff-base1},\ref{diff-base2} and \ref{diff-base3} with three other base explainers, namely MWP, LIME and RISE.   

%produced with MWP, LIME, and RISE.

\bibliography{xie_130.bib}


\begin{figure}[t]
	\centering
	
	\begin{tabular}{cc}
		\begin{tabular}{c|c}
			\includegraphics[width=7.2cm]{cello_guitar//gd-jcctree1.png}  &
			\includegraphics[width=7.2cm]{cello_guitar//mwp-jcctree1.png}\\ 
			{\small   (a) Grad-CAM}  &	{\small   (b) MWP}\\
			
		\end{tabular}
		\\\hline
		\vline	
		\\
		\begin{tabular}{c|c}
			\includegraphics[width=7.2cm]{cello_guitar//lime-jcctree1.png} &
			\includegraphics[width=7.2cm]{cello_guitar//rise-jcctree1.png}\\
			{\small   (c) LIME}  &	{\small   (d) RISE}\\
		\end{tabular}
	\end{tabular}
	\caption{CWOX-2s explanations with the four different base explainers on the Figure \ref{fig.cello.intro} \& \ref{c-wox} {\tt cello-guitar} example.}
	\label{diff-base1}

\end{figure}

\begin{figure}[t]
	\centering
	
	\begin{tabular}{cc}
		\begin{tabular}{c|c}
			\includegraphics[width=7.1cm]{desktop//gd-jcctree1.png}  &
			\includegraphics[width=7.1cm]{desktop//mwp-jcctree1.png}\\ 
			{\small   (a) Grad-CAM}  &	{\small   (b) MWP}\\
			
		\end{tabular}
		\\\hline
		\vline	
		\\
		\begin{tabular}{c|c}
			\includegraphics[width=7.1cm]{desktop//lime-jcctree1.png} &
			\includegraphics[width=7.1cm]{desktop//rise-jcctree1.png}\\
			{\small   (c) LIME}  &	{\small   (d) RISE}\\
		\end{tabular}
	\end{tabular}
\caption{CWOX-2s explanations with the four different base explainers on the Figure \ref{composite} example.}
	\label{diff-base2}
	
\end{figure}

\begin{figure}[t]
	\centering
	
	\begin{tabular}{cc}
		\begin{tabular}{c|c}
			\includegraphics[width=7.2cm]{necklace//gd-jcctree1.png}  &
			\includegraphics[width=7.2cm]{necklace//mwp-jcctree1.png}\\ 
			{\small   (a) Grad-CAM}  &	{\small   (b) MWP}\\
			
		\end{tabular}
		\\\hline
		\vline	
		\\
		\begin{tabular}{c|c}
			\includegraphics[width=7.2cm]{necklace//lime-jcctree1.png} &
			\includegraphics[width=7.2cm]{necklace//rise-jcctree1.png}\\
			{\small   (c) LIME}  &	{\small   (d) RISE}\\
		\end{tabular}
	\end{tabular}
	\caption{CWOX-2s explanations with the four different base explainers on the Figure \ref{result-necklace} example.}
	\label{diff-base3}
	
\end{figure}




%\subsection{Examples with MWP as base explainer}



%\subsection{Examples with LIME as base explainer}



% \subsection{Dry-run session}

%\subsection{Ethical Considerations}







\end{document}
