% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{caption}
\usepackage{subcaption}

%for cross referencing
%\usepackage{xr}
%\externaldocument{corvelo-benz_514-supp}
\usepackage{nameref,zref-xr}
\zxrsetup{toltxlabel}
\zexternaldocument*{corvelo-benz_514-supp}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{bbm}

\usepackage{stackengine}
%\usepackage{Definitions}

%macros for notations
\newcommand{\given}{\,\mid\,}
\newcommand{\X}{\mathcal{X}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\U}{\mathcal{U}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\M}{\mathcal{M}}
\newcommand{\Hu}{\mathcal{H}}
%\newcommand{\L}{\mathcal{L}}
\newcommand{\vecX}{\mathbf{X}}
\newcommand{\vecU}{\mathbf{U}}
\newcommand{\vecF}{\mathbf{F}}
\newcommand{\vecY}{\mathbf{Y}}
\newcommand{\vecy}{\mathbf{y}}
\newcommand{\Loss}{\mathcal{L}}

\newcommand{\wb}{\mathbf{w}}
\newcommand{\xb}{\mathbf{x}}
% \newcommand{\yb}{\mathbf{y}}
\newcommand{\zb}{\mathbf{z}}

\usepackage{latexsym} 
\usepackage{bm} 
\usepackage{mathrsfs}
\usepackage{cancel}
\newcommand{\Ab}{\bm{A}}
\newcommand{\Bb}{\bm{B}}
\newcommand{\Cb}{\bm{C}}
\newcommand{\Db}{\bm{D}}
\newcommand{\Eb}{\bm{E}}
\newcommand{\Fb}{\bm{F}}
\newcommand{\Gb}{\bm{G}}
\newcommand{\Hb}{\bm{H}}
\newcommand{\Ib}{\bm{I}}
\newcommand{\Jb}{\bm{J}}
\newcommand{\Kb}{\bm{K}}
\newcommand{\Lb}{\bm{L}}
\newcommand{\Mb}{\bm{M}}
\newcommand{\Nb}{\bm{N}}
\newcommand{\Ob}{\bm{O}}
\newcommand{\Pb}{\bm{P}}
\newcommand{\Qb}{\bm{Q}}
\newcommand{\Rb}{\bm{R}}
\newcommand{\Sbb}{\bm{S}}
\newcommand{\Tb}{\bm{T}}
\newcommand{\Ub}{\bm{U}}
\newcommand{\Vb}{\bm{V}}
\newcommand{\Wb}{\bm{W}}
\newcommand{\Xb}{\bm{X}}
\newcommand{\Yb}{\bm{Y}}
\newcommand{\Zb}{\bm{Z}}
\newcommand{\nn}{\nonumber}
\newcommand{\Ibb}{\mathbb{I}}
\newcommand{\Acal}{\mathcal{A}}
\newcommand{\Bcal}{\mathcal{B}}
\newcommand{\Ccal}{\mathcal{C}}
\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\Ecal}{\mathcal{E}}
\newcommand{\Fcal}{\mathcal{F}}
\newcommand{\Gcal}{\mathcal{G}}
\newcommand{\Hcal}{\mathcal{H}}
\newcommand{\Ical}{\mathcal{I}}
\newcommand{\Jcal}{\mathcal{J}}
\newcommand{\Kcal}{\mathcal{K}}
\newcommand{\Lcal}{\mathcal{L}}
\newcommand{\Mcal}{\mathcal{M}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\Ocal}{\mathcal{O}}
\newcommand{\Pcal}{\mathcal{P}}
\newcommand{\Qcal}{\mathcal{Q}}
\newcommand{\Rcal}{\mathcal{R}}
\newcommand{\Scal}{{\mathcal{S}}}
\newcommand{\Tcal}{{\mathcal{T}}}
\newcommand{\Ucal}{\mathcal{U}}
\newcommand{\Vcal}{\mathcal{V}}
\newcommand{\Wcal}{\mathcal{W}}
\newcommand{\Xcal}{\mathcal{X}}
\newcommand{\Ycal}{\mathcal{Y}}
\newcommand{\Zcal}{\mathcal{Z}}
\newcommand*{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand*{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\ea}{\emph{et al.}}
\newcommand{\eg}{\emph{e.g.}}
\newcommand{\ie}{\emph{i.e.}}
\newcommand{\iid}{\emph{i.i.d.}}
\newcommand{\etc}{\emph{etc.}}

\ifx\QED\undefined
\def\QED{~\rule[-1pt]{5pt}{5pt}\par\medskip}
\fi

\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
%\newenvironment{proof}{\emph{Proof. }}{ \hfill \QED}
\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}
\newtheorem{example}{Example}
\newtheorem{property}{Property}
% \newtheorem{lemma}[theorem]{Lemma}
\newtheorem{lemma}{Lemma}  % separate counters for lemmas
%\newtheorem{proposition}[theorem]{Proposition} %old
\newtheorem{proposition}{Proposition} % separate counters for propositions
% \newtheorem{claim}[theorem]{Claim}
\newtheorem{claim}{Claim} % separate counters for claims
\newtheorem{corollary}{Corollary}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{assumption}[theorem]{Assumption}
\fi

\newcommand{\xhdr}[1]{\noindent {\bf #1.}}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\title{Counterfactual Inference of Second Opinions}
% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<ninacobe@mpi-sws.org>?Subject=Counterfactual Inference of Second Opinions}{Nina~L.~Corvelo~Benz}{}}
\author[1]{Manuel~Gomez~Rodriguez}
% Add affiliations after the authors
\affil[1]{%
        Max Planck Institute for Software Systems\\
        Kaiserslautern, Germany
}
\affil[2]{%
        Department of Biosystems Science and Engineering\\
        ETH Zurich\\
        Zurich, Switzerland
}

\begin{document}

\maketitle

\begin{abstract}
Automated decision support systems that are able to infer second opi\-nions from experts can potentially facilitate a more efficient
allocation of resources---they can help decide when and from whom to seek a se\-cond opi\-nion.
%
In this paper, we look at the design of this type of support systems from the perspective of counterfactual inference.
%
We focus on a multiclass classification setting and first show that, if experts make predictions on their own, the underlying causal
mechanism generating their predictions needs to satisfy a desirable set invariant pro\-per\-ty.
%
Further, we show that, for any causal mechanism satisfying this property, there exists\- an equi\-va\-lent mechanism where the predictions
by each expert are generated by independent sub-mechanisms go\-ver\-ned by a common noise.
%
This motivates the design of a set invariant Gumbel-Max structural causal model where the structure of the noise governing the sub-mechanisms
underpinning the model depends on an intuitive notion of si\-mi\-la\-ri\-ty between experts which can be estimated from data.
%
Experiments on both synthetic and real data show that our model can be used to infer se\-cond opinions more accurately than its non-causal counterpart.
\end{abstract}

%################################
\section{Introduction}
\label{sec:introduction}
In decision making under uncertainty, seeking opinions from multiple human experts tends to improve the overall quality of the decisions.
%
For example, in medicine, second opinions have been shown valuable for establishing diagnoses and initiating treatment~\citep{burger2020outcomes} as 
well as reducing the number of unnecessary procedures~\citep{leape1989unnecessary,althabe2004mandatory}.
%
%
%
In machine learning, ground truth labels are determined by carefully aggregating multiple noisy labels provided by different experts~\citep{zhang2016learning} 
and inconsistencies between these noisy labels help developing more robust models~\citep{peterson2019human}.
%
Unfortunately, the timeliness and quality of the decisions is often compromised due to a shortage of experts, which prevents each decision to 
be informed by multiple experts'{} opinions.

In this context, we argue that the development of automated decision support systems that, given an expert'{}s opinion on a decision instance and a set of features, 
are able to infer other experts'{} opinions will enable a more efficient allocation of resources.
%
%
On the one hand, these sys\-tems could prevent (prioritize) seeking other experts'{} opinions when they are unlikely (likely) to bring new 
perspectives.
%
On the other hand, these systems could also help identify those experts whose opinion is most likely to disagree with that of the expert 
sought first.
%
Here, it is worth noting that several studies have also argued that decision support systems that identify disagreement between experts may help identify 
when a decision instance would benefit most from a second opinion~\citep{raghu2019direct, lim2021finding}. 
%
However, these studies do not focus on inferring other experts'{} opinions given an expert'{}s opinion on a decision instance and a set of features, 
as we do in our work.
%
%

%
More specifically, we consider a multiclass classification setting where, for each instance, experts form their opinions on their own (\ie, without 
communicating).\footnote{This setting fits a variety of real-world applications. For example, when a patient is diagnosed by multiple doctors separately.}%, 
%each doctor diagnoses the patient separately.} 
In this setting, each expert'{}s opinion reduces to a label prediction.
%
Then, our goal is to design decision support systems that, given an expert'{}s prediction on an instance with a set of features, are able to infer 
other experts'{} predictions about the same instance, as illustrated in Figure~\ref{fig:intro_example}.
%
%%%nina: figure moved from appendix
\begin{figure*}[t]
        \centering
        \subfloat[Training dataset]{\includegraphics[width=0.38\textwidth]{data}}
        \hspace{4mm}
        \subfloat[Use case]{\includegraphics[width=0.51\textwidth]{motivation_both}}
        \caption{An example of a training dataset and use case of our decision support system on a medical application.
        %
        In panel (a), for each patient, multiple doctors assess the severity of a concussion on the basis of a set of features.
        %
        In panel (b), given a doctor'{}s assessment of the severity of a concussion and a set of features, our decision support system infers other fellow doctors'{}
        assessment of the severity of the concussion.}
        %
        % Predicting second opinions on a classification task. Illustration of a dataset and use case example. Doctors diagnose the severity of a concussion for different patients.
        % A doctor gives their diagnosis and asks the system to infer the diagnosis of fellow doctors (second opinions) or infer a consensus diagnosis from all doctors.}
\label{fig:intro_example}
\end{figure*}
%
To this end, one could resort to standard supervised learning. Under this perspective, for each instance, the given expert'{}s prediction would 
be just an additional feature about the instance.
%
Unfortunately, this would limit the applicability of the re\-sul\-ting supervised learning model to the unrealistic scenario where, for each 
possible pair of experts, we observe a sizeable number of instances where both experts made a prediction.
%
%
To circumvent this limitation, we look at the design of the above systems from the perspective of counterfactual inference.

\xhdr{Our contributions} We first show that, if experts form their opinions for each instance on their own, the underlying causal mechanism ge\-ne\-ra\-ting 
the experts'{} predictions needs to sa\-tis\-fy a certain set invariant property.
%
Moreover, we further show that any structural causal model satisfying the above set invariant property (in short, any SI-SCM) also satisfies two additional desirable 
properties: 
%
\begin{itemize}[noitemsep, topsep=0pt]
%
\item[(i)] there exists an equivalent SI-SCM where each expert'{}s predictions are generated by independent sub-mechanisms 
governed by a common (multidimensional) noise;
%
\item[(ii)] given an expert'{}s prediction on an instance with a set of features, the conditional interventional distribution and the counterfactual 
distribution of another expert'{}s predictions entailed by the SI-SCM are identical\footnote{Under the conditional interventional distribution, both experts have made a prediction but we only 
observe one of them. Under the counterfactual distribution, only one expert has made a prediction, which we observe.}.
%
\end{itemize}
%
These properties suggest the following natural strategy to design and train SI-SCM based decision support systems.
%
In a first step, we can use interventional data about each expert---her predictions on a set of instances---to determine the structure of 
each sub-mechanism separately.
%
One can view this step as multiple independent supervised learning problems, one per expert.
% 
In a second step, we can use a small amount of interventional data about multiple experts ma\-king predictions about a joint set of instances to 
characterize similarity across experts and factorize the noise governing the sub-mechanisms into a set of noise components. 
%
%
In a way, in this second step, we are adding a wrapper to the supervised learning models characterizing each expert'{}s sub-mechanism to be able to make 
counterfactual predictions about second opinions.

To implement the above strategy, we introduce a specific class of SI-SCMs based on the Gumbel-Max structural causal model~\citep{oberst2019} (in short, 
Gumbel-Max SI-SCM) and characterize similarity across pairs of experts using the concept of counterfactual stability\footnote{Counterfactual stability is, in general, 
an axiomatic requirement imposed to counterfactual distributions~\citep{oberst2019}. However, in SI-SCMs, it is verifiable from interventional data due to (ii), as 
shown in Theorem~\ref{thm:pcs_equivalence}.}.
%
In the Gumbel-Max SI-SCM, each expert'{}s sub-mechanism is go\-verned by a Gumbel-Max noise variable
and submechanisms of similar experts may be governed by the same noise variable. 
%
Further, we show that the problem of uniquely asso\-cia\-ting each of these noise variables with disjoint sets of mutually similar experts given data can be formulated as a known
clique par\-tio\-ning problem, an NP-hard problem~\citep{grotschel1989cutting,grotschel1990facets},
and propose a %simple 
randomized greedy algorithm with good performance.
 %

Finally, we experiment with synthetic and real data comprising of $20{,}426$ expert predictions over $1{,}560$ natural images. 
%
The results on synthetic data show that our randomized greedy algorithm can successfully recover the disjoint sets of mutually similar experts underpinning a
specific Gumbel-Max SI-SCM from data.
%
The results on real data show that the (counterfactual) predictions provided by the Gumbel-Max SI-SCM are more accurate than those provided by its non-causal 
counterpart.
%

\xhdr{Further related work}
%
Predictions by different experts have been typically studied separately, \ie, without conditioning on an observed prediction by a given expert~\citep{dawid1979maximum, welinder2010online, 
guan2018said, kerrigan2021combining, straitouri2022provably}.
%
One could think of the observed prediction just as an additional feature when inferring other experts'{} predictions, however, this would limit the applicability of existing inference 
methods to scenarios where, for each pair of experts, we observe a sizeable number of instances where both experts made a prediction, as discussed previously.
%
More broadly, our work is not the first to use counterfactual reasoning in expert prediction~\citep{bica2020learning}. However, previous work has used counterfactual reasoning to 
quantify an expert'{}s preference over counterfactual outcomes rather than to infer other experts'{} predictions 
%conditioning on a given expert'{}s prediction.
given an observed expert'{}s prediction.

Counterfactual inference has a long and rich history~\citep{imbens2015causal}. However, it has mostly focused on estimating quantities related to the interventional 
distribution of interest such as, \eg, the conditional average treatment effect (CATE).
%
A few notable exceptions are by~\cite{oberst2019} and~\cite{tsirtsis2021counterfactual}, which use the Gumbel-Max SCM to reason about counterfactual distributions in Markov decision 
processes (MDPs), and by~\cite{lorberbom2021learning}, which introduces a parameterized family of causal mechanisms that generalize the Gumbel-Max SCM and are specifically-tuned to a distribution of observations and interventions of interest.
%
However, the Gumbel-Max structural causal model has not been used previously to reason about counterfactual expert predictions.

%################################
\section{Preliminaries}
\label{sec:preliminaries}
Given a set of random variables\footnote{We denote random variables with capital letters and realizations of random variables with lower case letters.} $\mathbf{X} = \{X_1, \dots X_n\}$, a structural causal
model (SCM) $\M$ defines a complete data-generating process via a collection of assignments
%
\begin{equation*}
X_i = f_i(\mathbf{PA}_i, U_i),
\end{equation*}
%
where
%
$\mathbf{PA}_i\subseteq \mathbf{X}\setminus X_i$ are the direct causes of $X_i$,
%
$\mathbf{F}=\{f_1, \dots, f_n\}$ are deterministic causal mechanisms,
%
$\mathbf{U}=\{U_1, \dots, U_n\}$ are jointly independent noise random variables,
%
and $P(\mathbf{U})$ denotes the (prior) distribution of the noise variables.
%
%
Here, note that the noise variables $\mathbf{U}$ are the only source of stochasticity and, given an observational distribution $P(\mathbf{X})$, there always exists a distribution
$P(\mathbf{U})$ and mechanisms $\mathbf{F}$ so that $P = P^{\M}$, where $P^{\M}$ is the distribution entailed by $\M$.

Two SCMs $\M$ and $\tilde{\M}$ over variables $\vecX$ and $\vecU$, with noise distribution $P(\mathbf{U})$ and mechanisms $\vecF$ and $\tilde{\vecF}$ respectively, are equivalent if, for all $i \in [n]$, it holds
that
%
\begin{equation*}
 x_i=f_i(\mathbf{pa}_i,u_i) \Longleftrightarrow x_i=\tilde{f}_i(\mathbf{pa}_i,u_i).
\end{equation*}
%
for any realization $\mathbf{PA}_i=\mathbf{pa}_i$ and $P(\mathbf{U})$-almost every $u_i$.\footnote{
$P(\mathbf{U})$-almost everywhere means that the set of noise realizations $\mathcal{U'}$ for which the property does not hold has probability zero under the distribution $P(\mathbf{U})$, \ie, $P(\mathbf{U} \in \mathcal{U'})=0$.}
%
%

%%
Given a SCM $\M$, an atomic intervention $\Ical$ corresponds to assigning a fixed value to a variable.
%
For example, let $\Ical = \text{do}[X_i=x]$ be the intervention that assigns value $x$ to variable $X_i$, then the intervened SCM $\M^{\Ical}$ does not assign the value of $X_i$ according to $f_i(\mathbf{PA}_i,U_i)$
but assign it to a fixed value $x$. The interventional distribution entailed by the intervened SCM is denoted $P^{\M\,;\,\Ical}$.
%
Furthermore, given the (possibly partial) observation $\mathbf{X}=\mathbf{x}$,  we can also define a modified SCM $\M_{\Xb=\xb}$ where the
noise variables $\Ub$ are distributed according to the posterior distribution $P(\Ub \given \Xb=\xb)$.
Then, we can view a counterfactual statement as an intervention $\Ical$ in the SCM $\M_{\Xb=\xb}$ and denote the counterfactual distribution
entailed by the counterfactual SCM $\M^{\Ical}_{\Xb=\xb}$ as $P^{\M \given \mathbf{X}=\mathbf{x}\,;\,\Ical}$.

The Gumbel-Max SCM is a specific class of SCM in which the causal mechanism for a random categorical variable $V$ is defined as
%
\begin{equation}
	f_v(\mathbf{PA}, \mathbf{U}) := \argmax_j\{\log P(V=j \given \mathbf{PA}) + U_j\}
	\label{eq:gumbelscm}
\end{equation}
%
and each noise variable $U_j \sim \text{Gumbel}(0,1)$.
%
%
Here, note that the interventional distribution $P^{\M\,;\,do[\mathbf{PA} = \mathbf{pa}]}(V)$ entailed by a Gumbel-Max SCM $\M$ is exactly $P(V \given \mathbf{PA} = \mathbf{pa})$.

%################################

%################################
\section{Counterfactual Inference of Second Opinions}
\label{sec:problem}
%
We consider a multi-class classification task where, for each instance, a human expert $h \subseteq \Hcal$ makes a label prediction
$y_h \in \Ycal = \{1, \ldots, k\}$ based on multiple sources of information, which are (imperfectly) summarized by a feature vector $x \in \Xcal$.
%
Here, we assume that experts make predictions on their own (\ie, without communicating with each other) and the assignment of experts
to instances is independent of the identity of the instances and their feature vectors.
%
%
Then, our goal is to design an automated decision support system that,
%
given a prediction $y_h$ from an expert $h$ about an instance summarized by a feature vector $x$,
%
is able to infer what prediction $y_{h'}$ another expert $h' \neq h$ would have made about the \emph{same} instance if she had
been asked.
%
Here, note that two different instances may be (imperfectly) summarized by the same feature vector $x$, however, we are interested
in a counterfactual prediction about the \emph{same} instance.

%
Our starting point is to view the above counterfactual statement as an intervention in a particular counterfactual SCM. More
specifically, let $\Mcal$ be a SCM defined by the assignments
%
\begin{equation} \label{eq:scm}
\vecY = f_\vecY(X, Z, U), \quad Z = f_{Z}(V), \quad \text{and}  \quad X = f_{X}(W)
\end{equation}
%
where $U$, $V$ and $W$ are (multidimensional) independent noise variables, $f_{\vecY}$, $f_{Z}$ and $f_{X}$ are given deterministic causal mechanisms (or functions),
and $\Yb = (Y_h)_{h \in Z}$ are the predictions by a set of human experts $Z \subseteq \Hcal$.
%
Then, we can express the above counterfactual statement as an intervention $\Ical = do[Z = \{h'\}]$ in the counterfactual SCM
$\Mcal_{X = x, Z = \{h\}, \Yb = y_h}$ and, to infer the label prediction $y_{h'}$, we just need to resort to the counterfactual
distribution $P^{\Mcal \given X = x, Z = \{h\}, \Yb = y_{h} \,;\, do[Z = \{h'\}]}(\Yb)$.
%

At this point, one may argue that, even if we find a noise distribution $P(U)$ and a function $f_{\Yb}$ under which the conditional distribution
$P^{\Mcal}(\Yb \given X)$ is a good fit for observed historical predictions by experts, we would be unable to validate how accurate our
counterfactual label predictions are using data.
%
In general, this is true since counterfactual reasoning lies within level three in the ``ladder of causation''~\citep{pearl2009causality}. In this context,
previous work resorts instead to axiomatic assumptions about the causal mechanism of the world~\citep{oberst2019, tsirtsis2021counterfactual, noorbakhsh2021counterfactual}.
In our setting, this would reduce to specifying how differences across experts may have lead to a different prediction while holding ``everything else'' fixed.
%
However, in what follows, we will show that, if experts do not communicate with each other, the above SCM satisfies a set invariance property that sur\-pri\-singly
implies that the above counterfactual distribution coin\-cides with an interventional conditional distribution. This enables a data-driven design and validation of our SCM based decision support system.
%################################

%################################
\section{Relating the Counterfactual and Interventional Worlds}
\label{sec:cscm}
%
To build some intuition on the reasons why, if experts do not commu\-ni\-cate, certain type of counterfactual and interventional distributions are identical, we start
with a simple example.
%
Let $h, h' \in \Hcal$ be two different experts and consider the following two questions:
%
\begin{enumerate}
	\item Both experts have made a label prediction about an instance (\ie, $Z=\{h, h'\}$) but we only observe the prediction $Y_{h} = c$ made by $h$, what is the
	prediction made by $h'$?
		\label{it:q1}
	\item One of the experts has made a label prediction $Y_{h} = c$ about an instance (\ie, $Z=\{h\}$) and we observe it, what would the prediction made by $h'$ be
	if she had made a prediction?
		\label{it:q2}
\end{enumerate}
%
The first question is of conditional nature while the second is a counterfactual one.
%
In general, the answer to both questions may differ, for example, if experts % 
influence each other'{}s predictions by sharing and discussing their opinions % 
in the first case.
%
However, if experts do not commu\-ni\-cate, the answer to both questions should be identical.
%
More formally, the following conditional interventional distribution and counterfactual distribution of the expert should be equal:
%
%
\begin{multline}
	P^{\M \,;\, do[Z=\{h,h'\}]}(Y_{h'} \given X=x, Y_{h}=c) \\
	= P^{\M \given X=x, Z=\{h\},\vecY=c \,;\, do[Z=\{h'\}]}(\vecY).
	\label{eq:samedistr_example}
\end{multline}
%
More generally, we will now show that, if each expert forms their opinion on their own, the above equality is a direct consequence of a set invariance property satisfied
by the SCM defined in Eq.~\eqref{eq:scm}.

%
\xhdr{Set Invariant SCMs (SI-SCMs)}
%
If experts do not communicate before making a prediction and hence are unaware and unaffected of other experts' opinions,
%
the me\-cha\-nism $f_{\vecY}$ has a set in\-va\-riant value over expansions (supersets) of $Z$.
%
For example, consider one single expert $h$ has made a prediction $f_{\vecY}(x,\{h\},u)=c$ about a specific instance.
%
%
Then, one can conclude that, if instead of a single expert, a set of experts $\zeta \subseteq \Hcal$ such that $h \in \zeta$ would
have made predictions about the same instance (\ie, $x$ and $u$ does not change),
%
expert $h$ would have made the same prediction, \ie, $( f_{\vecY}(x,\zeta,u) )_h = c$.
%
More formally, we define the set invariance property as follows:
%
\begin{definition}[Set Invariance]\label{prop:set invariant}
	A mechanism $f_{\vecY}$ for variable $\vecY$ is set invariant with respect to $Z$ if, for any two realizations $Z=\zeta$ and $Z=\zeta'$ such that
	$\zeta\subseteq \zeta'$, it holds~that
	%
	\begin{equation*}
	f_{\vecY}(x, \zeta, u) = (f_{\vecY}(x,\zeta',u))_{\zeta}  \quad \text{ for all } x \in \X, u \in \U\ .
	\end{equation*}
	%
	A SCM $\M$ with such a mechanism is set invariant for $\vecY$.
\end{definition}

%
	A set-invariant SCM (SI-SCM) for $Y$ can be constructed by expressing the causal mechanism $f_{\vecY}$ with sub-mechanisms $f_{Y_h}$ governed by a common noise variable:\footnote{All proofs can be found in Appendix~\ref{app:awesomeproofs}}
%
\begin{theorem}\label{thm:SCMsetinvariant}
	Any SCM $\M$ with mechanism $f_{\vecY}$ of the form $f_{\vecY}(X,Z,U)= (f_{Y_h}(X,U))_{h \in Z}$, where $f_{Y_h} \colon \X \times \U \to \Y$ are arbitrary functions,
	is set invariant for $\vecY$.
\end{theorem}
%
In fact, the following theorem shows that the class of SCMs with separate sub-mechanisms for $Y_h$ and a shared noise variable $U$ is not only a subclass but completely defines the class of SI-SCMs for $\vecY$.
Thus, any correlation between experts' predictions is caused by the common noise and features but not the causal mechanism.
\begin{theorem}\label{thm:SCMequivalence}
 For any SI-SCM $\M$, there exists an equivalent SI-SCM $\M'$ with causal mechanism
$f'_{\vecY}(X,Z,U) = (f'_{Y_h}(X,U))_{h \in Z}$ where for $h \in Z$
 %
\begin{equation*}
f'_{Y_h}(X,U) := (f_{\vecY}(X,\{h\},U))_{h\in Z}.
\end{equation*}
%
\end{theorem}
%
Here, we would like to emphasize that, if the mechanism $f_{\vecY}$ of an SCM is not explicitly decoupled into sub-mechanisms governed by the same noise, it may be challenging to check whether an arbitrary SCM is set invariant.
For arbitrary SCMs, Theorem~{\ref{thm:SCMsetinvariant}} can not be applied directly and Theorem~{\ref{thm:SCMequivalence}} does not tell us how to verify that an equivalent SCM exists.
However, it tells us that the mechanism of a set invariant SCM can be decoupled and simplified. It would be interesting to develop methods to check for set invariance for arbitrary SCMs in future work.
%

\xhdr{Equality between the counterfactual distribution and the conditional interventional distribution}
%
%
Returning to our simple motivational example, note that, if a SCM is set invariant, the answers to the counterfactual and the conditional questions~\ref{it:q1} and~\ref{it:q2} are
the same as long as the noise $u \sim P(U \given X=x, Y_{h}=c)$ is the same.
%
In particular, for question~\ref{it:q1}, the answer is $Y_{h'}=(f_{\vecY}(x,\{h,h'\},u))_{h'}$, for question~\ref{it:q2}, the answer is $Y_{h'}=f_\vecY(x,\{h'\},u)$, and since $f_{\vecY}$ is set invariant,
both answers are equal.
%
More generally, for arbitrary sets of experts, we can easily conclude that equality holds if and only if $f_{\vecY}$ is set invariant. %(follows from the definition of set invariance).

Next, to show that, if a SCM is set invariant, then the equa\-li\-ty of distributions in Eq.~\eqref{eq:samedistr_example} holds, we first present a more
general theorem that states that, if we expand the set of experts who make predictions, the corresponding interventional distribution of $\vecY$
does not change:
%
\begin{theorem}\label{thm:equalprop}
	%
	Let SCM $\M$ be set invariant for $\vecY$. Then, for any $\zeta, \zeta' \in \Hcal$ such that $\zeta \subseteq \zeta'$,
	it holds that
	\begin{multline*}
	P^{\M \,;\, do[Z=\zeta]}(\vecY=\vecy \given X) \\
	= P^{\M \,;\, do[Z=\zeta']}((\vecY)_{\zeta}=\vecy\given X)
	\end{multline*}
	%
	for any $\vecy \in \Y^{|\zeta|}$ where $(\vecY)_{\zeta}$ denotes the predictions by the experts in the
	subset $\zeta \subseteq \zeta'$.
\end{theorem}
%
The above theorem is straight forward to show using that, due to the set invariance property, the prediction values of mechanism $f_{\vecY}$ for $(x, \zeta, u)$ are equal to the values for $(x,\zeta',u)$ for experts
in $\zeta$ and, due to the independence between the noise and the intervention, the noise distribution does not change.
%
A direct conclusion
is that, no matter how many experts make predictions, the conditional interventional distribution of a single expert's prediction does not change,
as formalized by the following corollary:
%
\begin{corollary}\label{cor:equalprop_h}
Let SCM $\M$ be set invariant for $\vecY$. Then, for any $h \in \Hcal$ and $\zeta \subseteq \Hcal$ such that $h \in \zeta$, it
holds that
        \begin{equation*}
        P^{M \,;\, do[Z=\{h\}]}(Y_h \given X) = P^{M \,;\, do[Z=\zeta]}(Y_h\given X).
        \end{equation*}
\end{corollary}

Similarly, we can derive the desired equality between the counterfactual distribution and the conditional interventional distribution by using the set invariance of mechanism $f_{\vecY}$ and the fact that the noise
distribution changes equally in both scenarios. More formally, we have the following corollary:
%
\begin{corollary}\label{cor:equalcounterfactuals}
Let SCM $\M$ be set invariant for $\vecY$. Then, for any $h,h' \in \Hcal$ and $\zeta \subseteq \Hcal$ such that $h,h' \in \zeta$, it holds that
%
\begin{multline*}
	P^{\M\given X=x, Z=\{h\},\vecY=c \,;\, do[Z=\{h'\}]}(\vecY)
	\\= P^{\M \,;\, do[Z=\zeta]}(Y_{h'}\given X=x, Y_{h}=c) \ .
\end{multline*}
%
for any $x \in \X$ and $c \in \Y$.
\end{corollary}
%
\xhdr{Remark} While we have introduced the notion of set invariance for SCMs in the context of inferring second opinions, we believe it may be of independent interest since, generally speaking, 
it allows us to identify counterfactual distributions from interventional data.

%################################

%################################
\section{Characterizing Mutually Similar Experts} 
\label{sec:pcs}
%
%
Given a SI-SCM model $\Mcal$ where each expert'{}s predictions $Y_h$ are generated by a sub-mechanism $f_{Y_h}$, our goal in
this section is to characterize mutually similar experts.
%
Later on, this will help us factorize the noise $U$ gover\-ning the sub-mechanisms $f_{Y_h}$ underpinning the model into a set of independent
noise components and uniquely associate each of these noise components with disjoint sets of mutually similar experts given data.
%

To this end, we first start by characterizing similarity between a pair of experts $h, h' \in \Hcal$.
%
To this end, we resort to the recently introduced notion of counterfactual sta\-bi\-li\-ty~\citep{oberst2019}. More specifically, we argue that two experts
$h$ and $h'$ are \emph{similar} if $\M$ satisfies counterfactual stability for $h, h'$ with respect to $\vecY$.
%
\begin{definition}[Counterfactual stability]
\label{def:counterfactual-stability}
A SCM $\M$ satisfies counterfactual stability for $h, h'$ with respect to $\vecY$ if, for all $\zeta, \zeta' \subseteq \Hcal$ such that $h \in \zeta$ and
$h' \in \zeta'$ and for all $c' \neq c$, the condition
%
\begin{equation*}
\frac{P^{\M \,;\, \text{do}[Z = \zeta']}(Y_{h'} = c \given X)}{P^{\M \,;\, \text{do}[Z = \zeta]}(Y_h = c \given X)} \geq
\frac{P^{\M \,;\, \text{do}[Z = \zeta']}(Y_{h'} = c' \given X)}{P^{\M \,;\, \text{do}[Z = \zeta]}(Y_h = c' \given X)}
\end{equation*}
%
	implies that $P^{\M \given X, Z=\zeta, Y_{h} = c \,;\, \text{do}[Z = \zeta']}(Y_{h'} = c') = 0$, where $Y_{h} = c$ is the observed outcome
under $\text{do}[Z = \zeta]$.
%
\end{definition}
%
For example, consider a scenario where a doctor needs to decide what treatment option---surgery ($Y=0)$, radiation ($Y = 1$) or chemotherapy ($Y = 2$)---will be
more beneficial for a patient with a tumor, imperfectly summarized by a feature vector $x$.
%
%
Assume doctor $h$ decides the most beneficial option is surgery, \ie, $Y_h = 0$, and we know that, for patients with similar $x$, doctor $h'$ is generally more likely
to operate and less likely to resort to therapy than doctor $h$.
%
Then, if doctors $h$ and $h'$ are similar, as defined in Definition~\ref{def:counterfactual-stability}, we expect doctor $h'$ would have also decided the most be\-ne\-fi\-cial option
is surgery for the given patient, if consulted, \ie, $Y_{h'} = 0$.
%
Here, whenever two doctors $h$ and $h'$ are \emph{not} similar, one could argue that it is because they weigh any (hidden) factor %specific to 
of the patient at hand
differently\footnote{In general, note that similarity between experts does not always deterministically enforce the observed expert'{}s prediction on the counterfactual prediction. In the
example above, this happens because the inequality in Def.~\ref{def:counterfactual-stability} holds for the two remaining label values.
%
Rather, it allows us to identify experts with different decision making criteria.}.

%
%
Unfortunately, in general, we cannot use data to verify if two experts $h$ and $h'$ are similar. This is because our no\-tion of similarity relies on a counterfactual distribution,
$P^{\M \given Y_{h} = c \,;\, \text{do}[Z = \zeta']}$, and counterfactual reasoning lies within level three in the ``ladder of causation''~\citep{pearl2009causality}.
%
However, we will now define a notion of conditional sta\-bi\-li\-ty that is verifiable using interventional data and, in the case of SI-SCMs, is both a sufficient and necessary
condition for counterfactual stability---if conditional sta\-bi\-li\-ty holds, we can conclude that two experts are similar.
%
\begin{definition}[Conditional stability]
	A SCM $\M$ satisfies conditional stability for two experts $h, h' \in \Hcal$ with respect to $\vecY$ if, for all $\zeta \subseteq \Hcal$ such
	that $h, h' \in \zeta$ and for all $c' \neq c$, the condition
	%
	\begin{equation}
		\frac{P^{\M \,;\, \text{do}[Z = \zeta]}(Y_{h'} = c \given X)}{P^{\M \,;\, \text{do}[Z = \zeta]}(Y_h = c \given X)} \geq
		\frac{P^{\M \,;\, \text{do}[Z = \zeta]}(Y_{h'} = c' \given X)}{P^{\M \,;\, \text{do}[Z = \zeta]}(Y_h = c' \given X)}
		\label{eq:conditional-stability}
	\end{equation}
	 implies that
	$P^{\M;do[Z=\zeta]}(Y_{h'}=c' \given X, Y_{h} = c)=0$.
\end{definition}
%
Here, note that, for SI-SCMs, we only need to verify the condition in Eq.~\eqref{eq:conditional-stability} for the sets $\zeta = \{h\}$ and $\zeta = \{h'\}$ because
no matter how many experts make predictions, the conditional interventional distributions in Eq.~\eqref{eq:conditional-stability} do not change, as shown in
Corollary~\ref{cor:equalprop_h}.
%
Then, the following Theo\-rem formalizes the equivalence between conditional and counterfactual stability:
%
\begin{theorem}\label{thm:pcs_equivalence}
	Let SCM $\M$ be set invariant for $\vecY$. Then, $\M$ satisfies counterfactual stability for $h, h' \in \Hcal$ with
	respect to $\vecY$ iff it satisfies conditional stability.
\end{theorem}
%
Once we have a notion of similarity between pairs of experts that we can verify from data, we can characterize groups of mutually similar
experts. In this context, it will be useful to introduce the following notion of pairwise counterfactual stability (in short, PCS), which extends
counterfactual stability to groups of experts $\zeta \subseteq \Hcal$ of arbitrary size.
%
\begin{definition}[Pairwise Counterfactual Stability]\label{def:pcs_def}
	A SCM $\M$ satisfies pairwise counterfactual stability for a group of experts $\zeta \subseteq \Hcal$ with respect to $\vecY$ if
	it satisfies counterfactual stability for any $h, h' \in \zeta$.
\end{definition}
%
Similarly as in the case with a pair of experts, one can also define pairwise conditional stability and it imme\-dia\-te\-ly follows\-
from Theorem~\ref{thm:pcs_equivalence} that, for SI-SCM, pairwise conditional and counterfactual stability are equivalent,
as formalized by the following Corollary.
%
\begin{corollary}\label{cor:pcs_equivalence}
	Let SCM $\M$ be set invariant for $\vecY$. Then, $\M$ satisfies pairwise counterfactual stability for $\zeta \in \Hcal$ with
	respect to $\vecY$ iff it satisfies pairwise conditional stability.
\end{corollary}
%################################

%################################
\section{Gumbel-Max SI-SCM}
\label{sec:gumbel}
In this section, we build upon our theoretical results to develop the Gumbel-Max SI-SCM, a new class of SI-SCM based on the
Gumbel-Max SCM.

Given a set of experts $\Hcal$, the Gumbel-Max SI-SCM partitions $\Hcal$ into disjoint sets of experts $\Psi=\{\psi\}_{\psi \in \Psi}$,
as defined in Section~\ref{sec:pcs}, and associate all experts within each set to the same multidimensional noise variable.
%
%
More formally, the Gumbel-Max SI-SCM is defined as follows:
%
%
\begin{definition}[Gumbel-Max SI-SCM] % $\M(\Psi)$ for $\vecY$]
%
%
The Gumbel-Max SI-SCM $\M(\Psi)$ is a specific class of SCM in which the causal mechanism for $\vecY$ is defined as
%
%
\begin{equation*}
	f_{\vecY}(X, Z, U) = (f_{Y_h}(X, U))_{h \in Z}, \text{\quad with}
\end{equation*}
%
%
\begin{equation*}
	f_{Y_h}(X, U_{\psi(h)}) = \argmax_{c \in \Y}\{ \log P(Y_h=c \given X) + U_{\psi(h), c}\},
\end{equation*}
%
where $\psi(h) \in \Psi$ denotes the subgroup expert $h$ belongs to and each noise variable $U_{\psi(h), c} \sim \text{Gumbel}(0,1)$.
%
\end{definition}
%
%\vspace{-2mm}
By definition, the Gumbel-Max SI-SCM $\M(\Psi)$ is set invariant for $\Y$ and,
for any $\zeta \subseteq \Hcal$ and $h \in \zeta$, it holds that
%
$P^{\M(\Psi);\text{do}[Z=\zeta]}(Y_h \mid X) = P(Y_h \mid X)$.
%
Moreover, all experts within each group $\psi \in \Psi$ are mutually similar, as formalized by the following Theorem:
%
\begin{theorem} \label{thm:gumbel}
	The Gumbel-Max SI-SCM $\M(\Psi)$ satisfies pairwise counterfactual stability (PCS) for each group $\psi \in \Psi$ with respect to $\vecY$. % of size greater than $1$.
\end{theorem}
%
%\vspace{-2mm}
Finally, note that, for $\Psi = \Hcal$, the Gumbel-Max SI-SCM reduces to the original Gumbel-Max SCM defined
in Eq.~\ref{eq:gumbelscm}.
%
Therefore, one can view the Gumbel-Max SI-SCM as a generalization of the original Gumbel-Max SCM where,
instead of a single multidimensional noise variable $U$ for all $h \in \Hcal$, one has several noise variables $U_{\psi}$,
one per group.

%
\xhdr{Estimating counterfactual distributions} % entailed by $\M(\Psi)$}
%
\begin{figure}[t]
        \centering
                \includegraphics[width=0.45\textwidth]{inference.pdf}
        %\vspace{-2mm}
        \caption{ Illustration of the counterfactual sampling of experts' predictions with the Gumbel-Max SI-SCM $\Mcal(\Psi)$.
        }
        %\vspace{-2mm}
        %
\label{fig:flowdiagram_inference}
\end{figure}
%
%
Given a prediction $Y_h=c$ by an expert $h$, we can compute an unbiased finite sample Monte-Carlo estimator of the counterfactual
distribution $P^{\Mcal(\Psi) \given X = x, Z = \{h\}, \Yb = y_{h} \,;\, do[Z = \{h'\}]}(\Yb)$ for the prediction $Y_{h'}$ of another expert $h' \neq h$
as follows:
%
\begin{multline} \label{eq:cfc_distr_estimator}
P^{\Mcal(\Psi) \given X = x, Z = \{h\}, \Yb = y_{h} \,;\, do[Z = \{h'\}]}(\Yb) \\
	\approx \frac{1}{T} \sum_{t \in T} \mathbbm{1}{[c=f_{Y_{h'}}(x,\mathbf{u}_t)]} % \hat{f} -> f, we did not introduce \hat{f}
\end{multline}
%
%%\vspace{-2mm}
where $\mathbf{u}_1, \ldots, \mathbf{u}_T$ are samples from the posterior distribution $P^{\Mcal(\Psi) \given X = x, Z = \{h\}, \Yb = y_{h} \,;\, do[Z = \{h'\}]}(U_{\psi(h')})$
of the noise variable $U_{\psi(h')}$.
%
Here, we can use an efficient procedure to sample from the above noise posterior distribution, described elsewhere~\citep{oberst2019,maddison2015a}.
%
Moreover, note that, if $h \notin \psi(h')$, the posterior distribution coincides with the prior $P^{\Mcal(\Psi)}(U_{\psi(h')})$.
%
We summarized the sampling procedure is depicted in %}
Figure~\ref{fig:flowdiagram_inference}.

\xhdr{Partitioning experts into mutually similar groups}
%
In the Gumbel-Max SI-SCM $\Mcal(\Psi)$, for each expert $h \in \Hcal$, we can estimate the conditional distribution $P(Y_h \given X)$
using any machine learning model trained using historical predictions made by the expert $h$.
%
However, to fully define $\Mcal(\Psi)$, we need to partition the set of experts $\Hcal$ into disjoint sets of experts $\Psi$ given a small
amount of historical data about multiple experts making predictions about a joint set of instances. To this end, we proceed as follows.

First, we look for violations of the conditional stability condition throughout the historical data.
%
Whenever there exists a sample for which the predictions by two different experts $h$ and $h'$ violate conditional
stability\footnote{A violation occurs whenever Eq.~\eqref{eq:conditional-stability} holds but we observe $Y_h=c$ and $Y_{h'}=c'$.},
we conclude that $h$ and $h'$ cannot belong to the same group $\psi$.
%
Further, we also conclude that any pair of experts whose predictions did not violate conditional stability and were at least once
observed for the same sample \emph{can} be similar.
%
However, since conditional stability is not a transitive property, there may be multiple valid partitions $\Pcal = \{\Psi\}$ of the experts into
disjoint sets that are % mutually similar and
consistent with the above conclusions.
%
To decide among them, we would like to pick the partition $\Psi \in \Pcal$ under which the counterfactual distributions $P^{\Mcal(\Psi) \given X = x, Z = \{h\}, \Yb = y_{h} \,;\, do[Z = \{h'\}]}(\Yb)$
provide the best goodness of fit. 
%
More formally, we would like to solve the following mini\-mi\-za\-tion problem:
%
%\vspace{-2mm}
\begin{equation} \label{eq:optproblem}
	\underset{\Psi \in \Pcal}{\text{minimize}} \quad \sum_{h,h' \in \Hu} \Loss(\M(\Psi),h',h) - \Loss(\M(\Hu),h',h), % \\
\end{equation}
%
where $\Loss(\M(\cdot),h',h)$ denotes an average (empirical) loss whenever we observe $Y_{h}$ and infer the label prediction $Y_{h'}$ using the
counterfactual distribution $P^{\Mcal(\cdot) \given X = x, Z = \{h\}, \Yb = y_{h} \,;\, do[Z = \{h'\}]}(\Yb)$.
%
Here, we measure goodness of fit in terms of average loss reduction with respect to the counterfactual distributions entailed by the causal model
$\Mcal(\Hcal)$ because this will allow us to reduce the number of pairs $(h, h')$ we need to consider.
%%
%
\begin{figure}[t]
        %\vskip 0.2in
        \centering
        \includegraphics[width=0.3\textwidth]{training.pdf}
        %\vspace{-2mm} % Step-wise approach for p
        \caption{Partitioning experts into mutually similar groups $\Psi$.
        }
        %\vspace{-2mm}
        %
\label{fig:flowdiagram_training}
% \vskip -0.2in
\end{figure}
%
The step-wise approach for obtaining $\Psi$ is summarized in
Figure~\ref{fig:flowdiagram_training}.

Next, we formulate the above problem as a known clique partitioning problem~\citep{grotschel1989cutting,grotschel1990facets}.
%
More specifically, let $\Gcal = (\Hu,\mathcal{E})$ be an undirected graph where, if $\{h, h'\} \in \Ecal$, then $h$ and $h'$ \emph{can} be similar,
as concluded from the data.
%
Then, it readily follows that finding a partition $\Psi$ of $\Hu$ is equivalent to finding a clique cover for $\Gcal$\footnote{$\Psi$ is a clique cover
for $\Gcal$ iff $\Psi$ is a partition of $\Hcal$, \ie, $\bigcup_{\psi \in \Psi} \psi = \Hcal$ and $\psi \cap \psi' = \emptyset$ for all $\psi,\psi' \in \Psi$, and vertices
in $\psi \in \Psi$ form a clique in $\Gcal$.}.
%
Now,
let the weight $w(h, h')$ of each edge $\{h, h'\} \in \Ecal$ be given by:
\begin{multline*}
	w(h,h') = \Loss(\M(\Psi),h,h') - \Loss(\M(\Hu),h,h') \\
	+ \Loss(\M(\Psi),h',h) - \Loss(\M(\Hu),h',h)
\end{multline*}
%
%
Then, we can rewrite the minimization problem in Eq.~\ref{eq:optproblem} as:
%
\begin{equation}
	\begin{split}
		\underset{\Psi}{\text{minimize}} & \quad \sum_{\psi \in \Psi} \quad \sum_{h, h' \in \psi} w(h,h')
		\\
		\text{subject to} &\quad \Psi \text{ is a clique cover for } \Gcal,
		\label{eq:optproblem_graph}
	\end{split}
\end{equation}
%
where note that we only need to consider pairs of experts $h, h' \in \psi$ because, otherwise, $w(h, h') = 0$ since the corresponding
counterfactual distributions entailed by $\Mcal(\Psi)$ and $\Mcal(\Hcal)$ coincide.
%
The minimization problem given by Eq.~\eqref{eq:optproblem_graph} is a known clique partitioning problem (CPP)
\footnote{In most of the literature,
the problem is defined for complete graphs. However, for arbitrary graphs, one can simply include the missing edges and assign positive infinite weights so that
they are not included in a solution~\citep{brimberg2017solving}},
for which the decision problem of CPP for arbitrary weights is NP-Hard~\citep{grotschel1989cutting,grotschel1990facets}.
However, we found that a simple
randomized greedy algorithm works well in our setting, as shown in Figure~\ref{fig:results_synthetic} in Appendix~\ref{sec:synthetic}. Refer
to Appendix~\ref{sec:algorithm} for more details about the algorithm.
%################################
%
%################################
\section{Experiments on Real Data}
\label{sec:real}
In this section, we compare the performance of the proposed Gumbel-Max SI-SCM at inferring second opinions against several competitive baselines using a
dataset with real expert predictions over natural images. Appendix~\ref{sec:synthetic} contains addi\-tio\-nal experiments on synthetic data where we assess the
performance of Algorithm~\ref{alg:greedyalg} at recovering groups of mutually similar experts on synthetic data.\footnote{To facilitate research in this area, we release an open-source implementation of our code at \href{https://github.com/Networks-Learning/cfact-inference-second-opinions}{https://github.com/Networks-Learning/cfact-inference-second-opinions}.}
%

\xhdr{Data description and experimental setup}
%
We experiment with the dataset CIFAR-10H~\citep{peterson2019human}, which contains $10{,}000$ images taken from the test set of the standard dataset CIFAR-10~\citep{krizhevsky2009learning}.
%
Each of these images belongs to $n = 10$ classes and contains label predictions from approxi\-ma\-te\-ly $50$ human annotators. In total, the images are annotated
by $2{,}571$ different human annotators (from now on, experts).\footnote{The dataset CIFAR-10H
is one of the only larger public datasets containing multiple label predictions by different experts per sample, necessary
to train the proposed Gumbel-Max SI-SCM.
%
However, since our methodology and theoretical results are rather general, our model may also be useful in other applications.}
%
%
%
Since the classification task is relatively easy for humans, there are many images ($\sim$$35$\%) in which there is full agreement between experts---all experts make
the same label prediction.
%
Here, motivated by the empirical observation that, in medical diagnosis, there is typically a $20\%$ per-instance disagreement among experts~\citep{van2017extent, elmore2015diagnostic}, we filter out the
above mentioned images in which there is full agreement.
%
%
Moreover, we split the remaining images into two disjoint sets at random---a training set and a test set---and filter out data from any expert who made less than $130$ and $20$ predictions in
training and test set, respectively, and whose predicted labels in the training data do not cover all class labels.
%
After these preprocessing steps, the resulting training and test sets contain $1{,}257$ and $303$ images, respectively, annotated by $|\Hcal| = 114$ experts, where each image in the training and test set is annotated by at least two experts.
%
%
%
\begin{table}
    \centering
    \small
    \setlength{\tabcolsep}{4pt}
    \caption{Overall test accuracy}\label{tab:acc_table}
    \begin{tabular}{rccc}
      \toprule % from booktabs package
	    \bfseries Model & \bfseries $h, h' \in \Hcal$ & \bfseries $h, h' \in \psi$ & \bfseries $h \in \psi$, $h \in \psi'$ \\
      \midrule % from booktabs package
	    Gumbel-Max SI-SCM &  66.8\% & 79.9\% & 45.1\%\\
	    % Untrained Si-SCM
	    GNB & 48.9\% & 51.3\% & 45.1\%\\
	    GNB + CNB & 62.0\% & 66.0\% &55.2\%\\
%	    Gaussian NB & 56.5\% & & \\
      \bottomrule % from booktabs package
    \end{tabular}
    %\vspace{-3mm}
\end{table}
%
\begin{figure*}[t]
        \centering
        \subfloat[Gumbel-Max SI-SCM vs. GNB]{
        \stackunder{\includegraphics[width=0.233\textwidth]{compare_untrained_sc1.pdf}}{\scriptsize $h, h' \in \Hcal$}
        \stackunder{\includegraphics[width=0.233\textwidth]{compare_untrained_sc2.pdf}}{\scriptsize $h, h' \in \psi$}
        }
        % \hspace{1mm}
        \subfloat[Gumbel-Max SI-SCM vs. GNB+CNB]{
        \stackunder{\includegraphics[width=0.233\textwidth]{compare_naivebayes_sc1.pdf}}{\scriptsize $h, h' \in \Hcal$}
        \stackunder{\includegraphics[width=0.283\textwidth]{compare_naivebayes_sc2.pdf}}{\scriptsize $h, h' \in \psi$}
        }
        %\vspace{-2mm}
        \caption{Per-expert test accuracy achieved by our model and both baselines on the CIFAR-10H dataset. In each panel, the $y$-axis measures the per-expert test accuracy achieved by
        our method and the $x$-axis the per-expert accuracy achieved by a baseline.
        %
        For each cell, the darkness is proportional to the number of experts with the corresponding test accuracies.}
        %\vspace{-2mm}
        %
\label{fig:results_real}
\end{figure*}

To find the groups of mutually similar experts underpinning our Gumbel-Max SI-SCM, we run Algorithm~\ref{alg:greedyalg} on the trai\-ning set.
%
Within the Gumbel-Max SI-SCM, we estimate the conditional
distribution $P^{\M \,;\, \text{do}[Z=\{h\}]}(Y_h \given X)$ for each expert $h$ using a Gaussian Naive Bayes model (GNB) trained using also the training set (one GNB per expert)\footnote{In
the CIFAR-10H dataset, experts are assigned to images (presumably) at random. Therefore, it holds that $P^{\M \,;\, \text{do}[Z=\{h\}]}(Y_h \given X) = P(Y_h \given X, Z=\{h\})$
and we can use observational data to estimate the interventional conditional distribution $P^{\M \,;\, \text{do}[Z=\{h\}]}(Y_h \given X)$.}.
%
%
Each GNB model uses $20$ dimensional feature vectors computed by running PCA on a $512$ dimensional normalized feature vector extracted using VGG19~\citep{simonyan2014very}.
%
Both during training and test, given an observed label prediction $Y_h$ by an expert $h$, we infer the prediction $Y_{h'}$ by another expert $h'$ using the most likely label under (an estimate of) the corresponding counterfactual distribution.
%
%
%
To estimate each counterfactual distribution, we use $T=1{,}000$ samples from the noise posterior distribution.
%

\xhdr{Baselines and evaluation metrics}
%
We compare the performance of our trained Gumbel-Max SI-SCM with two baselines (see also Figure~{\ref{fig:flowdiag_experiments}} in Appendix~{\ref{app:real}}): % for an illustration of the setup):

\noindent \hspace{0mm} --- The ``\emph{GNB}'' baseline uses only the same Gaussian Naive Bayes models (GNB), one per expert, used by our trained Gumbel-Max SI-SCM.
%
More specifically, given an observed label prediction $Y_h$ by an expert $h$, it infers the prediction $Y_{h'}$ by another expert $h'$ using the most likely label under the estimate of the
conditional distribution $P^{\M \,;\, \text{do}[Z=\{h'\}]}(Y_{h'} \given X)$ given by the corres\-pon\-ding~GNB. % , \ie, $\hat{Y}_{h'} = \argmax_{\vecY} \hat{P}^{\M \,;\, \text{do}[Z=\{h'\}]}(Y_{h'} \given X)$.

\noindent \hspace{0mm} --- The ``\emph{GNB + CNB}'' baseline uses the same Gaussian\- Naive Bayes models (GNB), one per expert, used by our trained Gumbel-Max SI-SCM and
a Categorical Naive Bayes (CNB) model, one per expert, that estimates $P^{\M\,;\,\text{do}[Z = \{h, h'\}]}(Y_{h'} \given Y_{h})$.\footnote{The CNB uses a ``one-hot'' encoding of the observed prediction
$Y_h$ as a single $|\Hcal|$-dimensional feature where, for each dimension, it uses an additional label value to denote than an expert'{}s label prediction has not been observed.} More specifically,
given an observed label prediction $Y_h$ by an expert $h$, it infers the prediction $Y_{h'}$ by another expert $h'$ using the most likely label under the product of distributions
$P^{\M \,;\, \text{do}[Z=\{h'\}]}(Y_{h'} \given X) \times P^{\M \,;\, \text{do}[Z = \{h, h'\}]}(Y_{h'} \given Y_{h})$, as estimated by the corres\-pon\-ding GNB (first term) and CNB (second term).
%

To compare the performance of our trained Gumbel-Max SI-SCM and both baselines, for each sample in the test set, we pick each of the corresponding expert label predictions $Y_h$ as the observed
prediction in turn and infer the value of the other predictions $Y_{h'}$.
%
Here, we compute the overall accuracy as well as the per-expert accuracy and distinguish among three scenarios: (i) $h, h' \in \Hcal$; (ii) $h, h' \in \psi$; and, (iii) $h \in \psi, h' \in \psi', \psi \neq \psi'$.
%

\xhdr{Results}
%
We start by reporting that, during the training of our Gumbel-Max SI-SCM, Algorithm~\ref{alg:greedyalg} found $352$ violations of the conditional stability condition between pairs of
experts and partitioned the experts into fifteen disjoint groups of mutually similar experts, where seven of these groups were singletons. Refer to Appendix~\ref{app:real} for more
details regarding the groups identified by Algorithm~\ref{alg:greedyalg}.

Next, we report the overall accuracy achieved by our model and the baselines in Table~\ref{tab:acc_table}.
%
We find that, in general ($h, h' \in \Hcal$), our model infers the expert predictions more accurately than both baselines
%
%
and this competitive advantage comes from instances in which the observed prediction is by an expert $h$ who belongs to the same group
of mutually similar experts as the expert $h'$ whose prediction we infer ($h, h' \in \psi$).
%
In fact, the GNB+CNB baseline is more accurate whenever both experts $h$ and $h'$ do not belong to the same group ($h \in \psi$, $h' \in \psi'$, $\psi \neq \psi'$).
%
Moreover, we also find that the GNB+CNB baseline infers the expert predictions more accurately whenever both experts belong to the same group of mutually
similar experts identified by Algorithm~\ref{alg:greedyalg}.
%
In Appendix~\ref{app:real}, we report the confusion matrix of the above counterfactual predictions.

Finally, we report the per-expert $h'$ accuracy achieved by our model and both baselines in Figure~\ref{fig:results_real}.\footnote{Whenever $h, h' \in \psi$, we could not compute the per-expert accuracy
for $11$ experts---seven of these experts belong to singleton groups and the remaining four do not predict any of the same test samples predicted by other experts in their mutually similar groups.}
%
%
The results show that, in general ($h, h' \in \Hcal$), our model infers the expert predictions more accurately than the baselines for a majority
of the experts ($103$ and $89$, out of $114$, compared to GNB and GNB+CNB, respectively).
%
Moreover, if we restrict our attention to observed label predictions by experts $h$ belonging to the same group of mutually similar experts as the
expert $h'$ whose prediction we infer ($h, h' \in \psi$),
%
our model infers the expert prediction more accurately for almost all experts $h'$ ($100$ and $101$, out of $103$, compared to GNB and
GNB+CNB, respectively).
%
Additionally, Figure~\ref{fig:results_real_2} in Appendix~\ref{app:real} shows that, for most experts ($87$ out of $103$), the GNB+CNB baseline infers the expert predictions $Y_{h'}$ more
accurately if the observed prediction $Y_{h}$ is by an expert $h$ belonging to the same group of mutually similar experts as the expert $h'$ ($h, h' \in \psi$) than if it is by an expert $h$ belonging to a different group ($h \in \psi$, $h' \in \psi'$, $\psi \neq \psi'$).
%################################
%################################
\section{Conclusion}
\label{sec:conclusions}
In this work, we have addressed the problem of inferring second opinions by experts from the perspective of counterfactual
inference.
%
We have focused on a multiclass classification setting and showed that, if experts make predictions on their own, the underlying
causal mechanism generating their predictions needs to satisfy a desirable set invariant property.
%
Moreover, we have introduced the set invariant Gumbel-Max structural causal model, a new class of structural causal model
whose structure and counterfactual predictions about second opinions by experts can be validated using interventional data.
%
%

Our work opens up many interesting avenues for future work.
%
For example, we assume experts do not communicate before forming their opinion. %---they make predictions on their own.
%
Although this assumption may be satisfied in some real-world applications, it would be interesting to relax it.
%
Moreover, we have validated our model using a single real dataset.
%
It would be valuable to validate %the counterfactual predictions about second opinions provided by
our model using additional datasets
from other applications. % (\eg, medical diagnosis).
%
Finally, it would be important to carry out user studies in which the inferred second opinions provided by our model are shared
with domain experts (\eg, medical doctors). % in a real-world application.
%################################
\begin{acknowledgements}
        M.~Gomez-Rodriguez acknowledges support from the European Research Council (ERC) under the European Union'{}s Horizon 2020 research and innovation programme (grant agreement No. 945719).
\end{acknowledgements}

\bibliography{corvelo-benz_514}

\end{document}
