% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{caption}
\usepackage{subcaption}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{bbm}

\usepackage{algorithm}
\usepackage{algorithmic}

%for cross referencing
%\usepackage{xr}
%\externaldocument{corvelo-benz_514}
\usepackage{nameref,zref-xr}
\zxrsetup{toltxlabel}
\zexternaldocument*{corvelo-benz_514}

\usepackage{stackengine}
%\usepackage{Definitions}

%macros for notations
\newcommand{\given}{\,\mid\,}
\newcommand{\X}{\mathcal{X}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\U}{\mathcal{U}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\M}{\mathcal{M}}
\newcommand{\Hu}{\mathcal{H}}
%\newcommand{\L}{\mathcal{L}}
\newcommand{\vecX}{\mathbf{X}}
\newcommand{\vecU}{\mathbf{U}}
\newcommand{\vecF}{\mathbf{F}}
\newcommand{\vecY}{\mathbf{Y}}
\newcommand{\vecy}{\mathbf{y}}
\newcommand{\Loss}{\mathcal{L}}

\newcommand{\wb}{\mathbf{w}}
\newcommand{\xb}{\mathbf{x}}
% \newcommand{\yb}{\mathbf{y}}
\newcommand{\zb}{\mathbf{z}}

\usepackage{latexsym} 
\usepackage{bm} 
\usepackage{mathrsfs}
\usepackage{cancel}
\newcommand{\Ab}{\bm{A}}
\newcommand{\Bb}{\bm{B}}
\newcommand{\Cb}{\bm{C}}
\newcommand{\Db}{\bm{D}}
\newcommand{\Eb}{\bm{E}}
\newcommand{\Fb}{\bm{F}}
\newcommand{\Gb}{\bm{G}}
\newcommand{\Hb}{\bm{H}}
\newcommand{\Ib}{\bm{I}}
\newcommand{\Jb}{\bm{J}}
\newcommand{\Kb}{\bm{K}}
\newcommand{\Lb}{\bm{L}}
\newcommand{\Mb}{\bm{M}}
\newcommand{\Nb}{\bm{N}}
\newcommand{\Ob}{\bm{O}}
\newcommand{\Pb}{\bm{P}}
\newcommand{\Qb}{\bm{Q}}
\newcommand{\Rb}{\bm{R}}
\newcommand{\Sbb}{\bm{S}}
\newcommand{\Tb}{\bm{T}}
\newcommand{\Ub}{\bm{U}}
\newcommand{\Vb}{\bm{V}}
\newcommand{\Wb}{\bm{W}}
\newcommand{\Xb}{\bm{X}}
\newcommand{\Yb}{\bm{Y}}
\newcommand{\Zb}{\bm{Z}}
\newcommand{\nn}{\nonumber}
\newcommand{\Ibb}{\mathbb{I}}
\newcommand{\Acal}{\mathcal{A}}
\newcommand{\Bcal}{\mathcal{B}}
\newcommand{\Ccal}{\mathcal{C}}
\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\Ecal}{\mathcal{E}}
\newcommand{\Fcal}{\mathcal{F}}
\newcommand{\Gcal}{\mathcal{G}}
\newcommand{\Hcal}{\mathcal{H}}
\newcommand{\Ical}{\mathcal{I}}
\newcommand{\Jcal}{\mathcal{J}}
\newcommand{\Kcal}{\mathcal{K}}
\newcommand{\Lcal}{\mathcal{L}}
\newcommand{\Mcal}{\mathcal{M}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\Ocal}{\mathcal{O}}
\newcommand{\Pcal}{\mathcal{P}}
\newcommand{\Qcal}{\mathcal{Q}}
\newcommand{\Rcal}{\mathcal{R}}
\newcommand{\Scal}{{\mathcal{S}}}
\newcommand{\Tcal}{{\mathcal{T}}}
\newcommand{\Ucal}{\mathcal{U}}
\newcommand{\Vcal}{\mathcal{V}}
\newcommand{\Wcal}{\mathcal{W}}
\newcommand{\Xcal}{\mathcal{X}}
\newcommand{\Ycal}{\mathcal{Y}}
\newcommand{\Zcal}{\mathcal{Z}}
\newcommand*{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand*{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\ea}{\emph{et al.}}
\newcommand{\eg}{\emph{e.g.}}
\newcommand{\ie}{\emph{i.e.}}
\newcommand{\iid}{\emph{i.i.d.}}
\newcommand{\etc}{\emph{etc.}}

\ifx\QED\undefined
\def\QED{~\rule[-1pt]{5pt}{5pt}\par\medskip}
\fi

\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
%\newenvironment{proof}{\emph{Proof. }}{ \hfill \QED}
\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}
\newtheorem{example}{Example}
\newtheorem{property}{Property}
% \newtheorem{lemma}[theorem]{Lemma}
\newtheorem{lemma}{Lemma}  % separate counters for lemmas
%\newtheorem{proposition}[theorem]{Proposition} %old
\newtheorem{proposition}{Proposition} % separate counters for propositions
% \newtheorem{claim}[theorem]{Claim}
\newtheorem{claim}{Claim} % separate counters for claims
\newtheorem{corollary}{Corollary}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{assumption}[theorem]{Assumption}
\fi


\newcommand{\xhdr}[1]{\noindent {\bf #1.}}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\title{Counterfactual Inference of Second Opinions (Supplementary material)}
% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<ninacobe@mpi-sws.org>?Subject=Counterfactual Inference of Second Opinions}{Nina~L.~Corvelo~Benz}{}}
\author[1]{Manuel~Gomez~Rodriguez}
% Add affiliations after the authors
\affil[1]{%
        Max Planck Institute for Software Systems\\
        Kaiserslautern, Germany
}
\affil[2]{%
        Department of Biosystems Science and Engineering\\
        ETH Zurich\\
        Zurich, Switzerland
}

\begin{document}
\onecolumn
\maketitle
\section{Proofs}
\label{app:awesomeproofs}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\xhdr{Proof of Theorem~\ref{thm:SCMsetinvariant}}

Let $\zeta \subseteq \zeta' \subseteq \Hu$ and both non-empty. Then, for any $x \in \X, u \in \U$, we have that
	\begin{equation*}
	(f(x,\zeta',u))_\zeta = ((f_h(x,u))_{h\in \zeta'})_{\zeta} = \ (f_h(x,u))_{h\in \zeta} = f(x,\zeta,u).
	\end{equation*}

\xhdr{Proof of Theorem~\ref{thm:SCMequivalence}} % for Decoupling the Causal Mechanism}
%
	Let $\M'$ be constructed from $\M$ by changing causal mechanism $f$ with $f'$.
	To prove equivalence of $\M$ and $\M'$, we only need to show that, for any $u\in \U, x \in \X, \zeta \in \mathcal{P}(\Hu)\setminus \emptyset$, it holds that
	\begin{equation}
		\vecy = f(x,\zeta,u) \Longleftrightarrow \vecy = f'(x,\zeta,u),
	\end{equation}
as only the causal mechanism was altered in the construction of $\M'$.
%
Let $h \in \zeta$ be an arbitrary expert, then
	\begin{equation*} 
		(f'(x,\zeta,u))_h \overset{def.}{=} f(x,\{h\},u) = (f(x,\zeta,u))_h,
	\end{equation*}
where the last equality holds because $f$ is set invariant. 
Thus, $f(x,\zeta,u)=f'(x,\zeta,u)$ for all $x \in \X, \zeta \in \Hu, u \in \U$.
% \end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\xhdr{Proof of Theorem~\ref{thm:equalprop}}
%
%
For clarity, we explicitly write $\vecY_{Z=\{h'\}}$ and $\vecY_{Z=\zeta}$ to better distinguish the two interventional outcomes. 
	For discrete probability distribution $P(U)$ the right probability is given by 
	\begin{align*}
		& P^{M;do[Z:=\zeta']}(\vecY_\zeta=\vecy \mid X=x) 
		= \sum_{u \in \U} P(U=u) \cdot \mathbbm{1}{[f(x,\zeta',u))_\zeta=\vecy)]},
	\end{align*}
	whereas the left is given by
	\begin{align*}
		 P^{M;do[Z:=\zeta]}(\vecY=\vecy \mid X=x) 
		= \sum_{u \in \U} P(U=u) \cdot \mathbbm{1}{[f(x,\zeta,u)=\vecy]}.
	\end{align*}
	Because $f$ is a set invariant mechanism over $Z$, $(f(x,\zeta',u))_\zeta=f(x,\zeta,u)$, thus,
	\begin{align*}
		\sum_{u \in \U} P(U=u) \cdot \mathbbm{1}{[f(x,\zeta,u)=\vecy]}
		= & \sum_{u \in \U} P(U=u) \cdot \mathbbm{1}{[f(x,\zeta',u))_\zeta=\vecy]}.
	\end{align*}
	The proof is analogous for the continuous probability distributions $P(U)$. 

\xhdr{Proof of Corollary~\ref{cor:equalprop_h}}
	Choose $\zeta=\{h\}$ in Theorem~\ref{thm:equalprop} and note that abusing notation $\vecY_{Z=\zeta}$ is in this case equivalent to $Y_h$.

\xhdr{Proof of Corollary~\ref{cor:equalcounterfactuals}}
%
	Using the definition of counterfactual distributions, the proof is analogous to the proof of Theorem~\ref{thm:equalprop} but using the posterior distribution $P(U\mid X=x, Z=\{h\},Y_h=c)$. 
	Let $\zeta \subseteq H$ be so that $h, h' \in \zeta$.
For all $c \in \Y$, by definition, we have that
        \begin{align}
		P^{M\mid X=x, Z=\{h\},\vecY=c ;do[Z:=\{h'\}]}(\vecY=c')
		 = 
		  \sum_{u \in \U} P(U =u \mid X, (\vecY_{Z=\{h\}})_{h}=c) \cdot \mathbbm{1}{[f(x,\{h'\},u)=c']}. \label{eq:cprop}
        \end{align}
	Using that $\M$ is set invariant we get that
        \begin{align*}
		(\vecY_{Z=\{h\}})_{h}=f(x,\{h\} ,U) = (f(x,\zeta,U))_{h} = (\vecY_{Z=\zeta})_{h}
		\quad \text{ and } \quad
		\mathbbm{1}{[f(x,\{h'\},u))=c']} = \mathbbm{1}{[(f(x,\zeta,u))_{h'}=c']}.
        \end{align*}
	Thus, Eq.~\eqref{eq:cprop} is equal to
        \begin{align*}
		\sum_{u \in \U} P(U =u \mid X, (\vecY_{Z=\zeta})_{h}=c) \cdot \mathbbm{1}{[(f(x,\zeta,u))_{h'}=c']}
		\overset{def.}{=} P^{M;do[Z:=\zeta]}(Y_{h'}=c'\mid X=x, Y_{h}=c).
        \end{align*}
% \end{proof}
%
%
\iffalse 
(Remark: Following also holds 
        \[ P^{M\mid X=x, Z=\zeta\backslash\{h\},\vecY=\vecy ;do[Z:=\{h\}]}(Y_h) = P^{M\mid X=x, Z=\zeta\backslash\{h\},\vecY=\vecy;do[Z:=\zeta]}(Y_h).\] 
        However here the right distribution is also counterfactual, the notation implies we have observed the set $\zeta\backslash \{h\}$ and the counterfactual is set $\zeta$. 
)\fi

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\xhdr{Proof of Theorem~\ref{thm:pcs_equivalence}}
%
        Note that for all $\zeta_1, \zeta_2,\zeta_3 \subseteq H$ so that $h \in \zeta_1, h' \in \zeta_2, h,h'\in \zeta_3$ holds that
        \begin{align}
                P^{\M\mid X=x, Z=\zeta_1, Y_{h}=c;do[Z=\zeta_2]}(Y_{h'}=c')=0 &\iff P^{\M\mid X=x, Z=\{h\}, Y_{h}=c;do[Z=\{h'\}]}(Y_{h'}=c')=0,
                \label{eq:equivalence1}
                \\ 
                P^{\M\mid X=x, Z=\{h\}, Y_{h}=c;do[Z=\{h'\}]}(Y_{h'}=c')=0 &\iff P^{\M;do[Z=\zeta_3]}(Y_{h'}=c'\mid X =x, Z=\zeta_3, Y_{h}=c)=0,
                \label{eq:equivalence2}
        \end{align}
        %
        where Eq.~\eqref{eq:equivalence1} follows from the definition of set invariance and Eq.~\eqref{eq:equivalence2} follows from Corollary~\ref{cor:equalcounterfactuals}.
        %
        Recall that $p_\zeta(h,c) := P^{\M;do[Z=\zeta]}(Y_{h}=c\mid X)$ and $p_h(c) := P^{\M;do[Z=\{h\}]}(Y_{h}=c\mid X)$.

	It follows from Corollary~\ref{cor:equalprop_h} that, for all $h \in H$, $c \in \Y$ and $\zeta,\zeta'$ so that $h \in \zeta$ and $h \in \zeta'$, we have that
        \begin{align*}
		p_{\zeta}(h,c) = p_h(h,c)= p_{\zeta'}(h,c).
        \end{align*}
	Thus, following implications hold
        \begin{align*}
		 \frac{p_{\zeta_2}(h',c)}{p_{\zeta_1}(h,c)} \geq \frac{p_{\zeta_2}(h', c')}{p_{\zeta_1}(h,c')}
		  \iff   
		 \frac{p_{h'}(h',c)}{p_{h}(h,c)} \geq \frac{p_{h'}(h',c')}{p_{h}(h,c')}
		 \iff  
		 \frac{p_{\zeta_3}(h',c)}{p_{\zeta_3}(h,c)} \geq \frac{p_{\zeta_3}(h', c')}{p_{\zeta_3}(h,c')}.
        \end{align*}
        With these set of implications, it is straight forward to imply one statement from the other.

\xhdr{Proof of Theorem~\ref{thm:gumbel}}
%
Let $\psi$ be a subgroup in $\Psi$ so that $|\psi|\geq 2$. Let $h$ and $h'$ denote two arbitrary experts in subgroup $\psi$.
        As the Gumbel-Max SI-SCM $\M(\Psi)$ is set invariant, it is enough to show that pairwise conditional stability condition is satisfied for pair $h$ and $h'$.
        Analogously to \citet{oberst2019}, we proceed by proving the contrapositive, that for all sets $\zeta$, so that $h,h' \in \zeta$, and $c\neq c'$
        \begin{align*}
                 P^{\M(\Psi);do[Z=\zeta]}(Y_{h'}=c' \mid X, Y_h=c) \neq 0
                 \implies \frac{p_\zeta(h',c)}{p_\zeta(h,c)}< \frac{p_\zeta(h',c')}{p_\zeta(h,c')}.
        \end{align*}
        %
        If the conditional probability is positive, almost surely there must exist Gumbel noise variables $g_{\psi,c}$ and $g_{\psi,c'}$ such that
	%
        \begin{align*}
                \log P(Y_h=c \mid X) + g_{\psi,c} &> \log P(Y_h=c' \mid X) + g_{\psi,c'} \\ 
                \log P(Y_{h'}=c \mid X) + g_{\psi,c} &< \log P(Y_{h'}=c' \mid X) + g_{\psi,c'},
        \end{align*}
        %
        as the sub-mechanisms $f_h$ and $f_{h'}$ of each expert share the noise vector of the subgroup $\psi$.

        Recall that, by set invariance, $p_\zeta(h,c)=P(Y_h=c \mid X)$ for all $\zeta,h,c$. Hence, we can substitute the probabilities in both inequalities.
        %
        Then, we further subtract the first inequality from the second which cancels out the Gumbel noises.
        %
        Finally, using the properties of the logarithm function, the inequality is rearranged deriving the implication.
        \begin{align*}
                \log p_\zeta(h',c) - \log p_\zeta(h,c) &< \log p_\zeta(h',c') -\log p_\zeta(h,c'), \\
                \frac{ p_\zeta(h',c)}{ p_\zeta(h,c)} &< \frac{p_\zeta(h',c')}{p_\zeta(h,c')}.
        \end{align*}
        %
        This proves that the Gumbel-Max SI-SCM $\M(\Psi)$ satisfies the pairwise conditional stability condition
        %
        \begin{equation*}
                \frac{p_\zeta(h',c)}{p_\zeta(h,c)}\geq \frac{p_\zeta(h',c')}{p_\zeta(h,c')}
                \implies P^{\M(\Psi);do[Z=\zeta]}(Y_{h'}=c' \mid X, Y_h=c) = 0,
        \end{equation*}
        %
        for any two experts in the same subgroup in $\Psi$.
% \end{proof}

\section{Randomized Greedy Algorithm for the Clique Partitioning Problem}
\label{sec:algorithm}
The idea behind the simple greedy randomized Algorithm \ref{alg:greedyalg} is to sequentially grow a clique starting from a random vertex in $\Gcal$ until no vertices can be added, remove this clique from the graph and repeat this process on the remaining graph until no vertices are left.
For the current clique $\psi$ the set of vertices that can be added, called candidate set, consists of vertices that
have edges to all the vertices in $\psi$ so that the sum these edge weights is non-positive.
The expert with minimum sum is added next to the clique, and the candidate set is updated.

The updated candidate set is a subset of the previous set, thus, the sum of edge weights of a vertex connected to the updated clique can be computed in constant time by considering the previous value and the weight of the edge to the newly added vertex. 
If no edge exists, the vertex can be removed from the candidate set.
Algorithm \ref{alg:greedyalg} can thus be implemented in $O(|\mathcal{E}|)$.
%
\begin{algorithm}[ht]
    \caption{Greedy Algorithm for the Clique Partitioning Problem,
	 $N(\psi)$ denotes the set of vertices not in $\psi$ with edges to all vertices in $\psi$ }
	\label{alg:greedyalg}
    \begin{algorithmic}
	    \STATE \textbf{Input:} weighted, undirected graph $\Gcal=(\Hcal,\Ecal,w)$
	\WHILE{$\Gcal$ not empty}
	   \STATE pick random vertex $h$
	   \STATE $\psi \gets h$
	   \WHILE{$N(\psi)$ not empty}
		\STATE $h^* \gets \arg \min_{h' \in N(\psi)} \sum_{h\in \psi} w(\{h',h\})$
		\IF{ $\sum_{h\in \psi} w(\{h^*,h\})\leq0$}
			\STATE $\psi \gets h^*$
		\ELSE	\STATE break
		\ENDIF
	    \ENDWHILE
	    \STATE $\Psi \gets \psi$
	    \STATE delete $\psi$ from $\Gcal$
	    \STATE $\psi = \emptyset$
	\ENDWHILE
	\STATE \textbf{return} $\Psi$
	\end{algorithmic}
\end{algorithm}
%%%%%%%%%%%%%%%%%%%%%%%%%

We note that, since the algorithm minimizes the sum of weights for each clique sequentially, it's performance in recovering a partition minimizing the overall sum depends on the sequence of sampled vertices which we start each clique from. To stabilize the algorithm's performance, one can rerun the algorithm a few times and choose among the returned partitions the one minimizing the overall sum of edge weights between vertices in the same set (see objective function in optimization problem~\ref{eq:optproblem_graph}). In the experiments on real (synthetic) data, we run Algorithm~\ref{alg:greedyalg} $10$ ($5$) times.

\section{Experiments on Synthetic Data}
\label{sec:synthetic}
In this section, we assess the performance of Algorithm~\ref{alg:greedyalg} at recovering the groups of mutually similar experts underpinning our Gumbel-Max SI-SCM
using synthetic data. 

\xhdr{Experimental setup}
%
We consider a synthetic prediction task with $k = 5$ labels and $20$ features per sample, whose values we sample uniformly 
at random from the interval $[0, 1]$, and a set $\Hcal$ of $48$ synthetic experts.
%
%
These synthetic experts make label predictions according to a Gumbel-Max SI-SCM with five disjoint groups of mutually
similar experts $\Psi$, \ie, each expert within a group $\psi \in \Psi$ use the same Gumbel noise within the model\footnote{The 
groups in the partition $\Psi$ contain $6$, $7$, $11$, $11$ and $13$ experts.}.
%
Moreover, for each expert, the probability $P(Y_h=c \mid X=x)$ is given by a multinomial logit model with random weight coefficients 
$w=(w_1, \dots, w_5)$, which we also sample uniformly at random from the interval $[0, 1]$ independently for each expert, \ie,
%
\begin{equation} \label{eq:multdistr}
	P(Y_h=c \mid X=x) = \frac{\exp(w_c\cdot x)}{\sum_{j \in \Y} \exp(w_j\cdot x)}. 
\end{equation}
%

We measure the performance of Algorithm~\ref{alg:greedyalg} at recovering the partition $\Psi$ given the true probabilities $P(Y_h=c \mid X=x)$ under different amounts 
of training data $m$ and sparsity levels $s \in (0,1)$.
%
The sparsity level $s$ controls the average number of observed expert predictions per sample, \ie, for each sample, all experts make a prediction but we only
observe $\max\{2, (1-s) |\Hcal|\}$, picked at random.
%
Here, note that, as the sparsity level $s$ decreases (increases) and the amount of training data increases (decreases), it is easier (harder) to recover the partition $\Psi$.
%

As a measure of the difficulty of each inference problem, we will use the edge ratio $r$, defined as the fraction of pair of experts who belong to the same group $\psi \in \Psi$, among all 
pairs whose predictions did not violate conditional stability and were at least once observed for the same sample.
%
As performance metrics, we will use:
%
\begin{itemize}
	\item The adjusted random index (ARI), which measures similarity between the partition $\hat{\Psi}$ returned by Algorithm~\ref{alg:greedyalg} and the true partition $\Psi$. 
	Its value lies in the interval $[0,1]$ where $1.0$ means full recovery and $0.0$ means a completely random partition was recovered with no similarity to the true one.

	\item The average $0/1$-loss on a held-out set (with $1000$ samples) of a predictor that, given an observed label $Y_h$, returns the most likely label $Y_{h'}$ under the inferred counterfactual distribution 
	$P^{\M(\hat{\Psi}) \given X=x, Z=\{h\}\vecY=y_{h}; \text{do}(Z=\{h'\})(\vecY)}$. 
	%
	Here, to estimate the inferred counterfactual distributions, we use $500$ samples.
\end{itemize}
%
As a point of comparison, for the second performance metric, we also compute the average $0/1$-loss over the same held-out set of two other predictors that, given an observed label $Y_h$, return 
the most likely label $Y_{h'}$ under the true counterfactual distributions $P^{\M(\Psi) \given X=x, Z=\{h\}\vecY=y_{h}; \text{do}(Z=\{h'\})(\vecY)}$ and the counterfactual distribution $P^{\M(\Hcal) 
\given X=x, Z=\{h\}\vecY=y_{h}; \text{do}(Z=\{h'\})(\vecY)}$, respectively.
%
\begin{figure*}[t]
        \centering
        	\subfloat[Adjusted random index (ARI)]{\includegraphics[width=0.32\textwidth]{ARI_plot.pdf}}
	\hspace{1mm}
        \subfloat[Average 0/1-loss on a held-out set]{\includegraphics[width=0.32\textwidth]{comparison_2.pdf}}
        \hspace{1mm}
	\subfloat[Edge ratio]{\includegraphics[width=0.32\textwidth]{Inedge_plot.pdf}}
        \caption{Performance of Algorithm~\ref{alg:greedyalg} at recovering the partition $\Psi$ given the true probabilities $P(Y_h=c \mid X=x)$ under different amounts 
of data $m$ and sparsity levels $s \in (0,1)$. 
	%
	To compute the mean and the standard deviation in panels (a), (b) and (c), we run each experiment five times.
	%
        }
        %
\label{fig:results_synthetic}
\end{figure*}

\xhdr{Results}
%
%
Figure~\ref{fig:results_synthetic} summarizes the results, which show that, as long as the edge ratio $r > 0.3$, the inferred partition $\hat{\Psi}$ is 
very similar to the true partition $\Psi$ (\ie, the value of ARI is very close to $1$) and the $0/1$-losses of the predictors that use $\M(\hat{\Psi})$ and $\M(\Psi)$ respectively are very similar.
%
%
Here, note however that, even the predictor that uses the true model $\M(\Psi)$ has a non zero $0/1$-loss is not error free because, given an observed expert prediction $Y_h$ and feature vector $x$, 
the expert prediction $Y_{h'}$ is not deterministic.

\section{Additional Figures for Experiments on Real Data}
\label{app:real}
%
\begin{figure*}[!t]
        \centering
        \includegraphics[width=1.0\textwidth]{flowdiag_experiments.pdf}
        %\vspace{-1mm} 
        \caption{Different models used in our experiments on real data.}
        %
\label{fig:flowdiag_experiments}
\end{figure*}
%
\begin{figure*}[!t]
        \centering
        \includegraphics[width=0.7\textwidth]{groups_hist.pdf}
        %\vspace{-1mm}
	\caption{Size of the mutually similar expert groups returned by Algorithm~\ref{alg:greedyalg} for the preprocessed CIFAR-10H dataset.}
        %
\label{fig:results_groups}
\end{figure*}
%
\begin{figure*}[!t]
        \centering
                \subfloat[\hspace{10mm} Gumbel-Max SI-SCM]{\includegraphics[width=0.5\textwidth]{CM_Gumbel-Max-SI-SCM.pdf}}
        %\hspace{1mm}
        \subfloat[\hspace{10mm}GNB]{\includegraphics[width=0.5\textwidth]{CM_GNB.pdf}}
	\\
        %\hspace{1mm}
        \subfloat[\hspace{10mm} GNB+CNB]{\includegraphics[width=0.5\textwidth]{CM_GNB+CNB.pdf}}
        \caption{
		Confusion matrices of the counterfactual predictions of our model and the predictions of the two baselines for expert's labels on the test dataset.
        %
        }
        %
\label{fig:confusion_matrices}
\end{figure*}
%
\clearpage
\begin{figure*}[!t]
        \centering
        \includegraphics[width=0.5\textwidth]{nb_baseline.pdf}
        %\vspace{-1mm}
        \caption{Per-expert test accuracy achieved by the baseline GNB+CNB on the preprocessed CIFAR-10H dataset. For each expert $h'$, the $y$-axis measures 
        the test accuracy whenever the observed expert $h$ belongs to the same group of mutually similar experts as $h'$ and the $x$-axis measures 
        the test accuracy whenever $h$ does not belong to the same group.
        %
        For each cell, the darkness is proportional to the number of experts with the corresponding test accuracies.}
        %\vspace{-2mm}
        %
\label{fig:results_real_2}
\end{figure*}
\bibliography{corvelo-benz_514}

\end{document}
