\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage[table,xcdraw]{xcolor}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{unsrt}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{positioning,shapes,arrows,calc}
\usepackage[normalem]{ulem}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros


% ============================================================================
\usepackage{researchpack}
\usepackage[capitalize, nameinlink]{cleveref}
\usepackage{enumitem}
\usepackage{xspace}
\usepackage{multirow}

 
\hypersetup{
    colorlinks=true,
    citecolor=blue,
    linkcolor=blue,
    filecolor=blue,
    urlcolor=blue,
}

\usepackage{pifont}
\newcommand{\xmark}{\ding{55}}%

\usepackage{thmtools}
\declaretheorem[name=Theorem]{theorem}
\declaretheorem[name=Lemma, numberlike=theorem]{lemma}
\declaretheorem[name=Proposition, numberlike=theorem]{proposition}
\declaretheorem[name=Corollary, numberlike=theorem]{corollary}
\declaretheorem[name=Remark, numberlike=theorem]{remark}
\declaretheorem[name=Definition]{definition}
\declaretheorem[name=Example]{example}
\declaretheorem[name=Lemma, numbered=no]{lemma*}
\declaretheorem[name=Proposition, numbered=no]{proposition*}

% \AV{what about DIERESIS = DIverse Ensembles for REasoning Shortcut Informed Systems?}
% \newcommand{\method}{\texttt{ke\$ha}\xspace}
% \newcommand{\acronym}{Knowledge $+$ Ensembles for SHortcut-Awareness\xspace}

% We can call it whatever you want, but you already know that deep in our hearts the method will always be biretta :). 
\newcommand{\method}{\texttt{bears}\xspace}
\newcommand{\acronym}{BE Aware of Reasoning Shortcuts\xspace}
% \newcommand{\acronym}{DIverse Ensembles for Shortcut-Informed Systems\xspace}
\newcommand{\DPL}{\texttt{DPL}\xspace}
\newcommand{\SL}{\texttt{SL}\xspace}
\newcommand{\LTN}{\texttt{LTN}\xspace}

\newcommand{\ST}[1]{\textcolor{magenta}{\textbf{[ST: #1]}}}
\newcommand{\EM}[1]{\textcolor{cyan}{\textbf{[EM: #1]}}}
\newcommand{\AP}[1]{\textcolor{green}{\textbf{[AP: #1]}}}
\newcommand{\AV}[1]{\textcolor{orange}{\textbf{[AV: #1]}}}
\newcommand{\SB}[1]{\textcolor{brown}{\textbf{[SB: #1]}}}
\newcommand{\EK}[1]{\textcolor{pink}{\textbf{[EvK: #1]}}}

\newcommand{\changed}[1]{\textcolor{black}{{#1}}}


\newcommand{\indep}{\ensuremath{\mathrel{\perp\!\!\!\perp}}}
\newcommand{\task}{\ensuremath{\calT}\xspace}
\newcommand{\dataset}{\ensuremath{\calD}\xspace}
\newcommand{\BK}{\ensuremath{\mathsf{K}}\xspace}
\newcommand{\KL}{\ensuremath{\mathsf{KL}}\xspace}
\newcommand{\CE}{\ensuremath{\mathsf{CE}}\xspace}
\newcommand{\Ent}{\ensuremath{\mathsf{H}}\xspace}
\newcommand{\MI}{\ensuremath{\mathsf{I}}\xspace}
\newcommand{\de}{\ensuremath{\mathrm{d}}\xspace}
\newcommand{\Vol}{\ensuremath{\mathrm{Vol}}\xspace}
\newcommand{\Var}{\ensuremath{\mathrm{Var}}\xspace}
\newcommand{\Cov}{\ensuremath{\mathrm{Cov}}\xspace}

\newcommand{\MZero}{\includegraphics[width=1.85ex]{figures/mnist-0.png}\xspace}
\newcommand{\MOne}{\includegraphics[width=1.85ex]{figures/mnist-1.png}\xspace}
\newcommand{\MTwo}{\includegraphics[width=1.85ex]{figures/mnist-2.png}\xspace}
\newcommand{\MThree}{\includegraphics[width=1.85ex]{figures/mnist-3.png}\xspace}
\newcommand{\MFour}{\includegraphics[width=1.85ex]{figures/mnist-4.png}\xspace}
\newcommand{\MFive}{\includegraphics[width=1.85ex]{figures/mnist-5.png}\xspace}
\newcommand{\MSix}{\includegraphics[width=1.85ex]{figures/mnist-6.png}\xspace}
\newcommand{\MSeven}{\includegraphics[width=1.85ex]{figures/mnist-7.png}\xspace}
\newcommand{\MEight}{\includegraphics[width=1.85ex]{figures/mnist-8.png}\xspace}
\newcommand{\MNine}{\includegraphics[width=1.85ex]{figures/mnist-9.png}\xspace}

\newcommand{\redlight}{\includegraphics[width=1.85ex]{figures/red-circle.png}\xspace}
\newcommand{\pedestrian}{\includegraphics[width=1.85ex]{figures/man-walking.png}\xspace}
\newcommand{\emergency}{\includegraphics[width=1.85ex]{figures/ambulance.jpeg}\xspace}

\newcommand{\MNISTAdd}{{\tt MNIST-Addition}\xspace}
\newcommand{\MNISTShortcut}{{\tt MNIST-Even-Odd}\xspace}
\newcommand{\MNISTHalf}{{\tt MNIST-Half}\xspace}
\newcommand{\Kandinsky}{{\tt Kandinsky}\xspace}
\newcommand{\BOIA}{{\tt BDD-OIA} \xspace}

\newcommand{\MCDrop}{{\tt MCDO}\xspace}
\newcommand{\Laplace}{{\tt LA}\xspace}
\newcommand{\DeepEns}{{\tt DE}\xspace}
\newcommand{\ProbCBM}{{\tt PCBM}\xspace}
\newcommand{\LaplaceTorch}{{\tt laplace-torch}\xspace}

\newcommand{\YAcc}{\ensuremath{\mathrm{Acc}_Y}\xspace}
\newcommand{\CAcc}{\ensuremath{\mathrm{Acc}_C}\xspace}
\newcommand{\YECE}{\ensuremath{\mathrm{ECE}_Y}\xspace}
\newcommand{\CECE}{\ensuremath{\mathrm{ECE}_C}\xspace}
\newcommand{\YCONF}{\ensuremath{\mathrm{Conf}_Y}\xspace}
\newcommand{\CCONF}{\ensuremath{\mathrm{Conf}_C}\xspace}
\newcommand{\mFY}{\ensuremath{\mathrm{mF_1}(Y)}\xspace}
\newcommand{\mFC}{\ensuremath{\mathrm{mF_1}(C)}\xspace}
\newcommand{\mYECE}{\ensuremath{\mathrm{mECE}_Y}\xspace}
\newcommand{\mCECE}{\ensuremath{\mathrm{mECE}_C}\xspace}
\newcommand{\RECE}{\ensuremath{\mathrm{ECE}_{C}(R)}\xspace}
\newcommand{\FSECE}{\ensuremath{\mathrm{ECE}_{C}(F,S)}\xspace}
\newcommand{\LECE}{\ensuremath{\mathrm{ECE}_{C}(L)}\xspace}
\newcommand{\YAccOod}{\ensuremath{\mathrm{Acc}_{Yood}}\xspace}
\newcommand{\CAccOod}{\ensuremath{\mathrm{Acc}_{Cood}}\xspace}
\newcommand{\YECEOod}{\ensuremath{\mathrm{ECE}_{Yood}}\xspace}
\newcommand{\CECEOod}{\ensuremath{\mathrm{ECE}_{Cood}}\xspace}

\newcommand{\bear}{\includegraphics[width=1.75ex]{arxiv-v1/figures/bear.png} \xspace}


\definecolor{goldenrod}{rgb}{0.85, 0.65, 0.13}

\newcommand{\shortparagraph}[1]{\vspace{-7pt}\paragraph{#1}}
\newcommand{\shortsubsection}[1]{\vspace{-3pt}\subsection{#1}\vspace{-5pt}}
\newcommand{\shortsection}[1]{\vspace{-3pt}\section{#1}\vspace{-5pt}}
\newcommand{\shortshortsection}[1]{\vspace{-5pt}\section{#1}\vspace{-7pt}}
% ============================================================================

% ST: the issue with this is we are not exactly Bayesian, and also it leaves us open to complaints like ``we know that Bayes fixes overconfidence'' and ``Bayes + NeSy feels very slow''.
\title{
% \textsc{Bears} 
\method Make Neuro-Symbolic Models Aware of their Reasoning Shortcuts}
\author[1,2,$\star$]{Emanuele Marconato}
\author[1,$\star$]{Samuele Bortolotti}
\author[3,$\star$]{Emile van Krieken}
\author[3]{Antonio Vergari}
\author[1]{Andrea Passerini}
\author[1,4]{Stefano Teso}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of Information Engineering and Computer Science \\
    University of Trento \\
    Trento, Italy
}
\affil[2]{%
    Department of Computer Science\\
    University of Pisa\\
    Pisa, Italy
}
\affil[3]{%
    School of Informatics\\
    University of Edinburgh\\
    Edinburgh, United Kingdom
  }
\affil[4]{%
    CIMeC\\
    University of Trento\\
    Rovereto, Italy
  }
\affil[$\star$]{
    Equal contribution.
}


\begin{document}
\maketitle


% ST: @AV: alternative take: Start by explaining that concept quality is essential in NeSy (for reliability) and CBMs (for explainability too), and then mention that RSs lower concept quality.  Since existing approaches are not RS-aware, we propose a relatively model-agnostic technique for making sota concept extractors RS-aware. etc.

\begin{abstract}
    Neuro-Symbolic (NeSy) predictors that conform to symbolic knowledge -- encoding, \eg safety constraints -- can be affected by Reasoning Shortcuts (RSs): They learn concepts consistent with the symbolic knowledge by exploiting unintended semantics.
    % \EK{Neuro-Symbolic (NeSy) predictors that conform to symbolic knowledge -- encoding, \eg safety constraints -- can be affected by Reasoning Shortcuts (RSs), \ie they learn concepts consistent with the symbolic knowledge by exploiting unintended semantics.}
    %
    RSs compromise reliability and generalization and, as we show in this paper, they are linked to NeSy models being overconfident about the predicted concepts.
    %
    %\EK{'sure-proof' isn't really a word afaik}
    Unfortunately, the only trustworthy mitigation strategy requires collecting costly dense supervision over the concepts.
    %
    Rather than attempting to avoid RSs altogether, we propose to ensure NeSy models are \textit{aware of the semantic ambiguity of the concepts they learn}, thus enabling their users to identify and distrust low-quality concepts.
    %
    Starting from three simple desiderata, we derive \method (\acronym), an ensembling technique that calibrates the model's concept-level confidence without compromising prediction accuracy, thus encouraging NeSy architectures to be uncertain about concepts affected by RSs.
    %
    We show empirically that \method improves RS-awareness of several state-of-the-art NeSy models, and also facilitates acquiring informative dense annotations for mitigation purposes.
\end{abstract}


\section{Introduction}

\begin{figure*}[!t]
    \centering
    \includegraphics[height=11.5em]{figures/page-2}
    \caption{
        \textbf{\method lessens overconfidence due to reasoning shortcuts.}
        %
        \textbf{Left}: In the \BOIA autonomous driving task \citep{xu2020boia, sawada2022concept}, NeSy predictors can attain high accuracy \textit{and} comply with the knowledge even when confusing the concepts of pedestrian (${\tt ped}$) and red light (${\tt red}$) \citep{marconato2023not}.
        %
        \textbf{Middle}: State-of-the-art NeSy architectures predict concepts affected by RSs with high confidence, making it impossible to discriminate between reliable and unreliable concept predictions.
        %$
        \textbf{Right}: \method encourages them to allocate probability to \textit{conflicting} concept maps, substantially lessening overconfidence.
    }
    \label{fig:second-page}
\end{figure*}


% \AV{perhaps it is worth stressing this is the first Bayesian NeSy model? BaNeSy}


Research in Neuro-Symbolic (NeSy) AI \citep{garcez2019neural, de2021statistical, garcez2022neural} has recently yielded a wealth of architectures capable of integrating low-level perception and symbolic reasoning.
%
Crucially, these architectures encourage \citep{xu2018semantic} or guarantee \citep{manhaeve2018deepproblog, giunchiglia2020coherent, ahmed2022semantic, hoernle2022multiplexnet} that their predictions conform to given prior knowledge encoding,
%\EK{which encodes}
\eg structural or safety constraints, thus offering improved reliability compared to neural baselines \citep{di2020efficient, hoernle2022multiplexnet, giunchiglia2020coherent}.

It was recently shown that, however, NeSy architectures can achieve high prediction accuracy by learning concepts -- \textit{aka} neural predicates \citep{manhaeve2018deepproblog} -- with unintended semantics \citep{marconato2023not, li2023learning}.
%
E.g., consider an autonomous driving task like \BOIA \citep{xu2020boia} in which a model has to predict safe actions based on the contents of a dashcam image, under the constraint that whenever it detects pedestrian or red lights the vehicle must stop.  Then, the model can achieve perfect accuracy \textit{and} comply with the constraint even when confusing pedestrians for red lights, precisely because both entail the correct (stop) action \citep{marconato2023not}.  See \cref{fig:second-page} for an illustration.

These so-called \textit{reasoning shortcuts} (RSs) occur because the prior knowledge and data may be insufficient to pin-down the intended semantics of all concepts, and cannot be avoided by maximizing prediction accuracy alone.
%
They compromise in-distribution \citep{wang2023learning} and out-of-distribution generalization \citep{marconato2023not, li2023learning}, continual learning \citep{marconato2023neuro}, reliability of neuro-symbolic verification tools \citep{xie2022neuro}, and concept-based interpretability \citep{koh2020concept, marconato2023interpretability} and debugging \citep{teso2023leveraging}.
%
Importantly, unsupervised mitigation strategies either offer no guarantees or work under restrictive assumptions, while supervised ones involve acquiring costly side information, \eg concept supervision \citep{marconato2023not}.

Rather than attempting to avoid RSs altogether, we suggest NeSy predictors should be \textit{aware of their reasoning shortcuts}, that is, they should assign lower confidence to concepts affected by RSs, thus enabling users to identify and avoid low-quality predictions, all while retaining high accuracy.
%
Unfortunately, as we show empirically, state-of-the-art NeSy architectures are \textit{not} RS-aware.
%
We address this issue by introducing \method (\acronym), a simple but effective method for making NeSy predictors RS-aware that does \textit{not} rely on costly dense supervision.
%
\method replaces the concept extraction module with a diversified \textit{ensemble} specifically trained to encourage the concepts' uncertainty is proportional to how strongly these are impacted by RSs.
%
Our experiments show that \method successfully improves RS-awareness of three state-of-the-art NeSy architectures on four NeSy data sets, including a high-stakes autonomous driving task, and enables us to design a simple but effective active learning strategy for acquiring concept annotations for mitigation purposes.


\textbf{Contributions}.  Summarizing, we:
%
\begin{itemize}[leftmargin=1em]

    \item Shift focus from RS mitigation to RS awareness and show that state-of-the-art NeSy predictors are not RS-aware.

    \item Propose \method, which improves RS-awareness of NeSy predictors without relying on dense supervision.

    \item Demonstrate that it outperforms SotA uncertainty calibration methods on several tasks and architectures.

    \item Show that it enables intelligent acquisition of concept annotations, thus lowering the cost of supervised mitigation.
    
\end{itemize}




\section{Preliminaries}
\label{sec:preliminaries}

\paragraph{Notation.}  We denote scalar constants $x$ in lower-case, random variables $X$ in upper case, and ordered sets of constants $\vx$ and random variables $\vX$ in bold typeface.
%
Throughout, we use the shorthand $[n] := \{1, \ldots, n\}$.


\textbf{Neuro-Symbolic Predictors.}  RSs have been primarily studied in the context of \textit{NeSy predictors} \citep{dash2022review, giunchiglia2022deep}, which we briefly overview next.
%
Given an input $\vx \in \bbR^n$, these models infer a (multi-)label $\vy \in \{0, 1\}^m$ by leveraging \textit{prior knowledge} $\BK$ encoding, \eg known structural \citep{di2020efficient} or safety \citep{hoernle2022multiplexnet} constraints.
%
During inference, they first extract a set of \textit{concepts} $\vc \in \{0, 1\}^k$ using a (neural) concept extractor $p_\theta(\vC \mid \vx)$.
%
Then, they reason over these to obtain a predictive distribution $p_\theta(\vy \mid \vc; \BK)$ that associates lower \citep{xu2018semantic, ahmed2022neuro, van2022anesi} or provably zero \citep{manhaeve2018deepproblog, ahmed2022semantic} probability to outputs $\vy$ that violate the knowledge \BK.
%
Taken together, these two distributions define a NeSy predictor of the form $p_\theta(\vy \mid \vx; \BK)$.  The complete pipeline is visualized in \cref{fig:second-page} (left).


\begin{example}
    \label{ex:boia} 
    In our running example (\cref{fig:second-page}), given a dashcam image $\vx$, we wish to infer what action $y \in \{ {\tt stop}, {\tt go} \}$ a vehicle should perform.  This task can be modelled using three binary concepts $C_1, C_2, C_3$ encoding the presence of green lights (${\tt grn}$), red lights (${\tt red}$), and pedestrians (${\tt ped}$).  The knowledge specifies that if any of the latter two is detected, the vehicle must stop:  $\BK = ({\tt ped} \lor {\tt red} \Rightarrow {\tt stop})$.
    % \hspace{-0.9em}
    % \includegraphics[scale=0.0005]{BOIA}
\end{example}


Inference amounts to solving a \textsf{MAP} \citep{koller2009probabilistic} problem $\argmax_{\vy} \ p_\theta(\vy \mid \vx; \BK)$, and learning to maximize the log-likelihood on a training set $\calD = \{ (\vx_i, \vy_i) \}$.
%
Architectures chiefly differ in how they integrate the concept extractor and the reasoning layer, and in whether inference and learning are exact or approximate, see \cref{sec:related-work} for an overview.
%
Despite these differences, RSs are a general phenomenon that can affect all NeSy predictors \citep{marconato2023not}.


\paragraph{Reasoning Shortcuts.}  In NeSy, usually only the labels receive supervision, while the concepts are treated as \textit{latent variables}.
%
It was recently shown that, as a result, NeSy models can fall prey to \textit{reasoning shortcuts} (RSs), \ie they often achieve high label accuracy by learning concepts with unintended semantics \citep{marconato2023not, marconato2023neuro, wang2023learning, li2023learning}.

To properly understand RSs, we need to define how the data is generated, cf. \cref{fig:generative-process-assumptions}.
%
Following \citep{marconato2023not}, we assume there exist $k$ unobserved \textit{ground-truth concepts} $\vg \in \{0, 1\}^k$ drawn from a distribution $p^*(\vG)$, which generate both the observed inputs $\vx$ and the label $\vy$ according to unobserved distributions $p^*(\vX \mid \vG)$ and $p^*(\vY \mid \vG; \BK)$, respectively.
%
We also assume all observed labels satisfy the prior knowledge $\BK$ given $\vg$.

In essence, a NeSy predictor is affected by an RS whenever the label distribution $p_\theta(\vY \mid \vX; \BK)$ behaves well, but the concept distribution $p_\theta(\vC \mid \vX)$ does not, that is, given inputs $\vx$ it extracts concepts $\vc$ that yield the correct label $\vy$ but do not match the ground-truth ones $\vg$.
%
RSs impact the reliability of learned concepts and thus the trustworthiness of NeSy architectures in out-of-distribution \citep{marconato2023not} scenarios, continual learning \citep{marconato2023neuro} settings, and neuro-symbolic verification \citep{xie2022neuro}.  They also compromise the interpretability of concept-based explanations of the model's inference process \citep{rudin2019stop, kambhampati2021symbols, marconato2023interpretability}.


\begin{example}
    \label{ex:boia-rs}
    In \cref{ex:boia}, we would expect predictors achieving high label accuracy to accurately classify all concepts, too.
    %
    It turns out that, however, predictors misclassifying pedestrians as red lights (as in \cref{fig:second-page}, middle) can achieve an equally high label accuracy, precisely because both concepts entail the (correct) ${\tt stop}$ action according to \BK.
    %
    To see why this is problematic, consider tasks where the knowledge allows for ignoring red lights when there is an emergency. This can lead to dangerous decisions when there are pedestrians on the road \citep{marconato2023not}.
\end{example}


\textbf{Causes and Mitigation Strategies.}  The factors controlling the occurrence of RSs include \citep{marconato2023not}:
%
$1$) The structure of the \textit{knowledge} $\BK$,
%
$2$) The distribution of the \textit{training data},
%
$3$) The learning \textit{objective}, and
%
$4$) The \textit{architecture} of the concept extractor.
%
For instance, whenever the knowledge \BK admits multiple solutions -- that is, the correct label $\vy$ can be inferred from distinct concept vectors $\vc \ne \vc'$, as in \cref{ex:boia-rs} -- the NeSy model has no incentive to prefer one over the other, as they achieve exactly the same likelihood on the training data, and therefore may end up learning concepts that do not match the ground-truth ones \citep{marconato2023neuro}.

All four root causes are natural targets for mitigation.
%
For instance, one can reduce the set of unintended solutions admitted by the knowledge via \textit{multi-task learning} \citep{caruana1997multitask}, force the model to distinguish between different concepts by introducing a \textit{reconstruction penalty} \citep{khemakhem2020variational}, and reduce ambiguity by ensuring the concept encoder is \textit{disentangled} \citep{scholkopf2021toward}.
%
It was shown theoretically and empirically that, while existing unsupervised mitigation strategies \textit{can} and in fact \textit{do} have an impact on the number of RSs affecting NeSy predictors, especially when used in combination, they are also insufficient to prevent RSs in all applications \citep{marconato2023not}.

The most direct mitigation strategy is that of supplying \textit{dense annotations} for the concepts themselves.  Doing so steers the model towards acquiring good concepts and can, in fact, prevent RSs \citep{marconato2023not}, yet concept supervision is expensive to acquire and therefore rarely available in applications.


\section{From Mitigation to Awareness}
\label{sec:method}

Mitigating RSs is highly non-trivial.
%
Rather than facing this issue head on, we propose to make NeSy predictors \textit{reasoning shortcut-aware}, \ie uncertain about concepts with ambiguous or wrong semantics.
%
To see why this is beneficial, consider the following example:


\begin{example}
    \label{ex:driving}
    Imagine a NeSy predictor that confuses pedestrians with red lights, as in \cref{ex:boia-rs}.
    %
    If it is always certain about its concept-level predictions, as in \cref{fig:second-page} (middle), there is no way for users to figure out that some predictions should not be trusted.
    %
    A model classifying pedestrians as both ${\tt ped}$estrians and ${\tt red}$ lights with equal probability, as in \cref{fig:second-page} (right), is just as confused, but is also calibrated, in that it is more uncertain about pedestrians and red lights, which are low quality, compared to green lights, which are classified correctly.
    %
    This enables users to distinguish between high- and low-quality predictions and concepts, and thus avoid the latter.
\end{example}


% -- that is, $ \textstyle p({\tt pedestrian} \mid \pedestrian) = p({\tt red\_light} \mid \pedestrian) = \frac{1}{2}$ -- then \ST{I'd prefer to say that users can decide not to trust the model whenever it is uncertain}  \ST{we need a third concept that is not confounded, say green light} leads to to correct but uncertain predictions under $\BK_2$:
%     \[
%         p({\tt go} \mid \pedestrian, \emergency; \BK_2) = p({\tt stop} \mid \pedestrian, \emergency; \BK_2) = \frac{1}{2}
%     \]


We say a NeSy predictor is \textit{reasoning shortcut-aware} if it satisfies the following desiderata:
%
\begin{itemize}

    \item[\textbf{D1}.] \textbf{Calibration}:  For all concepts \textit{not} affected by RSs, the system should achieve high accuracy and be highly confident.  Vice versa,  for all concepts that \textit{are} affected by RSs, the model should have low confidence.

    \item[\textbf{D2}.] \textbf{Performance}:  The predictor $p_\theta(\vY \mid \vX; \BK)$ should achieve high \textit{label accuracy} even if RSs are present.

    \item[\textbf{D3}.] \textbf{Cost effectiveness}:  The system should not rely on expensive mitigation strategies.

\end{itemize}
%
Calibration (\textbf{D1}) captures the essence of our proposal:  a model that knows which ones of the learned concepts are affected by RSs can prevent its users from blindly trusting and reusing them.
%
Naturally, this should not come at the cost of prediction accuracy (\textbf{D2}) or expensive concept-level annotations (\textbf{D3}), so as not to hinder applicability.


% \ST{is this necessary? what was missing from what I had already written?} To see why this is useful, notice that a NeSy model is affected by a RS whenever some of the ground-truth concepts $\vg$ are mispredicted, that is, $\argmax_{\vc} p_\theta (\vc \mid \vX) \ne \vg$.
% %
% Predictors unaware of such RSs will be very confident of their prediction $\vc$.
% %
% To see the danger in this, trusting a confident model that confuses some concepts for one another can lead to poor results outside the domain where RSs work.
% %
% On the other hand, other RSs may distribute the probability mass around distinct concepts, including the ground-truth values $\vg$. This is a less severe case, since now the confusion on concepts unlocks the possibility to spot what concepts are confounded and correct them on the fly. We view this as the best option when RSs cannot be reduced further with the available mitigation strategies.


\begin{figure}[!t]
    \centering
    \begin{tikzpicture}[
        scale=1,
        transform shape,
        node distance=.25cm and .75cm,
        minimum width={width("G")+15pt},
        minimum height={width("G")+15pt},
        mynode/.style={draw,ellipse,align=center}
    ]
        \node[mynode, fill=black!13!white] (X) {$\vY$};
        \node[mynode, left=of X] (G) {$\vG$};
        \node[mynode, below=of X, fill=black!13!white] (Y) {$\vX$};
        \node[mynode, right=of X, blue] (C) {$\vC$};
            
        % \path
        %     (G) edge[-latex] (X)
        %     (G) edge[-latex] (Y)
        %     (Y) edge[-latex] (C);
        
        \draw [->] (G) -- (Y);
        \draw [->, bend angle=45, bend left, dashed, red] (Y) to node[below, pos=.75] {\textcolor{red}{$f$}} (G);
        \draw [->] (G) to node[above, pos=.5] {\textcolor{red}{$\beta_\BK$}} (X);
        \draw [->, blue] (Y) to (C);
        \draw [->, blue] (C) to (X);
        % \draw [->, bend angle=60, bend left, dashed, magenta] (G) to (C); %to node[above, pos=.9] {{$\alpha$}} (C);

    \end{tikzpicture}
    \caption{\textbf{Data generating process}.
    %
    The (unobserved) ground-truth concepts $\vG$ cause the inputs $\vX$ which cause the labels $\vY$ (in \textbf{black}).
    %
    A NeSy predictor learns to map inputs $\vX$ to concepts $\vC$ (in \textcolor{blue}{\textbf{blue}}), which ideally should match the concepts $\vG$ that caused $\vX$.
    %
    % Implicitly, it learns a possibly stochastic map from ground-truth to learned concepts (in \textcolor{magenta}{\textbf{pink}}):  whenever this differs from the identity, the model has learned a RS.
    %
    The maps $f$ and $\beta_\BK$ from assumptions \textbf{A1} and \textbf{A2} in \cref{sec:method} are shown in \textcolor{red}{\textbf{red}}.
    % \EK{Is this figure referenced in the main text now?}} \ST{yes}
    }
    \label{fig:generative-process-assumptions}
\end{figure}


\subsection{Awareness Via Entropy Maximization}

% C1 & C2
% Recall that NeSy predictors are affected by RSs when they output accurate label predictions by relying on the wrong concepts.
% %
% One way of understanding reasoning shortcuts is to look at the map from the ground-truth concepts $\vG$ to the learned concepts $\vC$ entailed by a NeSy predictor.
% %
% From this perspective, a RS occurs whenever
% %
% (\textbf{C1}) The distribution $\int p_\theta(\vY \mid \vx; \BK) \ p^*(\vx \mid \vG)$ matches the ground-truth distribution $p^*(\vY \mid \vG; \BK)$, and
% %
% (\textbf{C2}) The concept distribution entailed by the concept extractor, namely $p_\theta(\vC \mid \vG) := \int p_\theta(\vC \mid \vx) \ p^*(\vx \mid \vG) \ \de\vx$, differs from the identity function.

We start by introducing our basic intuition.
%
Consider an example $(\vx, \vy)$ and let $\vg$ be the underlying ground-truth concepts.
%
If $\vg$ is the only concept vector that entails the label $\vy$ according to the prior knowledge \BK, maximizing the likelihood steers the concept extractor towards predicting the correct concept $\vc = \vg$ with high confidence.  In this simplified scenario, NeSy predictors would automatically satisfy \textbf{D1}--\textbf{D3}.
%
In most NeSy tasks, however, there exist multiple concept vectors $\vc_1 \ne \ldots \ne \vc_u$ that \textit{all} entail the correct label $\vy$.  In this case, there is no reason for the model to prefer one to the others: all of them achieve the same (optimal) likelihood, yet only one of them matches $\vg$.
%
The issue at hand is that existing NeSy predictors tend to predict only one -- likely incorrect -- $\vc_i$, $i \in [u]$, and they do so \textit{with high confidence}, thus falling short of \textbf{D1}.

We propose an alternative solution.  Let $\Theta^*$ be the set of parameters $\theta$ attaining high accuracy (\textbf{D2}), \ie mapping inputs to concepts $\vc$ yielding good predictions $\vy$.
%
This set includes the (correct) predictor mapping $\vx$ into $\vg$ as well as all high-performance predictors mapping it to one or more unintended concept vectors $\vc_i \ne \vg$.
%
We wish to find one that is maximally uncertain about which $\vc_i$'s it should output, that is:
%
\[
    \max_{\theta \in \Theta^*} \; H (p_\theta( \vC \mid \vG ))
    \label{eq:optimization-d2}
\]
%
Here, $H (p_\theta( \vC \mid \vG )) = - \bbE_{p^*(\vg) p_\theta(\vc \mid \vg)}[ \log p_\theta (\vc \mid \vg)]$ is the conditional Shannon entropy, and:
\[
    p_\theta(\vC \mid \vG) := \int p_\theta(\vC \mid \vx) \ p^*(\vx \mid \vG) \ \de\vx
    \label{eq:rs-c2}
\]
%
is the distribution obtained by marginalizing the concept extractor $p_\theta(\vC \mid \vX)$ over the inputs $\vx$.
%
By construction, this $\theta$ achieves high accuracy (\textbf{D2}) but, despite being affected by RSs, it is less confident about its concepts $\vc$ (\textbf{D1}).
%
The issue is that, by \textbf{D3}, we have access to neither the ground-truth distribution $p_\theta(\vC \mid \vG)$ nor to samples drawn from it, so we cannot optimize \cref{eq:optimization-d2} directly.

% Therefore, we propose to model uncertainty over the concept extractor $p_\theta(\vC \mid \vx)$, instead.
% As we show next, this criteria is sufficient to maximise the entropy in \cref{eq:optimization-d2}.

% \begin{proposition}
%     For the generative process in \cref{fig:generative-process-assumptions}, it holds that: 
%     %
%     \[
%         \max_{\theta \in \Theta^*} H (p_\theta(\vC \mid \vX))
%             \leq  \max_{\theta \in \Theta^*}  H(p_\theta(\vC \mid \vG))
%     \]
%     %
%     \ST{When $\vG$ is \$foo, the bound is tight.}
% \end{proposition}

% %
% This result suggests that one can learn RS-aware predictors by jointly optimizing the label likelihood and the entropy of the concept extractor, that is, by solving:
% %
% \[
%     \label{eq:empirical-objective}
%     \max_{\theta} \, \sum_{(\vx, \vy) \in \calD} \log p_\theta(\vy \mid \vx; \BK) + \gamma \cdot H( p_\theta(\vC \mid \vx))
% \]
% %
% where $\calD$ is a training set and $\gamma > 0$ a hyperparameter.
% \EM{Add bridge to next section} Optimizing for this objective in practice, however, may be difficult, implying in some cases a trade-off between the label accuracy and the entropy on the concepts. \EM{maybe add here paper by Kareem on entropy reg.}


\subsection{Maximizing Entropy}

Next, we show that one can maximize \cref{eq:optimization-d2} by constructing a distribution $p_\theta(\vC \mid \vG)$ affected by \textit{multiple} but \textit{conflicting} RSs.
%
% Then, we propose to do so approximately by averaging over an ensemble of high-performance models affected by distinct RSs.
%
Our analysis builds on that of \citep{marconato2023not}, which relies on two simplifying assumptions:
%
\begin{itemize}

    \item[\textbf{A1}.] \textit{Invertibility}: Each $\vx$ is generated by a unique $\vg$, \ie there exists a function $f:\vx \mapsto \vg$ such that $p^*(\vG \mid \vX) = \Ind{ \vG - f(\vX) }$.\footnote{Works on the \textit{identifiability} of the latent variables in independent component analysis \citep{khemakhem2020variational, gresele2021independent, lachapelle2024nonparametric} and causal representation learning \citep{scholkopf2021toward, liang2023causal, buchholz2023learning} build on a similar assumption.}%, typically for continuous variables $\vG \in \bbR^k$ and $\vX \in \bbR^n$.}

    \item[\textbf{A2}.] \textit{Determinism}:  The knowledge $\BK$ is \textit{deterministic}, \ie there exists a function $\beta_\BK:\vg \mapsto \vy$ such that $p^*(\vY \mid \vG; \BK) = \Ind{\vY = \beta_\BK (\vg)}$.  This is often the case in NeSy tasks, \eg\ \BOIA.

\end{itemize}
%
The link between $\beta_\BK$, $f$, and the NeSy predictor is shown in \cref{fig:generative-process-assumptions}.
%
In the following, we use $\alpha: \{0,1\}^k \to \{0,1\}^k$ to indicate a generic map from $\vg$ to $\vc$, and denote $\calA$ the set of all such maps and $\calA^* \subseteq \calA$ that of $\alpha$'s that yield distributions $p_\theta(\vC \mid \vG)$ achieving perfect label accuracy (\textbf{D2}).
%
Each $\alpha$ encodes a corresponding \textit{deterministic} distribution $p_\theta(\vC \mid \vG) = \Ind{\vc = \alpha(\vg)}$:  if $\alpha$ is the identity, this distribution encodes the correct semantics. Otherwise, it captures an RS (cf. \cref{fig:second-page}).
%
Next, we show that every distribution $p_\theta(\vC \mid \vG)$ decomposes as a convex combination of maps $\alpha$.

% Marconato {\em et al.} \citep{marconato2023not} showed that, under invertibility (\textbf{A1}) and determinism (\textbf{A2}), one can describe every $p_\theta (\vC \mid \vG)$ in terms of such $\alpha$'s and that $\calA^*$ is not empty, cf. \cref{thm:mc-det-opts} in the appendix for details.
% %
% We will exploit these insights later on.
% %
% For now, we prove that the maps $\alpha \in \calA$ constitute a basis for expressing every possible distribution on the learned concepts:

% We are interested in expressing $p_\theta(\vC \mid \vG)$ as a combination of deterministic maps

% This is done by introducing, first, the maps , expressing how ground-truth concepts are mapped to the learned ones. % Intuitively, each map $\alpha$ associated to a ground-truth concept vector $\vg$ 
% We will denote with $\calA$ the set of all deterministic maps $\alpha$.
%
% \ST{unclear how $\alpha$ is related to the rest} 
% In the following, $\calA$ denotes the set of all deterministic maps $\alpha: \{0,1\}^k \to \{0,1\}^k$ from $\vg$ to $\vc$.
%
%
%
%
% that is:
% \[
%     \textstyle
%         \calA^* = \big\{ \alpha \in \calA: \; \bigwedge_{\vg \in \mathsf{supp}(\vG)} (\beta_\BK \circ \alpha)(\vg) = \beta_\BK(\vg)  \big\}
% \]

% \EM{Lemma is up here.}
%
% This decomposition allows us to convert the optimization problem in \cref{eq:optimization-d2} from a search over parameters $\theta$ into a search over weight vectors $\vomega$.  The question then becomes what combinations $\vomega$ of maps $\alpha \in \calA^*$ attain maximal entropy.  \ST{we should say the search automatically satisfies the constraint in \cref{eq:optimization-d2}.}
% %
% \ST{do we have formal proof of this?}  Intuitively, we expect distributions $p_\vomega(\vC \mid \vG)$ that mix together multiple optimal $\alpha$'s (including RSs \ST{this should be anticipated}) in $\calA^*$ to be highly accurate (\textbf{D2}), because all \ST{FINISHME!!!} capture different, conflicting reasoning shortcuts, and therefore to satisfy \textbf{D2} (by optimality of RSs) and \textbf{D1} (because these disagree with each other).

% The next result shows that, on the one hand, \cref{eq:optimization-d2} can in fact be cast as an optimization problem over $\vomega$, and on the other that the latter \ST{I don't see a clear link here, because the first part is about C|G, not C|X} can be approximately solved in practice:

\begin{lemma}
    \label{lemma:decomposition-of-p}
     For any $p(\vC \mid \vG)$, there exists at least one vector $\vomega$ such that the following holds:
    %
    \[
        p(\vC \mid \vG)
        = \sum_{\alpha \in \calA} \omega_\alpha \Ind{ \vC = \alpha(\vG) }
        := p_\omega(\vC \mid \vG)
        \label{eq:whatever}
    \]
    %
    where $\vomega \ge 0$, $\norm{\vomega}_1 = 1$.
    %
    Crucially, under invertibility (\textbf{A1}) and determinism (\textbf{A2}), if $p_\theta(\vC \mid \vG)$ is optimal (\textbf{D2}), \cref{eq:whatever} holds even if we replace $\calA$ with $\calA^*$.  
\end{lemma}


All proofs can be found in \cref{sec:proofs}.
%
% Taken individually, the maps $\alpha \in \calA^*$ yield \textit{deterministic} distributions $p(\vC \mid \vG)$, which by definition are not calibrated (\textbf{D1}).
%
This means that most distributions $p(\vC \mid \vG)$ are mixtures of \textit{multiple} maps $\alpha \in \calA^*$, each potentially capturing a different RS.
%
Naturally, if $\omega_\alpha$ is non-zero only for those $\alpha$'s that fall in $\calA^*$, $p_\theta(\vC \mid \vG)$ achieves high performance (\textbf{D2}).
%
The question is what $\vomega$'s achieve calibration (\textbf{D1}).
%
Intuitively, if $\vomega$ mixes $\alpha$'s capturing RSs that disagree on the semantics of some concepts and agree on others, $p_\theta(\vC \mid \vG)$ is RS-aware.


\begin{example}
    Consider \cref{fig:second-page} and two high-performance maps in $\calA^*$:
    %
    $\alpha_1$ mapping green lights to ${\tt grn}$, and both pedestrians and red lights to ${\tt red}$;
    %
    $\alpha_2$ also mapping green lights to ${\tt grn}$, but pedestrians and red lights to ${\tt ped}$.
    %
    Clearly, both maps are affected by RSs and overconfident, yet their mixture $\alpha = \frac{1}{2} (\alpha_1 + \alpha_2)$ yields a distribution $p_\theta(\vC \mid \vG)$ that looks exactly like the one in \cref{fig:second-page} (right), which predicts ${\tt grn}$ correctly with high confidence, and ${\tt red}$ and ${\tt ped}$ with low confidence, and thus satisfies \textbf{D1} and \textbf{D2}.
    % A NeSy model can learn to map both the pedestrian and the red light to a single concept, \ie both on $\tt pedestrian$, say $\alpha_1$, or both on $\tt red\_light$, say $\alpha_2$. Both of these $\alpha$'s are uncalibrated since they fail to predict at least one concept correctly. Conversely, a model $p_\theta$ that captures the average of $\alpha_1$ and $\alpha_2$  will be calibrated, being uncertain on both the $\tt pedestrian$ and $\tt red\_light$ concepts. The result is a model like in \cref{fig:second-page} (right). 
\end{example}


%
% As long as these differ in how they allocate mass to predicted concepts $\vC$, the corresponding distribution will be more calibrated.  \ST{explain}

In other words, this allows us to leverage the model's uncertainty to estimate the extent by which concepts are affected by RSs \textit{without the need for dense annotations} (\textbf{D3}).
%
Due to space considerations, we report our formal analysis of the connection between uncertainty and RSs in \cref{sec:entropy-vs-rss}.
%
The next result indicates that this intuition is consistent with our original objective in \cref{eq:optimization-d2}:

% It turns out that searching for a high entropy model is equivalent to searching for a combination of $\alpha$'s whose mixture achieves high entropy:

% Formally, the decomposition allows us to convert the optimization problem in \cref{eq:optimization-d2} from a search over parameters $\theta \in \Theta^*$ into a search over weight vectors $\vomega^*$, where $\omega^*_\alpha = 0$ if $\alpha \not \in \calA^*$.  The question then becomes what combinations $\vomega^*$ of maps $\alpha \in \calA^*$ attain maximal entropy.  


\begin{proposition}
    \label{prop:ideal-obj}
    (Informal.)  %Consider only optimal parameters $\theta \in \Theta^*$ for $p_\theta (\vC \mid \vG)$.
    %
    % Assuming that 
    If $p_\theta$ is expressive enough, under invertibility (\textbf{A1}) and determinism (\textbf{A2}), it holds that:
    %
    \begin{equation}
        \label{eq:objective-d1-d2}
         \max_{\theta \in \Theta^*} H(p_\theta(\vC \mid \vG))
            = \max_{\vomega^*} H(p_{\vomega^*}(\vC \mid \vG))
    \end{equation}
    %
    %
\end{proposition}


This also tells us that we can solve \cref{eq:optimization-d2} by finding a combination of maps $\alpha$'s with maximal entropy over concepts.


% This suggests that if the model is flexible enough to capture arbitrary convex combinations of RSs, the problem amounts to finding what combinations of RSs are mostly confused about the learned concepts. \EM{Expressiveness ahead->}


% Intuitively, \cref{prop:ideal-obj} (\textit{i}) shows that optimizing for \cref{eq:optimization-d2} amounts to finding a convex combination $\vomega^*$ of RSs are maximally uncertain over the concepts. On the other hand, point (\textit{ii}) shows that we can find a lower bound that we can optimize in practice.
% %
% Next, we show how to optimize this criterion in practice. \ST{not a practical criterion, we don't have G}


\subsection{RS-Awareness with \method}

Our results suggest that RS-awareness can be achieved by constructing an ensemble $\vtheta = \{ \theta_i \}$ of (deterministic) high-performance concept extractors affected by distinct RSs.
%
Ideally, we could construct such an ensemble by training multiple concept extractors $p_{\theta_i}(\vC \mid \vX)$ such that each of them picks up a different RS, and then defining an overall predictor $p_\vtheta$ as a convex combination thereof, that is, $p_\vtheta(\vC \mid \vX) = \sum_i \lambda_i p_{\theta_i} (\vC \mid \vX)$, where $\vlambda \ge 0$ and $\norm{\vlambda}_1 = 1$.
%
We next show that, if the ensemble is large enough, such a model does optimize our original objective in \cref{eq:optimization-d2}.


\begin{proposition}
    \label{prop:prior-posterior-optima}
    (Informal.)  Let $p(\vC \mid \vX)$ be a convex combination of models $p_{\theta_i}(\vC \mid \vX)$ with parameters $\vtheta = \{ \theta_i \}$ and weights $\vlambda =\{ \lambda_i \}$, such that $\theta_i \in \Theta^*$.
    %
    Under invertibility and determinism, there exists a $K \leq |\calA^*|$ such that for an ensemble with $K$ members, it holds that:
    %
    \[
        \max_{ \vtheta, \vlambda } H \Big(\sum_{i=1}^K \lambda_i p_{\theta_i}(\vC \mid \vX) \Big)
        = \max_{\vomega^*} H (p_{\vomega^*}(\vC \mid \vG))
    \]
    %
    Moreover, maximizing $H(p_\vtheta(\vC \mid \vX))$ amounts to solving:
    %
    \[  \label{eq:theoretical-obj}
        % \max_{ \{ (\theta_i, \lambda_i) \} } H(\vC \mid \vX) = 
        \begin{aligned}
            \max_{ \vtheta, \vlambda } \int p(\vx) \sum_{i=1}^K \lambda_i [& \KL( p_{\theta_i} (\vc \mid \vx)  \mid \mid \sum_{j=1}^K \lambda_j  p_{\theta_j} (\vc \mid \vx)) \\
            %
            & \quad + H(p_{\theta_i}(\vC  \mid \vx)) ]   \de \vx 
        \end{aligned}
    \]
    %
    where $\KL$ denotes the Kullback-Lieber divergence.
\end{proposition}


% This shows that, in principle, by capturing different $\theta_i \in \Theta^*$ we can construct a model that solves the ideal objective \cref{eq:optimization-d2}.

For the proposition to apply, it may be necessary to collect an enormous number of diverse, deterministic, high-performance models, potentially as many as $|\calA^*|$.
%
Naturally, constructing such an ensemble is highly impractical.
%
Thankfully, doing so is often unnecessary in practice:  as long as the ensemble contains models that disagree on the semantics of concepts, it will likely achieve high entropy on concepts affected by RSs and low entropy on the rest, as we show in our experiments.

\method exploits this observation to turn this into a practical algorithm.
%
In short, it grows an ensemble $\vtheta$ by optimizing a joint training objective combining label accuracy and diversity of concept distributions.
%
Each model $\theta_i$ is learned in turn by maximizing the following quantity:
%
\[
\begin{aligned} 
\label{eq:method-implementation}
    \calL(\vx, \vy; \BK, \theta_t)  \ 
        & + \gamma_1 \cdot
        \KL\big(
            p_{\theta_{t}}(\vC \mid \vx)\mid \mid \frac{1}{t}
            \sum_{j=1}^{t} p_{\theta_j}(\vC \mid \vx)
        \big)
    \\
        & + \gamma_2 \cdot H(p_{\theta_t} (\vC \mid \vx))
\end{aligned}
\]
%
over a training set $\calD$.
%
Here, $\calL(\vx, \vy; \BK, \theta)$ is the log-likelihood of member $\theta_i$, while the second term is a $\KL$ divergence -- obtained from \cref{eq:theoretical-obj} by taking a uniform $\vlambda$ -- encouraging $\theta_i$ to differ from $\theta_1, \ldots, \theta_{i-1}$ in terms of concept distribution.  Finally, $\gamma_1$ and $\gamma_2$ are hyperparameters.
%
Pseudocode and further details on the $\KL$ term can be found in \cref{sec:implementation-details}.
%
We remark that, despite learning $\theta_i$'s that are not necessarily optimal or deterministic, in practice \method still manages to drastically improve RS-awareness in our experiments.


\subsection{\method through a Bayesian Lens}

Bayesian inference is a popular strategy for lessening overconfidence of neural networks \citep{neal2012bayesian, kendall2017uncertainties, wang2020survey}.
%
It works by marginalizing over a (possibly uncountable) family of alternative predictors, each weighted according to a posterior distribution $p(\theta \mid \calD) \propto p(\calD \mid \theta) \cdot p(\theta)$ accounting for both data fit $p(\calD \mid \theta)$ and prior information $p(\theta)$.
%
Formally, the label distribution is given by:
%
\[
    \textstyle
    p(\vy \mid \vx; \calD) = \int p_\theta(\vy \mid \vx; \BK) \cdot p(\theta \mid \calD) \ \de \theta
    \label{eq:bayesian-averaging}
\]
%
The expectation is computationally intractable and thus often approximated in practice.
%
E.g., Monte Carlo approaches compute an unbiased estimate of \cref{eq:bayesian-averaging} by averaging the label distribution $p_{\theta_i}(\vy \mid \vx; \BK)$ of a (small) selection of parameters $\{ \theta_i \}$.  %The quality of the approximation hinges on how well the $\theta_i$'s cover the space of models \ST{ref}.
%
More advanced Bayesian techniques for neural networks \citep{wang2020survey}, like the Laplace approximation \citep{daxberger2021laplace} and variational inference methods \citep{osawa2019practical}, locally approximate the posterior around the (parameters of the) trained model.
%
Conceptually simpler techniques like deep ensembles \citep{deepesembles2017} average over a bag of diverse neural networks trained in parallel and have proven to be surprisingly effective in practice.

Recall that, by \cref{eq:method-implementation}, \method averages over models $\theta_i \in \Theta^*$ that achieve high likelihood but disagree in terms of concepts.
%
This can be viewed as a form of Bayesian inference.  Specifically, \cref{eq:bayesian-averaging} behaves similarly to \method if we select the prior and likelihood appropriately.
%
In fact, if 1) the prior $p(\theta)$ associates non-zero probability to all $\theta$'s encoding an RS, and
%
2) the likelihood $p(\calD \mid \theta)$ allocates non-zero probability only to $\theta$'s that match the data (almost) perfectly,
%
the resulting posterior $p(\theta \mid \calD)$ associates probability mass only to models that satisfy \textbf{D1} and \textbf{D2}, that is, those in $\Theta^*$.

Compared to stock Bayesian techniques, \method is specifically designed to handle RSs.
%
First, note that the likelihood $p(\calD \mid \theta)$ is highly multimodal, as it peaks on the ``optimal'' models in $\Theta^*$, thus Bayesian techniques that focus on neighborhood of trained networks have trouble recovering all modes {\citep{jospin2022hands}}.
%
Moreover, the expectation in \cref{eq:bayesian-averaging} runs over parameters $\theta$, which may be redundant, in the sense that different $\theta_i$'s can entail similar or identical concept encoders $p_\theta(\vC \mid \vX)$.  This suggests that covering the space of $\theta$'s, as done in \citep{d2021repulsive, wild2023rigorous},
is sub-optimal compared to averaging over $\theta_i$'s that disagree on which concepts they predict.
%
\method is designed to avoid both issues, as it learns models $\{ \theta_i \}$ that have both high likelihood and disagree on the semantics of concepts, so as to capture multiple, different modes of the likelihood, thus encouraging RS-awareness. 

% \ST{I don't think the prior is what matters the most here: if its support does not cover any RSs, it is no surprise that bayes NeSy cannot work.  The focus should be on two other things: 1) The likelihood, which has to be very point, and thus different from what is done in practice in Bayesian ML, and 2) Multimodality.  Specifically, we should anticipate that bayesian neural nets are unlikely to work in our setting because they tend to be very local, while the space of optimal models is combinatorial in nature, and say that we will show these issues empirically.}
% Notice that, however, the choice of the prior may lead to trading off the actual performances on label predictions, \ie failing in \textbf{D2}, for more compliance to the prior on parameters. This is visible from the \textsf{MAP} estimate, consisting of those parameters that attain higher probability mass:
% %
% \[  \textstyle
%     \Theta_{\textsf{MAP}} (\calD; \BK) = \argmax_\theta \log p(\calD \mid \theta) p(\theta)
% \]
% %
% It is only under a suitable choice of the prior, like the uniform distribution or a normal distribution with high diagonal variance, that solutions of the \textsf{MAP} estimate are likely to fall within the optimal solutions: \EM{turn to proposition?}\EK{Yep, this is unclear rn}
% %
% \[
%     \lim_{\sigma \to \infty} \argmax_\theta p(\calD \mid \theta) \calN_\theta (0, \mathsf{diag}(\sigma) ) %\argmax_\theta p(\calD \mid \theta) \mathsf{unif}_\theta 
%     \subseteq \Theta^* ( \calD; \BK ) 
% \]
% %
% where $\calN_\theta (0, \mathsf{diag}(\sigma))$ denotes the normal distribution on weights $\theta$ which tends in distribution to the uniform on the parameters, $\mathsf{unif}_\theta$ \EK{Are you sure this is well-defined? This sounds like an improper prior} \ST{yeah it is improper, the uniform has to live in a ball/box.}. Therefore, approaches of this sort that sample the parameters $\theta$ in the neighbors of the \textsf{MAP} estimate will likely average over different optimal solutions with different weights. \EK{I don't get this argument either.}

% \[
%     p(\vy \mid \vx; \calD) \approx \frac{1}{M} \sum_{m = 1}^M p_{\theta_m}(\vy \mid \vx; \BK)
% \]
% where $\theta_m \sim p(\theta \mid \calD)$. 


\subsection{Active Learning with Dense Annotations}
\label{sec:active}

As mentioned in \cref{sec:preliminaries}, a sure-proof way of avoiding RSs is to leverage concept-level annotations, which are however expensive to acquire.
%
\method helps to address this issue.  Specifically, we propose to exploit the model's concept-level uncertainty -- which is higher for the concepts most affected by RSs -- to implement a cost-efficient annotation acquisition strategy.

We consider the following scenario:  given a NeSy predictor $p_\theta$ affected by RSs and a pool of examples $\calD = \{(\vx_i, \vy_i)\}$, we seek to mitigate RSs by eliciting concept-level annotations for as few data points as possible.
%
This immediately suggests leveraging active learning techniques to select informative data points \citep{settles2012active}.
%
Options include selecting examples $(\vx_i, \vy_i)$ in $\calD$ with the highest concept entropy $H(p(\vC \mid \vx_i))$ and requesting dense annotations for the entire concept vector $\vG_i$, or requesting supervision only for specific concepts $G_j$ by maximizing $H(p(C_j \mid \vx_i))$ for $i$ and $j$.
%
Both entropies are cheap to compute for most neural networks (as the predicted concepts $C_j$ are conditionally independent given the input $\vx$ \citep{vergari2021compositional}), making acquisition both practical and easy to set up.

Crucially, these strategies only work if the model is RS-aware and, in fact, they fail for state-of-the-art NeSy architectures unless paired with \method, as shown in \cref{sec:experiments}.


% We propose to use the entropy of $p(\vC \mid \vx)$ for a multiple-phase active learning strategy. For each round interaction in $\tau=1, \ldots, T$, we add a fixed number of test exemplars $\ell$ together with concept supervision to the training set:
% \[  \label{eq:active-learning-examples}
%     \calD_{\tau+1} \leftarrow \calD_\tau \cup \{(\vx_i, \vy_i, \vg_i), \; \text{for } i \in \calH(\vC \mid \vX; \ell)   \}
% \]
% where $\calH(\vC\mid \vX; \ell)$ denotes the indexes corresponding of the first $\ell$ mostly confused examples in $\calD_{test} \setminus \calD_{\tau}$. Then, the model is finetuned on $\calD_{\tau +1}$. 

% Alternatively, from the perspective of the learned concepts, we can have an estimate of the uncertainty for $p(C_i \mid \vx)$ by looking at its variance over the whole training set, namely:
% \[  \label{eq:estimate-variance-concept}
%     \langle \Var (C_i ) \rangle = \sum_{\vx \in \calD}  \Var(C_i \mid \vx) \Ind{ p(C_i \mid \vx) >0}
% \]
% where we compute explicitly the variance for the binary variable $p(C_i \mid \vx)$ for those examples that predict it with non-zero probability. Notice that, however, from \cref{eq:estimate-variance-concept} we may end up mistaking some concepts $C_i$ for being affected by RSs, even if they are not. This can happen when a RS that captures correctly $C_i$ with $G_i$ attributes the value of another concept to it, that is $C_j = G_i$. On average, the value of $\langle \Var (C_i ) \rangle$ will increase. 

% We then propose to supervise the concept $C_i$ with positive and negative values according to $p(G_i \mid \vx)$. 
% This brings to a slightly different strategy for active learning that includes supervision of the most confused concepts. For $\tau = 1, \ldots, T$ steps, we finetune the model on:
% \[  \label{eq:active-learning-concepts}
%     \textstyle
%     \calD_{\tau+1} = \calD_\tau \cup \{ (\vx, \vy, g_i), \; \text{for } i = \sup_j  \langle \Var (C_j) \rangle  \}
% \]
% Notice that, depending on the resources available, the active step may consider a combination of a finite number of samples $\ell$ like in \cref{eq:active-learning-examples} but for only the most confused concepts \cref{eq:active-learning-concepts}. Approaches that do not model any uncertainty on the concepts cannot be used for this active learning step, leaving to sort randomly what concepts or exemplars should be supervised. On the other hand, bayesian models and \method become pretty helpful in this regard.


\subsection{Benefits and Limitations}
\label{sec:benefits-limitations}

The most immediate benefit of \method\ -- and of RS-awareness in general -- is that it enables users to identify and avoid untrustworthy predictions $\vc$ or even individual concepts $c_i$, substantially improving the reliability of NeSy pipelines.
%
Moreover, compared to simpler Bayesian approaches for uncertainty calibration, it is specifically designed for dealing with the multimodal nature of the RS landscape and -- as shown by our experiments -- yields more calibrated concept uncertainty in practice.
%
Finally, \method enables leveraging the model's uncertainty estimates to guide elicitation of concept supervision.

A downside of \method is that training time grows (linearly) with the size of the ensemble $\vtheta$.
%
This extra cost is justified in tasks where reliability matters, such as high-stakes applications or when learning concepts for model verification \citep{xie2022neuro}.
%
Regardless, in our experiments, an ensemble of $5$-$10$ concept extractors is sufficient to dramatically improve RS-awareness compared to regular NeSy predictors, with a runtime cost comparable to alternative calibration approaches.
%
This is not too surprising:  in principle, even an ensemble of \textit{two} models is sufficient to ensure improved calibration, provided these capture RSs holding strong contrasting beliefs.
%
Finally, \method involves two other hyperparameters: $\gamma_1$ and $\gamma_2$, which can be tuned, \eg via cross-validation on a validation split.
%
As for the relative importance of different members of the ensemble (that is, $\vlambda$), our experiments suggest that even taking a uniform average already substantially improves RS-awareness compared to existing approaches.

%
% More generally, we posit this is due to the combinatorial structure of the space of RSs.

% Inference is also slightly more expensive, depending on the model.  Specifically, for NeSy predictors based on probabilistic-logic reasoning, it is possible to perform 

% \ST{Not all RSs are equally bad.  The degree of badness is captured by the divergence $d( p(\vC \mid \vG), \Ind{\vC = \vG} )$, which simplifies in the factorized case.  I think the integral \textit{does} automatically take this into account, but it would be worth showing formally.}  \ST{some RSs are more basic than others: RSs form a combinatorial space.  It may be sufficient to learn two RSs per concept to ensure entropy is large enough.  This gives us some leeway when learning the ensemble.}



\section{Empirical Analysis}
\label{sec:experiments}

In this section, we tackle the following research questions:
%
\begin{itemize}

    \item[\textbf{Q1}.] Are existing NeSy predictors RS-aware?

    \item[\textbf{Q2}.] Does \method make NeSy predictors RS-aware?

    \item[\textbf{Q3}.] Does \method facilitate acquiring informative concept-level supervision?
    
\end{itemize}
%
To answer them, we evaluate three state-of-the-art NeSy architectures before and after applying \method and other well-known uncertainty calibration methods.
%
\changed{Our code can be found at: \href{https://github.com/samuelebortolotti/bears}{https://github.com/samuelebortolotti/bears}.}


\textbf{NeSy predictors.}  We consider the following architectures.

% \underline{Dee}p\underline{ProbLo}g (\DPL) \citep{manhaeve2018deepproblog} and the \underline{Semantic Loss} (\SL) \citep{xu2018semantic} leverage probabilistic-logic reasoning \citep{de2015probabilistic} -- implemented via knowledge compilation \citep{vergari2021compositional}, for efficiency -- to constrain or encourage the predicted labels to comply with the knowledge, respectively.
\changed{\underline{Dee}p\underline{ProbLo}g (\DPL) \citep{manhaeve2018deepproblog} instantiates one neural predicate for each of the binary concepts present in the knowledge, and implements them using one or more neural networks feeding on the input.  It then predicts a label by combining the neural predicates via probabilistic logic reasoning \citep{de2015probabilistic}.  It speeds up this step by compiling inference into a computational circuit using knowledge compilation techniques \citep{vergari2021compositional}.  This circuit contains no trainable parameter in our experiments.}

\changed{\underline{Semantic Loss} (\SL) \citep{xu2018semantic} is a penalty term used to encourage deep neural entworks to output predictions consistent with given prior knowledge.  In our experiments, we use a setup similar to similar to Concept Bottleneck Models \citep{zarlenga2022concept}:  we employ a neural network to predict the concepts, whose logits are connected to an MLP inferring the labels.  The \SL is applied to the outputs of the two networks.  Like \DPL, the \SL also exploits probabilistic-logic reasoning and knowledge compilation, but differs in that it cannot constrain predictions to satisfy the knowledge $\BK$ at test time.
}

\changed{\underline{Lo}g\underline{ic Tensor Networks} (\LTN) \citep{donadello2017logic, badreddine2022logic} softens the prior knowledge \BK using fuzzy logic to define a differentiable measure of label consistency and actively maximizes it during learning.  At inference time \citep{LTNtorch}, labels are predicted by first predicting the concepts by computing a \textsf{MAP} solution of $p_\theta(\vC \mid \vx)$ and then combining it with the logic.
}

% \underline{Lo}g\underline{ic Tensor Networks} (\LTN) \citep{donadello2017logic, badreddine2022logic} softens the prior knowledge \BK using fuzzy logic to define a differentiable measure of label consistency and actively maximizes it during learning.


\textbf{Competitors.}  We evaluate each architecture in isolation and in conjunction with \method and the following well-known calibration methods.
%

\underline{MC Dro}p\underline{out} (\MCDrop) \citep{gal2016dropout} \changed{consists in training a network with a dropout term and then averaging} %averages 
over an ensemble of concept extractors obtained by randomly deactivating neurons during inference;
%

The \underline{La}p\underline{lace a}pp\underline{roximation} (\Laplace) \citep{daxberger2021laplace} approximates the Bayesian posterior by placing a normal distribution around the trained concept extractor, applying a covariance proportional to the inverse of the Hessian matrix computed on the label loss;
%

\underline{Dee}p \underline{Ensembles} (\DeepEns) \citep{deepesembles2017}, like \method, \changed{trains an ensemble of models (with different hyperparameters, like the random seed and learning rate, to ensure different optimization between models) under the same objective and a noise adversarial term, but it does not contain any knowledge-unaware diversification penalty. After training, the mean concept extractor is given by the average of the ensemble members.}

We also consider a P\underline{robabilistic Conce}p\underline{t-bottleneck Models} (\ProbCBM) \citep{kim2023probabilistic} backbone, an interpretable neural network architecture that outputs a normal distribution for each concept, implicitly improving uncertainty calibration. 
\changed{Concept probabilities are predicted by instantiating two sets of prototypical vectors, one for positive values of each concept $C_i$ and one for the negative values, similar to concept embedding models \citep{zarlenga2022concept}. The sigmoid of the relative distance of the network embedding to each of these prototypes then gives the concept probability $p_\theta(\vC \mid \vx)$.
}
%
Hyperparameters and further implementation details are reported in \cref{sec:implementation-details}. 

For both labels and concepts, we report -- averaged over $5$ seeds -- both \textit{prediction quality} and \textit{calibration}, measured in terms of $F_1$ score (or accuracy) and Expected Calibration Error (ECE), respectively.
%
\changed{A higher ECE (reported explicitly in \cref{eq:ece-definition}) indicates that a model is overconfident in giving wrong predictions, which we expect to be the case for methods not modeling uncertainty explicitly.  The subscript $Y$ (resp. $C$) indicates we are measuring label (resp. concept) calibration error.}
%
See \cref{sec:metrics-details} for definitions.
%
We also report a runtime comparison in  \cref{sec:runtimes}.


\textbf{Data sets.}  We consider two variants of MNIST addition \citep{manhaeve2018deepproblog}, which requires predicting the sum of two MNIST \citep{lecun1998mnist} digits, except that only selected pairs of digits are observed during training.
%
\underline{\MNISTHalf} includes only sums of digits $0$ through $4$ chosen so that only the semantics of the digit $0$ can be unequivocally determined from data. \changed{The combinations include:}
\[
    \changed{\begin{cases}
        \MZero + \MZero &= 0\\ 
        \MZero + \MOne &= 1
    \end{cases} \quad
    \begin{cases}
        \MTwo + \MThree &= 5\\ 
        \MTwo + \MFour &= 6\
    \end{cases}
    }
\]
%
\underline{\MNISTShortcut} is similar, except that \changed{it covers all digits}; \changed{all in-distribution combinations are reported explicitly in \cref{sec:datasets-and-architecture}}. \changed{It was introduced in \citep{marconato2023not} to evaluate the impact of RSs in \DPL, \SL, and \LTN}

\underline{{\tt Kandinsk}}{\tt y} is a variant of the Kandinsky Patterns task \citep{muller_kandisnky_2021}, where given three images containing three simple colored shapes each (\eg two red squares and a blue triangle) and a logical combination of rules like ``the three objects have different shapes'' or ``they have the same color'', the goal is to predict whether the third image satisfies the same rules as the first two. \changed{
The example in \cref{fig:kand}
provides an idea of this task.}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.3\textwidth]{figures/kand-illustration}
    \caption{\changed{An example of a test sample for the \Kandinsky task. At inference time, the NeSy model has to choose according to the previous two images the third that completes the \textit{pattern}.
    The model computes a series of predicates for each image, like $\texttt{same\_colors}$, $\texttt{same\_shapes}$.
    In the running example, the first two images have different colors, so the model should pick the first option. }
    }
    \label{fig:kand}
\end{figure}
% \changed{where the correct figure completes the pattern.}
%

\underline{\BOIA} \citep{xu2020boia, sawada2022concept} is a real-world multi-label prediction task in which the goal is to predict what actions, out of $\{ {\tt forward}, {\tt stop}, {\tt right}, {\tt left} \}$, are safe based on objects (like pedestrians and red lights) that are visible in a given dashcam image and prior knowledge akin to that in \cref{ex:boia}.
%
\changed{The data set comprises $21$ concepts, indicating the presence of pedestrians, red and green traffic lights, and other kinds of objects common in road traffic.  The rules prevent the model from predicting actions whenever these are unsafe due to, \eg presence of obstacles in the corresponding direction. The \texttt{forward} and \texttt{stop} actions do share concepts, \eg
\begin{equation*}
\begin{cases}
    &\text{\texttt{red\_light}} \lor \text{\texttt{stop\_sign}} \lor \text{\texttt{obstacle}} \Rightarrow \text{\texttt{stop}}\\
    & \text{\texttt{stop}} \Rightarrow \lnot \text{\texttt{move\_forward}}
\end{cases}
\end{equation*}
}
See \cref{sec:datasets-and-architecture} for a longer description \changed{of all data sets}.


\begin{table}[!t]
    \centering
    \scriptsize
    \caption{\textbf{\method dramatically improves RS-awareness across the board}.  All tested architectures achieve substantially better concept-level ECE and out-of-distribution label-level ECE, with comparable in-distribution label-level ECE. Results for \MNISTHalf are shown. \MNISTShortcut shows a similar trend (see \cref{tab:mnistshort-full} in the Appendix).}
    \include{tables/mnist-table-1}
    \label{tab:mnisthalf}
\end{table}


\begin{table}[!t]
    \caption{\textbf{\method dramatically improves RS-awareness in the real-world.}
    Results on \BOIA with \DPL show substantial ECE improvements both jointly ($\mCECE$) and for different classes of concepts (F={\tt forward}, 
 S={\tt stop}, R={\tt turn right}, L={\tt turn left}).}
    \scriptsize
    \include{tables/boia-table-2}
    \label{tab:boia}
\end{table}





\textbf{Q1: RSs make NeSy predictors overconfident.} \cref{tab:mnisthalf} lists the label and concept ECE of all competitors on \MNISTHalf, measured both in-distribution (sums in the training set) and out-of-distribution (all other sums).  The label and concept accuracy are reported in the appendix (\cref{tab:mnisthalf-full}) due to space constraints.
%
%\EM{double check, there were results on\MNISTShortcut} \ST{read further down below}
Overall, all NeSy predictors achieve high label accuracy ($\ge 90\%)$ but fare poorly in terms of concept accuracy (approx. $43\%$ for \DPL and \SL, and \LTN), meaning they are affected by RSs, as expected. 
%
Our results also show that they are \textit{not RS-aware}, as they are very confident about their concept predictions (\CECE of approx. $69\%$ for \DPL, $71\%$ for \SL, and $70\%$ fort \LTN).  Moreover, the label predictions are well calibrated (\YECE is approx. $2\%$ for \DPL and \LTN, $1\%$ for \SL), meaning that label uncertainty is not a useful indicator of RSs. 
%
In general, models performance worsens out-of-distribution in terms of label accuracy (barely above $0$ for all models) and label and concept calibration (ECE around $90\%$), despite concept accuracy remaining roughly stable (about $40\%$).
%
The results for \MNISTShortcut follow a similar trend, cf. \cref{tab:mnistshort-full} in the appendix.

As for \BOIA, we only evaluate \DPL as it is the only model that guarantees predictions comply with the safety constraints out of the ones we consider.  
%
The results in \cref{tab:boia} and \cref{tab:boia-full} show that \DPL achieves good label accuracy ($72\%$ macro $F_1$) in this challenging task by leveraging poor concepts ($34\%$ macro $F_1$) with high confidence ($\mCECE \approx 84\%$).
%
This supports our claim that NeSy architectures are not RS-aware.


\textbf{Q2: Combining NeSy predictors with \method dramatically improves RS-awareness in all data sets while retaining the same prediction accuracy.}
%
For \MNISTHalf (\cref{tab:mnisthalf}, \cref{tab:mnisthalf-full}), \method shrinks the concept ECE from $69\%$ to $37\%$ for \DPL, from $71\%$ to $38\%$ for \SL, and from $70\%$ to $36\%$ for \LTN in-distribution.  The out-of-distribution improvement even more substantial, as \method improves both concept calibration (\DPL: $87\% \to 38\%$, \SL: $88\% \to 37\%$, \LTN: $87\% \to 32\%$) \textit{and} label calibration (\DPL: $92\% \to 39\%$, \SL: $95\% \to 75\%$, \LTN: $94\% \to 36\%$).
%
No competitor comes close.  The runner-up, \Laplace, improves concept calibration on average by
% ((69 - 65) + (71 - 59) + (70 - 55) + (87 - 82) + (88 - 75) + (87 - 73)) / 6
$10.5\%$ and at best by $15\%$ (for \LTN in-distribution), while \method averages 
% ((69 - 37) + (71 - 38) + (70 - 36) + (87 - 38) + (88 - 37) + (87 - 32)) / 6
$42.3\%$ and up to $55\%$ (for \LTN out-of-distribution).
%
\cref{fig:mnisthalf-entropy} shows that \method correctly assigns high uncertainty to all digits but the zero, which is the only one not affected by RSs, while all competitors are largely overconfident on these digits.

In \BOIA the trend is largely the same:  \method improves the test set concept ECE for all concepts jointly (\mCECE) from $84\%$ to $58\%$ ($-26\%$).  The improvement becomes even clearer if we group the various concepts based on what actions they entail:  concepts for {\tt forward}/{\tt stop} improve by $-61\%$, those for {\tt right} by $-69\%$, and those for {\tt left} by $-57\%$.
%
Here, \Laplace performs quite poorly (in fact, it yields \textit{worsen} calibration), and the runner-up \ProbCBM, which fares well ($-16\%$ $\mCECE$), is also substantially worse than \method.
%
Finally, we note that, despite their similarities, \DeepEns underperforms overall, showcasing the importance of our knowledge-aware ensemble diversification strategy.



\begin{figure}[!t]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/results/mnist/concept_uncertainty_DPL_mod}
    \caption{\textbf{Per-concept entropy shows \method is more uncertain about concepts affected by RS} on \MNISTHalf compared to regular \DPL and alternative uncertainty calibration methods.  \SL and \LTN show similar trends, see \cref{sec:all-results}. Importantly, these improvements do not require concept annotations.
    }
    \label{fig:mnisthalf-entropy}
\end{figure}

\begin{figure}[!t]
    \centering
        \includegraphics[width=0.8\linewidth]{figures/results/kandinsky/i-o_kand_bears-fixed}
    \caption{\textbf{\method allows selecting informative concept annotations faster.} A substantial improvement in concept accuracy is achieved by performing active learning guided by RS-aware concept uncertainty (\DPL+\method) with respect to plain concept uncertainty (\DPL) and random selection.
    %(\DPL+random).
    }
    \label{fig:kand-results}
\end{figure}

\textbf{Q3: \method allows for selecting better concept annotations.} \cref{fig:kand-results} reports the results in terms of concept accuracy on the \Kandinsky dataset when using an active learning strategy to acquire concept supervision. Results are obtained by pre-training DPL with 10 examples of red squares, and selecting additional objects for supervision based on their concept uncertainty. Results show that standard DPL has the same behaviour as a random sampling strategy, likely because of its poor estimation of concept uncertainty. On the other hand, DPL with \method manages to substantially outperform both alternatives in improving concept accuracy, achieving an accuracy of more than 90\% with just 50 queries, while the other strategies level off at around 75\% accuracy. Note that because of the presence of reasoning shortcuts, all models achieve high label-level accuracy regardless of their concept-level accuracy. See \cref{appendix:active-learning} for the details.


% \EM{To be reintegrated.} During our evaluation, we noticed a pattern: regardless of the seed employed for weight initialization and despite pushing newly learned models far from their predecessors, the models tend to employ the least amount of concepts possible to solve the task. \SB{We suppose that either there are RSs that are easier to learn with respect to the others (some of these structures may serve as fundamental building blocks, allowing for interpolation to derive all others) or using correctly less concepts is advantageous over employing a larger set of concepts} \EM{Good, but this result should be justified once we show the CMs. Move below.}



% \begin{figure}[!t]
%     \centering
%     \begin{tabular}{cc}
%         \includegraphics[width=0.475\linewidth]{figures/results/kandinsky/i-o_kandy_y_bears}
%         &
%         \includegraphics[width=0.475\linewidth]{figures/results/kandinsky/i-o_kand_bears}
%     \end{tabular}
%     \caption{\textbf{\method allows selecting informative concept annotations faster.} \ST{WRITEME} \EM{SBAM!}} 
%     \label{fig:kand-results}
% \end{figure}



% The reported results refer to \DPL, however, as we have seen during our evaluation, they can be generalized to \SL and \LTN. \SB{It should give us evidence that \method is model agnostic as RSs are model agnostic, right?} \SB{As pointed out by Stefano, it could be suitable to try out Laplace with a weak concept supervision on the concept loss (just to avoid stress from reviewers)}

% \begin{figure}
%     \centering
%     \includegraphics[width=\linewidth]{figures/results/halfmnist_entropy_on_c_hc_bar_plot.png}
%     \caption{Mean OVA entropy on the test set, for each concept.}
%     \label{fig:entropy-c}
% \end{figure}

% \cref{fig:entropy-c} reports the entropy of the various concepts $C_i$, averaged over the test set.  In order to compute the per-concept entropy, we treat each concept as a binary variable (in a one-vs-all fashion):

% As shown by the figure, Deep Ensembles shows a low entropy for the concept $0$, which is forced to correctly learn, and a consistent entropy for all the others. \SB{Even if $1$ is correctly learned, some models in the ensemble collapse either $2$, or $3$ or $4$ to $1$, which explains why it is peaked.}. Conversely, the frequentist model, Laplace and MC dropout are all extremely confident.

% \begin{figure}
%     \centering
%     \includegraphics[width=\linewidth]{figures/results/halfmnist_c_acc.png}
%     \caption{Accuracy per concept on the test dataset.}
%     \label{fig:accuracy-concept}
% \end{figure}

% Deep Ensemble does not worse than the frequentist model as shown in Figure~\ref{fig:accuracy-concept}, provided the first does not learn the correct solution.

% \begin{figure}
%     \centering
%     \includegraphics[width=\linewidth]{figures/results/alpha_plot_ensemble.png}
%     \caption{Alpha plot of the deep ensemble. In this case, the probability of the world is not factorized.}
%     \label{fig:alpha-de}
% \end{figure}

% Deep Ensemble interpolates different reasoning shortcuts as portrayed in Figure~\ref{fig:alpha-de}.

% As a further analysis, we have tested our approach on the full \MNISTAdd dataset without disentanglement. Hence, the only only possible reasoning shortcuts involve swapping symmetric worlds incorrectly.

% In this setting, contrary to the previous, the Diversified Deep Ensemble successfully recovers one model representing the intended solution. The concept accuracy is almost perfect for the Diversified Deep Ensemble technique, outperforming the frequentist model and all the others Bayesian alternatives. Furthermore, Diversified Deep Ensemble has high on all concept and the concept calibration is far more pronounced than in the previous experiment.

% As a further experiment, we run on the experiment on the full \MNISTAdd dataset with disentanglement. Thus, since no reasoning shortcut are present, we expect the bayesian models to match the performance of the frequentist model. Indeed, this happens for \DPL, \SL and  \LTN.

% Finally, we studied how the bayesian methods affect the calibration of the output label in OOD scenarios. Interestingly, deep ensembles showed to be more calibrated on output labels when tested out of distribution with respect to the frequentist model and the other bayesian counterparts. \SB{This happens for Halfmnist and Shortcutmnist, both for \SL and for \DPL and both in dissentangled and not-dissentagled setting} 

% \SB{What's next? 1) Fix LTNs underfitting problem (fixed?). 2) Setup a suitable \BOIA experiment for \method (done). 3) Evaluate biretta on \BOIA (almost done. It basically gives all better performances with respect to frequentist apart from "right"). 4) Evaluate different ensemble strategies? (Should be fine according to what we want, so no changes needed. Right?) Or define a way in which, for concepts affected by RSs, the ensemble captures at least one conflict in its bag? (is it possible without prior knowledge on concepts?)}

% Shifting focus to a more realistic dataset, we have experimented DeepEnsemble, Laplace and MC Dropout on \BOIA. Due to the consistent amount of concept (21), which would lead to a massive world probability matrix of dimension $2^{21}$, we decided to group each the concepts according to the prediction they influences, namely: forward\-stop, left and right. \SB{To be motivated! In practice, \DPL will only use these subsets to make the prediction}. Up to know, the model seems to always pick up reasoning shortcuts and bayesian methods successfully decreases the ECE for forward\-stop and left, while this does not happen for right. \SB{Not clear why, the ensemble picks always the same solution for the "right" scenario}


\section{Related Work}
\label{sec:related-work}

\textbf{Neuro-Symbolic Integration.}  NeSy AI \citep{dash2022review, giunchiglia2022deep} spans a broad family of models and tasks -- both discriminative and generative -- involving perception and reasoning \citep{de2021statistical, garcez2022neural, di2020efficient, misino2022vael, AhmedKLR23}.
%
Given discrete reasoning is not differentiable, NeSy architectures support end-to-end training either by imbuing the prior knowledge with probabilistic \citep{lippi2009prediction, manhaeve2018deepproblog, yang2020neurasp, huang2021scallop, marra2021neural, ahmed2022semantic, van2022anesi, skryagin2022neural} or fuzzy \citep{diligenti2017semantic, van2022analyzing, donadello2017logic} semantics, by implementing reasoning in embedding space \citep{rocktaschel2016learning}, or through a combination thereof \citep{pryor2022neupsl}.
%
Another difference is whether they encourage \citep{xu2018semantic, fischer2019dl2, pryor2022neupsl} vs. guarantee \citep{manhaeve2018deepproblog, giunchiglia2020coherent, ahmed2022semantic, hoernle2022multiplexnet, giunchiglia2024ccn} predictions to be consistent with the knowledge.
%
Despite their differences, all NeSy approaches can be prone to RSs, which occur whenever prior knowledge -- including label supervision -- is insufficient to pin down the correct concept semantics.

\textbf{Dealing with Reasoning Shortcuts.} Existing works on RSs focus on unsupervised mitigation, often by discouraging learned concepts from collapsing onto each other.
%
Examples include using a batch-wise entropy loss \citep{manhaeve2021neural}, a reconstruction loss \citep{marconato2023not}, a bottleneck maximization approach \citep{sansone2023learning}, and encouraging constraint satisfaction via non-trivial assignments \citep{li2023learning}. 
%
Our work builds on the insights from Marconato \textit{et al.} \citep{marconato2023not}, who recently showed that unsupervised mitigation only works in specific cases, and that only expensive strategies -- like multi-task learning and dense annotations \citep{marconato2023neuro} -- can provably avoid RSs in all cases.
%
\changed{Other works based on \textit{abductive learning} \citep{dai2020abductive, tao2024deciphering} constitute promising avenues for lessening the impact of RSs.}


%
Our key contribution is that of switching focus from mitigation to awareness, which -- as we show -- \textit{can} be achieved in an unsupervised manner.
%
In this sense, \method is closely related to unsupervised mitigation heuristics \citep{manhaeve2021neural, sansone2023learning, li2023learning}, but differs in the goal (awareness vs. mitigation). \method specifically averages over neural networks that capture conflicting RSs to achieve knowledge-dependent uncertainty calibration.
%
It is also related to the neuro-symbolic entropy of \citep{ahmed2022neuro}, which, however, \emph{minimizes} instead of maximizing the entropy of the NeSy predictor, and as such it can exacerbate the negative effects of RSs.
%
\changed{In our analysis, we characterize awareness of RSs in the limit case of infinite data, 
and future work can provide statistical guarantees about the uncertainty of concepts and final task performance, for example adapting results from \citep{tao2024deciphering, wang2023grounding}.
}

    % Finally, SPL \citep{ahmed2022semantic} also uses a mix of distributions to create the NeSy predictor, but this mix uses a single neural network encoder instead of an ensemble and does not use a repulsion term to encourage diversity among the components. \AP{weak, seems we need to compare.. do we really need to single out SPL from the NeSy literature here? if so we need to clarify why we don't compare} \EK{Yeah that's fair. About singling out: As a reviewer I'd have liked a comparison.}

% RSs differ from statistical shortcuts affecting neural networks \citep{geirhos2020shortcut}.  Both compromise generalization and interpretability and are difficult to detect and prevent based on training accuracy alone, but RSs occur even when the data is unbiased \citep{marconato2023not} and cannot be avoided using standard debiasing techniques \citep{ross2017right, teso2023leveraging}. \EK{Not sure about this paragraph. Your paper is about \method, not about RSs, so I don't think you need to compare RSs to other 'shortcuts' here.}
% %
% \citep{stammer2021right} address statistical shortcuts in NeSy architectures via knowledge augmentation, but this is also insufficient to prevent all RSs \citep{marconato2023not}. \EK{Feels out of place here. Also argument is weak.} 


\textbf{Uncertainty calibration in deep learning.}  Overconfidence of deep learning models is a well-known issue \citep{abdar2021review}.
%
Many strategies for reducing overconfidence of label predictions exist \citep{muller2019does, li2020closed, wei2022mitigating, mukhoti2020calibrating, carratino2022mixup}, many of which based on Bayesian techniques \citep{gal2016dropout,daxberger2021laplace,deepesembles2017}.  Our experiments show that applying them to NeSy predictors fails to produce RS-aware models, whereas \method succeeds.
%
Techniques from concept-based models for imbuing concepts with probabilistic semantics \citep{kim2023probabilistic, marconato2022glancenets} also can improve calibration, but  underperform in our experiments compared to \method.

% \AV{there are some works by the Leuven people on putting higher-order uncertainty in DeepProblog and circuits, e.g., \citep{cerutti2022handling}}
% \EK{Needs a much more thorough comparison to Semantic Probabilistic Layers. Itt's very similar to how you're increasing expressivity: Check out the mixture of independents. It's also a mix, like the deep ensemble.}  \AV{SPL is not Bayesian, the DE are used to model a posterior (the mixture is in that space)}


\section{Conclusion}
\label{sec:conclusion}

NeSy models tend to be unaware of RSs affecting them, hindering reliability.
%
We address this by introducing \method, which encourages NeSy models to be more uncertain about concepts affected by RSs, enabling users to identify and distrust bad concepts.
%
\method vastly improves RS-awareness compared to NeSy baselines and state-of-the-art calibration methods while retaining high prediction accuracy, and lowers the cost of supervised mitigation via uncertainty-based active learning of dense annotations.
%
In future work, we will explore richer knowledge acquisition strategies to encourage RS-awareness and reduce their impact, \changed{and look into leveraging causal representation learning \citep{scholkopf2021toward, liang2023causal, lippe2023biscuit} to define provably effective mitigation strategies. Furthermore, concurrent work proves that the NeSy predictors studied in this paper are fundamentally limited in expressing uncertainty, and that this can be overcome by increasing model expressivity using ensembling \cite{van2024Independence}. We will explore this relation with \method.  }
%
%In future work, we plan to improve \method by directly optimizing over the ensemble mixture $\vlambda$ and to evaluate alternative techniques for encouraging RS-awareness. 
% In future work, we plan to explore richer knowledge acquisition strategies to encourage RS-awareness and reduce their impact. 



\begin{contributions}
    E.v.K. conceived the idea of tackling RSs using uncertainty.
    All authors contributed to the conceptualization and the writing.
    E.M. and S.B. implemented the code and carried out the empirical evaluation.
    E.M. and E.v.K. analyzed the theoretical backing of our approach.
    S.T., A.V., and A.P. supervised the work.
    A.V. and A.P. managed fund acquisition.
\end{contributions}


\begin{acknowledgements}
    The authors are grateful to Zhe Zeng for useful discussion.
    Funded by the European Union. The views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Health and Digital Executive Agency (HaDEA). Neither the European Union nor the granting authority can be held responsible for them. Grant Agreement no. 101120763 - TANGO.
    AV is supported by the "UNREAL: Unified Reasoning Layer for Trustworthy ML" project (EP/Y023838/1) selected by the ERC and funded by UKRI EPSRC.
    Emile van Krieken was funded by ELIAI (The Edinburgh Laboratory for Integrated Artificial Intelligence), EPSRC (grant no. EP/W002876/1).
\end{acknowledgements}



\bibliography{references, explanatory-supervision}


\newpage
\appendix
\onecolumn


\section{Implementation Details}
\label{sec:implementation-details}


In this Section, we provide additional details about all metrics, datasets and models useful for reproducibility.


\subsection{Implementation}

All the experiments are implemented using Python 3.8 and Pytorch 1.13 and run on one A100 GPU.
%
The implementations of \DPL, \SL, and \LTN were taken verbatim from \citep{marconato2023not}.
%
We implemented \MCDrop and \DeepEns by adapting the code to capture the original algorithms~\citep{gal2016dropout, deepesembles2017}.
For \ProbCBM, we followed the original paper~\citep{kim2023probabilistic}. For \Laplace, we adapted the original library from \LaplaceTorch~\citep{daxberger2021laplace}.
In our experiments, we computed the Laplace approximation on the second last layer, mapping the embeddings to the concept layer. The images for \Kandinsky patterns were synthetically originated from the resource provided in \citep{muller_kandisnky_2021}.
 



\subsection{Metrics}
\label{sec:metrics-details}

For all datasets, we evaluate the predictions on the labels by measuring the accuracy and the $F_1$-score with macro average. We assess calibration using the Expected Calibration Error (ECE), which measures how accurately the model-predicted probabilities align with actual data likelihood. Specifically, for a given label $y_i \in \bbN$, the \YECE$(i)$ for each label error is evaluated as:
\[ 
    \YECE (i) = \sum_{\ell=1}^{M} \frac{|B_\ell|}{n} |\YAcc(B_\ell) - \YCONF(B_\ell)|, \quad \forall i \in [m]
\]
where $M$ is the number of bins, $B_m$ represent the $m$-th bin and \YCONF denotes the predicted probability. Essentially, the predicted probabilities are categorized into intervals, denoted as bins. Each data point is assigned to a bin based on its predicted probability. Within each bin, the average predicted probability and accuracy are computed. Ultimately, the ECE value is obtained by summing the averages of absolute differences between predicted probabilities and accuracies.
%
Similarly, we evaluate \CECE$(j)$ as:
\[  \label{eq:ece-definition}
    \CECE(j) = \sum_{\ell=1}^{M} \frac{|B_\ell|}{n} |\CAcc(B_\ell) - \CCONF(B_\ell)|, \quad \forall j \in [k]
\]
%
In \MNISTHalf and \MNISTShortcut we use the very same network to extract the first and second digits, and similarly in \Kandinsky for extracting the color and shape of each object.  For this reason, \CECE was evaluated by stacking the concepts predicted by the architecture for each object.  \YECE was evaluated on the final predictions. 

In contrast, \BOIA images involve multiple concepts and multiple labels. In this case, we  adopted a softer approach, specifically we averaged over the performances on each separate component: 
%
\[ 
    \mYECE = \frac{1}{m} \sum_{i = 1}^{m} \YECE(i)
\]
%
\[ 
    \mCECE = \frac{1}{k} \sum_{i = 1}^{k} \CECE(i)
\]
%
where $l$ and $k$ are the numbers of labels and concepts, respectively.

In \MNISTAdd and its variations, we evaluate all metrics both in-distribution and out-of-distribution.
%
In \Kandinsky, labels and concepts are both balanced, so we report accuracy for both.
%
In \BOIA the data is not as balanced, so we report the mean-$F_1$, score as in \citep{sawada2022concept, marconato2023not}, that is, we first compute the $F_1$-score for each action and then average them:
%
\[
    \mFY = \frac{F_1(\text{forward}) + F_1(\text{stop})+ F_1(\text{left}) +F_1(\text{right})}{4}
\]

%Similarly, for the evaluation of $k$ concepts, we consider two versions of the \CECE \EM{@SB similarly here}:
%\[
%\begin{aligned}
%    \text{soft-}\CECE (\vC) &= \frac{1}{k} \sum_{i=1}^k \CECE(C_i) \\
%
%    \text{hard-}\CECE &= \CECE(\vC)  \\
%\end{aligned}
%\]

For all datasets, to measure uncertainty concept-wise for a specific model \(p_\theta\), we rely on the one-vs-all entropy. We evaluate the average entropy of $p_\theta (C = c \mid \vx)$ as:

\[
    H_{OVA}(p_{\theta}(C = c| X)) = - \frac{1}{|\calD|}\sum_{\vx \in \calD} \big[  p_{\theta}(C = c | \vx) \log(p_{\theta}(C = c | \vx)) + (  1- p_{\theta}(C = c | \vx)) \log (1- p_{\theta}(C = c | \vx)) \big]
\]

% where \(n\) is the size of the dataset. 
% For Bayesian models, the entropy formula remains the same, while \(p_\theta\) is defined as the factorized probability.


\subsection{\method implementation}

Implementation-wise, \method is an extension of \DeepEns with a new concept-level repulsive term.
%
In short, \method works as follows.
%
For each new model $\theta_t$ to be added to the ensemble, we compute the following loss by considering all other members in $\vtheta = \{ \theta_j \}_{j=1}^{t-1} $:
%
\[
  \max_{\theta_t} \frac{1}{|\calD|}  \sum_{ (\vx, \vy) \in \calD } \big[ \log p_{\theta_t} (\vy \mid \vx; \BK) + \gamma_1 \KL \big( p_{\theta_t} (\vC \mid \vx) \mid \mid  \frac{1}{t} \sum_{j = 1}^t p_{\theta_j} (\vC \mid \vx)  \big) +  \gamma_2 H( p_{\theta_t}(\vC \mid \vx) )  \big]
\]


We can analyze further the expression of the $\KL$ divergence to express it differently:
\begin{align}
    \KL \big( p_{\theta_t} (\vC \mid \vx) \mid \mid \frac{1}{t} \sum_{j = 1}^t p_{\theta_j} (\vC \mid \vx)  \big) &= \sum_{\vc \in \{0,1\}^k } p_{\theta_t} (\vc \mid \vx) \log \frac{p_{\theta_t} (\vc \mid \vx) }{\frac{1}{t}\sum_{j = 1}^{t-1} p_{\theta_j} (\vc \mid \vx) + \frac{1}{t} p_{\theta_t} (\vc \mid \vx) } \\
    %
    &= - \sum_{\vc \in \{0,1\}^k } p_{\theta_t} (\vc \mid \vx) \log \frac{1}{t} \cdot \frac{\sum_{j = 1}^{t-1} p_{\theta_j} (\vc \mid \vx) + p_{\theta_t} (\vc \mid \vx) }{p_{\theta_t} (\vc \mid \vx)}\\
    %
    &=  \sum_{\vc \in \{0,1\}^k } p_{\theta_t} (\vc \mid \vx) \log {t} - \log  \left[ 1 + (t-1) \cdot \sum_{j=1}^{t-1} \frac{1}{t-1} \frac{p_{\theta_j} (\vc \mid \vx)}{p_{\theta_t} (\vc \mid \vx)}  \right] \\
    %
    &= \log {t} - \sum_{\vc \in \{0,1\}^k } p_{\theta_t} (\vc \mid \vx) \log \left[ 1 + (t-1) \cdot \frac{p_{rest} (\vc \mid \vx)}{p_{\theta_t} (\vc \mid \vx)}  \right]
\end{align}  

where in the second line we introduced a minus sign to flip the term in the logarithm, in the third line we have taken out $p_{\theta_t} (\vc \mid \vx) $ from the numerator and multiplied and divided the remaining terms for $(t-1)$, and in the last line we denoted with $p_{rest} (\vc \mid \vx) $ the average on the members of the ensemble up to $t-1$.

In general, the $\KL$ divergence is unbounded from above but since the same distribution $p_{\theta_t}(\vC \mid \vx)$ appears from both sides this gives an upper-bound. Notice that, since the \KL is always greater or equal than zero we have that:
\[
    0 \leq \KL \big( p_{\theta_t} (\vC \mid \vx) \mid \mid \frac{1}{t} \sum_{j = 1}^t p_{\theta_j} (\vC \mid \vx)  \big) \leq \log t
\]

Following, we consider the composite expression with the term proportional to the entropy on the concepts $p_\theta (\vC \mid \vx)$, without accounting for the $\log t$ term.  
% \begin{align}
%  &\gamma_1 \cdot H(p_\theta(\vC \mid \vx)) + \gamma_2 \cdot \KL \big( p_{\theta_t} (\vC \mid \vx) \mid \mid \frac{1}{t} \sum_{j = 1}^t p_{\theta_j} (\vC \mid \vx)  \big) \\
%  &= \gamma_2\Big[ \big( \frac{\gamma_1}{\gamma_2} -1 \big) H(p_{\theta_t} (\vC \mid \vx)) + H(p_{\theta_t} (\vC \mid \vx)) + \frac{1}{t} \KL \big( p_{\theta_t} (\vC \mid \vx) \mid \mid \frac{1}{t} \sum_{j = 1}^t p_{\theta_j} (\vC \mid \vx)  \big) \Big] \\
%  %
%  &\propto  \big( \frac{\gamma_1}{\gamma_2} -1 \big) H(p_{\theta_t} (\vC \mid \vx)) -\sum_{\vc \in \{0,1\}^k } p_{\theta_t} (\vc \mid \vx) \log p_{\theta_t} (\vc \mid \vx) \cdot \left[ 1 + (t-1) \cdot \frac{p_{rest} (\vc \mid \vx)}{p_{\theta_t} (\vc \mid \vx)}  \right] \\
%  %
%  &= \big( \frac{\gamma_1}{\gamma_2} -1 \big) H(p_{\theta_t} (\vC \mid \vx)) -\sum_{\vc \in \{0,1\}^k } p_{\theta_t} (\vc \mid \vx) \log \left[ p_{\theta_t} (\vc \mid \vx) + (t-1) \cdot {p_{rest} (\vc \mid \vx)} \right] \\
% &  
% \end{align}
% 
% 
In our implementation, we minimize the term:
\[
\begin{aligned}
    \min_\theta \frac{1}{|\calD|} \sum_{(\vx, \vy) \in \calD} p_\theta(\vy \mid \vx; \BK) 
    &+
    \frac{\gamma_1}{\log t} \sum_{\vc \in \{0,1\}^k } p_{\theta_t} (\vc \mid \vx) \log \left[1+ (t-1) \cdot \frac{p_{rest} (\vc \mid \vx)}{p_\theta(\vc \mid \vx)} \right] \\
    &+ \gamma_2 \Big( 1 - \frac{H(p_{\theta_t} (\vC \mid \vx)) }{k \log 2} \Big)  
\end{aligned}
\]
for each new member of the ensemble, where we divided the $\KL$ term by $\log t$ to ensure its normalization, and we normalized the entropy for the maximal value $k \log 2$. The pseudo-code of \method is shown in~\cref{alg-biretta}. 

\begin{algorithm}[ht]
\caption{\method}
\begin{algorithmic}[1]
    \Procedure{\method}{$\texttt{n}, \texttt{seeds}, \gamma_1, \gamma_2, \texttt{epochs}, \texttt{train\_loader}$}
        \State Initialize empty \texttt{ensemble}\;
        \For{$i = 1 \ldots \texttt{n}$}
            \State $\texttt{seed} \leftarrow \texttt{seeds}[i]$ \textcolor{teal}{\# Set seed using $\texttt{seeds}[i]$} \;
            \State ${\texttt{model} =\tt  get\_neq\_model(\texttt{seed})} $ \textcolor{teal}{\# Create a new ANN model from the seed} \;
            \For{$e =1, \ldots, \texttt{epochs}$}
                \For{\texttt{data} $(x, y)$ \textbf{in} $\texttt{train\_loader}$}
                    \State $\hat{y}, pcx = \texttt{model}(x)$ \textcolor{teal}{\# Compute $\hat{y}$ and $p(c \mid x)$} \;
                    \State $\text{loss} = \text{C}(y, \hat{y})$ \textcolor{teal}{\# Calculate the loss in classification for the NeSy model}
                    \;
                    \If{$i > 0$}
                        \State $\overline{pcx} = \text{mean}(pcx)$ \textcolor{teal}{\# Compute the ensemble average $\overline{p(c|x)}$}
                        \;
                        \State $\text{loss} = \text{loss} + \gamma_1 \; \KL(pcx \mid \mid \overline{pcx}) + \gamma_2 \; H(pcx)$
                        \textcolor{teal}{\# Update loss with the KL term and entropy penalty}
                        \;
                    \EndIf
                    \State loss.backprop() \textcolor{teal}{\# Backpropagate the loss and update model parameters} \;
                \EndFor
            \EndFor
            \State \texttt{ensemble}$[i] \leftarrow \texttt{model}$ \textcolor{teal}{\# Add $\texttt{model}$ to $\texttt{ensemble}$} \;
        \EndFor
        \State \textbf{return} $\texttt{ensemble}$\;
    \EndProcedure
\end{algorithmic}
\label{alg-biretta}
\end{algorithm}


\subsection{Datasets details}
\label{sec:datasets-and-architecture}

In our experiments, when possible, we processed different digits and objects with the same neural network. This happens in both \MNISTAdd tasks and \Kandinsky, whereas for \BOIA this choice is not available.

\subsubsection{\MNISTShortcut} 

As done in \citep{marconato2023not}, we considered the \MNISTShortcut dataset, initially introduced in \citep{marconato2023neuro}. This variant of \MNISTAdd has only a few specific combinations of digits, containing either only even or only odd digits:

\[
    \begin{array}{ccc}
        \begin{cases}
            \MZero + \MSix &= 6\\ 
            \MTwo + \MEight &= 10\\ 
            \MFour + \MSix &= 10\\ 
            \MFour + \MEight &= 12\\
        \end{cases}
        &
        \land
        &
        \begin{cases}
            \MOne + \MFive &= 6\\ 
            \MThree + \MSeven &= 10\\ 
            \MOne + \MNine &= 10\\ 
            \MThree + \MNine &= 12\\
        \end{cases}
    \end{array}
\]

\MNISTShortcut consists of a total of 6720 fully annotated samples in the training set, 1920 samples in the validation set, and 960 samples in the in-distribution test set. Additionally, there are 5040 samples in the out-of-distribution test dataset comprising all other sums that are not observed during training.

\paragraph{\textbf{Reasoning Shortcuts}:} As described in \citep{marconato2023not}, the number of deterministic RSs can be calculated by finding the integer values for the digits $\MZero, \ldots, \MNine$ that solve the above linear system. In total, it was shown that the number of deterministic RSs amounts to 49.

% \[
%     \begin{array}{ccc}
%         \begin{cases}
%             \alpha(0) + \alpha(6) &= 6\\ 
%             \alpha(2) + \alpha(8) &= 10\\ 
%             \alpha(4) + \alpha(6) &= 10\\ 
%             \alpha(4) + \alpha(8) &= 12\\ 
%         \end{cases}
%         &
%         \land
%         &
%         \begin{cases}
%             \alpha(1) + \alpha(5) &= 6\\ 
%             \alpha(3) + \alpha(7) &= 10\\ 
%             \alpha(1) + \alpha(9) &= 10\\ 
%             \alpha(3) + \alpha(9) &= 12\\ 
%         \end{cases}
%     \end{array}
% \]

\subsubsection{\MNISTHalf} This dataset constitutes a biased version of \MNISTAdd, including only half of the digits, specifically, those ranging from $0$ to $4$. Moreover, we selected the following combinations of digits:

\[ \label{eq:mnist-half-combs}
    \begin{cases}
        \MZero + \MZero &= 0\\ 
        \MZero + \MOne &= 1\\ 
        \MTwo + \MThree &= 5\\ 
        \MTwo + \MFour &= 6\\ 
    \end{cases}
\]
This allows introducing several RSs for the system.  Unlike \MNISTShortcut, two digits are not affected by reasoning shortcuts: namely $0$ and $1$. The remaining, $2$, $3$, and $4$ can be predicted differently, as shown below.

In total, \MNISTHalf comprises 2940 fully annotated samples in the training set, 840 samples in the validation set, 420 samples in the test set, and an additional 1080 samples in the out-of-distribution test dataset. These only comprise the remaining sums with these digits, like $\MOne+\MThree = 4$.

\paragraph{\textbf{Reasoning shortcuts}:}

We identify all the possible RSs empirically, since the system of observed sums can be written as a linear system from \cref{eq:mnist-half-combs}.
There are in total three possible optimal solutions, of which two are reasoning shortcuts. Explicitly:
\[
    \begin{aligned}
            \MZero  \mapsto 0, \
            \MOne   \mapsto 1, \
            & \MTwo   \mapsto 2, \
            \MThree \mapsto 3,  \
            \MFour  \mapsto 4 \\
        &\lor \\
            \MZero \mapsto 0, \ 
            \MOne \mapsto 1,  \
            &\MTwo \mapsto 3, \ 
            \MThree \mapsto 2, \
            \MFour \mapsto 3,  \\
        &\lor \\
            \MZero \mapsto 0,  \
            \MOne \mapsto 1,  \
            &\MTwo \mapsto 4,  \
            \MThree \mapsto 1, \  
            \MFour \mapsto 2,   
    \end{aligned}
\]


\subsubsection{\Kandinsky}

This dataset, introduced in \citep{muller_kandisnky_2021}, consists of visual patterns inspired by the artistic works of Wassily Kandinsky. These patterns are made of geometric figures, with several features. In our experiment, we propose a variant of \Kandinsky where each image has a fixed number of figures, and the associated concepts are shape and color. 
%
In total, each object can take one among three possible colors $(\tt red, blue, yellow)$ and one among three possible shapes $(\tt square, circle, triangle)$.


We propose our \Kandinsky variant for an active learning setup resembling an IQ test for machines. The task is to predict the pattern of a third image given two images sharing a common pattern.

% The example below provides an idea of this task.
% \begin{figure}[h]
%     \centering
%     \includegraphics[width=0.5\textwidth]{figures/kand-illustration}
%     \caption{An example of a test sample for the \Kandinsky task. At inference time, the NeSy model has to choose according to the previous two images the third that completes the \textit{pattern}.
%     For each image, the model computes a series of predicates, like $\texttt{same\_cs}$, $\texttt{same\_ss}$, and so on.
%     In this case, the first two images have different colors, so the model should pick the first option. 
%     }
%     \label{fig:kand}
% \end{figure}



Formally, let $x$ be an object in the figure, $S(x)$ the shape of $x$, and $C(x)$ its color.
%
Let the image be denoted as $Figure$. In total, each figure contains three objects with possibly different colors and shapes. To enhance the clarity and conciseness of our logical expressions, we introduce the following shorthand predicates:

\[
    \begin{cases}
        & \text{\texttt{diff\_s}}(Figure) \equiv \forall x, y \in Figure : \left( x \neq y \rightarrow \neg \left( S(x) = S(y) \right) \right) \\
        & \text{\texttt{diff\_c}}(Figure) \equiv \forall x, y \in Figure : \left( x \neq y \rightarrow \neg \left( C(x) = C(y) \right) \right) \\
        & \text{\texttt{same\_s}}(Figure) \equiv \forall x, y \in Figure : \left( S(x) = S(y) \right) \\
        & \text{\texttt{same\_c}}(Figure) \equiv \forall x, y \in Figure : \left( C(x) = C(y) \right) \\
        & \text{\texttt{pair\_c}}(Figure) \equiv \neg \text{\texttt{same\_c}}(Figure) \land \neg \text{\texttt{diff\_c}}(Figure)\\
        & \text{\texttt{pair\_s}}(Figure) \equiv \neg \text{\texttt{same\_s}}(Figure) \land \neg \text{\texttt{diff\_s}}(Figure)\\
    \end{cases}
\]

% Then the following logical predicates identify the patterns:
% \[
%     \begin{cases}
%         & \text{\texttt{same\_c\_same\_s}}(Figure) \equiv \text{\texttt{same\_c}}(Figure) \land \text{\texttt{same\_s}}(Figure)\\
%         & \text{\texttt{same\_c\_pair\_s}}(Figure) \equiv \text{\texttt{same\_c}}(Figure) \land \text{\texttt{pair\_equal\_shape}}(Figure)\\
%         & \text{\texttt{same\_c\_diff\_s}}(Figure) \equiv \text{\texttt{same\_c}}(Figure) \land \text{\texttt{diff\_s}}(Figure)\\
%         & \text{\texttt{pair\_c\_same\_s}}(Figure) \equiv \text{\texttt{pair\_equal\_color}}(Figure) \land \text{\texttt{same\_s}}(Figure)\\
%         & \text{\texttt{pair\_c\_pair\_s}}(Figure) \equiv \text{\texttt{pair\_equal\_color}}(Figure) \land \text{\texttt{pair\_equal\_shape}}(Figure)\\
%         & \text{\texttt{pair\_c\_diff\_s}}(Figure) \equiv \text{\texttt{pair\_equal\_color}}(Figure) \land \text{\texttt{diff\_s}}(Figure)\\
%         & \text{\texttt{diff\_c\_same\_s}}(Figure) \equiv \text{\texttt{diff\_c}}(Figure) \land \text{\texttt{same\_s}}(Figure)\\
%         & \text{\texttt{diff\_c\_pair\_s}}(Figure) \equiv \text{\texttt{diff\_c}}(Figure) \land \text{\texttt{pair\_equal\_shape}}(Figure)\\
%         & \text{\texttt{diff\_c\_diff\_s}}(Figure) \equiv \text{\texttt{diff\_c}}(Figure) \land \text{\texttt{diff\_s}}(Figure)\\
%     \end{cases}
% \]

Let $Sample$ represent a training sample consisting of two figures for the sake of simplicity; the extension to more figures is trivial. The final logic statement, that determines the model output, is:

\[
    \begin{aligned}
        &\text{\texttt{shared\_pattern}} \Rightarrow \forall f_1, f_2 \in \textit{Sample} : \\
        &   (\texttt{same\_c}(f_1) \land \texttt{same\_c}(f_2) )
        \lor  
            (\texttt{pair\_c}(f_1) \land 
        \texttt{pair\_c}(f_2))
        \lor 
            (\texttt{diff\_c}(f_1) \land \texttt{diff\_c}(f_2)) \\
        & \hspace{20em} \lor\\ 
        &   (\texttt{same\_s}(f_1) \land     \texttt{same\_s}(f_2))
        \lor 
            (\texttt{pair\_s}(f_1) \land \texttt{pair\_s}(f_2))
        \lor 
            (\texttt{diff\_s}(f_1) \land \texttt{diff\_s}(f_2))
    \end{aligned}
\]

Our \Kandinsky dataset version comprises $4$k examples in training, $1$k in validation, and $1$k in test.

We create our dataset to include a balanced number of positive and negative examples. Positive examples consist of three images sharing the same pattern, while in negative examples the third image does not match the pattern which is shared by the first two images. The order of examples does not introduce bias into the neural network learning procedure, as the network treats each figure independently.

\paragraph{\textbf{Preprocessing}:} When processing an entire figure at once, we empirically observed that the model faces challenges in achieving satisfactory accuracy. Consequently, we opted to process one object at a time. Therefore, we employed a simplified version of the dataset that comprises of rescaled objects manually extracted via bounding boxes. Thus, each example of the dataset consists of 9 objects, namely 3 objects for each figure, ordered based on their distance from the origin of the figure.

\paragraph{\textbf{Reasoning shortcuts}:} The knowledge we build for \Kandinsky admits many RSs. As there are no constraints on specific colors or shapes, in principle, each permutation of colors and shapes can achieve perfect accuracy. Furthermore, the logic is symmetrical; hence, the concepts of colors and shapes could be swapped. Working on this dataset, we have observed various RSs. An example is illustrated below:

\begin{figure}[h]
    \centering
    \includegraphics[width=0.4\textwidth]{figures/results/kandinsky/mondi_kandsky_colors.pdf}
    \caption{This plot shows an example of a RS in the \Kandinsky task. The model achieves perfect accuracy by predicting shapes based on their colors. In this scenario, all red objects are correctly identified as squares, blue ones as circles, and yellow ones as triangles.}
    \label{fig:kand-rs}
\end{figure}

\subsubsection{\BOIA}

This dataset is made of frames retrieved from driving scene videos for autonomous driving predictions~\citep{xu2020boia}. Each frame is labeled with four binary actions (\texttt{move\_forward}, \texttt{stop}, \texttt{turn\_left}, \texttt{turn\_right}). Scenes are annotated with $21$ binary concepts, providing explanations for the chosen actions. The training set includes $16$k fully labeled frames, while the validation and test sets have $2$k and $4.5$k annotated data, respectively.

The prior knowledge employed is the same as in \citep{marconato2023not}.
We report it here for the sake of completeness. 
For the \texttt{move\_forward} and \texttt{stop} move, the rules are:

\[
    \begin{cases}
        & \text{\texttt{red\_light}}  \Rightarrow \lnot\text{\texttt{green\_light}}\\
        & \text{\texttt{obstacle}} =  \text{\texttt{car}} \lor \text{\texttt{person}} \lor \text{\texttt{rider}}
        \lor \text{\texttt{other\_obstacle}}\\
        & \text{\texttt{road\_clear}} \Longleftrightarrow \lnot \text{\texttt{obstacle}}\\
        & \text{\texttt{green\_light}} \lor \text{\texttt{follow}} \lor \text{\texttt{clear}} \Rightarrow \text{\texttt{move\_forward}} \\
        & \text{\texttt{red\_light}} \lor \text{\texttt{stop\_sign}} \lor \text{\texttt{obstacle}} \Rightarrow \text{\texttt{stop}}\\
        & \text{\texttt{stop}} \Rightarrow \lnot \text{\texttt{move\_forward}}\\
    \end{cases}
\]

While for the \texttt{turn\_left} and the \texttt{turn\_right} action, the rules are:

\[
    \begin{cases}
        & \text{\texttt{can\_turn}} = \text{\texttt{left\_lane}} \lor \text{\texttt{left\_green\_lane}} \lor \text{\texttt{left\_follow}}\\
        & \text{\texttt{cannot\_turn}} = \text{\texttt{no\_left\_lane}} \lor \text{\texttt{left\_obstacle}} \lor \text{\texttt{left\_solid\_line}}\\
        & \text{\texttt{can\_turn}} \land \lnot \text{\texttt{cannot\_turn}} \Rightarrow \text{\texttt{turn\_left}} \\
    \end{cases}
\]

Moreover, for convenience in metric computations, we decided to group the actions into three classes of concepts. Specifically, we define $F-S$, which groups concepts concerning the \texttt{move\_forward} and \texttt{stop} actions, $L$, which groups concepts concerning the \texttt{turn\_left} action, and the $R$ group, which denotes the actions concerning the \texttt{turn\_right} action. The classes are shown in \cref{tab:boia-concept-classses}.

\begin{table}[h]
    \centering
    \input{tables/BOIA-CONCEPTS}
    \caption{Concept classes in \BOIA}
    \label{tab:boia-concept-classses}
\end{table}

\subsection{Hyperparameters and Model selection}

In our work, we opted for the widely used Adam optimizer~\citep{KingmaB14@adam}. For \MNISTHalf and \MNISTShortcut, the learning rate follows an exponential decay with $\gamma = 0.95$. Regarding \BOIA, the weight decay is $\omega = 1 \cdot 10^{-3}$ for all \DPL variants, except for \ProbCBM where we set it to $0.01$. For the learning rate $\gamma$, it is set to $0.2$ for \DPL and its variants. However, we observed that a $\gamma = 1$ works best for \ProbCBM since this model does not converge very early. In the active learning experiment on \Kandinsky, we applied exponential decay with $\gamma = 0.9$.

To choose the hyperparameters, we conducted a grid search over a predefined set of values, and selected the best values based on both qualitative and quantitative results from a validation set. The learning rate for all experiments was fine-tuned within the range of $10^{-4}$ to $10^{-2}$. Specifically, for \MNISTHalf, we set the learning rate to $5 \cdot 10^{-4}$ for \DPL, and $1 \cdot 10^{-3}$ for \SL, \LTN, and \ProbCBM. For \Kandinsky, the learning rate was set to $1 \cdot 10^{-3}$. In the case of \BOIA, we explored a learning rate range between $10^{-4}$ and $10^{-2}$ and selected $10^{-3}$ for all the models.

Regarding batch sizes, we observed that $64$ worked well for \MNISTShortcut and \MNISTHalf, and $512$ for \BOIA. For \Kandinsky, a smaller batch size of $16$ was chosen, as more frequent updates helped with model convergence.

Empirically, for \method, we discovered that optimizing $\gamma_1$ and $\gamma_2$ significantly influenced ensemble diversity, leading to different outcomes. Specifically, when these hyperparameters are much lower compared to the classification loss, the ensemble models tend to converge toward a single reasoning shortcut, reducing the impact of \method. Conversely, if these hyperparameters are bigger, the ensemble may consist of entirely different solutions, but potentially sub-optimal ones. These hyperparameters should be carefully tuned to strike a balance. We performed a grid search for both parameters over $\eta = \{0.1, 0.8, 0.5, 1, 2, 5, 10\}$ and selected the best values based on minimizing the classification objective and maximizing ensemble diversity.

For \MNISTHalf and \MNISTShortcut, we observed that the impact of entropy is negligible. Consequently, we set $\gamma_2 = 0$ for all experiments. In contrast, relying solely on the \KL term in \BOIA and \Kandinsky does not effectively explore a consistent space of reasoning shortcuts. Thus, for \method, we set $\gamma_1 = 0.1$ for \LTN and $\gamma_1 = 5$ for \SL. For \DPL, we set $\gamma_1 = 0.8$ for \MNISTHalf and \MNISTShortcut, $\gamma_1 = 0.1$ and $\gamma_2 = 1$ for \BOIA, and $\gamma_1 = 0.01$ for \Kandinsky.

Concerning the number of ensembles, a shared hyperparameter for \DeepEns and \method, we chose $5$ for \MNISTHalf, \MNISTShortcut, and \Kandinsky, and $20$ for \BOIA, considering the larger number of reasoning shortcuts in the latter. In \BOIA there $7^{21} \cdot 57^{114} \cdot 280^{280}$, compared to the $49$ present in \MNISTShortcut~\cite{marconato2023not}.

Additionally, we observed that \LTN behavior is quite unstable, resulting in sub-optimal models regardless of hyperparameter choices. To address this, we introduced an entropy penalization of $0.3$ to aid model convergence. The same approach was applied to \DPL on \Kandinsky, where this value was set to $0.2$.

Regarding the number of \Laplace sampling and \MCDrop, we observed no big difference, thus we selected $30$ for our experiments. 

Specifically for \Laplace, we applied the Laplace approximation to the concept layer of the pre-trained frequentist model. 
%
For \MNISTHalf and \MNISTShortcut, we used the Kronecker approximation of the Hessian matrix, but we could not use it for \BOIA due to excessive time and memory requirements.  For \BOIA, we switched to the diagonal approximation.

For \ProbCBM, the optimization involves the sum of two losses: a cross-entropy loss and a concept loss. The concept loss, denoted as $\calL_{\text{concept}}$, is defined as $\calL_{\text{concept}} = \calL_{BCE} + \lambda_{\text{KL}} \calL_{KL}$~\citep{kim2023probabilistic}. Here, $\calL_{BCE}$ represents the standard binary cross-entropy, and $\calL_{KL}$ serves as a regularization term for the Gaussian distribution, defined as $\calL_{KL} = \KL (N(\mu_c, \text{diag}(\sigma_c))||N(0, I))$. Since we lack concept supervision during training, the weight associated with the binary cross-entropy is set to $0$. The regularization term $\lambda_{kl}$ is maintained at $0.001$ for both examples, as setting it too high led to sub-optimal models in our specific context.

Finally, concerning the active learning example on \Kandinsky, we found that to achieve optimal convergence while still learning concepts, effective parameters for the concept supervision loss are $25$ for \DPL and $10$ for \DPL + \method.

\subsection{Architectures and Model Details}

\paragraph{\MNISTAdd:} The architectures employed for \MNISTShortcut and \MNISTHalf are essentially the one implemented in \citep{marconato2023not}, outlined in \cref{tab:encoder-mnist}. The only difference among the two datasets is the size of the bottleneck, which depends on the number of concepts. For \MNISTHalf, the last layer dimension is 10, while for \MNISTShortcut is 20. For \ProbCBM, the architecture is shown in \cref{tab:encoder-pcbm}. Additionally, for \SL only, we introduced an MLP with a hidden size of 50 neurons. 
This MLP takes the logits of both concepts as input and processes them to produce the final label.


\begin{table}[!t]
    \centering
    \caption{Encoder architecture for \MNISTHalf, \MNISTShortcut.}
    \begin{tabular}{lccccc}
        \toprule
        %
        & \textsc{Input}  & \textsc{Layer Type} & \textsc{Parameter} & \textsc{Activation} \\
        \midrule
        & $(28, 56, 1)$ & Convolution & depth=32, kernel=4, stride=2, padding=1  & ReLU   \\
        & $(32, 14, 28)$ & Dropout & $p = 0.5$ & & \\
        & $(32, 14, 28)$ & Convolution & depth=64, kernel=4, stride=2, padding=1  & ReLU   \\
        & $(64, 7, 14)$ & Dropout & $p = 0.5$ & \\
        & $(64, 7, 14)$ & Convolution & depth=$128$, kernel=4, stride=2, padding=1  & ReLU   \\
        & $(128, 3, 7)$ & Flatten & \\
        & $(2688)$ & Linear & dim=20, bias = True & \\
        \bottomrule
    \end{tabular}
    \label{tab:encoder-mnist}
\end{table}

\begin{table}[!t]
    \centering
    \caption{Encoder architecture for \Kandinsky}
    \begin{tabular}{lcccccp{7cm}}
        \toprule
        %
        & \textsc{Input}  & \textsc{Layer Type} & \textsc{Parameter} & \textsc{Activation} \\
        \midrule
        & $(28, 28, 3)$ & Flatten & & \\
        & $(2352)$ & Linear & dim=256, bias=True & ReLU \\
        & $(256)$ & Dropout & $p = 0.5$ & & \\
        & $(256)$ & Linear & dim=128, bias=True & ReLU \\
        & $(128)$ & Dropout & $p = 0.5$ & & \\
        & $(128)$ & Linear & dim=8, bias = True & & \\
        \bottomrule
    \end{tabular}
    \label{tab:encoder-kand}
\end{table}


\begin{table}[!t]
    \centering
    \caption{Encoder architecture for \ProbCBM}
    \begin{tabular}{lcccccp{7cm}}
        \toprule
        %
        & \textsc{Input}  & \textsc{Layer Type} & \textsc{Parameter} & \textsc{Activation} & \textsc{Note} \\
        \midrule
        & $(28, 56, 1)$ & Convolution & depth=32, kernel=4, stride=2, padding=1  & ReLU &  \\
        & $(32, 14, 28)$ & Dropout & $p = 0.5$ & & \\
        & $(32, 14, 28)$ & Convolution & depth=64, kernel=4, stride=2, padding=1  & ReLU &  \\
        & $(64, 7, 14)$ & Dropout & $p = 0.5$ & \\
        & $(64, 7, 14)$ & Convolution & depth=$128$, kernel=4, stride=2, padding=1  & ReLU &  \\
        & $(128, 3, 7)$ & Flatten & \\
        & $(2688)$ & Linear & dim=160, bias = True & & Head for $\mu$\\
        & $(2688)$ & Linear & dim=160, bias = True & & Head for $\sigma$\\
        \bottomrule
    \end{tabular}
    \label{tab:encoder-pcbm}
\end{table}


\paragraph{\BOIA:} Likewise for \citep{marconato2023not}, \BOIA images have been preprocessed, as detailed in~\citep{sawada2022concept}, employing a Faster-RCNN~\citep{ren2015fasterrcnn} pre-trained on MS-COCO and fine-tuned on BDD-100k, for initial preprocessing. Subsequently, we employ a pre-trained convolutional layer from~\citep{sawada2022concept} to extract linear features with a dimensionality of 2048. These linear features serve as inputs for the NeSy model, implemented with a fully-connected classifier network as outlined in~\cref{tab:dpl-classifier-boia} for \DPL, and in~\cref{tab:pcbm-classifier-boia} for \ProbCBM.

\paragraph{\Kandinsky:} For \Kandinsky, we chose to use an MLP-based encoder, as depicted in \cref{tab:encoder-kand}.

\begin{table}[!t]
    \centering
    \caption{\DPL architecture for \BOIA}
    \begin{tabular}{cccccccp{7cm}}
        \toprule
        & \textsc{Input}  & \textsc{Layer Type} & \textsc{Parameter} & \textsc{Activation} & \textsc{Note} \\
        \midrule
        & $(2048, 1) $ & Linear & dim=512, bias=True & ReLU &  \\
        & $(512) $ & Dropout &  &  &  \\
        & $(512) $ & Linear & dim=21, bias=True &  & Head for \texttt{move\_forward} action \\
        & $(512) $ & Linear & dim=12, bias=True &  & Head for \texttt{stop} action \\
        & $(512) $ & Linear & dim=12, bias=True &  & Head for \texttt{turn\_left} action \\
        & $(512) $ & Linear & dim=12, bias=True &  & Head for \texttt{turn\_right} action \\
        \bottomrule
    \end{tabular}
    \label{tab:dpl-classifier-boia}
\end{table}

\begin{table}[!t]
    \centering
    \caption{\ProbCBM architecture for \BOIA}
    \begin{tabular}{llllllcp{7cm}}
        \toprule
        & \textsc{Input}  & \textsc{Layer Type} & \textsc{Parameter} & \textsc{Activation} & \textsc{Note} \\
        \midrule
        & $(2048, 1) $ & Linear & dim=336, bias=True & & Head for $\mu$ \\
        & $(2048, 1) $ & Linear & dim=336, bias=True & & Head for $\sigma$ \\
        \bottomrule
    \end{tabular}
    \label{tab:pcbm-classifier-boia}
\end{table}

\subsection{Active Learning Setup}
\label{appendix:active-learning}

The active learning setup proposed is based on \Kandinsky. The examples consist of three figures, each composed of three objects. Each object is characterized by shape and color properties. In this setup, the model processes each object independently, producing a 6-dimensional vector that includes the one-hot encoding of shapes and colors, with each dimension representing one of the three shapes or colors. Overall, for each figure the model produces an 18-dimensional vector. The supervision is provided for a single object and consists of its shape and color.

To configure the experiment, we masked all the concepts in the training set, revealing them only when the object is chosen for supervision. Therefore, the active learning setup does not involve adding new examples to the training set but rather unveiling concepts in the existing ones.

The model was initialized by providing supervision on 10 \texttt{red-squares}, that was sufficient to allow it to achieve optimal accuracy (by learning a reasoning shortcut). Notice that without any initial concept-level supervision, the model was incapable of achieving decent accuracy results because of the complexity of the knowledge.

% This was done for two primary reasons. First, to provide the model with initial supervision, ensuring it learns correctly at least one of the concepts (\texttt{red} or \texttt{square}). This establishes a scenario where both concepts are affected and unaffected by reasoning shortcuts, a condition verified empirically on a validation set before the active learning experiment. Second, to help the model in reaching optimality on labels. The model, operating alone without supervision, struggles to learn a combination of concepts leading to perfect accuracy on the label due to the complexity of the knowledge.

At each step of the active learning setup, both \DPL and \method compute the Shannon entropy on an object, defined as:
\[
    H(\mathbf{s}, \mathbf{c} | x) = - \sum_{i,j} p_{\theta}(s_i, c_j | x) \log p_{\theta}(s_i, c_j | x)
\]

where $p_{\theta}(s_i, c_j | x)$ is the probability of shape $s_i$ and color $c_j$ for object $x$. Plain \DPL computes the probability of a certain configuration of concepts for an object $x$ as: 
\[
    p_{\theta}(s_i, c_j | x) = p_{\theta}(s_i | x) p_{\theta}(c_j | x)
\]

while \method computes it as: 

\[
    p_{\vtheta}(s_i, c_j | x) = \cfrac{1}{|\vtheta|} \sum_{\theta' \in \vtheta} p_{\theta'}(s_i | x) p_{\theta'}(c_j | x)
\]

Where $\vtheta$ is the learned ensemble. 
%
The top 10 elements with largest entropy are then selected to acquire concept-level supervision. The baseline method \DPL + \texttt{random} ignores concept uncertainty altogether and simply chooses 10 random elements from the training set.


\subsection{Runtime Comparison}
\label{sec:runtimes}

\begin{table*}[!h]
    \centering
    \scriptsize
    \caption{Wall-clock time for a single batch in \MNISTHalf}
    \include{tables/time_magnitude_halfmnist}
    \label{tab:halfmnist-time}
\end{table*}

To estimate the order of magnitude of \method, we measured the wall-clock time of a run on \MNISTHalf. Specifically, we computed the wall-clock time of the model inference on a single batch (all batches have the same dimension, i.e. $64$). For both \DeepEns and \method, we evaluated only a single model of the ensemble, namely the last model out of $5$. In this way, we isolate the time from the number of ensembles. We do not report the training time for \MCDrop and \Laplace as they are applied on pre-trained models. Additionally, we account for the pre-processing time of \Laplace, which is needed to compute the Hessian matrix for the Laplace approximation. However, it is important to note that this step is done only once. 

As shown in \cref{tab:halfmnist-time}, the inference time of \method is comparable to all the competitors, as long as the ensemble is not too big. In terms of training, although we take more time due to the overhead associated with the retrieval of the $p(C|x)$ for ensemble members and the computation of the loss function, we are comparable with \DeepEns.



\section{Theoretical Material}
\label{sec:proofs}

In this section, we include the proofs and the theoretical material needed for the main text. Before moving to the proofs of the main text claims, we report the statement of Lemma 1, Theorem 2, and Proposition 3 from \citep{marconato2023not} for ease of comparison. These rely on two assumptions, cf. \cref{sec:method}:
%
\begin{itemize} 

    \item[\textbf{A1}.] \textit{Invertibility}: Each $\vx$ is generated by a unique $\vg$, \ie there exists a function $f:\vx \mapsto \vg$ such that $p^*(\vG \mid \vX) = \Ind{ \vG - f(\vX) }$.

    \item[\textbf{A2}.] \textit{Determinism}:  The knowledge $\BK$ is \textit{deterministic}, \ie there exists a function $\beta_\BK:\vg \mapsto \vy$ such that $p^*(\vY \mid \vG; \BK) = \Ind{\vY = \beta_\BK (\vg)}$. 

\end{itemize}
%
We begin by reporting three useful results from \citep{marconato2023not} that will be used in our proofs. First of all, we will indicate with $\mathsf{supp} (\vG)$ the support of the probability distribution given by $p^*(\vG)$. 


\begin{lemma}
    \label{lemma:abstraction-from-lh}
    It holds that:
    %
    %
    (\textit{ii}) Under \textbf{A1}, there exists a bijection between the deterministic concept distributions $p_\theta(\vC \mid \vX)$ {that are constant over the support of $p(\vX \mid \vg)$, for each $\vg \in \mathrm{supp}(\vG)$,} and the deterministic distributions of the form $p_\theta(\vC \mid \vG)$.
\end{lemma}


\begin{theorem}
    \label{thm:mc-det-opts}
    Let $\calA$ be the set of mappings $\alpha: \vg \mapsto \vc$ induced by all possible deterministic distributions $p_\theta(\vC \mid \vG)$, \ie each $p_\theta(\vC \mid \vG) = \Ind{\vC = \alpha(\vG)}$ for exactly one $\alpha \in \calA$.
    %    
    Under \textbf{A1} and \textbf{A2}, the number of deterministic optima $p_\theta(\vC \mid \vG)$ is: 
    %
    \[ 
        \textstyle
        \sum_{\alpha \in \calA} \Ind{
            \bigwedge_{\vg \in \mathsf{supp}(\vG)}
                (\beta_\BK \circ \alpha)(\vg) = \beta_\BK(\vg)
        }
        \label{eq:model-count}
    \]
    In particular, the set of optimal maps $\calA^*$ is given by:
    \[  \textstyle
        \calA^* = \big\{ \alpha \in \calA: \; \bigwedge_{\vg \in \mathsf{supp}(\vG)} (\beta_\BK \circ \alpha)(\vg) = \beta_\BK(\vg)  \big\}
    \]
\end{theorem}


\begin{proposition}
    \label{prop:structure-of-nondet-ops}
    For probabilistic logic approaches (including DPL and SL):
    %
    (\textit{i}) All convex combinations of two or more deterministic optima $p_\theta(\vC \mid \vX)$ of the likelihood are also (non-deterministic) optima. {However, not all convex combinations can be expressed in DPL and SL.}
    %
    (\textit{ii}) Under \textbf{A1} and \textbf{A2}, all optima of the likelihood can be expressed as a convex combination of deterministic optima.
    %
    (\textit{iii}) If \textbf{A2} does not hold, there may exist non-deterministic optima that are not convex combinations of deterministic ones. These may be the only optima.
\end{proposition}


\subsection{Proof of Lemma~\ref{lemma:decomposition-of-p}}

\begin{lemma*}
    Take any input-concept distribution $p(\vC \mid \vX)$ and let $p(\vC \mid \vG)$ be the concept-concept distribution entailed by it.  Then there exists (at least one) vector $\vomega$ such that $p$ is a convex combination of maps $\alpha \in \calA$, that is:
    %
    $$
        p(\vC \mid \vG)
        = \sum_{\alpha \in \calA} \omega_\alpha \Ind{ \vC = \alpha(\vG) }
        := p_\omega(\vC \mid \vG)
    $$
    %
    parameterized by $\vomega \ge 0$, $\norm{\vomega}_1 = 1$.
    %
    Moreover, under invertibility (\textbf{A1}) and determinism (\textbf{A2}), the set of all maps $\calA$ restricts to the set of optimal maps $\calA^*$.  
\end{lemma*}

% \textit{Claim:} % [Decomposition of $p_\theta(\vC \mid \vG)$]
% %
% Any distribution $p(\vC \mid \vG)$ entailed by a distribution $p(\vC \mid \vX)$ can be decomposed as a weighted sum of maps $\alpha \in \calA$:
% \[
%     p(\vC \mid \vG) = \sum_{\alpha \in \calA} \omega_\alpha \Ind{ \vC = \alpha(\vG) }
% \]
% however, the parameters $\omega (\alpha)$ may not be unique. 
% In particular, under \textbf{A1} and \textbf{A2}, an optimal distribution $p(\vC \mid \vG)$ satisfying \textbf{D2} reduces $\calA$ to $\calA^*$ the set of optimal maps $\alpha$. 

    % Moreover, for a Bayesian model with posterior $p(\theta \mid \calD)$ the weights become $ \omega_\alpha = \bbE_{p(\theta \mid \calD)} [ \omega_\theta(\alpha) ] $.

% \begin{lemma} % [Decomposition of $p_\theta(\vC \mid \vG)$]
%     \EM{Not reported in the main text, we need to assume A1 and A2 here}
%     The map $p_\theta(\vC \mid \vG)$ entailed by the NeSy predictor $p_\theta(\vC \mid \vX)$ can be decomposed as a weighted sum of maps $\alpha: \vG \mapsto \vC$:
%     \[
%         p_\theta(\vC \mid \vG) = \sum_{\alpha \in \calA} \omega_\theta(\alpha) \Ind{ \vC = \alpha(\vG) }
%     \]
%     however, the parameters $\omega_\theta (\alpha)$ may not be unique. 
%     In particular, in probabilistic logic when $\theta \in \Theta^*$, $\calA$ restricts to $\calA^*$, the set of optimal solutions. 
%     % Moreover, for a Bayesian model with posterior $p(\theta \mid \calD)$ the weights become $ \omega_\alpha = \bbE_{p(\theta \mid \calD)} [ \omega_\theta(\alpha) ] $.
% \end{lemma}

\begin{proof}
    By definition, $p(\vC \mid \vG)$ is given by:
    \[  
        p(\vC \mid \vG) := \bbE_{p(\vx \mid \vg)} p(\vC \mid \vx)
    \]
    For each $\vg \in \{0,1\}^k$, $p(\vC \mid \vg)$ can be written as a convex combination of the maps $\alpha \in \calA$: 
    \[  
    \begin{aligned}
        p (\vC \mid \vg) &=  \sum_{\vc} p(\vc \mid \vg) \Ind{ \vC = \vc } \\
        &= \sum_\vc \left[  \sum_{\alpha \in \calA} \omega_\alpha  \Ind{\alpha(\vg) = \vc} \right] \Ind{\vC = \vc} \\
        & = \sum_{\alpha \in \calA} \omega_\alpha \Ind{ \vC = \alpha(\vg)}
    \end{aligned}
    \]
    where in the last line we swapped the summation and used the condition that $\alpha(\vg)= \vc$. Altogether, this yields for the single $\vg$ that the following must hold: 
    \[
        \sum_{\alpha \in \calA: \, \alpha(\vg) = \vc }  \omega_\alpha = p(\vc \mid \vg) 
    \]
    Combining all the cases this gives a system of $2^k \cdot {2^k}$ equations, one for each $\vg$ and for each $\vc$, for a total of $(2^k)^{2^k}$ variables $\vomega$:
    \[
        \sum_{\alpha \in \calA: \, \alpha(\vg) = \vc }  \omega_\alpha = p(\vc \mid \vg), \quad \forall \vg \in \{ 0, 1\}^k, \; \forall \vc \in \{ 0, 1\}^k 
        % \sum_{\vc} \sum_{\alpha \in \calA: \, \alpha(\vg)= \vc} \omega (\alpha) = 1, & \forall \vg \in \{ 0, 1 \}^k
    \]
    This shows that the linear system can always be solved, proving that $\calA$ spans the space of $p(\vC \mid \vG)$. %\EM{@ST check.} \ST{looks good, thanks!}
    
    % a basis for $p (\vC \mid \vG)$ that includes fewer elements of $\calA$. This happens because there are corner cases where $\omega_\alpha$ is uniquely determined.
    % When considering a deterministic solution $\alpha'$, there is only one choice corresponding to that solution which is $\omega_{\alpha'}=1$ and the remaining zero. Therefore, all maps $\alpha \in \calA$ constitute a complete basis for $p(\vC \mid \vG)$.
    % \ST{not super sure: how can $\calA$ be a ``basis'' (of *orthogonal* elements) if $\vomega$ is non-unique?  seems like those elements cannot be orthogonal.  this is a terminological thing rather than a conceptual mistake: it is clear tha $p(c|g)$ lives in a space spanned by $\calA$, what is not clear is that $\calA$ is a basis (minimal, orthogonal, ...)} \EM{a basis can also be non-orthogonal nor minimal.} \ST{check telegram}
    
    Next, under \textbf{A1} and \textbf{A2} we have that to have an optimal model we only need to consider the optimal elements $\alpha \in \calA^*$, where the set $\calA^*$ is defined from \cref{thm:mc-det-opts}:
    \[ \textstyle
        \calA^* = \big\{\alpha \in \calA: \; \bigwedge_{\vg \in \mathrm{supp}(\vG)} (\beta_\BK \circ \alpha) (\vg) = \beta_\BK (\vg)   \big\}
    \]
    We proceed by contradiction. Suppose there exists one $\alpha' \not \in \calA^*$ such that: 
    \[
        p(\vC \mid \vG) = \omega(\alpha') \Ind{\vC = \alpha' (\vg)} + \sum_{\alpha \in \calA^*} \omega_\alpha \Ind{\vC = \alpha (\vg)}
    \]
    is still optimal. Notice that there exists at least one $\vg$ such that $\alpha'(\vg) \neq \alpha(\vg)$, $\forall \alpha \in \calA^*$. This means that for those values $\vg$ we have $(\beta_\BK \circ \alpha') (\vg) \neq \beta_\BK (\vg)$. 
    Therefore, the NeSy predictor will result in a suboptimal model, since it does not place the mass on concepts attaining the same label. This proves the contradiction, yielding the claim.
    % 
    % We now turn to a Bayesian model implementing the conditional distribution on concepts as $p(\vC \mid \vx ) = \bbE_{p(\theta \mid \calD)} [ p_\theta(\vC \mid \vx) ] $. Thus, the map from ground-truth concepts to the learned ones becomes:
    % \[  
    % \begin{aligned}
    %     p(\vC \mid \vG) &= \bbE_{p( \vx \mid \vG) } \int p_\theta(\vC \mid \vx) p(\theta \mid \calD) \de\theta  \\
    %     &= \int   p_\theta(\vC \mid \vG) p(\theta \mid \calD) \de\theta 
    % \end{aligned}
    % \]
    % where in the second line we swapped the expectation and the integral, obtaining $p_\theta(\vC \mid \vG)$. From the previous point, we consider the decomposition of each $p_\theta(\vC \mid \vG)$ in terms of the maps $\alpha \in \calA$: \EM{fissare $ \omega_\theta$ }
    % \[  
    % \begin{aligned}
    %     p(\vC \mid \vG) &= \int p_\theta(\vC \mid \vG) p(\theta \mid \calD) \de\theta \\
    %     &= \int p(\theta \mid \calD) \sum_{\alpha \in \calA^*} \omega_\theta(\alpha) \Ind{ \vC = \alpha(\vg) }   \de\theta \\
    %     &=  \sum_{\alpha \in \calA^*} \Big[ \int p(\theta \mid \calD) \omega_\theta(\alpha) d \theta \Big] \Ind{ \vC = \alpha(\vG) }  \\
    %     &= \sum_{\alpha \in \calA^*} \omega_\alpha \Ind{ \vC = \alpha(\vG) } 
    % \end{aligned}
    % \]
    %This yields the claim.
\end{proof}

\subsection{Entropy on concept vectors and Reasoning Shortcuts}
\label{sec:entropy-vs-rss}

% \EM{This prop was removed from the main text. This essentially tells us what happens to concepts affected by RSs, showing that vanishing entropy implies those $\vg$ are not affected by RSs.} \EK{I renamed concept to concept vector throughout here.} \EM{thanks, i'll add here the definition affected by RSs}\EK{This looks much more readable than before!}

Under \textit{invertibility} (\textbf{A1}) and \textit{determinism} (\textbf{A2}), it is possible to describe entirely the set of optimal maps $\alpha:\vG \mapsto \vC$ through \cref{thm:mc-det-opts}, which we denote with $\calA^*$.
Before moving on to prove the main results in the main text, it is useful to introduce here the notion of ``equivalence set'' of a concept vector $\vg$ given the optimal maps $\alpha \in \calA^*$:
\[
    \calE (\vg; \BK) = \{ \alpha(\vg): \; \forall \alpha \in \calA^* \}
\]
that contains all the concepts $\vc \in \{ 0,1 
\}^k$ that are predicted by the maps $\alpha \in \calA^*$. With this, we can formally define when a ground-truth concept $\vg$ is mispredicted by RSs:
\begin{definition}
    We say that a concept vector $\vg \in \{ 0,1\}^k$ is ``mispredicted'' by RSs when $|\calE(\vg; \BK)| >1$, \ie there exist at least two different $\alpha_i, \alpha_j \in \calA^*$, such that $\alpha_i(\vg) \neq \alpha_j(\vg)$. Conversely,  a concept vector is ``correctly predicted'' if $\alpha(\vg) = \vg$, $\forall \alpha \in \calA^*$.
\end{definition}

Following, we can use the decomposition in terms of the map $\alpha$'s to inspect what combinations with optimal weights $\vomega^* = (\omega_1, \ldots, \omega_N)$, where $N = |\calA^*|$ and $\norm{\vomega^*}_1=1$, 
give high entropy on single concepts $\vg \in \{0,1\}^k$:

\begin{proposition} 
\label{prop:0-maximal-entropy}
    Suppose that $p_\vomega (\vC \mid \vG)$ admits a decomposition as a weighted sum of at least two distinct $\alpha \in \calA$, with weights $\vomega$. Then,
    \begin{itemize}
        \item[(\textit{i})] for any $\vg \in \{0,1 \}^k $ it holds that
        $
            H( p_\vomega(\vC \mid \vg) ) = 0
        $,
    when $\forall \alpha_i, \alpha_j \in \calA$ such that $\omega_{\alpha_i} > 0$ and $\omega_{\alpha_j} > 0$, it holds  $\alpha_i(\vg) = \alpha_j(\vg)$. 
    \end{itemize}
    %
    %
    Assuming that \textbf{A1} and \textbf{A2} hold: 
    \begin{itemize}
        \item[(\textit{ii})] If a concept $\vg \in \{0,1\}^k $ is not mispredicted by RSs, then all combinations  $\vomega^*$ of $\alpha \in \calA^*$ will give zero entropy $H (p_{\vomega^*} (\vC \mid \vg))$
        %
        \item[(\textit{iii})] Vice versa, if $\vg$ is mispredicted by RSs,
        there is always at least one combination $\vomega^*$ such that the entropy $H ( p_{\vomega^*}(\vC \mid \vg)  )$ attains a maximal value of: 
        \[
            H ( p_{\vomega^*}(\vC \mid \vg)  ) = \log |\calE(\vg; \BK)|
        \]
    \end{itemize}
    
    
    
\end{proposition}



% \textit{Claim:}
% Suppose that $p (\vC \mid \vG)$ admits a decomposition as a weighted sum of at least two $\alpha \in \calA$. Then, (\textit{i}) for any $\vg \in \calG$ it holds that:
% \[  \label{eq:entropy-on-optimal-concepts}
%     H( p(\vC \mid \vg) ) = 0
% \]
% if and only if $\alpha_i(\vg) = \alpha_j(\vg), \forall \alpha_i, \alpha_j \in \calA$, for those $\alpha_i$ such that $\omega(\alpha_i) > 0$. 
% %
% Assume that \textbf{A1} and \textbf{A2} hold.(\textit{ii}) If \cref{eq:entropy-on-optimal-concepts} holds for a ground-truth concept $\vg \in \calG$ for all possible choices of $\vomega^*$ involving combinations of only $\alpha \in \calA^*$, then $\alpha(\vg) = \vg$. 
% %
% (\textit{iii}) If a concept $\vg$ is affected by RSs, 
% % so that there exist at least two $\alpha_i(\vg) \neq \alpha_j (\vg)$, 
% there is always at least one combination $\vomega^*$ such that $H ( p(\vC \mid \vg)  )$ is maximal. \EM{I think we can have a more general form by relaxing A2.}
% \begin{proposition} \label{prop:0-maximal-entropy}
%     \EM{Not reported in the main text, we need to assume A1 and A2 here}
%     Suppose that $p_\theta (\vC \mid \vG)$ admits a decomposition as a weighted sum of at least two $\alpha \in \calA^*$, \ie $p_\theta(\vC \mid \vG) = \sum_{\alpha \in \calA^*} \omega_\theta(\alpha) \Ind{ \vC = \alpha(\vG) }$. Then, (\textit{i}) for any $\vg \in \calG$ it holds that:
%     \[  \label{eq:entropy-on-optimal-concepts-supp}
%         H( p_\theta(\vC \mid \vg) ) = 0
%     \]
%     if and only if $\alpha_i(\vg) = \alpha_j(\vg), \forall \alpha_i, \alpha_j \in \calA^*$, for those $\alpha_i$ such that $\omega_\theta(\alpha_i) > 0$. 
%     %
%     (\textit{ii}) If \cref{eq:entropy-on-optimal-concepts-supp} holds for a ground-truth concept $\vg \in \calG$ for all possible choices of $\vomega^*$, \ie $\forall \theta \in \Theta^*$, then $\alpha(\vg) = \vg$ for all $\alpha \in \calA^*$. 
%     %
%     (\textit{iii}) If a concept $\vg$ is affected by RSs, 
%     % so that there exist at least two $\alpha_i(\vg) \neq \alpha_j (\vg)$, 
%     there is always at least one combination $\vomega^*$ such that $H ( p(\vC \mid \vg)  )$ is maximal. 
% \end{proposition}

% \begin{proposition}
%     Suppose that $p_\theta (\vC \mid \vG)$ admits a decomposition as a weighted sum of at least two $\alpha \in \calA^*$. Then, for all $\vg \in \calG$ it holds that:
%     \[  \label{eq:entropy-on-optimal-concepts-supp}
%         H( p_\theta(\vC \mid \vg) ) = 0
%     \]
%     if and only if $\alpha_i(\vg) = \alpha_j(\vg), \forall \alpha_i, \alpha_j \in \calA^*$, for those $\alpha_i$ such that $\omega(\alpha_i) > 0$ (\EM{general case is that it must be also $\omega (\alpha) = 1$ being a solution.}). 
%     % Here, we denoted with $H ( \bullet )$ the conditional entropy on the equivalence class $\calE(\vg;\BK)$, given by:
%     % \[ 
%     %     H ( p_\theta(\vC \mid \vg)  ) = - \sum_{\vc \in \calE(\vg;\BK)} p_\theta(\vc \mid \vg) \log p_\theta(\vc \mid \vg)
%     % \]
%     Moreover, it holds that if a ground-truth concept $\vg \in \calG$ achieves \cref{eq:entropy-on-optimal-concepts-supp} for all possible choices of $\vomega^*$, then $\alpha(\vg) = \vg$ for all $\alpha \in \calA^*$. On the other hand, if a concept $\vg$ is affected by RSs so that there exist at least two $\alpha_i(\vg) \neq \alpha_j (\vg)$, there is always at least one combination $\vomega^*$ such that $H ( p(\vC \mid \vg)  )$ is maximal. 
% \end{proposition}

\begin{proof}
    (\textit{i}) We start by considering a $p_\vomega(\vC \mid \vG)$ given by a fixed convex combination of maps $\alpha \in \calA$, with a vector $\vomega$. 
    %
    % 
    % 
    We proceed to show that for any $\vg \in \{0,1 \}^k$, the entropy is zero holds \textit{if and only if} $\forall \alpha_i, \alpha_j \in \calA$ with $\omega(\alpha_i) > 0$ and $\omega(\alpha_j) > 0$, we have that $\alpha_i(\vg) = \alpha_j(\vg)$. 
    
    We consider a vanishing conditional entropy $H ( p(\vC \mid \vg) ) $ that is given only when $ p(\vC \mid \vg) = \Ind{\vC = \vc}$, for $\vc \in \{ 0,1 \}^k$. 
    % Since the convex combination with weights $\omega^*$ restricts to optimal solution only, we have that $\vc \in \calE(\vg;\BK)$. 
    This occurs only if ($1$) $ p(\vC \mid \vg ) = \Ind{ \vC = \alpha(\vg)}$, for $\alpha \in \calA$, or if ($2$) $ p(\vC \mid \vg) = \sum_{\alpha \in \calA} \omega_\alpha \Ind{ \vC = \alpha(\vg) }$, with $\omega_\alpha > 0$ only if $\alpha(\vg)$ is the same. Since we are considering probabilities $p(\vC \mid \vg)$ with at least two $\alpha$'s, only ($2$) holds, proving the result.

    (\textit{ii}) Next, under \textbf{A1} and \textbf{A2}, we consider the case where we have optimal maps $\alpha \in \calA^*$. 
    For those $\vg$'s that are \textit{correctly predicted} even by RSs, by definition $\calE(\vg; \calA^*) = 1$, and in particular $\alpha(\vg) = \vg$ for all $\alpha \in \calA^*$. This means that whatever combination of weights $\vomega^*$ is chosen, there will be only one element for $p(\vC \mid \vg)$ with all the probability mass. Therefore:
    \[
        p_{\vomega^*}(\vC \mid \vG) = \sum_{\alpha \in \calA^*} \omega^*_\alpha \Ind{ \vC = \alpha(\vg) } = \Ind{\vC = \vg } \sum_{\alpha \in \calA^*} \omega^*_\alpha
    \]
    that leads to a vanishing entropy.

    (\textit{iii}) 
    For any optimal solution, it holds that:
    \[
        \mathsf{supp}(p(\vC \mid \vg)) \subseteq \calE(\vg;\calA^*), \quad \forall \vg \in \{ 0, 1 \}^{k}
    \]
    since all concept vectors having non-zero mass in $p(\vC \mid \vg)$ must be optimal. 
    %
    We now consider a concept vector $\vg$ that is affected by RSs, in that there exists $\alpha_i, \alpha_j \in \calA^*$ such that $\alpha_i (\vg) \neq \alpha_j (\vg) $. 
    %
    We then rewrite it as follows:
    \[
    \begin{aligned}
        p_{\vomega^*} (\vC \mid \vg) &= \sum_{\alpha \in \calA^*} \omega^*_\alpha \Ind{\vC = \alpha (\vg)} \\
        %
        &= \sum_{\vc \in \calE(\vg; \calA^*)  } \big[ \sum_{\alpha \in \calA^*: \, \alpha(\vg) =\vc} \omega^*_\alpha \big] \Ind{\vC = \vc} \\
        %
        &= \sum_{\vc \in \calE(\vg; \calA^*)} \lambda_\vc \Ind{\vC = \vc}
    \end{aligned}
    \]
    where we denoted $\lambda_\vc = \sum_{\alpha \in \calA^*: \, \alpha(\vg)=\vc} \omega^*(\alpha)$ the weight associated to $\Ind{\vC = \vc}$.  
    When plugging this into the entropy we have that:
    \[ 
    \begin{aligned}
        H(p(\vC \mid \vg)) &= - \sum_{\vc \in \{0,1\}^k} p(\vc \mid \vg) \log p(\vc \mid \vg) \\
        %
        &= - \sum_{\vc \in \{ 0,1\}^k} \sum_{\vc' \in \calE(\vg; \calA^*)} \lambda_{\vc'} \Ind{\vc' = \vc} \log \sum_{\vc' \in \calE(\vg; \calA^*)} \lambda_{\vc'} \Ind{\vc' = \vc} \\
        %
        &= - \sum_{\vc \in \calE(\vg; \calA^*)} \lambda_\vc \log \lambda_\vc \\
        %
        &\leq - \sum_{\vc \in \calE(\vg; \calA^*)} \frac{1}{| \calE(\vg; \calA^*) |} \log \frac{1}{| \calE(\vg; \calA^*) |} \\
        %
        &= \log {| \calE(\vg; \calA^*) |} 
    \end{aligned}
    \]
    where the equality holds if and only if $ \lambda_\vc = \frac{1}{| \calE(\vg; \calA^*) |}  $ for all $\vc$ in $p(\vc \mid \vg)$.
    %
    We can therefore choose $\omega^*$ such that:
    \[
        \sum_{\alpha \in \calA^*: \, \alpha(\vg)  = \vc} \omega^*(\alpha)= |\calE(\vg; \calA^*)|^{-1}, \quad \forall \vc \in \calE(\vg; \calA^*)
    \]
    which fixes $|\calE(\vg; \calA^*)|$ equations for at least $|\calE(\vg; \calA^*)|$ variables $\omega^*$. In fact, the number of maps $\alpha \in \calA^*$ is equal to $|\calE(\vg; \calA^*)|$ only when all maps $\alpha_i(\vg) \neq \alpha_j(\vg)$, $\forall \alpha_i, \alpha_j \in \calA^*$, \ie there are not two different maps that predict the same concept for $\vg$.
    %
    This shows that by choosing the coefficients $\omega_\alpha$ correctly, it is possible to obtain a maximally entropic distribution $p(\vC \mid \vg)$. This concludes the proof.
\end{proof}

Point (\textit{ii}) of \cref{prop:0-maximal-entropy} essentially captures the intuition that concept vectors that are ``correctly predicted'' even by RSs will not contribute to increasing the entropy of the distribution $p (\vC \mid \vG)$.  Conversely, when a concept is ``mispredicted'' by RSs, there is always a combination attaining maximal entropy from point (\textit{iii}). Achieving maximal entropy for one ground-truth concept, however, is not enough to guarantee the others will also display maximal entropy.  This can happen because a combination $\vomega^*$ may increase the entropy of one $\vg_i$ while decreasing that of another $\vg_j$. 



% \EK{Irrelevant for the paper, but this last result is really interesting to me. It doesn't consider how many reasoning shortcuts map $\vg$ to $\vc$! So maybe, 999 map $\vg$ just to $\vg$, and then one more to a different one $\vc$. But then the maximum entropy solution is to provide equal mass to $\vg$ and $\vc$!} \EM{Yes exactly, that was a result I wanted to show in the main paper, but we can comment it here if you want. Also put reference in the main text?} \EK{I'm not sure. Do you use this result in another proof? It's interesting, but I'm not sure how it connects to your method, making the story unclear. If you can't really use it in this paper, you can always think about dropping it here for another more theory focused paper lol.} \EM{No in fact, we don't need for the main material. Nonetheless, it is very connected to the idea "few" combinations of RSs that allow to maximize entropy of each concept, but I have also other preliminary results. Better start think to a new paper :D }

% \EM{We are missing the general case, for all $\vg$}
    
% In practice, we want to find the convex combination that maximizes the quantity:
% \[
%     \sum_{\vg \in \calG} p(\vg) H(p(\vC \mid \vg) \mid \calE(\vg; \BK)) \leq \sum_{\vg \in \calG} p(\vg) \log {| \calC(\vg; \calA^*) |}
% \]
% where the upper-bound follows, and the equality holds when $\lambda_\vc = \frac{1}{{| \calC(\vg; \calA^*) |}}$. This means that: 
% \[
%     \forall \vg \in \calG \; \text{ and } \; \forall \vc \in \calC(\vg; \calA^*), \; \sum_{\alpha \in \calA^*: \, \alpha(\vg)= \vc} \omega_\alpha = \frac{1}{{| \calC(\vg; \calA^*) |}} 
% \]

% Note also that we are looking for:
% \[
% \max_{\omega} \sum p(\vg) H(p(\vC \mid \vg) \mid \calE(\vg; \BK) ) = \max_\omega H (\vC \mid \vG) \; \text{such that} \; p(\vC \mid \vG) \; \text{optimal}
% \]

% \begin{proposition}[Ideal Objective]
%     Consider optimal solutions capturing \textbf{D2}, such that $p(\vC \mid \vG)$ can be decomposed as a convex combination with coefficients $\vomega^*$. Then, the ideal objective for \textbf{D1} amounts to solve:
%     \[ \label{eq:objective-d1-d2}
%          \max_{ \theta \in \Theta^*} H(\vC \mid \vG) = \max_{\vomega^*} \sum_{\vg \in \calG} p(\vg) H( p(\vC \mid \vg  ) \mid \calE(\vg; \BK))
%     \]
%     Moreover, we can directly optimize for that by resorting to the fact that by the causal decomposition of the generative process it holds: 
%     \[
%         \max_{\theta \in \Theta^*} H(\vC \mid \vX) \leq  \max_{\theta \in \Theta^*}  H(\vC \mid \vG)
%     \]
% \end{proposition}

\subsection{Proof of Proposition~\ref{prop:ideal-obj}}
% \EK{Update notation. Here $\omega$ is a function from RSs to weight ($\omega(\alpha)$), while in the previous appendix, $\omega$ is a vector indexed by RSs ($\omega_\alpha$)} \EM{sure, the old version was w(alpha) now we have the other way round.}

Before proceeding, it is useful to pin down what we mean precisely with the set of parameters. Based on the generative process with $p^*(\vY \mid \vG;\BK)$, we define as ``optimal'' those parameters $\theta$ that meet the following criterion:
\[
    p_\theta (\vY \mid \vG; \BK) := \int p_\theta(\vY \mid \vx; \BK) p(\vx \mid \vG)  \de \vx = p^*(\vY \mid \vG;\BK)
\]
and denote the whole set with $\Theta^*$.

\begin{proposition*}
    Consider only optimal parameters $\theta \in \Theta^*$ for $p_\theta (\vC \mid \vG)$.
    %
    Assuming that $p_\theta$ is expressive enough to capture every possible combination $p_\vomega$, \ie for each $\vomega$ there exists $\theta$ s.t. $p_\theta(\vC \mid \vG) =p_\vomega(\vC \mid \vG)$,
    under invertibility (\textbf{A1}) and determinism (\textbf{A2}), it holds that:
    %
    \begin{equation*}
        \label{eq:objective-d1-d2}
         \max_{\theta \in \Theta^*} H(p_\theta(\vC \mid \vG))
            = \max_{\vomega^*} H(p_{\vomega^*}(\vC \mid \vG))
    \end{equation*}
    %
    %
\end{proposition*}

% \textit{Claim:}
%     %
%     \EM{update}
%     Consider optimal parameters $\theta \in \Theta^*$, such that $p_\theta (\vC \mid \vG)$ satisfies. 
%     Under \textbf{A1} and \textbf{A2}, 
%     \EM{when the parametric distribution is flexible enough all possible} $p_{\omega^*}(\vC \mid \vG)$, 
%     the objective in \cref{eq:optimization-d2} amounts to solve:
%     \[ \label{eq:objective-d1-d2-app}
%          \max_{ \theta \in \Theta^*} H( p_\theta(\vC \mid \vG)) = \max_{\vomega^*} H( p_{\vomega^*}(\vC \mid \vG  ))
%     \]
    
\begin{proof}
    We start from the fact that by \cref{lemma:decomposition-of-p} we can always express $p_\theta (\vC \mid \vG)$ as a convex combination of maps $\alpha \in \calA^*$ for some weights $\vomega^* (\theta)$. 
    Vice versa, since $p_\theta$ is flexible enough to capture any combination $\vomega^*$ for $p_{\vomega^*} (\vC \mid \vG)$, there will exists some weights $\theta(\vomega^*)$ associated to any vector $\vomega^*$. 
    Notice that, in general, neither $\vomega^* (\theta)$ nor $\theta(\vomega^*)$ are unique.
    This, nonetheless, allows us to convert a problem formulated in terms $\theta \in \Theta^*$ to one in terms of $\vomega^*$: 
    \[
    \begin{aligned}
        \max_{ \theta \in \Theta^* } H(p_\theta(\vC \mid \vG))
        &= \max_{ \theta \in \Theta^* } - \sum_{\vg \in \{ 0,1 \}^{2k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } 
        p_\theta(\vc \mid \vg) \log p_\theta(\vc \mid \vg)  \\
        %
        &= \max_{ \theta \in \Theta^* } - \sum_{\vg \in \{ 0,1 \}^{2k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } \left[ 
        \sum_{\alpha \in \calA^*} \omega^*_\alpha(\theta) \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calA^*} \omega^*_{\alpha'}(\theta) \Ind{ \vc = \alpha'(\vg) }
        \right]  \\
        %
        &=  \max_{ \vomega^*, \ ||\vomega^*||_1=1} - \sum_{\vg \in \{ 0,1 \}^{2k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } \left[ 
        \sum_{\alpha \in \calA^*} \omega^*(\alpha) \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha \in \calA^*} \omega^*(\alpha) \Ind{ \vc = \alpha(\vg) }
        \right] \\
        %
        &=  \max_{ \vomega^*, \ ||\vomega^*||_1=1} - \sum_{\vg \in \{ 0,1 \}^{2k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } 
        p_{\vomega^*}(\vc \mid \vg) \log p_{\vomega^*}(\vc \mid \vg)
 \\
        %
        &=  \max_{ \vomega^*, \ ||\vomega^*||_1=1} H(p_{\vomega^*} (\vC \mid \vG))
    \end{aligned}
    \]
    where in the third line we converted the maximization problem on the parameters $\theta \in \Theta^*$ to the weights $\vomega^*$.
    %
     This concludes the proof.
%
\end{proof}
%
% \textit{Claim:}
%     Moreover, (\textit{ii}) from the causal decomposition of the generative process, it holds: 
%     \[
%         \max_{\theta \in \Theta^*} H(\vC \mid \vX) \leq  \max_{\theta \in \Theta^*}  H(\vC \mid \vG)
%     \]
%    
% \begin{proof}
        
%     Next, we use the structure of the generative process from \cref{fig:generative-process-assumptions} to obtain the relation between $H (\vC \mid \vX) $ and $H(\vC \mid \vG)$. We have that $\vG \to \vX \to \vC$, where $p^*(\vX \mid \vG)$ is given by the generative process and the model gives $p_\theta (\vC \mid \vX)$.
%     Therefore, we can derive a variant of the data processing inequality \citep{cover1999elements}: \EK{Mention that $I$ is the mutual information (right?)} \EK{'using the chain rule of mutual information and noting that the conditional mutual information given x is 0 due to the conditional independence assumption...} \EM{todo, thanks!}
%     \[
%     \begin{aligned}
%         I(\vC : \vX, \vG) &= I(\vC : \vX) + \stackrel{=0}{I(\vC : \vG \mid \vX)} \\ 
%         &= I(\vC : \vG) + I(\vC : \vX \mid \vG) \\ 
%     \end{aligned}
%     \]
%     therefore obtaining $ I(\vC : \vX) \geq I(\vC : \vG) $. We make use of this fact to look at the conditional entropies:
%     \[
%     \begin{aligned}
%         I( \vC : \vX) &\geq I (\vC : \vG) \\
%         H(\vC) - H (\vC \mid \vX) &\geq  H(\vC) - H (\vC \mid \vG) \\
%         H (\vC \mid \vX) &\leq H (\vC \mid \vG)
%     \end{aligned}
%     \]
%     where $H(\vC \mid \vX) = - \bbE_{p^*(\vx)p_\theta (\vc \mid \vx)} \big[ \log p_\theta (\vc \mid \vx) \big]$ and $H(\vC \mid \vG) = - \bbE_{p^*(\vg) p_\theta (\vc \mid \vg)} \big[ \log p_\theta (\vc \mid \vg) \big]$. Then, it also holds that:
%     \[
%         \max_{\theta \in \Theta^*} H (\vC \mid \vX) \leq \max_{\theta \in \Theta^*} H (\vC \mid \vG)
%     \]
%     This concludes the proof.
% \end{proof}
%
\subsection{Proof of Proposition~\ref{prop:prior-posterior-optima}}

\begin{proposition*} % [Optimization of $H(\vC \mid \vX)$] 
    Let $p(\vC \mid \vX)$ be given by a convex combination of models $p_{\theta_i}(\vC \mid \vX)$, for $i \in [K]$, where $K$ denotes the total number of components of $\vtheta = \{ \theta_i \}$, and each $\theta_i \in \Theta^*$.
     %
    Let also $\vlambda =\{ \lambda_i \}$ contain all the weights $\lambda_i$ associated to each component $\theta_i$. 
    %
    Under invertibility (\textbf{A1}) and determinism (\textbf{A2}), there exists $K \leq |\calA^*|$ such that
    maximizing the entropy of $p_{\vomega^*}(\vC \mid \vG)$ can be solved by maximizing $H(p_\vtheta(\vC \mid \vX))$ on $\vtheta$ and $\vlambda$, that is:
    % In particular, when all $\theta_i$ entail a deterministic distribution $p_{\theta_i}(\vC \mid \vG)= \Ind{\vC = \alpha_i(\vG)}$, it holds that:
    %
    $$
        \max_{ \vtheta, \vlambda } H \Big(\sum_{i=1}^K \lambda_i p_{\theta_i}(\vC \mid \vX) \Big)
        = \max_{\vomega^*} H (p_{\vomega^*}(\vC \mid \vG))
    $$
    %
    Furthermore, we can write the maximization of $H(p_\vtheta(\vC \mid \vX))$ as:
    %
    $$
        % \max_{ \{ (\theta_i, \lambda_i) \} } H(\vC \mid \vX) = 
        % \begin{aligned}
            \max_{ \vtheta, \vlambda } \int p(\vx) \sum_{i=1}^K \lambda_i [%&
            \KL( p_{\theta_i} (\vc \mid \vx)  \mid \mid \sum_{j=1}^K \lambda_j  p_{\theta_j} (\vc \mid \vx)) 
            %
            %& 
            + H(p_{\theta_i}(\vC  \mid \vx)) ]   \de \vx 
        % \end{aligned}
    $$
    %
    where $\KL$ denotes the Kullback-Lieber divergence.
\end{proposition*}

\begin{proof}
    We start with $p(\vC \mid \vX) = \sum_{i} \lambda_i p_{\theta_i} (\vC \mid \vX) $ given by a convex combination of optimal models with parameters $\theta_i$, each entailing a deterministic distribution $p_{\theta_i} (\vC \mid \vG) = \Ind{\vC = \alpha_i(\vG)}$.
    %
    
    Recall that, by invertibility (\textbf{A1}), there exists $f:\vx \mapsto \vg$, entailing the inverse of $p^*(\vG \mid \vX)$.
    We know that by \cref{lemma:abstraction-from-lh} (\textit{ii}), if $p_{\theta_i}$ entails a deterministic distributions $p_{\theta_i}(\vC \mid \vG) = \Ind{\vC = \alpha_i(\vG)}$, then it is in one-to-one correspondence with $p_{\theta_i}(\vC \mid \vX)$. Formally, the latter is: 
    \[
        p_{\theta_i} (\vC \mid \vx') = \Ind{ \vC  = \alpha_i (\vg) }, \quad \forall \vx' \in \mathsf{supp}( p^*(\vX \mid \vg), \; \text{where} \; \vg = f(\vx)
    \]
    %
    Now, from the above equation, we can rewrite $H(p_\vtheta(\vC \mid \vX))$ as follows: 
    %
    \[
    \begin{aligned}
        H(p_\vtheta(\vC \mid \vX)) &= - \bbE_{p^*(\vx)} \Big[ \sum_{\vc \in \{ 0,1 \}^{k} } p_\vtheta(\vc \mid \vx) \log p_\vtheta(\vc \mid \vx) \Big]  \\
        %
        &=  - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \bbE_{p^*(\vx \mid \vg)} \Big[ \sum_{\vc \in \{ 0,1 \}^{k} } \sum_{i=1}^K \lambda_i  p_{\theta_i} (\vc \mid \vx) \log  \sum_{j=1}^K \lambda_j p_{\theta_j} (\vc \mid \vx) \Big] \\
        %
        &=  - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg)  \sum_{\vc \in \{ 0,1 \}^{k} } \bbE_{p^*(\vx \mid \vg)} \Big[ \sum_{i=1}^K \lambda_i p_{\theta_i} (\vc \mid \vx) \log  \sum_{j=1}^K \lambda_j p_{\theta_j} (\vc \mid \vx) \Big] \\
        %
        %
        &= - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } \bbE_{p^*(\vx \mid \vg)} \Big[ \sum_{i=1}^K \lambda_i  \Ind{\vc = \alpha_i(\vg)} \log  \sum_{j=1}^K \lambda_j \Ind{\vc = \alpha_j(\vg)} \Big] \\
        %
        &= - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} }  \sum_{i=1}^K \lambda_i  \Ind{\vc = \alpha_i(\vg)} \log  \sum_{j=1}^K \lambda_j \Ind{\vc = \alpha_j(\vg)}  \\
        %
        &= - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } p_\vtheta(\vc \mid \vg) \log p_\vtheta (\vc \mid \vg) \\
        &= H(p_\vtheta(\vC \mid \vG))
    \end{aligned}
    \]
    where the second line follows from the fact that the expectation on the input variables can be written as $\bbE_{p^*(\vg)}[p^*(\vx \mid \vg)]$, and $p_\vtheta (\vC \mid \vG)$ is the distribution with convex weights $\vlambda$, where each $\lambda_i$ is associated to the reasoning shortcut $\alpha_i$, entailed by $\theta_i$. This means that maximizing $H(p_\vtheta(\vC \mid \vX))$ directly maximizes $H(p_\vtheta(\vC \mid \vG))$. 
    
    % in the second last line we substituted the weighted sum $ \sum_i \lambda_i$ with the sum $\sum_{\alpha \in \calA^*} \omega^*(\alpha)$, where $\omega^*(\alpha) = \sum_i \lambda_i \Ind{ \alpha_i = \alpha }$, and the expectation on $p^*(\vx \mid \vg)$ becomes trivial. \EK{I can't follow a lot of this explanation.}
    %
    % In particular, maximizing the members of a deterministic ensemble such that they attain maximal
    % entropy $H(\vC \mid \vX)$ is equivalent to maximizing for the subset $\calB \in \calA^*$ with equal weights, namely:
    % \[
    %     \max_{ \{ \theta_i, \lambda_i \} } H(\vC \mid \vX) \equiv \max_{ \lambda^* } H(\vC \mid \vG)
    % \]
    Next, suppose that $\vtheta$ is fixed and contains a number $K = |\calA^*|$ of members, such that each deterministic RS $\alpha \in \calA^*$ is captured by exactly one member $\theta_i \in \vtheta$.  This means that each $\theta_i$ captures $p_{\theta_i} (\vC \mid \vG) = \Ind{\vC = \alpha_i (\vG)}$, and it holds that if $\theta_i \neq \theta_j$, then $\alpha_i (\vg) \neq \alpha_j(\vg)$ for at least one $\vg \in \{0,1\}^k$.
    
    We prove that maximizing $\vlambda$ when $\vtheta$ is fixed and contains all possible deterministic RSs amounts to maximizing the combination of RSs. The proof follows a similar derivation to \cref{prop:ideal-obj}:
    \[
    \begin{aligned}
        \max_{ \vlambda, ||\vlambda||_1 = 1 } H(p_\theta(\vC \mid \vG))
        %
        &= \max_{ \vlambda, ||\vlambda||_1 = 1 } - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } 
        p_\vtheta(\vc \mid \vg) \log p_\vtheta(\vc \mid \vg)  \\
        %
        &= \max_{ \vlambda, ||\vlambda||_1 = 1 } - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } \left[ 
        \sum_{i = 1}^K \lambda_i \Ind{ \vc = \alpha_i(\vg) } \log \sum_{j = 1}^K \lambda_j \Ind{ \vc = \alpha_j(\vg) }
        \right]  \\
        %
        % &= \max_{ \vlambda, ||\vlambda||_1 = 1 } - \sum_{\vg \in \{ 0,1 \}^{2k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{2k} } \left[ 
        % \sum_{\alpha \in \calA^*} \omega^*_\alpha \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calA^*} \omega^*_{\alpha'}(\theta) \Ind{ \vc = \alpha'(\vg) }
        % \right]  \\
        %
        &=  \max_{ \vomega^*, \ ||\vomega^*||_1=1} - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } \left[ 
        \sum_{\alpha \in \calA^*} \omega^*_\alpha \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calA^*} \omega^*_{\alpha'} \Ind{ \vc = \alpha'(\vg) }
        \right] \\
        %
        &=  \max_{ \vomega^*, \ ||\vomega^*||_1=1} - \sum_{\vg \in \{ 0,1 \}^{k} } p^*(\vg) \sum_{\vc \in \{ 0,1 \}^{k} } 
        p_{\vomega^*}(\vc \mid \vg) \log p_{\vomega^*}(\vc \mid \vg) \\
        %
        &=  \max_{ \vomega^*, \ ||\vomega^*||_1=1} H(p_{\vomega^*} (\vC \mid \vG))
    \end{aligned}
    \]
    where in the third line we substituted $\lambda_i$ with $\omega^*_{\alpha}$ and the summation over the ordered components with the summation over $\alpha \in \calA^*$. 
    Notice that this also means that an ensemble containing all different deterministic RSs with parameters $\theta_i$ can express arbitrary combinations of them via $\vlambda$. 
    % \EK{This is close to the result I have for mixtures, except over possible worlds rather than over RSs} \EM{good!}

    % \EK{I'm not sure how to consider this case. Is this an existence proof? You're still inside a 'proof' environment, so I'd expect formal arguments here rather than a discussion.} \EM{Does it work? or more rigorous?} \EK{Yeah still not rigorous enough. There is no existence proof that there is such an $\vomega^ *$ where at least one component is 0. I think you can refer to Equation 46 here: If there are solutions to omega where some are 0, which exists when this system of equations is undetermined... Then you can form such an $\vomega^ *$ with maximum entropy. If I understand it correctly.}
    
    Now, consider the case where a few elements of $\calA^*$ contribute to achieving maximum entropy for $p_{\vomega^*}(\vC \mid \vG)$. Therefore, there exists at least one $\omega_{\alpha'}^*=0$, while the remaining lead to the maximum entropy for $p_{\vomega^*}(\vC \mid \vG)$. 
    %
    It holds that, similarly, the maximum of $H(p_\vtheta (\vC \mid \vG))$ can be obtained by considering a smaller number of components $\vtheta$ since the weight associated with a specific $\theta_j$ capturing $\alpha'$ must be $0$.
    This also means that the ensemble dimension $K$ can be strictly smaller than $|\calA^*|$, while still achieving maximal entropy. 
    
    We now maximize the entropy on $\vtheta$ and $\vlambda$ together:
    \[
        \max_{\vtheta, \vlambda} H(p_\vtheta (\vC \mid \vG))
    \]
    Since the number of components $K$ is upper-bounded by $|\calA^*|$, we can always find a solution by getting all different $\theta_i$, each capturing different deterministic distributions $\alpha_i$. 
    On the other hand, when a fewer number of $\alpha$'s are required, it suffices to find those $K$ components $\theta_i$ that are combined with a non-zero weight $\lambda_i$. In this case, $K < |\calA^*|$.
    %
    This means, altogether, that:
    \[
        \max_{\vtheta, \vlambda} H(p_\vtheta (\vC \mid \vX)) = \max_{\vtheta, \vlambda} H(p_\vtheta (\vC \mid \vG)) = \max_{\vomega} H(p_\vomega (\vC \mid \vG)) 
    \]
    proving our first point. 
    

    We proceed by analyzing the conditional entropy $H(\vC \mid \vX)$, which can be written as:
    \[
    \begin{aligned}
        H(\vC \mid \vX) &= - \int p(\vx) \sum_{\vc \in \{ 0,1\}^k } p(\vc \mid \vx ) \log p(\vc \mid \vx ) \de \vx \\
        %
        &= - \int p(\vx) \sum_{\vc \in \{ 0,1\}^k } \sum_i \lambda_i p_{\theta_i} (\vc \mid \vx) \log \sum_j \lambda_j p_{\theta_j} (\vc \mid \vx) \de \vx \\
        %
        &= \int p(\vx) \sum_{\vc \in \{ 0,1\}^k } \sum_i \lambda_i p_{\theta_i} (\vc \mid \vx) \left[ \log \frac{p_{\theta_i}(\vc \mid \vx) }{\sum_j \lambda_j p_{\theta_j} (\vc \mid \vx)} - \log p_{\theta_i}(\vc \mid \vx) \right] \de \vx \\
        %
        &= \int p(\vx) \sum_i \lambda_i \big[  \KL ( p_{\theta_i}(\vc \mid \vx) \mid \mid \sum_j \lambda_j p_{\theta_j} (\vc \mid \vx) ) + H (p_{\theta_i}(\vc \mid \vx) ) \big] \de \vx
    \end{aligned}
    \]
    where in the third line we multiplied and divided for the members of the ensemble $p_{\theta_i}(\vc \mid \vx)$, and in the last line we grouped the expressions of the \KL divergence and of the conditional entropy.
    %
    Therefore, for the maximization on $\vtheta$ and $\vlambda$:
    \[
        \max_{ \vlambda, \vtheta } H(\vC \mid \vX) = \max_{ \vlambda, \vtheta  } \int p(\vx) \sum_i \lambda_i \big[ \KL ( p_{\theta_i} (\vc \mid \vx)  \mid \mid \sum_j \lambda_j  p_{\theta_j} (\vc \mid \vx)) + H(p_{\theta_i}(\vC  \mid \vX)) \big]   \de \vx 
    \]
    as claimed. This concludes the proof.
\end{proof}

\newpage

\section{Additional Results}
\label{sec:all-results}

\subsection{\MNISTAdd}

We report here additional results for the experiments shown in \cref{sec:experiments}. Along with $\YECE$ and $\CECE$, we show also the performances of \method compared to other competitors in terms of the label accuracy (\YAcc) and concept accuracy (\CAcc), both in-distribution and out-of-distribution.

% In the first table, we report the complete results for \MNISTHalf:

\begin{table*}[!h]
    \centering
    \scriptsize
    \caption{Complete evaluation on \MNISTHalf. The values on \YAcc \textit{in-distribution} shows that \method and all competitors achieve optimal predictions on labels. The values of \CAcc \textit{in-distribution}, on the other hand, show that all methods pick up a RS. This holds for \DPL, \SL, and \LTN. The pattern completely change out-of-distribution, where all methods struggle in terms of label accuracy $\YAccOod$.}
    \include{tables/mnist-half-complete}
    \label{tab:mnisthalf-full}
\end{table*}

We include next the results on the \MNISTShortcut. Likewise, \method when paired to all NeSy models shows drastic improvements in terms of \YECE and \CECE, both \textit{in} and \textit{out-of-distribution}. 


\begin{table*}[!h]
    \centering
    \scriptsize
    \caption{Complete evaluation on \MNISTShortcut. All competitors struggle in terms of \YAcc \textit{in-distribution} when not paired to \SL, while \method shows sensible improvements when paired on both \DPL and \LTN. The accuracy on concepts \CAcc \textit{in-distribution} shows that all methods pick up a RS, despite being generally suboptimal. In the \textit{out-of-distribution} we observe a drastic degradation on both \YAccOod and \CAccOod.}
    \include{tables/short-mnist-complete}
    \label{tab:mnistshort-full}
\end{table*}

\newpage

\subsection{\BOIA}

We report here the complete evaluation on \BOIA. The values on \mFY show that \method does not worsen sensibly the scores w.r.t. \DPL and \DPL paired with \DeepEns, despite being trained with an extra term (conflicting in principle with the optimization on label accuracy). \Laplace and \ProbCBM, on the other hand, perform worse compared to other methods. In terms of \mFC, both \ProbCBM and \method improve the scores compared to \DPL alone. 

\begin{table*}[!h]
    \centering
    \scriptsize
    \caption{Full results on \BOIA.}
    \include{tables/boia-complete}
    \label{tab:boia-full}
\end{table*}

\subsection{\Kandinsky}

We include here the evaluation curves for the active experiment on both the \YAcc and \CAcc for \DPL paired with the entropy strategy (in \textcolor{goldenrod}{yellow}), with the random baseline (in \textcolor{blue}{blue}), and with \method (in \textcolor{red}{red}).

\begin{figure}[!h]
    \centering
    \begin{tabular}{cc}
        \includegraphics[width=0.475\linewidth]{figures/results/kandinsky/i-o_kandy_y_with_labels-bears-fixed}
        &
        \includegraphics[width=0.475\linewidth]{figures/results/kandinsky/i-o_kand_bears-fixed}
    \end{tabular}
    \caption{\textbf{\method allows selecting informative concept annotations faster.} (left) label accuracy. (\textit{right}) concept accuracy.} 
    \label{fig:kand-results-complete}
\end{figure}

\newpage

\subsection{Concept-wise Entropy scores for \MNISTHalf}

We report the entropy scores for each concept for all NeSy models we tested. \method performs as desired, whereas the runner-up, \Laplace, struggles to put low-entropy on $0$, especially when paired with \SL and \LTN. 

\begin{figure}[!h]
    \centering
    \begin{tabular}{cc}
    \includegraphics[width=0.475\linewidth]{figures/results/mnist/concept_uncertainty_sighazzo-2.pdf} &
    \includegraphics[width=0.475\linewidth]{figures/results/mnist/concept_uncertainty_dpl_ood-2}
    \end{tabular}
    %
    \caption{\textbf{\DPL + \method shows high entropy for concepts affected by RSs while it does not for others in out-of-distribution settings.} (\textit{left}) In distribution. (\textit{right}) Out-of-distribution} 
    \label{fig:dpl-halfmnist-entropy}
\end{figure}

\vspace{2em}

\begin{figure}[!h]
    \centering
    \begin{tabular}{cc}
        \includegraphics[width=0.475\linewidth]{figures/results/mnist/concept_uncertainty_slol-2}
        &
        \includegraphics[width=0.475\linewidth]{figures/results/mnist/concept_uncertainty_slol_ood-2}
    \end{tabular}
    \caption{\textbf{\SL + \method shows high entropy for concepts affected by RSs while it does not for others.} (\textit{left}): in-distribution. (\textit{right}): out-of-distribution.} 
    \label{fig:sl-halfmnist-entropy}
\end{figure}

\vspace{2em}

\begin{figure}[!h]
    \centering
    \begin{tabular}{cc}
        \includegraphics[width=0.475\linewidth]{figures/results/mnist/concept_uncertainty_LTN_2}
        &
        \includegraphics[width=0.475\linewidth]{figures/results/mnist/concept_uncertainty_ltn_ood-2}
    \end{tabular}
    \caption{\textbf{\LTN + \method shows high entropy for concepts affected by RSs while it does not for others.} (\textit{left}): in-distribution. (\textit{right}): out-of-distribution.} 
    \label{fig:ltn-halfmnist-entropy}
\end{figure}


\newpage

\subsection{Confusion Matrices \Kandinsky}

We report the confusion matrices (CMs) for the active learning experiment on \Kandinsky dataset. At the beginning, \DPL picks a RS showing that only few concept vectors $\vc$ can be used to solve the classification task. At the last iteration, corresponding to collecting a total of $70$ objects with concept annotation, \DPL and \DPL + \method show very different CMs. Both show that colors have learned correctly, although the concept annotation collected with \method make \DPL align more to the diagonal, corresponding to the intended solution.

\begin{figure}[!h]
    \centering
    \includegraphics[width=0.4\linewidth]{figures/results/kandinsky/mondi_kandsky_dpl_beginningcolors.pdf}
    \caption{\textbf{\DPL at iteration $0$ in active learning settings on \Kandinsky}}
    \label{fig:dpl-halfmnist-active}
\end{figure}

\begin{figure}[!h]
    \centering
    \begin{tabular}{ccc}
        \includegraphics[width=0.4\linewidth]{figures/results/kandinsky/mondi_kandsky_dpl_end_colors}
        & &
        \includegraphics[width=0.4\linewidth]{figures/results/kandinsky/mondi_kandsky_biretta_end_colors}
    \end{tabular}
    \caption{\textbf{Iteration $70$ in active learning settings on \Kandinsky.} Right: \DPL Left: \DPL + \method}
    \label{fig:dpl-halfmnist-active-end}
\end{figure}

\newpage

\subsection{Confusion Matrices on \BOIA}

\begin{figure}[!h]
    \centering
    \includegraphics[width=\linewidth]{figures/results/boia/frequentist_cm}
    \caption{\textbf{\DPL confusion matrices per concept classes on \BOIA}}
    \label{fig:dpl-frequentist-boia-cm}
\end{figure}

\begin{figure}[!h]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/results/boia/frequentist_cm_2}
    \caption{\textbf{\DPL multilabel confusion matrix on \BOIA}}
    \label{fig:dpl-frequentist-boia-cm-2}
\end{figure}

\newpage

\begin{figure}[!h]
    \centering
    \includegraphics[width=\linewidth]{figures/results/boia/biretta_cm}
    \caption{\textbf{\DPL + \method confusion matrices per concept classes on \BOIA}}
    \label{fig:biretta-frequentist-boia-cm}
\end{figure}

\begin{figure}[!h]
    \centering
    \includegraphics[width=\linewidth]{figures/results/boia/biretta_cm_2}
    \caption{\textbf{\DPL + \method multilabel confusion matrix on \BOIA}}
    \label{fig:biretta-frequentist-boia-cm-2}
\end{figure}

\newpage

\subsection{Confusion Matrices on \MNISTHalf}

\begin{figure}[!h]
    \centering
    \begin{tabular}{ccc}
        \includegraphics[width=0.4\linewidth]{figures/results/mnist/confusion_matrix_combined_concept_frequentist_.pdf} 
        & \textcolor{white}{POLLO} &
        \includegraphics[width=0.4\linewidth]{figures/results/mnist/confusion_matrix_combined_concept_resense_.pdf.pdf}
    \end{tabular}
    
    \caption{\textbf{(\textit{left)} \DPL  and (\textit{right}) \DPL + \method concepts confusion matrix on \MNISTHalf}}
    \label{fig:dpl-biretta-mnist-cm}
\end{figure}

\end{document}

\section{Evaluation without independent distributions}

We consider the problem of evaluating a model that does not respect the independence assumption over different concepts. The independence assumption reads as:
\[
    p_\theta(\vC \mid \vx) = \prod_{i = 1}^k p_\theta( C_i \mid \vx)
\]
In the current implementation of DPL (as well as SL), we are making use of the following trick to compute the probability on labels:
\[
\begin{aligned}
    p_\theta(\vy = y \mid \vx; \BK) &= \sum_\vc u_\BK (\vy = y \mid \vc) p_\theta(\vc \mid \vx) \\
    &= \sum_{i_1, \ldots, i_k} W^y_{i_1, \ldots, i_k} p(C_1 = i_1 \mid \vx) \times \ldots \times p(C_k = i_k \mid \vx)
\end{aligned}
\]
where $W_{i_1, \ldots i_k}^y $ is the tensor underlying $u_\BK (\vy = y \mid \vc = (i_1, \ldots, i_k)^\top)$. The whole computation can be reformulated with tensors, by introducing
$ P^{i_1, \ldots, i_k} (\vx) = p_1^{i_1}(\vx) \otimes \ldots \otimes p_k^{i_k}(\vx) $, where $p^{i_1}_1 (\vx) = p(C_1 = i_1 \mid \vx)$ etc. 
%
\AV{that's a circuit! In a sense you are reinventing the conditional circuit distribution (e.g. in SPL) : )
}   \EM{Yes, I think this is the connection with circuit notation. We also implement DPL with this!}
%
In the case of two concepts $C_1$ and $C_2$ (\eg MNIST-addition) we can compute the tensor of probabilities as:
\[
    P(\vx) = p(C_1 \mid \vx) p(C_2 \mid \vx)^\top
\]

Now, we tackle the general case where $P^{i_1 i_2}(\vx)$ is not factorized, but we want to compute quantities like $p_\theta(\vy \mid \vx; \BK) $, $p_\theta (\vC \mid \vG)$, and $\arg \max p_\theta(\vC \mid \vx) $. 
%
We consider the Bayesian approximation where we compute:
\[
    p(\vC \mid \vx; \calD) = \frac{1}{M} \sum_{m=1}^M p_{\theta_m} (\vC \mid \vx) \approx \int p_\theta (\vC \mid \vx) p(\theta \mid \calD) \mathrm{d} \theta
\]
We make use of the fact that each $p_{\theta_m}$ is factorized, so that: 
\[
    P^{i_1 \ldots i_k} (\vx; \calD) = \frac{1}{M} \sum_{m=1}^M p^{i_1}_1(\vx; \theta_m) \otimes \ldots \otimes p^{i_k}_k(\vx; \theta_m) 
\]
In general, the tensor $P^{i_1, \ldots i_k}$ is not factorized, \ie cannot be expressed as the tensor product of different elements. We now compute the label prediction as:
\[
    \begin{aligned}
        p(\vY=y \mid \vx; \BK, \calD) &= \sum_{i_1, \ldots, i_k} W^y_{i_1 \ldots i_k} P^{i_1 \ldots i_k}(\vx; \calD) \\
        &= \sum_{i_1, \ldots, i_k} W^y_{i_1 \ldots i_k} \frac{1}{M} \sum_{m=1}^M p^{i_1}_1(\vx; \theta_m) \otimes \ldots \otimes p^{i_k}_k(\vx; \theta_m) \\
        &= \frac{1}{M} \sum_{m=1}^M \sum_{i_1, \ldots, i_k} W^y_{i_1 \ldots i_k} p^{i_1}_1(\vx; \theta_m) \otimes \ldots \otimes p^{i_k}_k(\vx; \theta_m) \\
        &= \frac{1}{M} \sum_{m=1}^M p_{\theta_m} (\vY = \vy \mid \vx; \BK)
    \end{aligned}
\]
Eventually, the same term can be derived by the linearity of probabilistic logic, that is:
\[  
\begin{aligned}
    p(\vY \mid \vX; \calD) &= \sum_\vc u_\BK (\vY \mid \vc) \int p_\theta (\vc \mid \vX) p(\theta \mid \calD) \mathrm{d} \theta \\
    &= \int p(\theta \mid \calD) \sum_\vc u_\BK (\vY \mid \vc) p_\theta (\vc \mid \vX) \mathrm{d} \theta \\
    &= \int p_\theta(\vY \mid \vX) p(\theta \mid \calD) \mathrm{d} \theta
\end{aligned}
\]
We now turn to computing the form of $\alpha: \vG \to \vC$, which we can frame as:
\[
\begin{aligned}
    p_\theta(\vC \mid \vg) &= \frac{1}{|\calD_\vg|} \sum_{\vx \in \calD_\vg}  p_\theta(\vC \mid \vx)  \\
\end{aligned}
\]
where $\calD_\vx = \{ \vx \in \calD: f^{-1}_{1:k} (\vx) = \vg \}$ is the set of input variables having the same ground-truth concepts $\vg$. We compute the Bayesian estimate with:
\[
\begin{aligned}
    P^{i_1 \ldots i_k} (\vg;\calD) &:=  \frac{1}{|\calD_\vg|} \sum_{\vx \in \calD_\vg}  P^{i_1 \ldots i_k} (\vx ; \calD) \\
    &=  \frac{1}{|\calD_\vg|} \sum_{\vx \in \calD_\vg} \frac{1}{M} \sum_{m=1}^M p^{i_1}_1(\vx; \theta_m) \otimes \ldots \otimes p^{i_k}_k(\vx; \theta_m) \\
    &= \frac{1}{M} \sum_{m=1}^M \frac{1}{|\calD_\vg|} \sum_{\vx \in \calD_\vg} p^{i_1}_1(\vx; \theta_m) \otimes \ldots \otimes p^{i_k}_k(\vx; \theta_m) \\
    &=\frac{1}{M} \sum_{m=1}^M  P^{i_1 \ldots i_k} (\vg ; \theta_m)
\end{aligned}
\]
so the map $\alpha$ can be safely evaluated as the average of the different models. Finally, let us consider how to extract the $\argmax$ for the concepts, this is given by:
\[
    \argmax_{\vc} p(\vc \mid \vx; \calD) = \argmax_\vc \frac{1}{M} \sum_m  p_{\theta_m} (\vc \mid \vx)
\]
this can be computed as:
\[
    \vc = \argmax_{i_1, \ldots, i_k} P^{i_1 \ldots i_k} (\vx; \calD)
\]
We resume how to implement in practice these operations by leveraging averages of factorized models. 

\begin{table*}[!t]
    \centering
    \begin{tabular}{llcp{7cm}}
        \toprule
        %
         & \sc tensor & \sc averaging & \sc implementation \\
         \midrule \\
         %
         $p_{\theta_m}(\vc \mid \vx)$ & $P^{i_1 \ldots i_k} (\vx; \theta_m)$ & $\xmark$ & \textbf{1.} Since $p_{\theta_m}(\vc \mid \vx)$ is factorized, evaluate the tensor product $p_{\theta_m}(C_1 \mid \vx) \otimes \ldots \otimes p_{\theta_m}(C_k \mid \vx) $ to obtain the tensor of probabilities of possible worlds.
         \\
        $p(\vc \mid \vx; \calD)$ & $P^{i_1 \ldots i_k} (\vx; \calD) $ & $\checkmark$ & \textbf{1.} Evaluate the probability $P^{i_1 \ldots i_k}(\vx; \theta_m) $ for each $\theta_m$, then perform the average. \\
        %
        $p(\vy \mid \vx; \BK, \calD)$ & $ W^y_{i_1 \ldots i_k} P^{i_1 \ldots i_k}(\vx;\calD)$  & $\checkmark$ & \textbf{1.} Evaluate the probability $P^{i_1 \ldots i_k}(\vx; \calD)$ and use it in the forward pass. {\textbf{2.} Evaluate the label for each model, then average them. 
        } \\
        %
        $p(\vc \mid \vg; \calD)$ & $P^{i_1 \ldots i_k} (\vg; \calD)$ & $\checkmark$ & \textbf{1.} Evaluate the probability $P^{i_1 \ldots i_k} (\vx; \calD) $  then average over $\calD_\vg$ \textbf{2.} Evaluate the probability $P^{i_1 \ldots i_k} (\vg; \theta_m) $ for each $\theta_m$ then average on them. \\
        \bottomrule
    \end{tabular}
    \caption{Computing key quantities for evaluation}
    \label{tab:key-quantities}
\end{table*}


\begin{remark}[Choice of $\calB$] 
    Notice that the choice may seem arbitrary since given $p(\vC \mid \vG)$ there can be different subsets $ \calB $'s giving the same probability distribution. We show that they are in a specific relationship with one another and it holds that when considering bigger $|\calB'| > |\calB|$, for $\calB, \calB' \subseteq \calA^*$, we have that the rightmost term in eq 40 matches. We do that by ... 
\end{remark}


\begin{proposition}[Optima of the Bayes estimator]
    Given an uniform prior $p(\theta)$ of the form:
    \[
    p(\theta) = 
    \begin{cases}
        \frac{1}{V_n (k) } & \text{for} \; ||\theta||^2 \leq k^2 \\
        0 & \text{otherwise}
    \end{cases}
    \]
    it holds that the \textsf{MAP} parameters are also optimal, that is $ \Theta_{\textsf{MAP}} (\calD; \BK) \subseteq \Theta^*(\calD, \BK)$.
\end{proposition}
    
\begin{proof}
    We consider the optimization given by posterior distribution:
    \[
        \argmax_\theta \log p(\theta \mid \calD) \propto \argmax_\theta [\log p(\calD \mid \theta) + \log p(\theta)]
    \]
    With this choice of the prior, the parameters are constrained to be within a ball of radius $k$. The optimization problem hence amounts to solve:
    \[
        \argmax_\theta \log p(\calD \mid \theta), \quad \text{s.t.} \; ||\theta||^2 \leq k^2
    \]
    The only solutions to this problem are of the form $ \theta \in \Theta^*(\calD, \BK) $ such that $||\theta||^2 \leq k^2$. Therefore, the posterior distribution must have support within $\Theta^*(\calD, \BK)$. This proves our claim.
\end{proof}

\begin{remark}
    As customary in Bayesian inference, the choice of the prior influences the optimal solutions obtained in the learning. By taking a normal prior over the weights $p(\theta) = \calN (\theta \mid 0, 1)$ this leads to the $l_2$ regularization of the weights \EM{add citation}. This prior, however, may lead to optima that are sub-optimal for the labels, thus obtaining a posterior $p(\theta \mid \calD)$ that contains not proper Reasoning Shortcuts. In practice, we do not require a specific distributional form for $p(\theta)$ given that our tested methods sample from the posterior by (\textit{i}) sampling the weights in \MCDrop, (\textit{ii}) approximating the posterior distribution around the MLE with \Laplace, and (\textit{iii}) creating an ensemble of different models with \DeepEns and \method. 
\end{remark}



\EM{At this point, the following is optional...}

\begin{proposition}[Solving $\max H(\vC \mid \vG)$]
    Consider the subset $\calB \subseteq \calA^*$, such that $\omega_\alpha = 1 / |\calB|$ if $\alpha \in \calB$ and $\omega_\alpha = 0$ otherwise, then we have:
    \[
        \max_{\vomega^*} H(\vC \mid \vG) \geq \max_{\calB \subseteq \calA^*} \sum_{\vg} p(\vg) \sum_{\alpha \in \calB}  \frac{1}{|\calB|} \big[ \log |\calB| - \log \big(\sum_{\alpha' \in \calB} \Ind{\alpha'(\vg) = \alpha(\vg)} \big) \big]
    \]
    Similarly, for an ensemble $\{ \theta_i \}$, for $\theta \in \Theta^*$, we obtain that when the members of the ensembles entail deterministic distributions $p_{\theta_i} (\vC \mid \vG)$, the objective yields:
    \[
        \max_{ \{ \theta_i\} } H(\vC \mid \vX) 
        \equiv 
        \max_{\calB \subseteq \calA^*} \sum_{\vg} p(\vg) \sum_{\alpha \in \calB}  \frac{1}{|\calB|} \big[ \log |\calB| - \log \big(\sum_{\alpha' \in \calB} \Ind{\alpha'(\vg) = \alpha(\vg)} \big) \big]
    \]
\end{proposition}

\begin{proof}
    We show the first point of the proposition by calculation. Consider the set of solutions $ \Omega^* = \argmax_{\vomega^*} H(\vC \mid \vG)$ and suppose that there exists one $\Vec{\omega^*}$ that has components all equals to each other, except some to zero. Therefore we have the following holds for the maximization problem:
    \[  
    \begin{aligned}
        & \max_{ \vomega^* } - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\vc \in \{ 0,1 \}^{2k} } \left[ 
        \sum_{\alpha \in \calA^*} \omega_\alpha \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calA^*} \omega_\alpha \Ind{ \vc = \alpha'(\vg) }
        \right]  = \\
        &= \max_{\calB \subseteq \calA^*} - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\vc \in \{ 0,1 \}^{2k} } \left[ 
        \sum_{\alpha \in \calB} \frac{1}{|\calB|} \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calB} \frac{1}{|\calB|} \Ind{ \vc = \alpha'(\vg) }
        \right]
    \end{aligned}
    \]
    On the other hand, if there is no $\Vec{\omega^*}$ that has components all equal to each other, except some to zero, the equivalence does not hold, and in general we obtain that optimizing for $\calB \subseteq \calA$ does not match the best entropy possible:
    \[  
    \begin{aligned}
        &= \max_{\calB \subseteq \calA^*} - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\vc \in \{ 0,1 \}^{2k} } \left[ 
        \sum_{\alpha \in \calB} \frac{1}{|\calB|} \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calB} \frac{1}{|\calB|} \Ind{ \vc = \alpha'(\vg) }
        \right] \\
        &< \max_{ \vomega^* } - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\vc \in \{ 0,1 \}^{2k} } \left[ 
        \sum_{\alpha \in \calA^*} \omega_\alpha \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calA^*} \omega_\alpha \Ind{ \vc = \alpha'(\vg) }
        \right] 
    \end{aligned}
    \]
    
    We can further simplify the expression for $\max_{\calB \subset \calA^*} H(\vC \mid \vG) $ by resorting to the fact that the coefficients have the same magnitude:
    \[  \label{eq:evaluating-cond-ent-b}
    \begin{aligned}
        H(\vC \mid \vG)  &= - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\vc \in \{ 0,1 \}^{2k} } \left[ 
        \sum_{\alpha \in \calB} \frac{1}{|\calB|} \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calB} \frac{1}{|\calB|} \Ind{ \vc = \alpha'(\vg) }
        \right]   \\
        %
        &= - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\alpha \in \calB} \frac{1}{|\calB|}
        \left[ 
        \sum_{\vc \in \{ 0,1 \}^{2k} } \Ind{ \vc = \alpha(\vg) } \log \sum_{\alpha' \in \calB} \frac{1}{|\calB|} \Ind{ \vc = \alpha'(\vg) }
        \right]  \\
        %
        &= - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\alpha \in \calB} \frac{1}{|\calB|}
        \left[ 
        \log \sum_{\alpha' \in \calB} \frac{1}{|\calB|} \Ind{ \alpha'(\vg) = \alpha(\vg)}
        \right]  \\
        %
        &= \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\alpha \in \calB} \frac{1}{|\calB|}
        \Big[ 
        \log |\calB| - \log \sum_{\alpha' \in \calB} \Ind{ \alpha'(\vg) = \alpha(\vg)}
        \Big]
    \end{aligned}
    \]
    This proves the first point. Following, we consider the case where $p(\vC \mid \vX) = \frac{1}{M} \sum_{ i=1 }^M p_{\theta_i} (\vC \mid \vX) $ is the average of $M$ models, each that entails a deterministic $p_{\theta_i} (\vC \mid \vG) = \Ind{\vC = \alpha_i(\vG)}$. By writing the expression of $H(\vC \mid \vX)$ we obtain:
    \[
    \begin{aligned}
        H(\vC \mid \vX) &= - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \bbE_{p^*(\vx \mid \vg)} \sum_{\vc \in \{ 0,1 \}^{2k} } p(\vc \mid \vx) \log p(\vc \mid \vx)  \\
        %
        &=  - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \bbE_{p^*(\vx \mid \vg)} \sum_{\vc \in \{ 0,1 \}^{2k} } \frac{1}{M} \sum_{ i=1 }^M p_{\theta_i} (\vc \mid \vx) \log \frac{1}{M} \sum_{ j=1 }^M p_{\theta_j} (\vc \mid \vx) \\
        %
        &= - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \bbE_{p^*(\vx \mid \vg)} \sum_{\vc \in \{ 0,1 \}^{2k} } \frac{1}{M} \sum_{ i=1 }^M \Ind{\vc = \alpha_i(\vg)} \log \frac{1}{M} \sum_{ j=1 }^M \Ind{\vc = \alpha_j(\vg)} \\
        %
        &= - \sum_{\vg \in \{ 0,1 \}^{2k} } p(\vg) \sum_{\vc \in \{ 0,1 \}^{2k} } \frac{1}{M} \sum_{ i=1 }^M \Ind{\vc = \alpha_i(\vg)} \log \frac{1}{M} \sum_{ j=1 }^M \Ind{\vc = \alpha_j(\vg)} \\
        &= H(\vC \mid \vG)
    \end{aligned}
    \]
    where the second line follows from the fact that deterministic $p_\theta(\vC \mid \vG)$ are in one-to-one correspondence w.r.t. $p_\theta(\vC \mid \vX)$ from Prop. 3 in \citep{marconato2023not}m and
    $H(\vC \mid \vG)$ is evaluated as in \cref{eq:evaluating-cond-ent-b}. In particular, maximizing the members of a deterministic ensemble such that they attain maximal
    entropy $H(\vC \mid \vX)$ is equivalent to maximizing for the subset $\calB \in \calA^*$ with equal weights, namely:
    \[
        \max_{ \{ \theta_i \} } H(\vC \mid \vX) \equiv \max_{ \calB \subseteq \calA^* } H(\vC \mid \vG)
    \]
    This concludes the proof.
\end{proof}