%------------------------------------------------------------------------------
% UAI 2023: Fred & Lounes
%------------------------------------------------------------------------------

\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%------------------------------------------------------------------------------
% Packages
%------------------------------------------------------------------------------

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{microtype}
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amsbsy}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{nicefrac}
\usepackage{bm}
\usepackage{bbm}
\usepackage{adjustbox}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{bbm, dsfont}
\usepackage{bbold}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{caption}
\usepackage{subcaption}
\usepackage[most]{tcolorbox}
% Algorithms
\usepackage[ruled,noend,vlined]{algorithm2e}
%Tikz
\usepackage{pgf,tikz}
\usetikzlibrary{shapes,arrows,automata,positioning,calc,matrix}
% Savetrees
%\usepackage[subtle,mathspacing=normal,wordspacing=normal]{savetrees}
% \usepackage{siunitx} % for proper typesetting of numbers and units

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


%------------------------------------------------------------------------------
% Theorem Styles
%------------------------------------------------------------------------------

\newtheoremstyle{example}% name
  {9pt}%      Space above, empty = `usual value'
  {9pt}%      Space below
  {}%         Body font
  {}%         Indent amount (empty = no indent, \parindent = para indent)
  {\bfseries}% Thm head font
  {.}%        Punctuation after thm head
  { }%         Space after thm head: \newline = linebreak
  {}%         Thm head spec
\theoremstyle{example}
\newtheorem{example}{Example}
\newtheoremstyle{theorem}% name
  {9pt}%      Space above, empty = `usual value'
  {9pt}%      Space below
  {}%         Body font
  {}%         Indent amount (empty = no indent, \parindent = para indent)
  {\bfseries}% Thm head font
  {.}%        Punctuation after thm head
  { }% Space after thm head: \newline = linebreak
  {}%         Thm head spec
\theoremstyle{theorem}
\newtheorem{theorem}{Theorem}
\newtheorem{conjecture}{Conjecture}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{problem}{Problem}
\newtheorem{proposition}{Proposition}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{note}{Note}
\newtheorem{open}{Open question}

%------------------------------------------------------------------------------
% Commands
%------------------------------------------------------------------------------

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\newcommand{\B}{\mathbb{B}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\Rp}{\mathbb{R}_+}
\newcommand{\Rpp}{\mathbb{R}_{++}}
\renewcommand{\S}{\mathbb{S}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Zp}{\mathbb{Z}_+}
\newcommand{\Zpp}{\mathbb{Z}_{++}}
\renewcommand{\emptyset}{\varnothing}
\newcommand{\Id}[1]{#1}

\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\Argmax}{Argmax}
\DeclareMathOperator*{\Argmin}{Argmin}

\newcommand{\Binom}[1]{\binom{[#1]}{2}}
\renewcommand{\vec}[1]{\bm{#1}}
\newcommand{\Var}[1]{\mathit{Var}(#1)}
\newcommand{\seq}[1]{\langle #1 \rangle}
\newcommand{\inner}[2]{\langle #1, #2 \rangle}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\bignorm}[1]{\bigl\|#1\bigr\|}
\newcommand{\norma}[1]{\|#1\|_{\mathrm{a}}}
\newcommand{\normas}[1]{\|#1\|_{\mathrm{a^*}}}
\newcommand{\normb}[1]{\|#1\|_{\mathrm{b}}}
\newcommand{\normbs}[1]{\|#1\|_{\mathrm{b^*}}}
\newcommand{\sgn}{\mathrm{sgn}}
\newcommand{\size}[1]{|#1|}
\newcommand{\vecphi}{\bm{\phi}}
\newcommand{\vecPhi}{\bm{\Phi}}
\newcommand{\vecbeta}{\bm{\beta}}
\newcommand{\vectheta}{\bm{\theta}}
\newcommand{\vecTheta}{\bm{\Theta}}

\newcommand{\longpage}{\enlargethispage{\baselineskip}}
\newcommand{\shortpage}{\enlargethispage{-\baselineskip}}

%------------------------------------------------------------------------------
% Hyphenation
%------------------------------------------------------------------------------


%------------------------------------------------------------------------------
% Front Page
%------------------------------------------------------------------------------

\title{Approximating Probabilistic Explanations via Supermodular Minimization}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<bounia@cril.fr>?Subject=Your UAI 2023 paper}{Louenas Bounia}{}}
\author[1]{\href{mailto:<koriche@cril.fr>?Subject=Your UAI 2023 paper}{Frederic Koriche}{}}
% Add affiliations after the authors
\affil[1]{%
    CRIL UMR CNRS 8188\\
    Université d'Artois, France
}
  
\begin{document}
\maketitle

\begin{abstract}
  Explaining in accurate and intelligible terms the predictions made by classifiers 
  is a key challenge of eXplainable Artificial Intelligence (XAI).
  To this end, an \emph{abductive explanation} for the predicted label of
  some data instance  is a subset-minimal collection of features such that the restriction of the instance 
  to these features is sufficient to determine the prediction. 
  However, due to cognitive limitations, abductive explanations are often too large to be interpretable. 
  In those cases, we need to reduce the size of abductive explanations, while still 
  determining the predicted label with high probability. In this paper, we show that finding such 
  \emph{probabilistic explanations} is NP-hard, even for decision trees.
  In order to circumvent this issue, we investigate the \emph{approximability} of probabilistic explanations 
  through the lens of supermodularity. We examine both greedy descent and greedy ascent 
  approaches for supermodular minimization, whose approximation guarantees  
  depend on the curvature of the ``unnormalized'' error function that evaluates the precision of the explanation.
  Based on various experiments for explaining decision tree predictions, 
  we show that our greedy algorithms provide an efficient alternative to 
  the state-of-the-art constraint optimization method.   
\end{abstract}


%------------------------------------------------------------------------------
% Introduction
%------------------------------------------------------------------------------

\section{Introduction}\label{sec:introduction}

Basically, the \emph{classification} problem is to extrapolate from a set of labeled data instances a 
hypothesis, or \emph{classifier}, that accurately predicts the labels of new, incoming data instances. 
Decision trees, random forests, support vector machines, and neural nets, are common examples 
of classifiers for which theoretical properties have been extensively studied in the machine learning 
literature (see e.g. \cite{Sayed.2022} for a recent survey). The spectrum of applications for these classifiers 
is wide, ranging from document and image classification, to customer profiling and medical diagnosis. 
However, with the increasing deployment of data-driven learning models in our society comes the issue of 
\emph{explaining} predictions in human intelligible terms. As a key topic of eXplainable Artificial Intelligence (XAI), 
this issue is exacerbated in sensitive domains, such as cybersecurity and healthcare, where explanations 
are crucial for building trust and confidence in the classifier \citep{GuidottiMRTGP.2019,Miller.2019,SamekMVHM.2019,Molnar.2020}.  

Among the various types of explanations proposed in the XAI literature, 
\emph{formal} explanations are particularly interesting, since their soundness  
can be mathematically validated \citep{MarquesSilvaI.2022}. Notably, when 
the classifier $h$ is a Boolean function, a common explanation for predicting the output $h(\vec x)$
of some data instance $\vec x$ is a subset-minimal collection of features $I$ such that the restriction $\vec x_I$ of 
$\vec x$ to $I$ determines $h(\vec x)$. Such an \emph{abductive} explanation \citep{IgnatievNM.2019}, 
also called \emph{sufficient reason} \citep{DarwicheH.2020}, is logically sound, 
because $\vec x_I$ can be viewed as a prime implicant of the hypothesis $h$ that covers the instance 
$\vec x$ \citep{ShihCD.2018}. Although finding an abductive explanation is $\mathrm{NP}$-hard in general, 
tractable cases have been identified for various hypothesis classes 
\citep{MarquesSilvaGCIN.2020,AudemardBBKLM.2021,HuangIIM.2021,CooperM.2023}. 

However, the soundness of explanations is not the only criterion for clarifying in intelligible terms 
the predictions made by classifiers. The \emph{conciseness} property is also important, 
since an abductive explanation involving too many features cannot be understood by human users.
Indeed, in cognitive psychology, it has long been recognized that there is an upper limit on our 
ability to reason about simultaneously interacting elements. As conjectured 
by \cite{Miller.1956}, this limit is seven plus or minus two elements and, since then, it has been confirmed by 
many experiments in cognitive science. Thus, restricting the size of explanations 
appears as a \emph{constraint} for ensuring their intelligibility.

Based on these considerations, how can we reduce the size of explanations 
while retaining much of their soundness? This is where \emph{probabilistic explanations} \citep{WaldchenMHK.2021} 
come into the equation. Namely, suppose we are given an abductive explanation $I$ for a classifier 
$h$ and some instance $\vec x$, together with a size limit $k \leq \size{I}$. For any candidate subset $S$ of $I$, 
let $\epsilon_{h,\vec x}(S)$ denote the probability that a random instance $\vec y$ covered by $\vec x_S$ 
is classified differently from $\vec x$ by $h$. In other words, 
$\epsilon_{h,\vec x}(S)$ is the probability of making an ``explanation mistake''
for inferring $h(\vec x)$, using $\vec x_S$ instead of $\vec x$.
With this notion in hand, the main problem considered in this study is to find a 
probabilistic explanation $S \subseteq I$ of size at most $k$ such that $\epsilon_{h,\vec x}(S)$ is minimized.   

Unfortunately, this optimization task is very expensive from a computational viewpoint. 
Indeed, the problem of finding a minimizer $S$ of $\epsilon_{h,\vec x}(\cdot)$ subject to 
some cardinality constraint $\size{S} \leq k$ is $\mathrm{NP}^{\mathrm{PP}}$-hard 
for general classifiers \citep{WaldchenMHK.2021}, and $\mathrm{NP}$-hard for decision trees \citep{ArenasBOS.2022}. 
As shown in the present study, this problem remains $\mathrm{NP}$-hard for decision trees 
even in the restricted case where $S$ is a subset of some given abductive explanation $I$. 

In order to overcome such a computational barrier, this paper investigates the \emph{approximability}
of probabilistic explanations through the lens of supermodularity. As $\epsilon_{h,\vec x}(S)$ 
can be viewed as the number $\mu_{h,\vec x}(S)$ of mistakes induced from the choice of $S$, 
averaged over the number of instances covered by $\vec x_S$, our results exploit two key properties:
(i) the unnormalized error function $\mu_{h,\vec x}(\cdot)$ is \emph{supermodular} and \emph{non-increasing},
and (ii) the normalization factor is \emph{constant} for all subsets $S$ with the same size.
Thus, even if $\epsilon_{h,\vec x}(\cdot)$ is not supermodular, we can still use approximation 
algorithms for supermodular minimization, by coupling them with a level-wise selection method, 
in order to derive probabilistic explanations endowed with approximation guarantees.

To this point, it is well-known that the task of maximizing a non-decreasing submodular  
function subject to a cardinality constraint is $(1 - \frac{1}{e})$-approximable \citep{NemhauserW.1978}. 
The situation is however different 
for minimizing non-increasing supermodular functions: the problem is not approximable  
to within a constant, unless $\mathrm{P} = \mathrm{NP}$ \citep{MittalS.2013}. Still, approximation factors
can be provided by taking into account the \emph{curvature} of the objective function 
\citep{Ilev.2001,SviridenkoVW.2017}. 

In this paper, we present two conceptually simple and easy-to-implement algorithms, 
whose approximation factors depend on the curvature $c$ of the function
$\mu_{h,\vec x}(\cdot)$. The first algorithm is a \emph{greedy descent} 
method that achieves a $\frac{e^{p} - 1}{p}$-approximation, where $p = \frac{c}{1 - c}$, 
and the second algorithm is a \emph{greedy ascent} method that achieves a $\frac{1}{1 - c}$-approximation.
The sizes of the greedy descent and greedy ascent solutions are bounded by $k$ and $k\ln\left(\frac{2e}{c}\right)$, respectively.  

Both algorithms are empirically compared with the 
constraint-based approach suggested in \citep{ArenasBOS.2022}, 
which aims at inferring \emph{optimal} probabilistic explanations for decision tree predictions. 
Experimental results indicate that our greedy algorithms can efficiently find accurate explanations 
and, unlike the constraint-based approach, they are able to scale up on high-dimensional explanation tasks.   

This paper is organized as follows. The main concepts relating to probabilistic explanations and
supermodular minimization are introduced in Section~\ref{sec:explanations} and Section~\ref{sec:supermodular}, 
respectively. Our approximation algorithms are theoretically analyzed in Section~\ref{sec:algorithms},
and empirically validated in Section~\ref{sec:experiments}. Finally, the related work and some perspectives 
of further research are discussed in Section~\ref{sec:discussion}.

%------------------------------------------------------------------------------
% Probabilistic Explanations
%------------------------------------------------------------------------------

\section{Probabilistic Explanations}\label{sec:explanations}

In this section, we start with some background about probabilistic explanations,
and then, we examine some computational aspects related to their evaluation and optimization. 

\subsection{Notation and Problem Formulation}

For a positive integer $d$, we use $[d]$ to denote the set $\{1,\cdots,d\}$.
The classifiers under consideration in this study are hypotheses 
of the form $h: \{0,1\}^d \rightarrow \{0,1\}$. Thus, any input of $h$ is  
$d$-dimensional Boolean vector $\vec x$, called \emph{instance}, and the output of $h(\vec x)$ 
is a Boolean value, classifying $\vec x$ as a negative example or a positive one.  
A \emph{partial instance} is a vector $\vec z \in \{0,1,*\}^d$, 
where $z_i = *$ indicates that the $i$th feature of $\vec z$ is left undefined.
An instance $\vec x$ is \emph{covered} by $\vec z$, if 
$x_i = z_i$ for all features $i \in [d]$ such that $z_i \neq *$. For a subset $S \subseteq [d]$,
the \emph{restriction} of $\vec x$
to $S$, denoted $\vec x_S$, is the partial instance in $\{0,1,*\}^d$ such that, 
for each $i \in [d]$, $(x_S)_i = x_i$ if $i \in S$, and $(x_S)_i = *$ otherwise. 
Clearly, any instance $\vec y \in \{0,1\}^d$ is covered by $\vec x_S$ if and only if $\vec y_S = \vec x_S$. 

Given a classifier $h$, and an instance $\vec x$ for which  
the prediction $h(\vec x)$ must be explained, 
let $\epsilon_{h,\vec x}: 2^{[d]} \rightarrow \mathbb R$ denote the \emph{error function} given by
\begin{align}
\label{def:error}
\epsilon_{h,\vec x}(S) \!=\! \frac{\size{\{\vec y \in \{0,1\}^d: h(\vec y) \neq h(\vec x), \vec y_S = \vec x_S\}}}
{\size{\{\vec y \in \{0,1\}^d: \vec y_S = \vec x_S\}}} 
\end{align}  
As indicated above, $\epsilon_{h,\vec x}(S)$ can be thought as the probability of making an ``explanation mistake''
when using the partial instance $\vec x_S$ instead of the complete instance $\vec x$.
Given a precision parameter $\varepsilon \in [0,1]$, an explanation $S$ is called  
\emph{$(1 - \varepsilon)$-probable} if  $\epsilon_{h,\vec x}(S) \leq \varepsilon$. 
We say that $S$ is \emph{abductive} if $\epsilon_{h,\vec x}(S) = 0$, and $\epsilon_{h,\vec x}(S') > 0$ 
for every proper subset $S'$ of $S$. Note that (\ref{def:error}) can be rewritten as
\begin{align}
  \label{def:decomposition}
  \epsilon_{h,\vec x}(S) = \frac{\mu_{h,\vec x}(S)}{2^{d - \size S}}
\end{align}
where $\mu_{h,\vec x}(S)$ is the number of mistakes induced from the choice of $S$, that is,
\begin{align}
\mu_{h,\vec x}(S)  = \size{\{\vec y \in \{0,1\}^d: h(\vec y) \neq h(\vec x), \vec y_S = \vec x_S\}}
  \label{def:mistake}
\end{align}  


\begin{figure}[t]
    \centering
    \begin{tikzpicture}[scale=0.5]
      \matrix (A) [matrix of nodes, row sep=0.75cm, nodes={minimum width=2.25cm}]
      {
          $\{1,2\}$ & $\{1,3\}$ & $\{2,3\}$ \\
          $\{1\}$ & $\{2\}$ & $\{3\}$ \\
          & $\{\emptyset\}$ \\
      };
      \path (A-1-1)--(A-1-2) node[above=1cm] (link) {$\{1,2,3\}$};
      \foreach \i in {1,...,3}
        \draw[densely dotted] (link) -- (A-1-\i);
      \foreach \i/\j in {1/2, 3/2, 2/1, 1/1, 3/3, 2/3}
        \draw[densely dotted] (A-1-\i)--(A-2-\j);
      \foreach \i/\j in {1/2, 2/2, 3/2}
        \draw[densely dotted] (A-2-\i)--(A-3-\j);
      \footnotesize
      \node[left=-0.15cm of link, text=blue] (link.err) {$0$}; 
      \node[right=-0.15cm of link, text=magenta] (link.err) {$0$}; 
      \node[left=-0.75cm of A-1-1, text=blue] (A11.err) {$\frac{1}{2}$}; 
      \node[right=-0.75cm of A-1-1, text=magenta] (A11.err) {$1$}; 
      \node[left=-0.75cm of A-1-2, text=blue] (A12.err) {$\frac{1}{2}$}; 
      \node[right=-0.75cm of A-1-2, text=magenta] (A11.err) {$1$}; 
      \node[left=-0.75cm of A-1-3, text=blue] (A13.err) {$\frac{1}{2}$}; 
      \node[right=-0.75cm of A-1-3, text=magenta] (A11.err) {$1$}; 
      \node[left=-0.85cm of A-2-1, text=blue] (A21.err) {$\frac{3}{4}$}; 
      \node[right=-0.85cm of A-2-1, text=magenta] (A21.err) {$3$}; 
      \node[left=-0.85cm of A-2-2, text=blue] (A22.err) {$\frac{3}{4}$}; 
      \node[right=-0.85cm of A-2-2, text=magenta] (A21.err) {$3$}; 
      \node[left=-0.85cm of A-2-3, text=blue] (A23.err) {$\frac{1}{2}$}; 
      \node[right=-0.85cm of A-2-3, text=magenta] (A21.err) {$2$}; 
      \node[left=-0.85cm of A-3-2, text=blue] (A23.err) {$\frac{5}{8}$}; 
      \node[right=-0.85cm of A-3-2, text=magenta] (A23.err) {$5$}; 
    \end{tikzpicture}
    \caption{The error $\epsilon_{h,\vec x}(S)$ (in blue) and the number of mistakes $\mu_{h,\vec x}(S)$ 
    (in magenta) for each $S \subseteq [3]$, using the classifier $h$ given by  
    (\ref{ex:hypothesis}) and the instance $\vec x = (1,1,1)$.}
    \label{fig:diagram}
\end{figure}
  
  \begin{example}
    Consider the classifier $h: \{0,1\}^3 \rightarrow \{0,1\}$ specified by the polynomial threshold function:
    \begin{align}
    \label{ex:hypothesis}
    h(\vec x) = 1 & \Leftrightarrow x_1x_2x_3 + x_1x_2 - x_1 - x_2 \geq 0 
    \end{align}
    Given the instance $\vec x = (1,1,1)$ for which we need to explain $h(\vec x) = 1$,
    and using the Hasse diagram in Figure~\ref{fig:diagram}, it follows that $\{1,2,3\}$ 
    is the only abductive explanation for $h$ and $\vec x$. Yet, 
    $\{1,2\}$ and $\{3\}$ are both subset-minimal $\frac{1}{2}$-probable explanations for $h$ and $\vec x$.  
  \end{example}

With these notions in hand, we are now in position to formulate 
the main problem considered in this study.

\begin{problem}
  \label{def:problem}
Given a classifier $h: \{0,1\}^d \rightarrow \{0,1\}$, an instance $\vec x \in \{0,1\}^d$,
a set $I \subseteq [d]$ of features, a size limit $k \leq \size{I}$, 
find a subset $S \subseteq I$ of size at most $k$ such that $\epsilon_{h,\vec x}(S)$ is minimized. 
\end{problem}

\subsection{Evaluating Explanation Errors}
\label{subsec:evaluation}

It is easy to see that the problem of evaluating $\epsilon_{h,\vec x}(S)$ is $\#\mathrm{P}$-hard in general. 
However, \cite{IzzaHINCM.2022} have shown that $\epsilon_{h,\vec x}(S)$ can be computed in polynomial time, 
when $h$ is described by a decision tree. For completeness, we show here that 
$\epsilon_{h,\vec x}(S)$ can be evaluated in linear time for decision trees, 
using the orthogonality of decision trees, and the fact 
this property is closed under conditioning.

% Indeed, a polynomial-time reduction from the standard $\#\mathrm{SAT}$ problem easily follows, 
% using a conjunctive normal form (CNF) representation of $h$, an instance $\vec x$ 
% such that $h(\vec x) = 0$, and the set $S = \emptyset$. 

To this end, recall that a \emph{(Boolean) decision tree} is a binary tree $\mathcal T$, 
each of whose internal nodes is labeled with one 
of $d$ Boolean variables from $X_d = \{x_1,\cdots,x_d\}$, and whose leaves are labeled $0$ or $1$.   
The value $h(\vec x) \in \{0,1\}$ of a hypothesis $h$ described by $\mathcal T$ on an instance $\vec x$ 
is given by the label of the leaf reached from the root of $\mathcal T$ as follows: 
at each node, go to the left or right child depending on whether the input value of the corresponding variable 
is $0$ or $1$, respectively. The \emph{size} of $\mathcal T$, denoted $\size{\mathcal T}$, 
is given by the number of nodes in $\mathcal T$. 
For illustration, a decision tree representing the classifier (\ref{ex:hypothesis}) is given in Figure~\ref{fig:dt}.

\tikzset{
  internal/.style = {solid,align=center, inner sep=0pt, text centered, circle, draw=black, text width=1.5em},
  leaf/.style = {solid, align=center, inner sep=2pt, text centered, text width=1.5em},
  subtree/.style = {isosceles triangle, isosceles triangle stretches=true, inner sep=0, shape border rotate=90, yshift={-10mm}, 
  solid, align=center, minimum size =1.5cm, draw=black, text width=1.5em}
}

\begin{figure}[t]
    \centering
    \begin{tikzpicture}[->,>=stealth',level/.style={sibling distance = 3.75cm/#1,level distance = 1.4cm}] 
    \node [internal] {$x_1$}
        child{ 
            node [internal] {$x_2$}           
                child{ 
                    node [leaf] {$1$}
                    edge from parent[densely dotted] node[above left] {\tiny $0$}
                    }
			    child{ 
                    node [leaf] {$0$}
                    edge from parent[solid] node[above right] {\tiny $1$}
                    }
            edge from parent[densely dotted] node[above left] {\tiny $0$}
            }                                
        child{ 
            node [internal] {$x_2$}
                child{ 
                    node [leaf] {$0$} 
                    edge from parent[densely dotted] node[above left] {\tiny $0$}
                    }
            child{ 
                node [internal] {$x_3$}
				    child{ 
                        node [leaf] {$0$}
                        edge from parent[densely dotted] node[above left] {\tiny $0$}
                        }
				    child{ 
                        node [leaf] {$1$}
                        edge from parent[solid] node[above right] {\tiny $1$}
                        }
                edge from parent[solid] node[above right] {\tiny $1$}
                }
            edge from parent[solid] node[above right] {\tiny $1$}
            }
        ; 
    \end{tikzpicture}
    \caption{A decision tree representation of (\ref{ex:hypothesis}).}
    \label{fig:dt}
\end{figure}

As usual, a \emph{literal} over $X_d$ is a variable $x_i$, or its 
negation $\overline x_i$. The negation of a literal $l$ is given by 
$\neg l = x_i$ if $l = \overline x_i$, and $\neg l = \overline x_i$ if $l = x_i$. A 
a \emph{term} is a conjunction of literals, and a \emph{Disjunctive Normal Form (DNF)}  
formula is a disjunction of terms. Here, DNF formulas are viewed as sets of terms, 
and terms are viewed as sets of literals. 
A term is \emph{inconsistent} if includes a pair $\{l,\neg l\}$ of opposite literals. 
A DNF formula $F = \{t_1,\cdots,t_m\}$ is \emph{orthogonal} if $t_i \cup t_j$ is inconsistent 
for all pairs $i,j \in [m]$ such that $i \neq j$. The \emph{conditioning} \citep{Darwiche.1999} of $F$ 
by a term $t$, denoted $F \mid t$, is the formula obtained by removing from 
$\{t_1 \cup t,\cdots,t_m \cup t\}$ any term that is inconsistent. 


\begin{proposition}
\label{prop:evaluation}
    Given a classifier $h: \{0,1\}^d \rightarrow \{0,1\}$ represented by some decision tree $\mathcal T$, an instance $\vec x \in \{0,1\}^d$, 
    and a set of features $S \subseteq [d]$, evaluating $\epsilon_{h,\vec x}(S)$ can be done in $\mathcal O(\size{S} \cdot \size{\mathcal T})$ time.
\end{proposition}
\begin{proof}
    It is well-known that $\mathcal T$ can be transformed in linear time into 
    an equivalent orthogonal DNF formula, denoted $\mathrm{DNF}(\mathcal T)$, where each term corresponds to 
    a path from the root to a leaf labeled with $1$. Given an instance $\vec x \in \{0,1\}^d$ and a set $S$ of features, 
    let $t_{\vec x_S}$ be the term associated with the partial instance $\vec x_S$, that is,
    \begin{align*}
      t_{\vec x_S} = \bigcup_{i=1}^d \{x_i: (x_S)_i = 1\} \cup \{\overline x_i: (x_S)_i = 0\}
    \end{align*}
    By construction, $\mathrm{DNF}(\mathcal T) \mid t_{\vec x_S}$ is orthogonal and hence, for decision trees, 
    (\ref{def:mistake}) can simply be rewritten as:
    \begin{align*}
      \mu_{h,\vec x}(S) =  
      \begin{cases}
        \sum_{t \in \mathrm{DNF}(\mathcal T) \mid t_{\vec x_S}} 2^{d - \size{t}} & \mbox{ if } h(\vec x) = 1\\
        2^d - \sum_{t \in \mathrm{DNF}(\mathcal T) \mid t_{\vec x_S}} 2^{d - \size{t}} & \mbox{ if } h(\vec x) = 0
      \end{cases}
    \end{align*}
    The result follows from (\ref{def:decomposition}), together with the fact that $\mathrm{DNF}(\mathcal T) \mid t_{\vec x_S}$ 
    can be derived in $O(\size{S} \cdot \size{\mathcal T})$ time.
\end{proof}

\subsection{Minimizing Explanation Errors}

The next result shows that the decision version of Problem~\ref{def:problem} is generally hard to solve 
for decision trees, even when the set $I$ of candidate features is an abductive explanation.  

\begin{proposition}
  \label{prop:hardness}
  Given a classifier $h$ represented by some decision tree $\mathcal T$, 
  an instance $\vec x \in \{0,1\}^d$, an abductive explanation $I \subseteq [d]$ for $h$ and $\vec x$,
  an integer $k < \size{I}$, and a threshold $\varepsilon \in (0,\frac{1}{2})$, the problem of finding 
  a subset $S \subseteq I$ of size at most $k$ satisfying $\epsilon_{h,\vec x}(S) \leq \varepsilon$ 
  is $\mathrm{NP}$-hard.
\end{proposition}
\begin{proof}
  We consider three problems $\vec P_1,\vec P_2$ and $\vec P_3$, each taking as input a decision tree representation 
  $\mathcal T$ of some classifier $h$, an instance $\vec x \in \{0,1\}^d$, and two parameters $k$ and $\varepsilon$.
  The third problem is also given an abductive explanation $I$ for $h$ and $\vec x$.
  For $\vec P_1$, $k \leq [d]$ and $\varepsilon \in (0,\frac{1}{2})$, 
  for $\vec P_2$, $k \leq [d]$ and $\varepsilon \in (\frac{1}{2},1)$,
  and for $\vec P_3$, $k < \size{I}$ and $\varepsilon \in (0,\frac{1}{2})$. 
  The corresponding tasks are given as follows:
  \begin{itemize}
    \item[$\vec P_1$:] Find $S \subseteq [d]$ such that $\size{S} \leq k$ and $\epsilon_{h,\vec x}(S) \leq \varepsilon$;
    \item[$\vec P_2$:] Find $S \subseteq [d]$ such that $\size{S} \leq k$ and $\epsilon_{h,\vec x}(S) \geq \varepsilon$;
    \item[$\vec P_3$:] Find $S \subseteq I$ such that $\size{S} \leq k$ and $\epsilon_{h,\vec x}(S) \leq \varepsilon$.
  \end{itemize}  
  By Theorem 2 in \citep{ArenasBOS.2022}, $\vec P_1$ is $\mathrm{NP}$-hard. Based on this result, we give here a chain 
  of polynomial-time reductions $\vec P_1 \preceq_p \vec P_2 \preceq_p \vec P_3$.

  Given an instance $(\mathcal T_1,\vec x_1,k_1,\varepsilon_1)$ of $\vec P_1$, we build an instance 
  $(\mathcal T_2,\vec x_2,k_2,\varepsilon_2)$ of $\vec P_2$, where $\mathcal T_2$ is the negation of $\mathcal T_1$,
  $\vec x_2 = \vec x_1$, $k_2 = k_1$ and $\varepsilon_2 = 1 - \varepsilon_1$. Note that $\mathcal T_2$ can 
  be constructed in polynomial time by simply switching the label of each leaf in $\mathcal T_1$. Let 
  $h_1$ and $h_2$ denote the hypotheses associated with $\mathcal T_1$ and $\mathcal T_2$, respectively. 
  Since by construction, $\epsilon_{h_2,\vec x}(S) = 1 - \epsilon_{h_1,\vec x}(S)$, we have $\vec P_1 \preceq_p \vec P_2$,
  and hence, $\vec P_2$ is $\mathrm{NP}$-hard. 

  \begin{figure}[t]
    \centering
    \begin{tikzpicture}[scale=0.75,->,>=stealth',level/.style={sibling distance = 5cm/#1,level distance = 1.2cm}] 
    \node [internal] {$x_0$}
        child[child anchor=north]{ 
            node [subtree] {$\neg \mathcal T_2$}           
            edge from parent[densely dotted] node[above left] {\tiny $0$}
            }                                
        child{ 
            node [internal] {$x_1$}
                child{ 
                    node [leaf] {$0$} 
                    edge from parent[densely dotted] node[above left] {\tiny $0$}
                    }
                child{ 
                      node {$\vdots$}
				              child{ 
                        node [leaf] {$0$}
                        edge from parent[densely dotted] node[above left] {\tiny $0$}
                        }
				              child{ 
                        node [internal] {$x_d$}
                        child{ 
                          node [leaf] {$0$}
                          edge from parent[densely dotted] node[above left] {\tiny $0$}
                          }
                        child{ 
                          node [leaf] {$1$}
                          edge from parent[solid] node[above right] {\tiny $1$}
                          }
                        edge from parent[solid] node[above right] {\tiny $1$}
                        }
                edge from parent[solid] node[above right] {\tiny $1$}
                }
            edge from parent[solid] node[above right] {\tiny $1$}
            }
        ; 
    \end{tikzpicture}
    \caption{The decision tree $\mathcal T_3$ in the proof of Proposition~\ref{prop:hardness}.}
    \label{fig:proof}
\end{figure}

  Now, consider an instance $(\mathcal T_2,\vec x_2,k_2,\varepsilon_2)$ of $\vec P_2$. Without loss of generality, 
  we assume here that $\vec x_2$ is the $d$-dimensional all-ones vector $\vec 1$, and $h_2(\vec x) = 1$ where 
  $h_2$ is the hypothesis associated with $\mathcal T_2$. We construct an instance 
  $(\mathcal T_3,\vec x_3,I,k_3,\varepsilon_3)$ of $\vec P_3$ in the following way. 
  Let $\mathcal T_3$ be the decision tree defined according to Figure~3.
  The root node of $\mathcal T_3$ is labeled with $x_0$, the subtree rooted at the left child of $x_0$ 
  is the negation of $\mathcal T_2$, and the subtree rooted at the right child of $x_0$ 
  is the caterpillar encoding the conjunction $x_1 \land \cdots \land x_{d}$.
  Let $\vec x_3$ be the all-ones vector $\vec 1$ over $\{0,1,\cdots,d\}$, and $I = \{0,1,\cdots,d\}$.
  The remaining parameters are set to $k_3 = k_2$ and $\varepsilon_3 = 1 - \varepsilon_2$. 
  
  Let $h_3$ be the hypothesis associated with $\mathcal T_3$.
  For any $i \in \{0,1,\cdots,d\}$, let $\vec y_{i \leftarrow 0}$ be the instance in $\{0,1\}^{d+1}$ obtained by 
  flipping the value $x_{3,i}$ and leaving all other values of $\vec x_3$ unchanged. 
  Since $h_3(\vec x_3) = 1$, and $h_3(\vec y_{i \leftarrow 0}) = 0$ for all $i \in \{0,1,\cdots,d\}$,
  it follows that $I$ is an abductive explanation for $h_3$ and $\vec x_3$. 
  Moreover, for any proper subset $S$ of $I$ that \emph{includes} the feature $0$, we have
  \begin{align*}
    \epsilon_{h_3,\vec x_3}(S) = 1 - \frac{1}{2^{d + 1 - \size{S}}} \geq \frac{1}{2} > \varepsilon_3
  \end{align*}
  So, in order to solve $\vec P_3$, we need to identify a subset $S \subseteq I$ of size at most $k_3$
  that \emph{excludes} the feature $0$, while satisfying $\epsilon_{h_3,\vec x_3}(S) \leq \varepsilon_3$. 
  But we can see that for any $S \subseteq I \setminus \{0\}$,
  \begin{align*}
    \epsilon_{h_3,\vec x_3}(S) 
      &\!=\! \frac{\size{\{\vec y \!\in\! \{0,1\}^d:\! h_2(\vec y)\! = \!h_2(\vec x_2), \vec y_S\! =\! (\vec x_{2})_S\}}}
                      {\size{\{\vec y \in \{0,1\}^d: \vec y_S\! =\! (\vec x_{2})_S\}}} \\
     &\!=\! 1 - \epsilon_{h_2,\vec x_2}(S) 
  \end{align*}
  Therefore, $\epsilon_{h_3,\vec x_3}(S) \leq \varepsilon_3$ if and only if $\epsilon_{h_2,\vec x_2}(S) \geq \varepsilon_2$.  
  It follows that, $\vec P_2 \preceq_p \vec P_3$, and hence, $\vec P_3$ is $\mathrm{NP}$-hard. 
  \end{proof}

%------------------------------------------------------------------------------
% Supermodular Minimization
%------------------------------------------------------------------------------

\section{Supermodular Minimization}\label{sec:supermodular}

The main idea of this study is to relax the requirement of finding an optimal solution to 
Problem~\ref{def:problem}, and instead settle for a solution that is ``good enough'', 
using supermodular minimization algorithms. In this section, we start 
with some basic notions about supermodularity, and then, we examine some useful properties 
of the mistake function. 

\subsection{Supermodular Functions}

Given a real-valued set function $f: 2^{[d]} \rightarrow \mathbb R$, the quantities
\begin{align*}
    \mathrm{L}_f(i \mid S) &=  f(S \setminus \{i\}) - f(S), \mbox{ and } \\
    \mathrm{G}_f(i \mid S) &= f(S \cup \{i\}) - f(S) 
\end{align*}
are respectively capturing the \emph{marginal loss} of removing an element $i$ from a set $S$, 
and \emph{marginal gain} of adding an element $i$ to a set $S$.
A set function $f$ is \emph{non-increasing} if $\mathrm{L}_f(i \mid S) \geq 0$ for all $S \subseteq [d]$ and $i \in S$, and 
$f$ is \emph{non-decreasing} if $\mathrm{G}_f(i \mid S) \geq 0$ for all 
$S \subseteq [d]$ and $i \in [d] \setminus S$.
A set function $f$ is \emph{supermodular} if it satisfies the diminishing loss condition  
\begin{align*}
  \mathrm{L}_f(i \mid S) \geq \mathrm{L}_f(i \mid T) 
\end{align*}
for all $S \subseteq T \subseteq [d]$ and $i \in S$. 
Dually, $f$ is \emph{submodular} 
if it satisfies the diminishing gain condition
\begin{align*}
  \mathrm{G}_f(i \mid S) \geq \mathrm{G}_f(i \mid T) 
\end{align*}
for all $S \subseteq T \subseteq [d]$ and $i \in [d] \setminus T$. Based on the fact that
$\mathrm{L}_f(i \mid S) = -\mathrm{G}_f(i \mid S \setminus \{i\})$ for all $S \subseteq [d]$ and $i \in S$,  
$f$ is supermodular if and only if $-f$ is submodular.
Finally, $f$ is \emph{modular} if it is both submodular and supermodular.

For a non-negative set function $f$ and a nonempty subset $I \subseteq [d]$, the \emph{curvature} of $f$ over $2^I$
is given by 
\begin{align}
\label{def:curvature}  
c = 1\! -\! \min_{i \in I} \frac{\mathrm{L}_f(i \mid I)}{\mathrm{L}_f(i \mid \{i\})} 
    = 1\! -\! \min_{i \in I} \frac{\mathrm{G}_f(i \mid I\! \setminus\! \{i\})}{\mathrm{G}_f(i \mid \emptyset)} 
\end{align}
Clearly, $c \in [0,1]$ whenever $f$ is non-decreasing and submodular, or non-increasing and supermodular.
Note that the curvature coincides with the notion of ``steepness'' defined in \citep{Ilev.2001}.
When $I = [d]$, $c$ is called the \emph{total curvature} of $f$ \citep{ConfortiC.1984}.
Notably, in the case where $f$ is non-increasing and supermodular, the condition $c < 1$ is sufficient 
for ensuring that the task of minimizing $f$ subject to a cardinality constraint is approximable to within a constant \citep{Ilev.2001,SviridenkoVW.2017}. 

\subsection{Minimizing Explanation Mistakes}

In light of Figure~\ref{fig:diagram}, we can see that the error function 
$\epsilon_{h,\vec x}(\cdot)$ is generally \emph{not} supermodular or submodular, 
and \emph{not} non-increasing or non-decreasing. 
However, if we instead focus on the unnormalized version $\mu_{h,\vec x}(\cdot)$ given in (\ref{def:mistake}), then 
the following properties can be derived.   

\begin{proposition}
    \label{prop:mistake}
    Let $h: \{0,1\}^d \rightarrow \{0,1\}$ be a classifier, $\vec x \in \{0,1\}^d$ be an instance,
    and $I \subseteq [d]$ be any nonempty set of features. Then, $\mu_{h,\vec x}(\cdot)$ is supermodular and non-increasing.
    Furthermore, if $I$ is an abductive explanation for $h$ and $\vec x$, then the curvature $c$ of $\mu_{h,\vec x}(\cdot)$ 
    over $2^I$ satisfies $c < 1$. 
\end{proposition}

\begin{proof}
  Let $f$ be the function $\mu_{h,\vec x}(\cdot)$, and $N$ be the set of instances $\vec y \in \{0,1\}^d$ such that $h(\vec x) \neq h(\vec y)$.
  For any subset $S \subseteq [d]$, let $C(\vec x_S)$ denote the set of instances $\vec y \in \{0,1\}^d$ 
  covered by $\vec x_S$, and for any feature $i \in S$, let $\overline C(\vec x_{S \setminus \{i\}})$ denote the set 
  $C(\vec x_{S \setminus \{i\}}) \setminus C(\vec x_{S})$.  
  
  The fact that $f$ is non-increasing directly follows from the observation that 
  $\mathrm{L}_{f}(i \mid S) = \size{\overline C(\vec x_{S \setminus \{i\}}) \cap N} \geq 0$
  for any $S \subseteq [d]$ and any $i \in S$.
  Now, given any superset $T$ of $S$, we have $\overline C(\vec x_{T \setminus \{i\}}) \subseteq \overline C(\vec x_{S \setminus \{i\}})$.
  It follows that $\overline C(\vec x_{T \setminus \{i\}}) \cap N \subseteq \overline C(\vec x_{S \setminus \{i\}}) \cap N$,
  and hence, $\mathrm{L}_{f}(i \mid T) \leq \mathrm{L}_{f}(i \mid S)$.
  Therefore, $f$ is supermodular.
  
  Finally, $\mathrm{L}_{f}(i \mid I) > 0$ whenever $I$ is a (non-empty) abductive explanation for $h$ and $\vec x$. 
  This, together with the fact that, by supermodularity, $\mathrm{L}_{f}(i \mid \{i\}) \geq \mathrm{L}_{f}(i \mid I)$ 
  for any $i \in I$, implies that $c \in [0,1)$. 
\end{proof}


%------------------------------------------------------------------------------
% Approximation Algorithms 
%------------------------------------------------------------------------------

\section{Approximation Algorithms}\label{sec:algorithms}

After providing an overview of probabilistic explanations and supermodular minimization, 
we now present two greedy approximation algorithms for Problem~\ref{def:problem}. 

\subsection{Greedy Descent}

A natural approach for minimizing a supermodular and non-increasing function $f$ subject to a cardinality constraint 
$\size{S} \leq k$ is to start from the input set $I$ of candidate features, and to iteratively remove from 
the current solution $S$ any feature $i$ that minimizes the marginal loss $\mathrm{L}_{f}(i \mid S)$, 
until the desired size $\size{S} = k$ is reached. As shown by \cite{Ilev.2001}, this greedy method achieves a 
$\frac{e^{p} - 1}{p}$-approximation, where $p = \frac{c}{1 - c}$, and $c$ is the curvature of $f$ over $2^I$. 

In the setting of our study, the error function $\epsilon_{h,\vec x}(\cdot)$ in (\ref{def:decomposition}) 
is a normalized version of $\mu_{h,\vec x}(\cdot)$, which
is supermodular and non-increasing. Furthermore, the normalization factor $2^{d - \size{S}}$ 
is \emph{constant} for all subsets $S$ with the same size. Based on these properties, 
we can combine the above greedy descent approach for $f = \mu_{h,\vec x}(\cdot)$, 
with a level-wise selection method that stores the subsets $S_0,S_{1},\cdots,S_k$ obtained 
for each size $j \in \{0,1,\cdots,k\}$, and that returns from this sequence the best subset $S_j$ 
with respect to $\epsilon_{h,\vec x}(\cdot)$. A formal description is given in Algorithm~\ref{alg:descent}.  

\begin{proposition}
  \label{prop:descent}
  Let $S^*$ be an optimal solution to Problem~\ref{def:problem}, let $c$ be the curvature 
  of $\mu_{h,\vec x}(\cdot)$ over $2^I$, and assume that $I$ is an abductive explanation for $h$ and $\vec x$. Then, the solution $S_{\textsc{gd}}$ 
  returned by Greedy Descent (\textsc{gd}) satisfies:
  \begin{align*}
    \epsilon_{h,\vec x}(S_{\textsc{gd}}) \leq \left(\frac{e^{p} - 1}{p}\right)\epsilon_{h,\vec x}(S^*) \mbox{ where } p = \frac{c}{1 - c} < 1
  \end{align*}
\end{proposition}
\begin{proof}
The fact that $p < 1$ follows from Proposition~\ref{prop:mistake}.
Let $j^*$ be the size of $S^*$, and let $S_ {j^*}$ be the solution computed by \textsc{gd} 
at the end of the step $j = n - j^* + 1$. Note that $\size{S^*} = \size{S_{j^*}}$. So, 
by application of Corollary 4 in \citep{Ilev.2001}, we must have 
\begin{align*}
  \mu_{h,\vec x}(S_{j^*}) \leq \left(\frac{e^{p} - 1}{p}\right) \mu_{h,\vec x}(S^*)
\end{align*}
Since \textsc{gd} is returning a minimizer of $\epsilon_{h,\vec x}(\cdot)$ over the sequence 
$S_0, \cdots, S_{j^*}, \cdots, S_k$, it follows that
\begin{align*} 
    \epsilon_{h,\vec x}(S_{\textsc{gd}}) 
    &\leq \epsilon_{h,\vec x}(S_{j^*})
    = \frac{\mu_{h,\vec x}(S_{j^*})}{2^{d - j^*}} \\
    &\leq \left(\frac{e^{p} - 1}{p}\right)\frac{\mu_{h,\vec x}(S^*)}{2^{d - j^*}}
    = \left(\frac{e^{p} - 1}{p}\right)\epsilon_{h,\vec x}(S^*)
\end{align*}
\end{proof}

\setcounter{algocf}{0}
\begin{algorithm}[t]
	\SetArgSty{textrm}
	\DontPrintSemicolon
	\caption{Greedy Descent (\textsc{gd})}
	\label{alg:descent}
	\SetKw{Input}{Input:}
  \SetKwFor{For}{For}{do}{endfor}
  \BlankLine
	\Input{classifier~$h$,~instance~$\vec x$,~feature~set~$I$,~integer $k$}\;  
	\BlankLine
  Set $S_n = I$, where $n = \size{I}$\;
  \For{$j = n$ \textbf{downto} $1$}
    {
      Let $i^* \in \Argmin_{i \in S_j} \mu_{h,\vec x}(S_j \setminus \{i\})$\;
      Set $S_{j - 1} = S_{j} \setminus \{i^*\}$\;
    }
  Let $S_{\textsc{gd}} \in \Argmin_{S \in \{S_0,S_1,\cdots,S_k\}} \epsilon_{h,\vec x}(S)$\;
  Return $S_{\textsc{gd}}$\;  
\end{algorithm}

\subsection{Greedy Ascent}

An alternative approach is to consider the objective function $f = - \mu_{h,\vec x}(\cdot)$, 
which is submodular and non-decreasing. Based on the well-known greedy method for 
submodular maximization \citep{NemhauserW.1978}, we could start from $S_0 = \emptyset$, 
and iteratively add to the current solution $S_{j-1}$ any maximizer $i \in I \setminus S_{j-1}$ 
of the marginal gain $\mathrm{G}_{f}(i \mid S_{j-1})$ until $\size{S_j} = k$. 
Unfortunately, such a method would fail here because $f$ is \emph{non-positive}.  
Yet, as observed by \cite{LibertyS.2017}, this issue can be alleviated by slightly increasing the size limit $k$.
More precisely, given a parameter $\gamma \in (0,1)$, the greedy method achieves a 
$\frac{1}{1 - \gamma}$-approximation, whenever it is allowed to improve its solution $S_{j-1}$ until 
$\size{S_j} = k\lceil \ln \left(\nicefrac{f(\emptyset)}{\gamma f(S_{j-1})}\right) \rceil$.   
By coupling this idea with the level-wise selection method suggested above, we get 
a greedy ascent algorithm for minimizing $\epsilon_{h,\vec x}(\cdot)$, detailed in Algorithm~\ref{alg:ascent}.

\begin{proposition}
  \label{prop:ascent}
  Under the conditions of Proposition~\ref{prop:descent}, the solution $S_{\textsc{ga}}$ returned 
  by Greedy Ascent (\textsc{ga}) satisfies:
  \begin{align*}
    \epsilon_{h,\vec x}(S_{\textsc{ga}}) &\leq \left( \frac{1}{1 - c} \right)\epsilon_{h,\vec x}(S^*), \mbox{ and } \\ 
    \size{S_{\textsc{ga}}} 
    &\leq k\left(1 + \bigg\lceil \ln \frac{\mu_{h,\vec x}(\emptyset)}{\mu_{h,\vec x}(S^*)} \biggr\rceil\right) 
    \leq k \bigg\lceil \ln \frac{2e}{c} \biggr\rceil
  \end{align*}
\end{proposition}
\begin{proof}
  The upper bound on $\epsilon_{h,\vec x}(S_{\textsc{ga}})$ can be derived from the following 
  chain of inequalities:
  \begin{align*} 
    \epsilon_{h,\vec x}(S_{\textsc{ga}}) 
    &\leq \epsilon_{h,\vec x}(S_{j})
    = \frac{\mu_{h,\vec x}(S_{j})}{2^{d - j}} 
    \leq \left(\frac{1}{1 - c}\right)\frac{\mu_{h,\vec x}(S^*)}{2^{d - j}} \\
    &\leq \left(\frac{1}{1 - c}\right)\frac{\mu_{h,\vec x}(S^*)}{2^{d - \size{S^*}}}
    = \left(\frac{1}{1 - c}\right)\epsilon_{h,\vec x}(S^*)
\end{align*}
where the first inequality uses the fact that $S_{\textsc{ga}}$ is a minimizer of $\epsilon_{h,\vec x}(\cdot)$
over $\{S_0,\cdots,S_j\}$, the second inequality follows from \citep[Theorem 5]{LibertyS.2017} 
and $c \leq \gamma$, and the last inequality follows from $\size{S^*} \leq j$.

The first upper bound on $\size{S_{\textsc{ga}}}$ simply follows from \citep[Theorem 5]{LibertyS.2017}, 
and the fact that $\gamma \geq \frac{1}{e}$. For the last bound on $\size{S_{\textsc{ga}}}$,
we know that $\frac{1}{c} = \mu_{h,\vec x}(\emptyset) - \min_{i \in I} \mu_{h,\vec x}(\{i\})$, whenever 
$I$ is an abductive explanation. This, together with the fact that 
$\min_{i \in I} \mu_{h,\vec x}(\{i\}) \leq \frac{1}{2} \mu_{h,\vec x}(\emptyset)$, 
yields $\mu_{h,\vec x}(\emptyset) \leq \frac{2}{c}$.
\end{proof}

\begin{algorithm}[t]
	\SetArgSty{textrm}
	\DontPrintSemicolon
	\caption{Greedy Ascent (\textsc{ga})}
	\label{alg:ascent}
	\SetKw{Input}{Input:}
  \SetKwRepeat{Repeat}{Repeat}{Until}  
  \BlankLine
	\Input{classifier~$h$,~instance~$\vec x$,~feature~set~$I$,~integer $k$}\;  
	\BlankLine
    Let $c$ be the curvature of $\mu_{h,\vec x}(\cdot)$ over $2^I$\;
    Set $j = 0$, $S_0 = \emptyset$ and $\gamma = \max \bigl\{\frac{1}{e}, c\bigr\}$\;
    \Repeat{$\displaystyle j = k \biggl \lceil{\ln \left(\frac{\mu_{h,\vec x}(\emptyset)}{\gamma \cdot \mu_{h,\vec x}(S_{j})}\right)}\biggr \rceil$}
    {
      Let $i^* \in \Argmin_{i \in I \setminus S_{j-1}} \mu_{h,\vec x}(S_{j-1} \cup \{i\})$\;
      Set $S_{j} = S_{j-1} \cup \{i^*\}$\;
    }
    Let $S_{\textsc{ga}} \in \Argmin_{S \in \{S_0,S_1,\cdots,S_{j}\}} \epsilon_{h,\vec x}(S)$\;
    Return $S_{\textsc{ga}}$\;  
\end{algorithm}


\subsection{Application to Decision Trees}
\label{subsec:application}

The approximation bounds derived in Propositions~\ref{prop:descent} and \ref{prop:ascent} 
hold for \emph{any} (Boolean) hypothesis class. However, in order to ensure that \textsc{GD} and \textsc{GA} are 
computationally efficient, each call to the value oracle $\mu_{h,\vec x}(\cdot)$ should run in polynomial time. 
As emphasized in Section~\ref{subsec:evaluation}, this is the case for decision trees.  
Namely, if the input classifier $h$ of \textsc{gd} is represented by a decision tree $\mathcal T$,
then by Proposition~\ref{prop:evaluation} and the fact that the number of 
calls to the value oracle is quadratic in $n = \size{I}$, implies that \textsc{gd} runs in 
$\mathcal O(n^3 \size{\mathcal T})$ time. For \textsc{ga}, 
 the number of calls to the value oracle is bounded by $jn + n + 2$, 
where $j$ is the number of iterations of the main loop, and $n + 2$ is the number of calls required to compute $c$. 
So, \textsc{ga} runs in $\mathcal O(k n^2 (1 + \ln \nicefrac{2}{c}) \size{\mathcal T})$ time.  


%------------------------------------------------------------------------------
% Experiments 
%------------------------------------------------------------------------------

\section{Experiments}\label{sec:experiments}

In order to validate the effectiveness of our algorithms, we have considered various instances 
of Problem~\ref{def:problem}, where the input classifier is described by a decision tree. The code 
was written using the \texttt{Python} language. All the experiments have been conducted on a
computer equipped with a 3.1 GHz Intel(R) Core i9-9900 CPU and 64 GiB of RAM.

\subsection{Experimental Setup}

In our experiments, we have considered $B = 50$ datasets, or \emph{benchmarks}, 
from the standard repositories \emph{Kaggle}, \emph{OpenML} and \emph{UCI}. 
Notably, \emph{mnist38} and \emph{mnist49} are subsets of the dataset \emph{mnist}. 
Except for \emph{cnae}, all datasets are binary classification tasks with a number of 
attributes ranging from $10^1$ to $10^5$.
The multi-label classification task \emph{cnae} was transformed into a binary classification task 
by considering the dominant label versus all other labels. 

\begin{table*}[t]
  \centering
  \begin{footnotesize}
    \begin{adjustbox}{width=\linewidth,center}  
    \begin{tabular}{lrrrrrrrrrrrrr}
    \toprule
    \multicolumn{4}{c}{Benchmark} & & \multicolumn{3}{c}{$\epsilon_{h,\vec x}(S)$} & &\multicolumn{3}{c}{$|S|$} & & \multicolumn{1}{c}{Time (s)} \\
    \cmidrule{1-4}\cmidrule{6-8} \cmidrule{10-12} \cmidrule{14-14}
    name & $\mathit{acc}$ & $d$ & \size{$I$} && \textsc{ga} & \textsc{gd} & \textsc{Sat} && \textsc{ga} & \textsc{gd} & \textsc{Sat} && \textsc{Sat} \\
    \midrule
    \emph{meta-data} &87.42&44&5.09 && 0.08 ($\pm$0.11)& 0.08 ($\pm$0.11)& 0.08 ($\pm$0.11) && 3.10&3.10&3.10 && 12.14\\

    \emph{glass} &78.46&31&5.38 && 0.26 ($\pm$0.11)& 0.26 ($\pm$0.11)& 0.26 ($\pm$0.11) && 2.14&2.14&2.14 && 2.36\\

    \emph{student perf.} &91.79&30&5.41 && 0.26 ($\pm$0.11)& 0.26 ($\pm$0.11)& 0.26 ($\pm$0.11) && 2.00&2.00&2.00 && 2.16\\
    
    \emph{primary tumor}&84.31&23&6.23 && 0.09 ($\pm$0.09)& 0.09 ($\pm$0.09)& 0.09 ($\pm$0.08) && 4.22&4.22&4.22 && 3.58\\

    \emph{liver disorders} &75.96&58&6.38 && 0.18 ($\pm$0.09)& 0.18 ($\pm$0.08)& 0.18 ($\pm$0.08) && 4.00&4.00&4.00 && 27.33\\

    \emph{schizophrenia} &80.39&33&6.39 && 0.37 ($\pm$0.24)& 0.37 ($\pm$0.24)& 0.37 ($\pm$0.24) && 1.27&1.27&1.27 && 4.79\\

    \emph{hungarian} &62.92&13&6.65 && 0.12 ($\pm$0.12)& 0.12 ($\pm$0.12)& 0.11 ($\pm$0.10) && 3.58&3.56&3.56 && 1.68\\

    \emph{horse colic} &75.68&40&6.73 && 0.14 ($\pm$0.07)& 0.13 ($\pm$0.07)& 0.13 ($\pm$0.07) && 4.03&4.06&4.06 && 11.56\\
               
    \textcolor{blue}{\emph{indian liver}} &64.57&84&8.21 && 0.10 ($\pm$0.09)& 0.10 ($\pm$0.09)& 0.16 ($\pm$0.12)&& 5.08&4.89&6.12 && 176.28\\

    \textcolor{blue}{\emph{pima indians}} &75.32&97&8.30 && 0.15 ($\pm$0.14)& 0.15 ($\pm$0.14)& 0.16 ($\pm$0.12) && 5.85&5.84&6.58 && 484.6\\

    \textcolor{blue}{\emph{loan eligibility}} &74.31&68&8.47 && 0.19 ($\pm$0.13)& 0.18 ($\pm$0.13)& 0.20 ($\pm$0.14) && 5.60&5.70&6.82&& 42.87\\

    \emph{patient treat.} &66.01&10&8.92 && 0.05 ($\pm$0.09)& 0.03 ($\pm$0.06)& 0.03 ($\pm$0.08) && 5.63&5.94&5.94 && 24.08\\

    \emph{wine} &69.58&11&9.03 && 0.09 ($\pm$0.10)& 0.09 ($\pm$0.09)& 0.09 ($\pm$0.12) && 5.59& 5.64&5.62 && 36.32\\
    
    \textcolor{blue}{\emph{employee attr.}} &82.45&63&10.56 && 0.06 ($\pm$0.09)& 0.06 ($\pm$0.09)& 0.20 ($\pm$0.11) && 6.41&6.39&6.98 && 1017.24\\
    
    \textcolor{blue}{\emph{contraceptive}} &51.36&90&10.84 && 0.06 ($\pm$0.08)& 0.06 ($\pm$0.08)& 0.39 ($\pm$0.17) && 4.27&4.26&5.95 && 1096.07\\
    
    \textcolor{blue}{\emph{compas}} &67.60&40&10.95 && 0.03 ($\pm$0.07)& 0.04 ($\pm$0.08)& 0.05 ($\pm$0.09) && 5.68&5.83&6.78 && 1082.32\\

    \textcolor{blue}{\emph{fetal health}} &91.85&93&11.33 && 0.12 ($\pm$0.06)& 0.12 ($\pm$0.06)& 0.23 ($\pm$0.11) && 5.59&5.59&6.00 && 930.61\\

    \textcolor{magenta}{\emph{dorothea}} &91.88&$10^5$&12.90 && 0.25 ($\pm$0.10)& 0.25 ($\pm$0.10)& $-$ && 6.70&6.70&$-$ && $-$\\
     
    \textcolor{magenta}{\emph{bank market.}} &89.49&882&13.11 && 0.29 ($\pm$0.08)& 0.29 ($\pm$0.07)& $-$ && 6.99&6.99&$-$ && $-$\\

    \textcolor{magenta}{\emph{mnist49}} &95.99&784&15.57 && 0.37 ($\pm$0.14)& 0.37 ($\pm$0.14)& $-$ && 6.97&6.89&$-$ && $-$\\

    \textcolor{magenta}{\emph{spambase}} &92.11&236&16.09 && 0.24 ($\pm$0.11)& 0.23 ($\pm$0.09)& $-$ && 6.87&6.87&$-$ && $-$\\
    
    \textcolor{magenta}{\emph{mnist38}} &96.42&784&17.89 && 0.37 ($\pm$0.13)& 0.38 ($\pm$0.14)& $-$&& 6.93&6.93&$-$ && $-$\\

    \textcolor{magenta}{\emph{cnae}} &92.59&856&19.07 && 0.32 ($\pm$0.25)& 0.32 ($\pm$0.25)& $-$ && 5.97&5.97&$-$ && $-$\\

    \textcolor{magenta}{\emph{gisette}} &94.10&5000&21.42 && 0.32 ($\pm$0.11)& 0.32 ($\pm$0.11)& $-$ && 6.88&6.88&$-$ && $-$\\

    \textcolor{magenta}{\emph{farm ads}} &80.78&$54877$&23.15 && 0.13 ($\pm$0.17)& 0.13 ($\pm$0.17)& $-$&& 6.31&6.31&$-$ && $-$\\
        
    \bottomrule
    \end{tabular}
    \end{adjustbox}
  \end{footnotesize}
  \caption{Experimental results on 25 benchmarks for decision tree explanations, using $k = 7$.}
  \label{tab:results}
\end{table*}


For each benchmark $b \in [B]$, an explanation task consists in a tuple $(\mathcal T,\vec x,I,k)$ described 
as follows. $\mathcal T$ is the decision tree representation of some classifier $h$, which is 
learned from the training part of $b$. In our experiments, we have used a \texttt{Scikit-Learn} implementation of 
the \textsc{Cart} algorithm for generating $\mathcal T$. The predictive accuracy of $h$ is measured on the test 
part of $b$. By interpreting each internal node of 
$\mathcal T$ as a Boolean feature, the instance $\vec x$ to be explained is 
taken from the test part of $b$, and binarized according to the $d$ 
features occurring in $\mathcal T$. The set $I$ is given by the collection of features occurring in the (single) root-to-leaf 
path in $\mathcal T$ that is consistent with the instance $\vec x$. Here, $I$ is often referred to as 
a \emph{path-explanation} \citep{IzzaIM.2022}, or \emph{direct reason} \citep{AudemardBBKLM.2022}. 
As observed in \citep{IzzaIM.2022}, $I$ is not necessarily minimal with respect to set inclusion. 
Finally, we have used $k~= 7 \pm 2$ for the size limit. The performance of explanation algorithms on 
a benchmark $b$ is measured by drawing uniformly at random $m$ instances $\vec x$ 
from the test set of $b$, and averaging the resulting error $\epsilon_{h,\vec x}(S)$ 
and size $\size{S}$ of the output $S \subseteq I$. In our experiments, 
$m$ was set to $\min \{s,150\}$, where $s$ is the size of the test set of $b$.
 
To compare the performance of $\textsc{GD}$ and $\textsc{GA}$ with an 
exact solver, we have chosen the SAT-based approach in \citep{ArenasBOS.2022}. Namely,
a SAT encoding was provided for the following task: given as input 
a decision tree $\mathcal T$ for some classifier $h$, an instance $\vec x$, and two parameters $k \leq d$ and 
$\varepsilon \in [0,1)$, return as output ``yes'' if there is a set of features $S$ satisfying both $\size{S} \leq k$ 
and $\epsilon_{h,\vec x}(S) \leq \varepsilon$, and ``no'' otherwise.
In the setting of our experimental setup, $S$ is a subset of the path-explanation $I$ for $\vec x$.
So, the above SAT encoding was extended to the decision version of Problem~\ref{def:problem}, 
by adding the clause $\bigvee \{x_i: (x_I)_i = 1\} \lor \{\overline x_i: (x_I)_i = 0\}$. 
For the original version of Problem~\ref{def:problem}, a binary search over 
the interval $(0,1]$ was performed in order to find a minimizer $S$ of $\epsilon_{h,\vec x}(\cdot)$
with precision of $10^{-3}$, which requires at most 10 calls to the SAT solver. 
We used a \texttt{Pysat} implementation of \textsc{Glucose 4} for the solver,
with a timeout of $30$ minutes per explanation task.\footnote{We mention 
in passing that an SMT-based approach was recently proposed in \citep{IzzaHINCM.2022}, 
but the code was not available at the time of writing this paper.}

\subsection{Experimental Results}

In Table~\ref{tab:results} is reported an overview of our results on 25 of 50 benchmarks, for $k = 7$.
The leftmost column gives the name of the dataset $b$. The columns $\mathit{acc}$ and $d$
are respectively giving the accuracy and the number of features of the decision tree.
The rows are sorted according to the average size $\size{I}$ of the path-explanation. 
The fifth, sixth, and seventh columns are reporting the results for the average error 
$\epsilon_{h,\vec x}(S)$ of the explanation $S$ returned by \textsc{ga}, \textsc{gd}, 
and the SAT-based approach, respectively. The next three columns are reporting the average size of $S$
for these algorithms. Finally, the last column gives the average run-times (in seconds) of the 
SAT-based approach. Notably, for the 7 datasets in blue, the SAT solver occasionally reaches the timeout
before the end of binary search, which results in a degradation of precision. 
For the 8 datasets in magenta, the solver could not perform a single run of binary search before reaching the timeout.
We have not reported the run-times of $\textsc{ga}$ and $\textsc{gd}$, because they could always find 
a solution in less than $0.1$ seconds.

In light of these results, we can observe that the performance of greedy algorithms for minimizing 
$\epsilon_{h,\vec x}(\cdot)$ is remarkable, especially in comparison with the performance of  
the SAT-based approach. For the benchmarks where the SAT solver could return 
an optimal solution $S^*$, the differences 
$\epsilon_{h,\vec x}(S_{\textsc{gd}}) - \epsilon_{h,\vec x}(S^*)$ 
and $\epsilon_{h,\vec x}(S_{\textsc{ga}}) - \epsilon_{h,\vec x}(S^*)$ 
are most often negligible. Moreover, for high-dimensional datasets such as 
\emph{dorothea}, \emph{gisette} and \emph{farm ads}, both \textsc{ga} and \textsc{gd} remain stable by 
providing explanations with comparable errors in a few tenths of a millisecond.
Regarding the conciseness of explanations, we can see that $\size{S_{\textsc{gd}}}$ is on average smaller than $\size{S^*}$.
Interestingly, $\size{S_{\textsc{ga}}}$ is on average smaller than the size limit $k = 7$,
which indicates that the upper bound on $\size{S_{\textsc{ga}}}$ in Proposition~\ref{prop:ascent} 
is rarely attained in practice. Finally, \textsc{ga} and \textsc{gd} could efficiently reduce  
path-explanations $I$ which are not always abductive. In other words, both algorithms 
are, in practice, robust enough to handle some explanation tasks for which the curvature 
$c$ of the unnormalized error function is close to or equal to $1$.


%------------------------------------------------------------------------------
% Discussion 
%------------------------------------------------------------------------------

\section{Discussion}\label{sec:discussion}

\paragraph{Related Work.}
Clarifying in a comprehensible way the prediction $h(\vec x)$ made by some classifier 
$h$ on an input data instance $\vec x$ often takes the form of a set $I$ of features which 
in conjunction determine $h(\vec x)$ \citep{Ribeiro0G.2018}. Such an explanation is abductive \citep{IgnatievNM.2019},
or sufficient \citep{DarwicheH.2020}, precisely when $I$ is minimal with respect to inclusion. 
The problem of finding abductive explanations has been a subject of extensive research, 
recently surveyed in 
\citep{MarquesSilvaI.2022}. The hypothesis classes which are tractable 
for computing abductive explanations include, among others, decision trees 
\citep{AudemardBBKLM.2021,HuangIIM.2021,IzzaIM.2022}, Naive Bayes classifiers \citep{MarquesSilvaGCIN.2020},
monotone threshold functions \citep{CooperM.2023}, and Boolean functions compiled into 
deterministic Decomposable Negation Normal Form (dDNNF) \citep{AudemardKM.2020,HuangIICAM.2022}.
Actually, even when the problem of finding an abductive explanation is $\mathrm{NP}$-hard, 
empirical results indicate that it can often be solved in practice using 
SAT-based approaches \citep{IgnatievM.2021,IzzaM.2021,IgnatievISM.2022}.

However, due to cognitive limitations, a major weakness of abductive explanations 
is their uncontrollable size. In order to circumvent this issue, a common approach 
is to seek for abductive explanations of minimum size. Unfortunately, the corresponding 
optimization problem is $\mathrm{NP}$-hard for decision trees \citep{BarceloM0S.2020}, 
and $\Sigma_2^p$-hard in general \citep{AudemardBBKLMb.2022}. Furthermore, even 
if shortest abductive explanations could be found in a reasonable amount of time, 
their size remains uncontrollable.

By capturing a natural trade-off between conciseness and precision, 
probabilistic explanations have been a subject of growing 
research in the past two years 
\citep{BlancLT.2021,IzzaINCM.2021,WaldchenMHK.2021,WangKV.2021,ArenasBOS.2022,IzzaHINCM.2022,Waldchen.2022}. 
Recall that a size-$k$ $(1 - \varepsilon)$-probable explanation for a 
classifier $h$ and an instance $\vec x$ is a subset $S \subseteq [d]$ such that $\size{S} \leq k$ 
and $\epsilon_{h,\vec x}(S) \leq \varepsilon$. Finding such explanations is 
$\mathrm{NP}^{\mathrm{PP}}$-hard in general \citep{WaldchenMHK.2021,Waldchen.2022},
and $\mathrm{NP}$-hard for decision trees \citep{ArenasBOS.2022}. In the present study, 
we have shown that this problem remains $\mathrm{NP}$-hard for decision trees, even under 
the assumption that $S$ is a subset of some given abductive explanation $I$.

Heuristic approaches to probabilistic explanations have been
considered in \citep{IzzaINCM.2021,IzzaHINCM.2022}. The optimization task is symmetric to that of Problem~\ref{def:problem}: given a hypothesis $h$, an 
instance $\vec x$, a set of features $I$ and an error parameter $\varepsilon$, the goal is 
to find a $(1 - \varepsilon)$-probable explanation $S \subseteq I$ that minimizes $\size{S}$. For this task, 
the authors have proposed a greedy algorithm that runs in polynomial time, 
when $h$ is described by a decision tree $\mathcal T$, and $I$ is a path-explanation 
for $\vec x$ and $\mathcal T$. However, this algorithm does not provide any 
approximation guarantee with respect to the optimal size.

To the best of our knowledge, approximation approaches to probabilistic explanations 
have only been investigated in \citep{BlancLT.2021}. Again, 
the problem under consideration is to find a $(1 - \varepsilon)$-probable explanation $S$ 
that minimizes $\size{S}$. Based on some results on implicit learning, the authors 
gave a PAC-style polynomial-time algorithm that takes as input a classifier $h$, 
an instance $\vec x$, a confidence parameter $\delta$, and a precision 
parameter $\varepsilon$, and that returns as output a set $S \subseteq [d]$
with the following guarantees: (i) $\size{S}$ is polynomial in 
$d, \nicefrac{1}{\delta}$ and $\nicefrac{1}{\varepsilon}$, and (ii) if $\vec x$ is 
drawn uniformly at random over $\{0,1\}^d$, then $\epsilon_{h,\vec x} \leq \varepsilon$ 
with probability at least $(1 - \delta)$. However, this algorithm is mainly 
of theoretical interest, since $\size{S}$ is in 
$\mathcal O\left( (\nicefrac{1}{\delta})^9  (\nicefrac{1}{\epsilon})^{12} \right)$ 
and, more importantly, the instances to be explained in practical applications are rarely 
picked at random according to the uniform distribution. 

\paragraph{Perspectives.}
In our study, probabilistic explanations have been examined through the prism 
of supermodular minimization. Inspired from results in \citep{Ilev.2001,LibertyS.2017},
we have proposed two greedy approximation algorithms for 
minimizing explanation errors subject to a cardinality constraint, whose 
performance essentially depends on the curvature $c$ of the unnormalized error function 
$\mu_{h,\vec x}(\cdot)$. Importantly, our approximation results hold for any (Boolean)
hypothesis class, and hence, our greedy algorithms are computationally efficient whenever 
$\mu_{h,\vec x}(\cdot)$ can be evaluated in polynomial time. Beyond decision trees, which 
have been examined in this paper, (ordered) binary decision diagrams \citep{HuHS22} and dDNNF representations 
\citep{HuangIICAM.2022} are examples of classifiers satisfying this condition.  

This work leaves open several questions. Notably, (i) what is the optimal approximation 
factor for minimizing the error of probabilistic reasons under a cardinality constraint?
A partial answer might come from \cite{SviridenkoVW.2017}, 
who gave a near-optimal algorithm for minimizing a non-increasing supermodular function subject to a matroid constraint.
But this method is mainly of theoretical interest, as its computational 
complexity is prohibitive. So, (ii) can we find alternative, near-optimal approximation algorithms 
which are computationally efficient? Finally, (iii) using sampling methods,
can we extend approximation algorithms to hypothesis classes for which the problem of evaluating $\mu_{h,\vec x}(\cdot)$ 
is intractable?  


\begin{acknowledgements}
Many thanks to the reviewers for their comments and suggestions. 
This work has benefited from the support of the AI Chair EXPEKCTATION (ANR-19- CHIA-0005-01) 
of the French National Research Agency. It was also partially supported by TAILOR, 
a project funded by EU Horizon 2020 research and innovation programme under GA No 952215.  
\end{acknowledgements}

%------------------------------------------------------------------------------
% References
%------------------------------------------------------------------------------

\bibliography{bounia_332}
\end{document}







