\documentclass[accepted]{uai2025}
\usepackage{helvet}  % DO NOT CHANGE THIS
\usepackage{courier}  % DO NOT CHANGE THIS
\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
\frenchspacing  % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in}  % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in}  % DO NOT CHANGE THIS
%
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{algpseudocode}

\usepackage{newfloat}
\usepackage{listings}
\usepackage[numbers]{natbib}
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{subcaption}

% Graphics
\usepackage{tikz}
\usetikzlibrary{decorations.markings}
\usetikzlibrary{arrows}
\usetikzlibrary{shapes}
\usetikzlibrary{positioning}
\usepackage{pgfplots}
% Specific Packages
\usepackage{booktabs}

\usepackage{amsthm}
\usepackage{mathtools} 
\usepackage{bm}
\usepackage{bbold}
\usepackage{eurosym}
\usepackage{adjustbox}
\usepackage{amsmath} % Pour les notations mathématiques

%\usepackage{color}
\definecolor{vlgray}{gray}{0.92}
\definecolor{lgray}{gray}{0.88}
\definecolor{grey1}{rgb}{0.4,0.4,0.4}

\newcommand{\Id}[1]{#1}

\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\Argmax}{Argmax}
\DeclareMathOperator*{\Argmin}{Argmin}
\newcommand{\Binom}[1]{\binom{[#1]}{2}}
\renewcommand{\vec}[1]{\bm{#1}}
\newcommand{\Var}[1]{\mathit{Var}(#1)}
\newcommand{\Lit}[1]{\mathit{Lit}(#1)}
\newcommand{\Sol}[1]{\mathit{Sol}(#1)}
\newcommand{\Opt}[1]{\mathit{Opt}(#1)}
\newcommand{\Val}[1]{\mathit{Val}(#1)}
\newcommand{\seq}[1]{\langle #1 \rangle}
\newcommand{\inner}[2]{\langle #1, #2 \rangle}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\bignorm}[1]{\bigl\|#1\bigr\|}
\newcommand{\norma}[1]{\|#1\|_{\mathrm{a}}}
\newcommand{\normas}[1]{\|#1\|_{\mathrm{a^*}}}
\newcommand{\normb}[1]{\|#1\|_{\mathrm{b}}}
\newcommand{\normbs}[1]{\|#1\|_{\mathrm{b^*}}}
\newcommand{\sgn}{\mathrm{sgn}}
\newcommand{\size}[1]{|#1|}
\newcommand{\vecphi}{\bm{\phi}}
\newcommand{\vecPhi}{\bm{\Phi}}
\newcommand{\vecbeta}{\bm{\beta}}
\newcommand{\vectheta}{\bm{\theta}}
\newcommand{\vecTheta}{\bm{\Theta}}
\newcommand{\PP}{\mathbb{P}}

% KR
\def\nnf{{\tt NNF}}
\def\cnf{{\tt CNF}}
\def\dnf{{\tt DNF}}
\def\Dnnf{{\tt DNNF}}
\def\dt{{\tt DT}}
\def\rf{{\tt RF}}
\def\d4{{\tt D4}}
\def\MINMAJ{{\tt minMAJ}}
\def\MAJ{{\tt MAJ}}
\def\PMAJ{{\tt PMAJ}}
\def\RMINMAJ{{\tt RminMAJ}}
\def\RMAJ{{\tt RMAJ}}
\def\RmwMAJ{{\tt RmwMAJ}}
\def\PMINMAJ{{\tt PminMAJ}}
\def\PMINPI{{\tt PminPI}}
\def\MINPI{{\tt minPI}}
\def\PPI{{\tt Path-PI}}

\newtheorem{prop}{Proposition}
\newtheorem{proposition}[prop]{Proposition}

\newtheorem{pro}{Property}
\newtheorem{property}[pro]{Property}

\newtheorem{exmp}{Example}
\newtheorem{example}[exmp]{Example}

\newtheorem{lem}{Lemma}
\newtheorem{lemma}[lem]{Lemma}

\newtheorem{pre}{Prof}
\newtheorem{prof}[pre]{Prof}

\newtheorem{prob}{Problem}
\newtheorem{problem}[prob]{Problem}

\newtheorem{thm}{Theorem}
\newtheorem{theorem}[thm]{Theorem}

\newtheorem{rem}{Remark}
\newtheorem{remark}[rem]{Remark}

\newtheorem{defn}{Definition}
\newtheorem{definition}[defn]{Definition}

\title{Using Submodular Optimization to Approximate Minimum-Size Abductive Path
Explanations for Tree-Based Models}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Louenas Bounia}{}
% Add affiliations after the authors
\affil[1]{%
    LIPN-UMR CNRS 7030.\\
    Université Sorbonne Paris Nord\\
    Villetaneuse, France
}
  
  \begin{document}
\maketitle

\begin{abstract}
One of the key challenges of Explainable Artificial Intelligence (XAI) is providing concise and understandable explanations for classification model predictions. An abductive explanation for a given instance is a minimal set of features that justify the prediction. These minimal explanations are valuable for their interpretability, as they eliminate redundant or irrelevant information. However, computing these explanations is difficult, even for simpler classifiers like decision trees. Finding a minimum-size abductive explanation in decision trees is an {\sc NP}-complete problem, and this complexity extends to random forests for minimum-size majoritary reasons. In this work, we focus on finding minimal sets of features along the paths leading to the decision, called \textit{path-abductive explanations}. We show that the problem of finding \textit{minimum-size path-abductive explanations} in decision trees and \textit{minimum-size path-majoritary reasons} in random forests is also {\sc NP}-complete. To address this, we reformulate the problem as a submodular optimization task and propose a greedy algorithm with optimality guarantees. Our experiments demonstrate that this algorithm produces near-optimal explanations efficiently and offers a strong alternative for difficult instances, where exact methods based on {\sc SAT} encodings are computationally expensive. This approach is especially useful in resource-limited environments where modern {\sc SAT} solvers are not feasible.
\end{abstract}

\section{Introduction}\label{sec:intro}
The supervised classification problem involves deducing a model capable of predicting labels from annotated data. Common classifiers include decision trees \cite{Quinlan86}, random forests \cite{Breiman01}, {\sc XGBoost} \cite{XGBoost}, support vector machines \cite{SVM}, and neural networks, which are widely used across fields like text and image classification, customer analysis, and medical diagnosis. However, with increasing use in critical sectors like healthcare and finance, the ability to explain model decisions is vital for transparency, trust, and regulatory compliance \cite{Molnar19}.

Formal explanations play a central role in Explainable Artificial Intelligence (XAI), as they provide mathematically validated justifications \cite{TrustworthyJoa}, which makes them particularly suitable for sensitive applications, such as the medical, financial, or legal domains. Unlike post-hoc agnostic methods such as {\sf LIME} \cite{Lime16}, {\sf SHAP} \cite{Lundberg17}, {\sf Anchors} \cite{Anchor18}, or counterfactual explanations \cite{Dhurandhar2018}, which rely on local perturbations or game theory without considering the internal structure of the model, formal explanations are directly tied to the behavior of the studied model. This structural link grants them crucial properties of faithfulness, consistency, and robustness, often absent in agnostic methods. The latter can, indeed, generate identical explanations for opposite predictions \cite{IgnatievNM19}, lack rigorous theoretical foundations \cite{JoaoGame}, or be sensitive to input perturbations \cite{alvarezmelis2018}, undermining their reliability in critical contexts. Conversely, for a Boolean classifier $h$, a formal abductive explanation is characterized by the {\sf PI} (Prime Implicant), which corresponds to a minimal subset of features $I$ such that the restriction $x_I$ of the input $\vec{x}$ is sufficient to guarantee the output $h(x)$ \cite{IgnatievNM19, DarwicheH20}. Although finding such an explanation may be an {\sf NP}-hard problem \cite{Cooper23, AudemardKR21}, efficient solutions have been proposed for certain classes of models, notably decision trees and random forests \cite{Audemardetal22, Audemard22ijcai, IzzaRF}. Finally, we specify that our approximation proposed in this work is also distinguished by the use of formal approximations, which preserve the theoretical guarantees inherent to formal explanations, unlike empirical approximations used in agnostic approaches such as {\sf SHAP} or {\sf LIME}.

Conciseness is as important as validity in ensuring explanation comprehensibility. Human cognitive limits \cite{Miller56} justify the need for smaller explanations. However, finding minimum-size explanations is challenging, even for tree-based models. Computing a minimum-size abductive explanation for decision trees is {\sf NP}-hard \cite{dke, Barcelo2020ModelIT}, and for random forests, computing a {\sf PI}-explanation is {\sf DP}-complete \cite{IzzaRF}, with minimum-size abductive explanations being $\sum_2^p$-complete \cite{Audemardetal22}. Majoritary reasons, introduced for random forests \cite{Audemardetal22}, are implicants of the majority of trees in the forest, but finding their minimum size is also {\sf NP}-hard. Constraint optimization and modern {\sf SAT} solvers have been applied to compute efficient explanations, such as the {\sf MUS} method \cite{IzzaRF} for PI-explanations and the {\sc MaxSAT} solver for minimum-size majoritary reasons \cite{Audemardetal22}.

The high computational complexity of these exact methods can become prohibitive, particularly for hard-to-explain instances or high-dimensional inputs, where computational time increases significantly. This issue is compounded in resource-constrained environments, where hardware and time limitations further restrict computation. To address this, we focus on approximating minimum-size explanations using submodularity, applied efficiently to decision trees and random forests. We aim to approximate minimum-size sufficient reasons (PI-explanations) for decision trees and minimum-size majoritary reasons ($\MINMAJ$) for random forests. For tree-based models, we focus on path-restricted explanations, which reflect the model’s internal decision-making process. Our work proposes an efficient approximation method for minimum-size path-abductive explanations, which aligns with the internal workings of decision trees and ensures concise and relevant explanations.

\textbf{Contributions and Main Motivation.} In this work, we focus on approximating minimum-size explanations through the lens of \textit{submodularity}, with an efficient application to \textit{decision trees} and \textit{random forests}. More specifically, we focus on \textit{minimum-size path-abductive explanations}, which are based on decision paths (\textit{path-explanations}) and are aligned with the internal workings of tree-based models. These explanations minimize the redundancy in path-explanations that are redundant \cite{IzzaDT, IzzaDT1}, while remaining consistent with the model's reasoning.

\noindent \textbf{Our contributions include:} First, we reformulate the problem of computing \textit{minimum-size abductive explanations} as a submodular optimization task, applicable to a Boolean classifier $h$ and a data instance $\vec{x}$. Second, we extend this reformulation to the computation of \textit{minimum-size path-majoritary reasons} (denoted $\PMINMAJ$) for random forests $F$. We demonstrate that this problem remains \textsf{NP}-complete, even for a single tree ($F = \{T\}$), where majoritary reasons coincide with \textsf{PI}-explanations. Finally, we propose an efficient \textit{greedy algorithm} with theoretical optimality guarantees, including an approximation bound on the size of the explanations.

\noindent \textbf{Our main motivations are:} 
We aim to provide an efficient alternative when computing minimum-size reasons becomes challenging, particularly for complex instances or high-dimensional inputs, where exact methods based on \textsf{SAT} solvers are computationally expensive. Additionally, we focus on delivering explanations in \textit{resource-constrained environments}, where the use of \textsf{SAT} solvers or powerful machines is not feasible. Furthermore, we emphasize the importance of \textit{path-explanations}, as the local decision-making process of tree-based models relies on these paths, offering concise justifications that align with the internal reasoning of the model.

\noindent In summary, this work proposes a practical and theoretically grounded solution for approximating minimum-size abductive explanations, addressing the limitations of existing methods in demanding contexts. \textit{Path-explanations} play a central role in this approach, providing concise and interpretable justifications that are aligned with the internal reasoning of the model.

\section{Preliminaries} \label{sec:Preliminaries}
\textbf{Classification problems.} We assume that the reader is familiar with the basic concepts of machine learning, such as supervised learning, binary classification, random forests, and the principle of majority voting.

\textbf{Notations.} Let $n$ be an integer, and let $[n]$ denote the set $\{1, \ldots, n\}$. We denote by $\mathcal{F}_n$ the class of all Boolean functions mapping $\{0,1\}^n$ to $\{0,1\}$, and $X_n = \{x_1, \ldots, x_n\}$ refers to the set of Boolean variables. An assignment $\vec{x} \in \{0,1\}^n$ is called an \emph{instance}. A \emph{literal} $\ell$ is either a variable $x_i$ or its negation $\overline{x}_i$. A \emph{term} $t$ is a conjunction of literals\footnote{In this work, we treat a term as a set of literals for simplicity.}, and a \emph{clause} $c$ is a disjunction of literals. A \dnf\ formula is a disjunction of terms, and a \cnf\ formula is a conjunction of clauses. A formula $f$ is \emph{consistent} if and only if it has at least one model (i.e., an assignment that satisfies it). Given an instance $z \in \{0,1\}^n$, the corresponding term $t_z$ is defined as follows: $t_{z} = \bigwedge_{i=1}^n x_i^{z_i} = \{x_1^{z_1}, \ldots, x_n^{z_n}\}, \quad \text{where } x_i^0 = \overline{x}_i \text{ and } x_i^1 = x_i.$

An \emph{implicant} of a Boolean function $f$ is a term $t$ such that $t$ implies $f$ (i.e., every assignment satisfying $t$ also satisfies $f$). A \emph{prime implicant} of $f$ is an implicant $t$ of $f$ such that no proper subset of $t$ is an implicant of $f$. A \emph{partial instance} is a vector $\vec{z} \in \{0, 1, *\}^n$, where $z_i = *$ indicates that the $i$-th feature of $\vec{z}$ is undefined. An instance $\vec{x}$ is \emph{covered} by $\vec{z}$ if $x_i = z_i$ for all features $i \in [n]$ such that $z_i \neq *$. For a subset $S \subseteq [n]$ of features, the restriction of $\vec{x}$ to $S$, denoted $x_S$, is the partial instance in $\{0, 1, *\}^n$ such that: $(x_S)_i = x_i$ if $i \in S$, and $*$ otherwise. Any instance $\vec y \in \{0, 1\}^n$ is covered by $x_S$ if and only if $y_S = x_S$. The term $t_{x_S}$ associated with the partial instance $x_S$ is defined as:

$t_{x_S} = \bigcup_{i=1}^{n} \left( \{x_i : (x_S)_i = 1\} \cup \{\overline{x_i} : (x_S)_i = 0\} \right).$

\subsection{Decision Tree and Random Forest.} 
\noindent \textbf{\emph{A binary decision tree}} on $X_n$ is a binary tree $T$, where each internal node is labeled with one of the $n$ Boolean input variables from $X_n$, and each leaf is labeled with either $0$ or $1$. Each variable is assumed to appear at most once on any path from the root to a leaf (read-once property). The value $T(x) \in \{0, 1\}$ of $T$ for an input instance $\vec{x}$ is determined by the label of the leaf reached from the root node.

\noindent \textbf{A \emph{random forest}} on \(X_n\) is a set \(F = \{T_1,\cdots,T_m\}\), where each \(T_i\) \((i \in [m])\) is a decision tree on \(X_n\), and the value \(F(\vec{x})\) is given by
\begin{align*}
  F(\vec{x}) = 
    \begin{cases}
      1 & \text{if } \frac{1}{m} \sum_{i=1}^{m} T_i(\vec{x}) > \frac{1}{2} \\
      0 & \text{otherwise.}
    \end{cases}
\end{align*}

The \emph{size} of $F$ is given by $\size{F} = \sum_{i=1}^{m} \size{T_i}$, where $\size{T_i}$ is the number of nodes present in $T_i$. The class of decision trees on $X_n$ is denoted by $\dt_n$, and the class of random forests with at most $m$ decision trees (for $m \geq 1$) on $\dt_n$ is denoted by $\rf_{n,m}$. Finally, $\rf_n = \bigcup_{m \geq 1} \rf_{n,m}$ and $\rf = \bigcup_{n \geq 1} \rf_n$. It is well known that any decision tree $T \in \dt_n$ can be transformed in linear time into an equivalent $\dnf$ (or an equivalent $\cnf$). This $\dnf$ is an orthogonal $\dnf$ (see \cite{bounia2024enhancing} for more detail). However, when moving to random forests, the situation is quite different. Any formula in $\cnf$ or $\dnf$ can be converted in linear time into an equivalent random forest, but there is no polynomial space conversion from a random forest to $\cnf$ or $\dnf$ \cite{Audemardetal22}.

\subsection{Abductive Explanations}\label{sec:sec_abductive}
\textbf{Abductive Explanations and Decision Trees.}  An abductive explanation \footnote{Unlike \cite{IgnatievNM19}, we do not require abductive explanations to be minimal with respect to inclusion.} for an instance $\vec{x}$ is a subset of features $S$ such that the restriction of $\vec{x}$ to $S$ is sufficient to obtain the same prediction. A \textit{sufficient reason} (denoted as {\sf PI}-explanation) is a minimal abductive explanation with respect to inclusion, while a \textit{minimum-size sufficient reason} is an abductive explanation containing the smallest number of literals.

Decision trees are naturally interpretable, as each instance $\vec{x}$ can be explained by a unique direct path from the root to a decision leaf, called a \textit{direct reason} (or \textit{path-explanation}), denoted $P^h_{\vec{x}}$. However, these direct reasons may contain redundant features \cite{IzzaDT}, justifying the use of more concise explanations, such as \textit{sufficient reasons} and \textit{minimum-size sufficient reasons}. Although sufficient reasons can be computed in polynomial time for decision trees, finding a minimum-size sufficient reason is an {\sf NP}-hard problem \cite{Barcelo2020ModelIT}.

\begin{definition}[Path-Sufficient Reason ($\PPI$)]
Let $h$ be a classifier represented by a tree $T \in \dt_n$ and $\vec{x} \in \{0, 1\}^n$ an instance. A $\PPI$ for $\vec{x}$ given $h$ is a set of features $S$ such that the associated term $t_{x_S}$ is a sufficient reason for $\vec{x}$ given $h$ and $t_{x_S} \subseteq P^h_{\vec{x}}$. A \textbf{Minimum-Size Path-Sufficient Reason} ($\PMINPI$) is a $\PPI$ of minimum-size.
\end{definition}

It is evident that it is always possible to derive a sufficient reason $t$ ({\sf PI}-explanation) from path explanation $P^h_{\vec{x}}$. However, a $\PMINPI$ generally does not coincide with a $\MINPI$.

\begin{remark}\label{prop:coincidePI}
When $h$ is a classifier represented by a decision tree $T \in \dt_n$, it is important to note that the $\PMINPI$ reason generally does not coincide with $\MINPI$, although all $\PMINPI$ and $\PPI$ reasons are \textsf{PI}-explanations.
\end{remark}

Furthermore, for an instance $\vec{x}$ and a classifier $h$ represented by a tree $T$, the number of $\PPI$ can be exponential in the size of the input.

\begin{proposition}\label{numberDT-PI}
There exists a decision tree $T \in \dt_n$ of depth $\log_2(n + 1)$ such that, for any instance $\vec{x} \in \{0, 1\}^n$, the number of $\PPI$ for $\vec{x}$ given $T$ is at least $\left\lfloor \frac{3}{2}^{\frac{n+1}{2}} \right\rfloor$.
\end{proposition}

Due to the large number of $\PPI$, finding a $\PMINPI$ is not always straightforward.

\begin{proposition}\label{minhard}
Let $h$ be a classifier represented by a decision tree $T \in \dt_n$ and $\vec{x} \in \{0, 1\}^n$ an instance. Computing a \textbf{Minimum-size path-sufficient reason} ($\PMINPI$) for $\vec{x}$ given $h$ is an {\sf NP}-hard problem.
\end{proposition}

Despite this result, it is possible to compute a $\PMINPI$ in many practical cases. To achieve this, we use a slightly modified version of the encoding proposed in \cite{dke}, which relies on \textsc{Partial MaxSAT} solvers. However, these encodings require significant memory resources and powerful machines, especially for handling high-dimensional or challenging instances.

\textbf{Abductive Explanations and Random Forests.} The notions of \textit{PI-explanation} (or sufficient reason) and $\MINPI$ are general and applicable to any classifier $h$, including when $h$ is a random forest $F \in \rf_{n, m}$. However, as mentioned in Section \ref{sec:intro}, computing $\MINPI$ and \textit{PI-explanations} for a random forest remains opaque. In this work, we focus on a type of explanation better suited to the internal workings of random forests, introduced by \cite{Audemardetal22}: \textit{direct reasons} and, more specifically, \textit{majoritary reasons ($\MAJ$)}. A \textit{direct reason} for a random forest is defined as the conjunction of the reasons for the features located on the paths of the trees that vote for the majority class.

\begin{definition}\label{def:direct}
Let $F = \{T_1, \ldots, T_m\}$ be a random forest (\textit{random forest}) in $\rf_{n,m}$, and $\vec x \in \{0, 1\}^n$ be an instance. The \textbf{direct reason} for $\vec x$ given by $F$ is the term $P^F_x$ defined by $P^F_x = \bigwedge_{T_i \in F^x} P^{T_i}_x$ where $F^x = \{T_i \in F \mid T_i(\vec x) = F(\vec x)\}.$
By construction, $P^F_x$ can be computed in time $O(n\cdot|F|)$.
\end{definition}

However, as with decision trees, $P^F_x$ often contains redundant features  \cite{Audemardetal22}. We therefore focus on a stronger version of abductive explicitons than $P^F_x$: \textit{majoritary reasons}. A majoritary reason, as defined in \cite{Audemardetal22}, is an implicant $t$ of the majority of trees in $F$, where the removal of a single feature invalidates the majority condition.

\begin{definition}\label{def:majority_reason}
Let $F = \{T_1, \ldots, T_m\}$ be a random forest in $\rf_{n,m}$ and $\vec{x} \in \{0, 1\}^n$ an instance. A \emph{majoritary reason ($\MAJ$)} for $\vec{x}$ given by $F$ is a term $t$ covering $\vec{x}$ such that $t$ is an implicant of at least $\left\lfloor \frac{m}{2} \right\rfloor + 1$ decision trees $T_i$, and for every literal $l \in t$, the term $t \setminus {l}$ does not satisfy this condition.

A \emph{Path-majoritary reason ($\PMAJ$)} for $\vec{x}$ given by $F$ is a $\MAJ$ such that $t \subseteq P^F_x$. A \emph{minimum-size majoritary reason ($\MINMAJ$)} is a $\MAJ$ with the smallest number of literals, and a \emph{minimum-size Path-majoritary reason ($\PMINMAJ$)} is a $\PMAJ$ with the smallest number of literals.
\end{definition}

In analogy with the proposition \ref{numberDT-PI}, and considering that a decision tree is a special case of a random forest where $F = \{T\}$, it is obvious that the number of reasons $\PMAJ$ can also be exponential. Moreover, in this special case, the reasons $\MINMAJ$ coincide with the $\MINPI$. Note that all reasons $\PMAJ$ and $\PMINMAJ$ are $\MAJ$ reasons, but they are not necessarily $\MINMAJ$. While deriving a reason $\MAJ$ or $\PMAJ$ is feasible in linear time, finding their minimum-size versions ($\MINMAJ$ and $\PMINMAJ$) is computationally more complex. Finding a reason $\MINMAJ$ has been shown to be an {\sc NP}-complete problem \cite{Audemardetal22}, and the following proposition shows that computing $\PMINMAJ$ is also hard.

\begin{proposition}\label{prop:hardpminmaj}
Let $F \in \rf_{n, m}$, $\vec x \in \{0, 1\}^n$, and $k \in \mathbb{N}$. Deciding whether there exists a reason $\PMINMAJ$ $t$ for $\vec x$ given $F$, such that $t$ contains at most $k$ features, is an {\sc NP}-complete problem.
\end{proposition}

Proposition \ref{prop:hardpminmaj} illustrates the difficulty of deriving explanations in a random forest, particularly when the size of \( F \) or the dimension of \( \vec{x} \) is large. To overcome this complexity, we explore efficient approximations with few resources. Finally, we recall abductive explanations of minimum size in the context of the error function.

\textbf{Error function}. We now define a central concept for the rest of this work, the \textit{explanation error function} $\epsilon_{h, \vec x}(S)$ for a classifier $h$ and an instance $\vec x$, which can be interpreted as the probability of making an {\it explanation error} using a subset $S$ of features. Given a classifier $h$ and an instance $\vec x$ for which the prediction $h(\vec x)$ must be explained, let $\epsilon_{h, \vec x}: 2^{[n]} \rightarrow \mathbb R$ be the \textit{explanation error function} \citep{bounia2023approximating, bounia2023these} defined by: $\epsilon_{h,\vec x}(S) = \frac{\mu_{h,\vec x}(S)}{2^{n - \size S}}$ where $\mu(S) = \size{\{\vec y \in \{0,1\}^n: h(\vec y) \neq h(\vec x), \vec y_S = \vec x_S\}}$. As noted earlier, $\epsilon_{h,\vec x}(S)$ can be interpreted as the probability of making an {\it explanation error}, where $\mu_{h,\vec x}(S)$ represents the number of errors induced by the choice of $S$. For a feature subset $S$, $t_{x_S}$ is an abductive explanation for $\vec{x}$, given $h$, if $\epsilon_{h,\vec x}(S) = 0$. Moreover, $t_{x_S}$ is a {\sf PI}-explanation if $\epsilon_{h,\vec x}(S) = 0$ and $\epsilon_{h,\vec x}(S') > 0$ for any proper subset $S'$ of $S$. $t_{x_S}$ is $\MINPI$ if it is a {\sf PI}-explanation that contains a minimal number of features. Note that when $h$ is represented by a decision tree $T$, the function $\mu_{h, \vec x}$ can be simply rewritten as follows:

$$
\mu_{h,\vec{x}}(S) =
\begin{cases}
\sum\limits_{t \in \dnf(T) \mid t_{x_S}} 2^{n - |t|} & \text{if } h(\vec{x}) = 0 \\
2^{n-|S|} - \sum\limits_{t \in \dnf(T) \mid t_{x_S}} 2^{n - |t|} & \text{if } h(\vec{x}) = 1
\end{cases}
$$

The result shows that the evaluation of $\epsilon_{h,\vec{x}}(S)$ can be achieved in time $O(|S| \cdot |T|)$ when $h$ is represented by a decision tree $T$ \cite{bounia2023approximating, Bouniaipmu}, which is not always the case in general. Indeed, in the general case, the problem of evaluating $\epsilon_{h,\vec{x}}(S)$ is \textbf{\#P-hard} (or $\#$-{\sf SAT}) \citep{BooleanF}. We now formulate the problem of finding a $\MINPI$ reason for $\vec x$ given $h$.

\section{Problem formulation}
\textbf{Main idea.} A term $t_{x_S}$, associated with a feature subset $S \subseteq V = [n]$, constitutes an abductive explanation for $\vec{x}$ given $h$ if and only if $\epsilon_{h, \vec{x}}(S) = 0$, i.e. $\mu_{h, \vec{x}}(S) = 0$. Thus, a minimum-size abductive explanation corresponds to the smallest set $S$ (in cardinality) satisfying $\mu_{h, \vec{x}}(S) = 0$.

\subsection{Approximation of a minimum-size abductive explanation}
The problem of finding a minimum-size abductive explanation (or a $\MINPI$) for an instance $\vec{x}$, given a classifier $h$, can be formulated as an optimization problem. The objective is to select a subset $S$ of minimum-size features satisfying an upper bound constraint $\alpha \geq 0$ on a function $g_{h, \vec{x}}(S)$, which depends on the classifier $h$ and the instance $\vec{x}$.

\begin{problem}\label{Problem1}
Let $h$ be a classifier, an instance $\vec{x}$ and a constant bound $\alpha \geq 0$. Let $g_{h, x}$ be a non-negative set function depending on $h$ and $\vec{x}$. The problem studied consists in finding a subset of features $S \subseteq V$ solution of the problem:

\[
\begin{aligned}
\min_{S \subseteq V} \quad & |S|  \\
\text{s.t.} \quad & g_{h, \vec{x}}(S) \geq \alpha
\end{aligned}
\]
\end{problem}

\begin{proposition}\label{prop:P1minPI}
Let $h : \{0, 1\}^n \rightarrow \{0, 1\}$ be a classifier, $\vec{x} \in \{0, 1\}^n$ be an instance, and $S^* \subseteq V$ be a subset of features. For $V=[n],~g_{h, x}(S) = \mu_{h, x}(\emptyset) - \mu_{h, x}(S)$ and $\alpha = \mu_{h, x}(\emptyset)$, $S^*$ is an optimal solution to the problem \ref{Problem1} if and only if $t_{x_{S^*}}$ is a $\MINPI$ reason for $\vec{x}$ given at $h$.
\end{proposition}

\textbf{Application to decision trees.} When a classifier $h$ is represented by a decision tree, we restrict ourselves to the features present in the explanation based on the path $P^h_x$ (denoted $V_x^{\text{path}}$\footnote{In the rest of the article, $V_x^{\text{path}}$ denotes the set of features appearing in the direct paths leading to the decision for $\vec{x}$, whose classification must be explained.}) to generate a minimum-size explanation $\PMINPI$. To do this, it suffices to define $V = V_x^{\text{path}}$.

\begin{proposition}\label{prop:PDT1minPI}
Let $h$ be a classifier represented by a decision tree $T \in \dt_n$ and an instance $\vec{x}$ to be explained. For $V = V_x^{\text{path}}$, $g_{h, x}(S) = \mu_{h, x}(\emptyset) - \mu_{h, x}(S)$ and $\alpha = \mu_{h, x}(\emptyset)$, $S^*$ is an optimal solution to the problem \ref{Problem1} if and only if $t_{x_{S^*}}$ is a $\PMINPI$ reason for $\vec{x}$ given $h$.
\end{proposition}

In the case of decision trees, the evaluation of the error function $\mu_{h, x}(.)$ can be done in polynomial time (see \cite{IzzaDT}), and more precisely in linear time \cite{bounia2024enhancing}. This evaluation is also feasible in polynomial time for linear classifiers \cite{Barcelo2020ModelIT}, \textsf{d-DNNF} classifiers \cite{huang2022tractable}, and \cite{hu2022optimizing} decision diagrams. However, optimization problems, such as \ref{Problem1}, are generally \textsf{\textsf{NP}-hard} in these cases (see \cite{nemhauser1988integer}, Chapter III).

This is consistent with the \ref{prop:P1minPI} and \ref{prop:PDT1minPI} propositions, as well as with the \textsf{\textsf{NP}-hardness} of computing the minimum explanations $\MINPI$ and $\PMINPI$ when $h$ is a decision tree. In general, the evaluation of $\mu_{h, x}$ is \textsf{\#P}-hard \citep{BooleanF}, making its computation intractable in polynomial time. For a random forest, this problem is equivalent to a \textsf{\#SAT}, and the approximation or exact computation of $\MINPI$ (of \textsf{DP}-complete complexity) remains out of reach. We therefore focus on another type of abductive explanation, majoritary reasons, and we will introduce a new error function adapted to random forests, efficiently computable in linear time.

\subsection{Approximation of minimum-size majoritary reasons}
To circumvent the problem of evaluating the error function $\epsilon_{h, x}$ when $h$ is represented by a random forest $F \in \rf_{n, m}$, we focus on a subset of trees in $F$, namely those that vote for the majoritary class. This means that we consider the whole: $F^x = \{T_i \in F, i\in [m]~|~T_i(\vec{x}) = F(\vec{x})\}$. Inspired by the fact that a majoritary reason is an implicant of at least half of the trees in the forest and that the number of errors must be zero when considering an implicant of the majority of the trees (i.e. a $\MAJ$-reason), we define the function: $\epsilon_{F^x}(S) = \frac{\mu_{F^x}(S)}{\#F^x \cdot 2^{n-|S|}}$ with:

$$
\mu_{F^x}(S) = \left[\sum\limits_{T_i \in F^x} \mu_{x, T_i}(S) \right] \mathbb{I}\left(\sum\limits_{T_i \in F^x} \mathbb{I}_{\{\mu_{x, T_i}(S)=0\}} \leq \frac{m}{2}\right)
$$

where $\#F^x$ denotes the number of trees in $F^x$, and $\mathbb{I}$ is the indicator function. The function $\epsilon_{F^x}$ can be interpreted as the average probability of making an explanation error by selecting a subset $S$ of features. Note that when $F = \{T\}$, $\mu_{F^x}$ coincides with $\mu_{T, x}$.

\textbf{Evaluation of $\mu_{F^x}$.}
For any $T_i \in F$, the function $\mu_{x, T_i}(S)$ can be computed in $O(|S| \cdot |T_i|)$ \cite{bounia2023approximating, Bouniaipmu}. The evaluation of $\mu_{F^x}(S)$ then consists in computing $\mu_{x, T_i}(S)$ for each $T_i \in F^x$. Since $|F^x|$ represents the sum of the sizes of the trees that compose it, the total cost is $O(|S| \cdot |F^x|)$, with $|F^x| = \sum\limits_{T_i \in F^x} |T_i|$.

\textbf{Approximation of a mimumm-size majoritary reason.}
When a classifier $h$ is a random forest and we seek to explain the prediction of a given instance $\vec{x}$, the problem \ref{Problem1} can be reformulated so that its optimal solution corresponds to a reason $\MINMAJ$ for $\vec{x}$ given $h$.

\noindent \textbf{Formally, this problem can be adapted as follows:}
\begin{itemize}
\item Let a classifier $h$ be represented by a random forest $F \in \rf_{n, m}$.
\item Let an instance $\vec{x}$ whose prediction is to be explained.
\item Let $g_{h, x}(S) = \mu_{F^x}(\emptyset)-\mu_{F^x}(S), \alpha = \mu_{F^x}(\emptyset)$.
\end{itemize}

With these parameter adjustments in the formulation of the problem~\ref{Problem1}, the latter becomes a version adapted to the context of majority voting when the classifier $h$ is a random forest $F$.

\begin{proposition} \label{prop:P1FminPI}
Let $h$ be a classifier represented by a random forest $F \in \rf_{n, m}$ and $\vec{x} \in \{0, 1\}^n$ be an instance. If we set: $V = [n]$, $\alpha = \mu_{F^x}(\emptyset)$ and $g_{h, x}(S) = \mu_{F^x}(\emptyset) - \mu_{F^x}(S)$, then a set $S^* \subseteq V$ is an optimal solution to the problem~\ref{Problem1} if and only if $t_{x_{S^*}}$ is a $\MINMAJ$ reason for $\vec{x}$ given $h$. And if $V = V_x^{path}$ then $S^*$ is an optimal solution to the problem~\ref{Problem1} if and only if $t_{x_{S^*}}$ is a $\PMINMAJ$ reason for $\vec{x}$ given $h$.
\end{proposition}

\section{Approximation Algorithms}
The main objective of this study is to relax the optimality constraint for the \textit{Problem \ref{Problem1}} (as well as its version adapted to the random forest context) by prioritizing obtaining a \textit{sufficiently good} solution in terms of quality. To do so, we exploit submodular optimization techniques and a greedy algorithm, which have the advantage of being less resource-intensive than exact approaches based on \textsf{SAT} encodings or constraint optimization. This section begins with a reminder of the fundamental concepts of submodularity, followed by an analysis of the essential properties of the error functions $\mu_{h, x}$ and $\mu_{F^x}$. Finally, we propose a greedy algorithm to obtain an approximate solution, accompanied by a study of its theoretical guarantees and experimental performances.

\subsection{Supermodular functions}
Let $f : 2^{[n]} \to \mathbb{R}$ be a function with real parts. We say that $f$ is non-decreasing if $f(S \cup \{i\}) \geq f(S) \quad \text{for all } S \subseteq [n] \text{ and } i \in [n] \setminus S$, and non-increasing if $f(S \cup \{i\}) \leq f(S) \quad \text{for all } S \subseteq [n] \text{ and } i \in [n] \setminus S$.

\noindent $f$ is supermodular if it satisfies the following condition for all subsets $A, B$ of $[n]$: $f(A \cup B) + f(A \cap B) \geq f(A) + f(B).$ On the other hand, $f$ is submodular if, for all subsets $A$ and $B$ of $[n]$, the following condition is satisfied: $f(A \cup B) + f(A \cap B) \leq f(A) + f(B).$

For all $S \subseteq [n]$ and $i \in S$, a function $f$ is supermodular if and only if $-f$ is submodular. Moreover, $f$ is modular if it is both submodular and supermodular.

In general, the error function \( \epsilon_{h,x}(\cdot) \) is neither supermodular nor submodular, and it is neither non-increasing nor non-decreasing \citep{bounia2023approximating}. However, by considering the non-normalized version \( \mu_{h,x}(\cdot) \), useful properties can be derived. For any classifier \( h \) and an instance \( x \in \{0, 1\}^n \), the function \( \mu_{h,x} \) is non-negative, supermodular, and non-increasing \cite{bounia2023approximating}. Consequently, since \( \mu_{F^x} \) is a linear combination of supermodular, non-negative, and non-increasing functions, it inherits these properties: \( \mu_{F^x} \) remains supermodular, non-negative, and non-increasing.

\begin{proposition}\label{supermodular}
Let $h : \{0, 1\}^n \to \{0, 1\}$ be a classifier, and let $\vec{x} \in \{0, 1\}^n$ be an instance. Then, the function $g_{h, x}(S) = \mu_{h, x}(\emptyset) - \mu_{h, x}(S)$ is submodular, non-decreasing, and non-negative, with $g_{h, x}(\emptyset) = 0$.
\end{proposition}

When $h$ is represented by a random forest $F \in \rf_{n, m}$ and $g_{h, x}(S) = \mu_{F^x}(\emptyset) - \mu_{F^x}(S)$, Proposition \ref{supermodular} shows that $g_{h, x}$ is a non-negative, submodular, non-decreasing function satisfying $g_{h, x}(\emptyset) = 0$. Therefore, Problem \ref{Problem1} can be reformulated as a submodular optimization problem. Although such problems are often {\sf NP}-hard (cf. Chapter {\sf III} of \citep{nemhauser1988integer}), the use of a greedy algorithm provides an approximate solution with performance guarantees close to optimal.

\subsection{Greedy algorithm}
A natural approach to minimize a supermodular and non-increasing function $f$ under the strong constraint of minimizing $|S|$ consists of formalizing the problem as a leader selection problem for $f$, of minimum-size, reaching an error bound $\alpha$. Greedy algorithms can then be used to compute an approximate solution. As shown in \cite{Leader, ErrorLeader}, this greedy method benefits from mathematical guarantees on solution quality. In our study, the function $\mu_{h, x}(\cdot)$ is supermodular and non-increasing. However, the approach in \cite{Leader} is also based on the work of \cite{nemhauser1988integer} and on the fact that $g_{h, x}(\emptyset) = 0$, but \cite{Leader} does not emphasize the details, being rather generic. Therefore, using supermodular minimization algorithms is not possible, as most rely on the assumption $\mu_{h, x}(\emptyset) = 0$, which in our case ($\mu_{h, x}(\emptyset) \ne 0$). The same applies to $\mu_{F^x}$.

To circumvent this limitation, we are interested in a modified version of $\mu_{h, x}$, defined by the function $g_{h, x}(.)$. This function is submodular, non-negative, and non-decreasing, and it satisfies $g(\emptyset) = 0$. These properties allow the use of the greedy algorithm \ref{alg:alg1}, described as follows:

\begin{algorithm}[ht!]
\caption{Greedy Approximation Algorithm} \label{alg:alg1}
\State \textbf{Input:} A submodular funciton $g$, termination bound $\alpha$
\State \textbf{Output:} A set of feateurs $S$
\State $S \gets \emptyset$
\State $\text{error} \gets 0$
\State $V$ \Comment{A set of features, by default $V \gets [n]$}
\While{$\text{error} < \alpha$}

\State $e^* \gets \underset{e \in V \setminus S}{\operatorname{argmax}}~ g(S \cup \{e\}) - g(S)$ 
\If{$g(S \cup \{e\})-g(S) \leq 0$}
 \State \Return $S$
 \Else
 \State $S \gets S \cup \{e^*\}$
 \State $\text{error} \gets g(S)$
 \EndIf
\EndWhile
\State \Return $S$
\end{algorithm}

\begin{theorem}\label{theorem1}
Let $h : \{0, 1\}^n \rightarrow \{0, 1\}$ be a classifier and $\vec{x} \in \{0, 1\}^n$ an instance. For $g(S) = \mu_{h, x}(\emptyset)-\mu_{h, x}(S)$, let $|S^*| = k^*$ be the size of the optimal solution of problem \ref{Problem1}, and let $|S| = k$ be the set returned by algorithm \ref{alg:alg1}. Then,
\[
\frac{|S|}{|S^*|} = \frac{k}{k^*} \leq 1 + \ln\left(\frac{\mu_{\max}}{\mu_{h, x}(S_{k-1}) }\right).
\]
Where $\mu_{\max} = \max_{i \in V} \mu_{h, x}(\{i\})$

\end{theorem}

Theorem~\ref{theorem1} establishes an approximation guarantee for the greedy algorithm, by comparing the size of the set~$S$ to that of the optimal set~$S^*$. This bound remains valid for the adaptive version of the problem~\ref{Problem1} applied to random forests, i.e. when~$h$ is represented by a random forest~$F$. Moreover, by constructing $S_{k-1}$ via the algorithm~\ref{alg:alg1}, we have $\mu_{h, x}(S_{k-1}) \neq 0$ ($\mu_{h, x}$ is non-increasing), which guarantees the validity of the approximation bounds. In the worst case, the algorithm~\ref{alg:alg1} runs in time~$O(n^3 \cdot |h|)$ (i.e. $O(n^3 \cdot |T|)$ for decision trees and $O(n^3 \cdot |F|)$ for random forests).

\textbf{Discussion on the approximation bound.}  The approximation bounds of theorem~\ref{theorem1} depend on the value of $\mu_{h, x}(S_{k-1})$, which varies depending on the instance considered. However, we believe that it is possible to obtain fixed and more precise bounds by using other greedy algorithms. Our experimental results support this intuition: by comparing the numerical values of the bounds described in theorem \ref{theorem1} with $\log(n)$ (see Table \ref{tab:bounds}), we observe that the average value of the bound is typically a multiplicative factor $\gamma \cdot \log(n)$, with $\gamma$ being non-negative constant. This suggests that the bounds could be reduced to a form proportional to $\log(n)$, without dependence on $\mu_{h, x}(S_{k-1})$. However, this remains a conjecture, as no theoretical result yet rigorously supports the bound being in $\mathcal{O}(\log(n))$, although we have strong reasons to believe so. We think that it is possible to obtain a fixed and more precise bounds inspired in particular by the work of~\cite{Iyer2013SubmodularOW, Wolsey1982}, by reformulating the problem~\ref{Problem1} as a bicriteria submodular optimization problem with submodular cover and submodular knapsack constraints.


\begin{table}[ht!]
    \centering
    \begin{tabular}{lcc}
        \toprule
        \textbf{Dataset} & $\log(n)$ & \textbf{Bound} \\
        \midrule
        compas & 3.78 & 5.84 \\
        titanic & 4.43 & 5.86 \\
        yeast & 3.30 & 9.19 \\
        malware & 3.43 & 4.86 \\
        gisette & 4.88 & 10.31 \\
        tae & 3.40 & 4.26 \\
        spambase & 5.37 & 10.70 \\
        mnist38 & 5.32 & 12.16 \\
        letter & 4.43 & 6.62 \\
        meta & 3.61 & 4.42 \\
        \bottomrule
    \end{tabular}
    
\caption{Additional experimental results on several datasets when $h$ is represented by a decision tree $T$. $\log(n)$ denotes the logarithm of the number of binary features $n$, and \textit{Bound} is the average value of the approximation bound from Theorem~\ref{theorem1}, computed over at most $m = 250$ instances.} \label{tab:bounds}
\end{table}

\paragraph{Improvement of the output from algorithm~\ref{alg:alg1}.} The output of algorithm~\ref{alg:alg1} does not necessarily guarantee a minimal abductive explanation for inclusion (sufficient or majoritary reason) for $\vec{x}$ given $h$. To refine this result, we extract a less redundant explanation from the solution $S$ returned by algorithm~\ref{alg:alg1}, using a simple greedy procedure. Algorithm~\ref{alg:alg2}\footnote{Recall that algorithm~\ref{alg:alg2} runs in time $O(|S|\cdot|h|)$.} takes as input the set $S$ and iteratively removes elements that do not contribute to a minimal abductive explanation for the inclusion. Specifically, it discards any element $\ell$ such that $\mu_{h, \vec{x}}(S \setminus \{\ell\})$ still yields a valid explanation. The final output is a minimal abductive explanation for the inclusion for $\vec{x}$ given $h$.



\begin{algorithm}[ht!]
\caption{Improving Solution Parsimony} \label{alg:alg2}
\State \textbf{Input:} a classifier $h$, instance $\vec{x} \in \{0, 1\}^n$, a set $S$
\State \textbf{Output:} a minimal abductive explanation for inclusion
\State $I \gets S$ \Comment{$S$ is the output of the algorithm \ref{alg:alg1}}
\For{$\ell \in I$}
\If{$\mu_{h, x}(S) = 0$}
\State $S \gets S - \{\ell\}$
\EndIf
\EndFor
\State \Return $S$
\end{algorithm}

The algorithm~\ref{alg:alg2} iteratively traverses the elements of $S$, eliminating those that do not affect the validity of the explanation. The output of the algorithm \ref{alg:alg2} constitutes a minimal explanation for inclusion.

\section{Experiments}
In this section, we evaluate the performance of our approach by comparing the solutions returned by our greedy algorithm with exact methods based on \textsf{Partial MaxSAT} solver. Our goal is to measure the efficiency of our algorithm in approximating the optimal solution to Problem~\ref{Problem1}. We demonstrate that our method is a high-performing alternative, particularly for challenging instances where exact approaches become inefficient due to their high computational cost. Finally, we discuss the relevance of our approach in resource-constrained environments and explain why it represents a more suitable solution than existing exact methods.

\subsection{Experimental Protocol}
We conducted experiments on various instances of Problem~\ref{Problem1}. Since, when $F = \{T\}$, the concepts $\MINPI$, $\PMINPI$, $\PPI$, and $P^T_x$ coincide with $\MINMAJ$, $\PMINMAJ$, $\PMAJ$, and $P^F_x$, respectively, we focus on the case where $h$ is represented by a random forest $F \in \rf_{n, m}$. The experiments were performed using Python code executed on a machine equipped with an Intel(R) Core $i9-9900$ processor clocked at $3.1$ GHz and $64$ GiB of RAM.

George Nemhause
We studied a set of $B = 52$ datasets from well-known sources such as Kaggle (\url{www.kaggle.com}), OpenML (\url{www.openml.org}), and UCI (\url{archive.ics.uci.edu/ml/}). Categorical features were encoded as integers, while numerical features were binarized during the training of random forests. All datasets used are related to binary classification tasks.

\textbf{Methodology.} For each instance $\vec{x}$ in the test set of a dataset $b$, an explanation task is defined by the pair $(F_b, \vec{x})$, where $F_b$ denotes the random forest representing a classifier $h$, trained on the training set of $b$ using the \textit{Scikit-Learn} library \citep{scikit-learn}. The training of $F_b$ was performed with default hyperparameters, except for the \textit{nb\_estimator} parameter, which controls the number of trees in the forest. This parameter was adjusted to ensure high performance while avoiding an explosion in the size of the forest and the encodings, while maintaining good accuracy. A time limit of $60$ minutes per instance was defined.

To evaluate the performance of the algorithm~\ref{alg:alg1} in approximating a solution to Problem~\ref{Problem1}, we randomly select $m = \min(q, 250)$ instances $\vec{x}$ from the test set of $b$, where $q$ is the size of this set. And due to space constraints, we limit ourselves to comparing the average sizes of the approximate solutions to those of the explanations ($\PMINMAJ$).

\noindent \textbf{Comparison with exact methods.} To compare the performance of the algorithm~\ref{alg:alg1} with an exact solution, we used an approach based on a \textsc{Partial MaxSAT} solver described in \citep{Audemardetal22}. Concretely, given a random forest $F$ associated with a classifier $h$ and an instance $\vec{x}$, the \textit{hard} clauses ($C_{\text{hard}}$) of the encoding represent the clause $\cnf$ of the forest, while the \textit{soft} clauses encode the literals of the instance $\vec{x}$. The optimal solution of this instance \textsc{Partial MaxSAT} corresponds to a $\MINMAJ$ reason for $\vec{x}$ given $h$ \citep{Audemardetal22} (i.e. the optimal solution of Problem~\ref{Problem1} with $V = [n]$ and $\alpha = \mu_{F^x}(\emptyset)$).

In the case of minimum-size majoritary reasons restricted to paths ($\PMINMAJ$), the above-mentioned encoding has been extended by adding the clause: $\bigvee \{x_i : (x_I)_i = 1\} \vee \bigvee \{x_i : (x_I)_i = 0\}~~~~\text{where}~~~~ I = V_x^{\text{path}}$ to the \textit{hard} clauses ($C_{\text{hard}}$). Thus, the optimal solution of the extended \textsc{Partial MaxSAT} problem corresponds to a $\PMINMAJ$ for $\vec{x}$ given $h$ (optimal solution of Problem~\ref{Problem1} with $V = V_x^{\text{path}}$, $g(S)=\mu_{F^x}(\emptyset)-\mu_{F^x}(S),~\alpha = \mu_{F^x}(\emptyset)$).

\noindent \textbf{Analysis of Hard Instances.} We focus here on instances for which the optimal resolution of Problem~\ref{Problem1} becomes difficult for the \textsc{Partial MaxSAT} solver. These situations arise when the size of the random forest $F$ is large, or when the input instance's dimension is large, leading to an explosion in the size of the Boolean circuit encoding ($\cnf$ formula). However, even in datasets with lower dimensionality, complex instances may arise. This difficulty can also stem from the random and complex structure of the considered instances, making convergence to an optimal solution more difficult for the \textsc{Partial MaxSAT} solver, thus posing computational challenges. Due to space constraints, our analysis is limited to the two datasets \textit{Placement} and \textit{Cars}.

\begin{table*}[ht!]
  \centering
  \begin{scriptsize}
  \adjustbox{max width=\textwidth}{
  \begin{tabular}{rrrrrrrrrrrrrrrrr}
    \toprule
    \multicolumn{3}{c}{dataset} && \multicolumn{3}{c}{random forest} && \multicolumn{4}{c}{$|\text{Path-Reason}|$} && \multicolumn{2}{c}{Times-Reason} \\
    \cmidrule{1-3}\cmidrule{5-7} \cmidrule{9-12} \cmidrule{14-15}
    name & \#F & \#I && \%A & $\#T$ & $|F|$ && $|P^F_x|$ & $S^*$ & $S_{\text{algo1}}$ & $S_{\text{improve}}$ && $S^*$ & $S_{\text{algo1}}$ \\
    \midrule
tic-tac-toe &9&958 && 100.0&53&10265 && 9.00 ($\pm$ 0.00) & 5.75 ($\pm$ 1.02) & 5.79 ($\pm$ 1.03) & 5.75 ($\pm$ 1.07) && 0.0640 & 0.0595 \\

monk &16&601 && 66.85&33&9047 && 12.26 ($\pm$ 0.96) & 8.31 ($\pm$ 1.61) & 8.89 ($\pm$ 2.16) & 8.42 ($\pm$ 2.09) && 0.0672 & 0.0686 \\

titanic &498&623 && 79.68&23&3631 && 61.98 ($\pm$ 15.08) & 27.59 ($\pm$ 9.13) & 30.17 ($\pm$ 10.82) & 28.16 ($\pm$ 10.82) && 0.8162 & 0.8206 \\

biomed &267&209 && 90.48&23&777 && 73.54 ($\pm$ 15.09) & 32.08 ($\pm$ 8.52) & 38.75 ($\pm$ 10.64) & 36.05 ($\pm$ 10.04) && 0.8134 & 0.4040 \\

vote &16&434 && 94.66&25&1321 && 15.56 ($\pm$ 0.57) & 5.87 ($\pm$ 1.23) & 5.97 ($\pm$ 1.30) & 5.90 ($\pm$ 1.23) && 0.0361 & 0.0135 \\

compas &63&6172 && 65.77&31&28303 && 21.75 ($\pm$ 6.31) & 11.14 ($\pm$ 4.09) & 12.15 ($\pm$ 4.86) & 11.43 ($\pm$ 4.28) && 0.9147 & 0.7103 \\

vehicle &272&846 && 98.43&17&839 && 53.90 ($\pm$ 8.63) & 21.46 ($\pm$ 6.08) & 24.94 ($\pm$ 6.97) & 24.94 ($\pm$ 6.97) && 0.1162 & 0.1037 \\

heart &400&303 && 85.71&33&2447 && 64.53 ($\pm$ 12.32) & 27.98 ($\pm$ 8.50) & 31.21 ($\pm$ 9.54) & 30.19 ($\pm$ 9.1) && 9.9494 & 0.4648 \\

hepatitis &172&142 && 86.05&35&931 && 51.16 ($\pm$ 5.96) & 18.70 ($\pm$ 7.12) & 21.30 ($\pm$ 6.69) & 21.30 ($\pm$ 6.69) && 0.5841 & 0.1043 \\

horse &394&299 && 84.44&29&1771 && 80.21 ($\pm$ 15.04) & 37.99 ($\pm$ 10.14) & 42.98 ($\pm$ 11.93) & 42.98 ($\pm$ 11.93) && 11.4389 & 0.7641 \\

student.por &142&649 && 91.79&23&1861 && 48.44 ($\pm$ 3.97) & 18.35 ($\pm$ 5.02) & 20.25 ($\pm$ 5.42) & 20.21 ($\pm$ 5.36) && 1.0457 & 0.1747 \\

haberman &154&306 && 69.57&31&3329 && 56.39 ($\pm$ 8.28) & 28.50 ($\pm$ 7.31) & 31.29 ($\pm$ 7.18) & 30.76 ($\pm$ 7.65) && 4.1101 & 0.5890 \\

employee &72&4653 && 84.67&19&23063 && 32.82 ($\pm$ 6.61) & 15.03 ($\pm$ 4.99) & 17.85 ($\pm$ 7.41) & 16.92 ($\pm$ 6.77) && 0.1571 & 0.1790 \\

\bottomrule
\end{tabular}}
\end{scriptsize}
\caption{Statistics on the approximation of \(\PMINMAJ\) reasons when $h$ is a random forest.} \label{tab:stats-RF}
\end{table*}

\subsection{Experimental Results}
Table~\ref{tab:stats-RF} presents a sample of our results for $15$ datasets. The columns include the dataset name, the number of binary features ($\#F$), the number of instances ($\#I$), the accuracy of the forest $F_b$ ($\%A$), and the size of the forest ($|F|$). The column $|\text{Path-Reason}|$ indicates the average size of the explanations computed for the $m$ selected instances. The columns $S^*$, $S_{\text{algo1}}$, and $S_{\text{improve}}$ respectively present the average sizes of the $\PMINMAJ$ reasons, the output of Algorithm~\ref{alg:alg1}, and its improvement obtained with Algorithm~\ref{alg:alg2}. Similarly, the column $|\text{Times-Reason}|$ shows the average time required to derive $\PMINMAJ$ (sub-column $S^*$) and its approximation (sub-column $S_{\text{algo1}}$).

For the column $|\text{Path-Reason}|$, we observe that the average size of the approximate solutions to Problem~\ref{Problem1} (with $V = [n]$) generated by Algorithm~\ref{alg:alg1} is close to that of the optimal solutions ($\PMINMAJ$). For most of the studied datasets, the maximum average error $||S_{\text{algo1}}| - |S^*||$ is on the order of $2$ on average, demonstrating the accuracy of the approximate solutions. This error slightly decreases with the improved solutions $S_{\text{improve}}$, reaching a precision close to $1.2$. This error can be as low as $10^{-3}$ for some datasets, such as \textit{monk}, \textit{vote}, \textit{tic-tac-toe}, \textit{compas}, and \textit{heart}. Moreover, the average size of the outputs of Algorithm~\ref{alg:alg1} is significantly smaller than that of $\MAJ$ and $\PMAJ$ (see the supplementary material in \url{https://github.com/Lounesbo}). The column $|\text{Times-Reason}|$ shows the average time required to find $\PMINMAJ$ and its approximation. For the datasets in the table, the computation time is almost identical. However, this is not always the case, as shown by our study on \textit{Placement} and \textit{Cars}. The results in Table~\ref{tab:stats-RF} demonstrate that the algorithm can compute an efficient approximation solution.

\begin{figure}[h]
  \centering
  \includegraphics[width=0.5\textwidth]{placement_cars.png}
  \caption{Placement and Cars}
  \label{fig:placement}
\end{figure}

Table~\ref{tab:placement} presents experimental results for $2$ datasets. The columns $\#I$, $\#F$, $\%A$, and $|F|$ respectively indicate the number of instances, the number of binary features, the accuracy of the forest, and its size. The column $\text{Path-Times}$ shows the average computation times for the $m = 65$ selected instances. The sub-columns $S^*$ and $S_{\text{alg1}}$ reflect the average computation times for the exact and approximate solutions. We note that the average computation time of the exact solvers often exceeds $10$ minutes (as already shown in the experimental part of the work by \citep{Audemardetal22}), while Algorithm~\ref{alg:alg1} generates a solution in less than $5$ seconds.

\begin{table}[ht!]
  \centering
  \begin{scriptsize}
  \adjustbox{max width=\textwidth}{
  \begin{tabular}{rrrrrrrrrr}
    \toprule
    \multicolumn{1}{c}{Dataset} & \multicolumn{1}{c}{\#I} & \multicolumn{1}{c}{\#F} & \multicolumn{1}{c}{\%A} & \multicolumn{1}{c}{$\#T$} & \multicolumn{1}{c}{$|F|$} & 
    \multicolumn{2}{c}{Path-Times} \\
    \cmidrule(lr){6-6} \cmidrule(lr){7-8}
    & & & & & & $S^*$ & $S_{\text{alg1}}$ \\
    \midrule
    \textbf{Placement} & 215 & 371 & 95.38 & 47 & 1947 & 239.74 & 0.61 \\
    \textbf{Cars} & 406 & 611 & 91.8 & 53 & 2685 & 199.55 & 1.35 \\
    \bottomrule
  \end{tabular}}
  \end{scriptsize}
  \caption{Experimental results on Placement and Cars}
  \label{tab:placement}
\end{table}

Figure~\ref{fig:placement} illustrates the number of instances for which the \textsc{Partial MaxSAT} solvers failed to find a $\PMINMAJ$ reason within the allotted time limits of $3$, $5$, $10$, $15$, $20$, $30$, and $50$ minutes (Placement in orange, Cars in blue). These results show that exact methods can take a significant amount of computation time, while the greedy algorithm requires negligible time. Although these results highlight the slowness of solvers in producing results, Figure~\ref{fig:placement} shows that some instances deemed \textit{difficult} remain unresolved even after $50$ minutes. For example, for the \textit{Placement} dataset, out of $65$ instances in the test set, $14$ remain unsolved after $3$ minutes, $12$ after $5$ minutes, $9$ after $10$ minutes, and $4$ after $20$ minutes, with $1$ remaining after $30$ minutes. Similar results are observed for the \textit{Cars} dataset. This phenomenon typically occurs when the forest consists of complex, large, and deep trees, and/or when the dataset is high-dimensional. These results highlight the utility of our greedy algorithms.

\section{Conclusion}
This work addressed the challenge of generating minimum-size abductive explanations for classification models, focusing on decision trees and random forests. By formulating the problem as a submodular optimization, we leveraged structural properties that enable high-quality approximate solutions. We showed that computing minimum-size abductive explanations for these classifiers is an {\sf NP}-complete problem, even when restricted to the features of direct paths in decision trees or random forest trees, highlighting the complexity of providing concise explanations. To address this, we developed efficient greedy algorithms with theoretical optimality guarantees, producing near-optimal explanations in reasonable time. Our experiments demonstrated that the greedy algorithm is as effective as exact methods, and sometimes more computationally efficient, generating intelligible and relevant explanations, as shown in our case study on the {\it placement} benchmark. This makes our approach a viable alternative, particularly in resource-limited environments where modern solvers are costly. Our method, based on submodular optimization, is well-suited for hard-to-explain instances, offering a robust alternative to exact solvers. Future work could explore new formulations of the problem, develop more sophisticated algorithms, and extend our approach to other classification models, including neural networks, to broaden its applicability in diverse contexts.

\bibliography{references}


\end{document}
