%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% UAI 2025: Probabilistic Explanations for Black-Box Regression
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Packages
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\documentclass[accepted]{uai2025} 
% after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        %% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsthm}
\usepackage{xfrac}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{bbm}
\usepackage{caption}
\usepackage{subcaption}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage[ruled,noend,vlined]{algorithm2e} % Algorithms
\usepackage{tikz,tikz-3dplot} % nice language for creating drawings and diagrams

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Theorem Styles
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newtheoremstyle{example}% name
  {9pt}%      Space above, empty = `usual value'
  {9pt}%      Space below
  {}%         Body font
  {}%         Indent amount (empty = no indent, \parindent = para indent)
  {\bfseries}% Thm head font
  {.}%        Punctuation after thm head
  { }%         Space after thm head: \newline = linebreak
  {}%         Thm head spec
\theoremstyle{example}
\newtheorem{example}{Example}
\newtheoremstyle{theorem}% name
  {9pt}%      Space above, empty = `usual value'
  {9pt}%      Space below
  {}%         Body font
  {}%         Indent amount (empty = no indent, \parindent = para indent)
  {\bfseries}% Thm head font
  {.}%        Punctuation after thm head
  { }% Space after thm head: \newline = linebreak
  {}%         Thm head spec
\theoremstyle{theorem}
\newtheorem{theorem}{Theorem}
\newtheorem{conjecture}{Conjecture}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{problem}{Problem}
\newtheorem{proposition}{Proposition}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{note}{Note}
\newtheorem{open}{Open question}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Commands
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% Functions
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\Argmax}{Argmax}
\DeclareMathOperator*{\Argmin}{Argmin}

% Sets
\newcommand{\B}{\mathbb{B}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\Rp}{\mathbb{R}_+}
\newcommand{\Rpp}{\mathbb{R}_{++}}
\renewcommand{\S}{\mathbb{S}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Zp}{\mathbb{Z}_+}
\newcommand{\Zpp}{\mathbb{Z}_{++}}
\renewcommand{\emptyset}{\varnothing}

% Vectors and matrices
\renewcommand{\vec}[1]{\bm{#1}}

% Operations
\newcommand{\Binom}[1]{\binom{[#1]}{2}}
\newcommand{\defeq}{\stackrel{\text{def}}{=}} 
\newcommand{\Var}[1]{\mathit{Var}(#1)}
\newcommand{\seq}[1]{\langle #1 \rangle}
\newcommand{\inner}[2]{\langle #1, #2 \rangle}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\sgn}{\mathrm{sgn}}
\newcommand{\size}[1]{|#1|}
\newcommand{\support}[1]{\mathrm{supp}(#1)}
\newcommand{\rank}[1]{\mathrm{rank}(#1)}

% Classes & Oracles
\newcommand{\NP}{$\textsf{NP}$}
\newcommand{\PP}{$\textsf{PP}$}
\newcommand{\NPPP}{$\textsf{NP}^\textsf{PP}$}
\newcommand{\EX}{$\textsc{ex}$}

% Explanations
\newcommand{\Precision}[1]{\mathsf{P}_{f, \vec{x}, \mathcal{D}}(#1)}
\newcommand{\Fidelity}[1]{\mathsf{F}_{f, \vec{x}, \mathcal{D}}(#1)}
\newcommand{\EmpFidelity}[1]{\widehat{\mathsf{F}}_{f, \vec{x}, m}(#1)}

% Formatting
\newcommand{\longpage}{\enlargethispage{\baselineskip}}
\newcommand{\shortpage}{\enlargethispage{-\baselineskip}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Graphical commands 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\tdplotsetmaincoords{60}{120}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Title
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{Probabilistic Explanations for Regression Models}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<frederic.koriche@cril.fr>?Subject=Your UAI 2025 paper}{Frederic Koriche}{}}
\author[1]{Jean-Marie Lagniez}
\author[1]{Chi Tran}
% Add affiliations after the authors
\affil[1]{%
    Univ.~Artois, CNRS\\ 
    Centre de Recherche en Informatique de Lens (CRIL)\\ 
    France
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Abstract
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  
\begin{document}
\maketitle

\begin{abstract}
    Formal explainability is an emerging field that aims to provide mathematically guaranteed explanations for the predictions made by machine learning models. 
    Recent work in this area focuses on computing “probabilistic explanations” for the predictions made by classifiers based on specific data instances. 
    The goal of this paper is to extend the concept of probabilistic explanations to the regression setting, treating the target regressor as a black box function. 
    The class of probabilistic explanations consists of linear functions that meet a sparsity constraint, alongside a hyperplane constraint defined for the data instance being explained. 
    While minimizing the precision error of such explanations is generally \NPPP-hard, we demonstrate that it can be approximated by substituting the precision measure with a fidelity measure. 
    Optimal explanations based on this fidelity objective can be effectively approached using Mixed Integer Programming (MIP). 
    Moreover, we show that for certain distributions used to define the precision measure, explanations with approximation guarantees can be computed in polynomial time using a variant of Iterative Hard Thresholding (IHT).
    Experiments conducted on various datasets indicate that both the MIP and IHT approaches outperform the state-of-the-art LIME and MAPLE explainers.
\end{abstract}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Introduction
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Introduction}
\label{sec.intro}

As machine learning models increasingly impact critical decisions in areas such as criminal justice, medical diagnosis, and social scoring, the significance of ethics, fairness, and safety in these models has become more apparent than ever. 
In response to this need, Explainable Artificial Intelligence (XAI) has developed a range of explanation techniques that help users understand these models without requiring in-depth knowledge of their inner workings \citep{Miller.AI.2022,Molnar.Book.2022}. 
Recently, the field of \emph{formal explainability} has emerged as a promising subdiscipline, concentrating on providing explanations with mathematical guarantees concerning quality, size, and semantics 
\citep{Ignatiev.IJCAI.2020, MarquesSilva.AAAI.2022}. 
The aim of formal explainability is to establish theoretical foundations for explaining predictions made by machine learning models, so as to calibrate trust and confidence in their capabilities.

A well-studied problem in formal explainability is to identify a rule that explains \emph{why} a given data instance \(\vec{x}\) is classified as \(f(\vec{x})\) by a classifier \(f\). 
This rule can be described as a subset \(S\) of features, such that any change in the values of features outside \(S\) does not affect the outcome \(f(\vec{x})\). 
Since the restriction of \(\vec{x}\) to \(S\), denoted by \(\vec{x}_S\), contains enough information to determine \(f(\vec{x})\), 
the feature subset \(S\) is often referred to as a (weak) \emph{abductive explanation} \citep{Cooper.AI.2023}, 
also called \emph{sufficient reason} \citep{Darwiche.ECAI.2020}. 
However, despite the appealing soundness of abductive explanations, their size often exceeds the cognitive limits of human users. 
As suggested by \cite{Miller.PR.1956}, our ability to reason about multiple features is typically limited to seven, plus or minus two elements. 
This limitation has been reinforced by numerous cognitive science experiments (see e.g. \citep{Saaty.MCM.2003}), and empirical research in XAI indicates that explanations should be concise \citep{Lage.HCOMP.2019}.

Therefore, achieving a balance between precision and conciseness is crucial when generating explanations for predictive models. 
The concept of \emph{probabilistic explanations} \citep{Waeldchen.JAIR.2021,Izza.JAR.2023} embodies this balance. 
In this context, the \emph{precision error} of a feature set $S$ is the 
probability that $f$ separates a random instance $\vec z$ from $\vec x$, when the restrictions of 
$\vec z$ and $\vec x$ to $S$ are indistinguishable. The precision error is evaluated according to a predefined distribution, 
such as the uniform distribution over all data instances, or some neighborhood distribution centered at $\vec x$. 
Based on this measure, the computation of probabilistic explanations can be framed as a constrained stochastic optimization problem. 
For example, if we aim to find an explanation with the lowest precision error under a user-supplied size limit $k$, the task is to
\begin{equation}
\label{pb:classification}
    \tag{P1}    
    \begin{aligned}
        \text{minimize} \quad & \mathbb P_{\vec z}[f(\vec z) \neq f(\vec x) \mid \vec z_S = \vec x_S] \\
        \text{subject to} \quad & |S| \leq k 
    \end{aligned}
\end{equation}

To the best of our knowledge, probabilistic explanations have mostly been studied within the context of classification. 
However, considering the variety of available regression models, a logical question arises: \emph{how can probabilistic explanations be extended to the regression setting?}

This paper addresses the above question without making assumptions about the structure of the regression model $ f $. 
For instance, $ f $ could be represented by a tree ensemble, a support vector machine, or a deep neural network. 
In our algorithms, $ f $ is treated as a black-box function.
% with only query access to the output value of supplied data instances.

When explaining the prediction \(f(\vec{x})\) made by \(f\) for a data instance \(\vec{x}\), 
feature subsets alone are often insufficient to describe the relationship between input features and continuous output values. 
Therefore, this study considers explanations in the form of \emph{linear models} \(\vec w\) that satisfy the hyperplane condition \(\vec{w} \cdot \vec{x} = f(\vec{x}) \). 
Such a constraint ensures that any explanation \(\vec w\) is consistent with \(f\) at \(\vec{x}\).
The conciseness or \emph{sparsity} of \(\vec w\) is measured by the number of its nonzero coefficients, denoted as \(\|\vec{w}\|_0\).

To quantify how ``sufficient'' a sparse linear model is in determining a regression model with adequate precision, 
we replace the conditional zero-one loss function in (\ref{pb:classification}) with a conditional absolute loss function.
Thus, the precision error of an explanation $\vec w$ for the value $f(\vec x)$ of some data instance $\vec x$
is defined by the conditional expected loss of $\size{f(\vec{z}) - f(\vec{x})} $ given that $\vec{w} \cdot \vec{z} = \vec{w} \cdot \vec{x}$. 
Again, the precision is evaluated according to some predefined distribution over the instance space. 
With these notions in hand, the problem examined in this paper is to
\begin{equation}
    \label{pb:precision}
    \tag{P2}
    \begin{aligned}
        \text{minimize} \quad & \mathbb E_{\vec z}[\size{f(\vec z) - f(\vec x)} \mid \vec{w} \cdot \vec{z} = \vec{w} \cdot \vec{x}] \\
         \text{subject to} \quad & \vec{w} \cdot \vec{x} = f(\vec{x})  \mbox{ and } \norm{\vec w}_0 \leq k   
    \end{aligned}
\end{equation}
In Section \ref{sec:complexity}, we show that when $f$ is represented by a neural network, (\ref{pb:precision}) is hard for \NPPP, 
a complexity class that is beyond the capabilities of modern solvers. However, this hardness result does not preclude 
the existence of algorithms that offer \emph{additive} approximation guarantees on the conciseness and the precision of optimal explanations.

In Section \ref{sec:pp}, we show that the precision error of feasible solutions in (\ref{pb:precision}) is upper-bounded by their \emph{fidelity error}, 
a measure often used in model-agnostic explainability \citep{Li.ICLR.2021}. By replacing 
the objective in (\ref{pb:precision}) with the empirical fidelity error, the corresponding problem becomes: 
\begin{equation}
    \label{pb:fidelity}
    \tag{P3}
    \begin{aligned}
        \text{minimize} \quad & \frac{1}{m} \sum_{i=1}^m (\vec w\!\cdot\!\vec z_i  - f(\vec z_i))^2  \\
         \text{subject to} \quad & \vec{w} \cdot \vec{x} = f(\vec{x})  \mbox{ and } \norm{\vec w}_0 \leq k   
    \end{aligned}
\end{equation}
This formulation, which involves a non-conditional expected loss function as the objective, is a variant of the well-studied \emph{sparse regression} problem \citep{Natarajan.SJC.1995}.
While this problem remains \NP-hard, it can be approached using \emph{Mixed Integer Programming} (MIP) 
with a polynomial number of queries to $ f $. 
The corresponding explanations are $ k $-sparse, and with high probability, 
their precision error is at most $ \sqrt{\gamma^*} + o(1) $, where $ \gamma^* $ is the optimal value of (\ref{pb:fidelity}).

In Section \ref{sec:np}, we present a variant of the \emph{Iterative Hard Thresholding} (IHT) algorithm \citep{Blumensath.ACHA.2009,Garg.ICML.2009}
that computes approximate solutions to (\ref{pb:fidelity}) in polynomial time. 
For the uniform distribution, these explanations are $ k $-sparse, and with high probability, their precision error is at most $ 7\sqrt{\gamma^*} + o(1) $.

From an empirical standpoint, we compare in Section \ref{sec:experiments} the MIP and IHT approaches with the popular LIME \citep{Ribeiro.KDD.2016} and MAPLE \citep{Plumb.NeurIPS.2018} explainers. 
Through experiments on various datasets, we demonstrate that both MIP and IHT approaches outperform these state-of-the-art explainers in terms of fidelity while using a reasonable amount of time for the MIP solver.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Related Work
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Related Work}
\label{sec:related_work}

Probabilistic explanations have gained increasing attention in the field of formal explainability due to their flexibility. 
As outlined in (\ref{pb:classification}), we can set a sparsity level \( k \) and request a feature subset \( S \) of size at most \( k \) that minimizes precision error \citep{Koriche.ECML.2024}. 
Alternatively, we can fix a precision level \( \epsilon \) and ask for a smallest feature subset \( S \) with an error of at most \( 1 - \epsilon \) \citep{Izza.JAR.2023}. 
However, this flexibility comes at a cost: \cite{Waeldchen.JAIR.2021} demonstrated that deciding whether there exists a \( k \)-sparse \( \epsilon \)-precise explanation 
$S$ for the prediction \( f(\vec x) \) made by a neural network \( f \) on a data instance \( \vec x \) is a \NPPP-hard problem. 
Additionally, they showed that minimizing the size of an \( \epsilon \)-precise explanation is \NP-hard to approximate within a factor of \( d^{1 - \delta} \) for any $\delta > 0$, 
where  \( d \) is the dimension of \( x \). 

For these reasons, the tractability and approximability of probabilistic explanations have been explored for simpler classifiers, 
such as decision trees \citep{Arenas.NeurIPS.2022, Bounia.UAI.2023} and linear threshold functions \citep{Subercaseaux.AAAI.2025}. 
In cases where \( f \) is a black-box classifier, \cite{Blanc.NeurIPS.2021} demonstrated that if the instance \( \vec{x} \) being explained is drawn from a uniform distribution, 
then with high probability, an \( \epsilon \)-precise explanation \( S \) of size \( k' \) can be derived from the path \( T(\vec{x}) \) of a depth-\( k' \) decision tree \( T \) 
with fidelity error \( \mathbb{P}_{\vec{z}}[T(\vec{z}) \neq f(\vec{z})] \leq \epsilon \). When \( k' \) is polynomial in the ``average certificate complexity'' of \( f \), 
the decision tree \( T \) can be implicitly learned in polynomial time. While our approach for the regression setting shares some similarities with their findings, 
we do not assume that \( \vec{x} \) is selected uniformly at random.

In a broader context, various model-agnostic methods have been proposed to extrapolate a linear explanations from the neighborhood of data instances 
\citep{Ribeiro.KDD.2016, Plumb.NeurIPS.2018, Agarwal.ICML.2021, Zhao.UAI.2021}. 
A common goal is to minimize the unconstrained objective
\begin{align*}
    %\label{eq:model_agnostic_objetive}
    \frac{1}{m} \sum_{i=1}^m \phi_{\vec x}(\vec z_i) (\vec w\!\cdot\!\vec z_i  - f(\vec z_i))^2 + \psi(\vec w)
\end{align*}

Here, \(\{(\vec z_i,f(\vec z_i))\}_{i=1}^m\) is a set of labeled samples generated from some neighborhood distribution around \(\vec x\),
\(\phi_{\vec x}(\vec z_i)\) assesses the importance of \(\vec z_i\), and \(\psi(\vec w)\) penalizes the complexity of \(\vec w\). 
For example, in the LIME method \citep{Ribeiro.KDD.2016}, \(\phi_{\vec x}(\vec z_i)\) is a normalized distance between \(\vec z_i\) and \(\vec x\), 
while in the MAPLE method \citep{Plumb.NeurIPS.2018}, \(\phi_{\vec x}(\vec z_i)\) measures the average number of times \(\vec z_i\) ends up in the same leaf as \(\vec x\) in a random forest trained from \(f\).
Despite their popularity, these heuristic methods do not always provide theoretical guarantees regarding the consistency, fidelity, or sparsity of extrapolated explanations. 
This contrasts with our MIP and IHT approaches, which aim to solve (\ref{pb:fidelity}), incorporating consistency and sparsity as constraints, and defining fidelity as the objective.

\paragraph{Notation.}
Plain letters represent functions and scalars, while boldface letters represent vectors and matrices. 
The all-ones vector is denoted as \( \vec{1} \) and the all-zeros vector as \( \vec{0} \). 
For a positive integer \( d \), we use \( [d] \) to denote the set \( \{1, \ldots, d\} \).
Additionally, we use \( \vec{1}_S \) to denote the indicator vector in $\{0,1\}^d$ of a subset \( S \subseteq [d] \), and we use \( \mathbbm{1}[E] \) to denote the indicator function in $\{0,1\}$ of an event \( E \subseteq \{0,1\}^{d} \). 
The support set of a vector \( \vec{w} \in \mathbb{R}^d \), denoted as \( \text{support}(\vec{w}) \), is the set of coordinates \( j \in [d] \) for which \( w_j \neq 0 \).
The scalar product of two vectors \( \vec{v} \) and \( \vec{w} \) is denoted as \( \vec{v} \cdot \vec{w} \), and the coordinate-wise (or Hadamard) product is denoted as \( \vec{v} \odot \vec{w} \). 
For a scalar \( p \in [0, \infty] \), the \( L_p \) norm of \( \vec{w} \) is denoted as \( \| \vec{w} \|_p \). 
The limit cases are \( \| \vec{w} \|_0 = \size{(\text{support}(\vec{w}))} \) and \( \| \vec{w} \|_{\infty} = \max_{j=1}^{d} |w_j| \).
For a scalar \( r \geq 0 \), the \( L_p \) ball of radius \( r \) is defined as 
\[
\mathcal{B}_p(r) = \{ \vec{w} \in \mathbb{R}^d : \| \vec{w} \|_p \leq r \}
\]
For a vector \( \vec{u} \in \mathbb{R}^d \) and a scalar \( r \in \mathbb{R} \), the hyperplane at \( (\vec{u}, r) \) is defined as 
\[
\mathcal{H}(\vec{u}, r) = \{ \vec{w} \in \mathbb{R}^d : \vec{u} \cdot \vec{w} = r \}
\]
Finally, the Euclidean projection of a vector \( \vec{w} \in \mathbb{R}^d \) onto a set \( \mathcal{U} \subseteq \mathbb{R}^d \) is given by 
\[
\Pi_{\mathcal{U}}(\vec{w}) = \arg\min_{\vec{u} \in \mathcal{U}} \| \vec{w} - \vec{u} \|_2.
\]

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Problem Formulation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Problem Formulation}
\label{sec:formulation}

In this study, we consider explanation tasks where data instances are defined over a set of \emph{interpretable literals}.
For instance, consider a bank customer wanting to understand why her loan application received a score of \(-\tfrac{1}{4}\), which is below the acceptance threshold. 
Interpretable literals such as \([\text{Income} \geq 70 \text{K\$}]\), \([\text{Debt-To-Income (DTI) ratio} \leq 35\%]\), and \([\text{Proof of Address} = \text{Yes}]\) could be utilized. 
A clear and concise explanation could be provided using an if-then rule with weighted features, such as
\[
\tfrac{1}{2} [\text{Income} \geq 70 \text{K\$}] - \tfrac{3}{4} [\text{DTI ratio} > 35\%] \rightarrow \text{Score} = -\tfrac{1}{4}
\]
More formally, let \([d]\) denote the set of interpretable literals. 
By treating these literals as binary features, the regression models explored in this study 
are pseudo-Boolean functions of the form \(f: \{\pm 1\}^d \rightarrow [-1,+1]\).\footnote{Our theoretical results can easily be extended to co-domains $[-c,+c]$, provided that $c$ is constant.} 
Here, any input to \(f\) is a data instance \(\vec{x} \in \{\pm 1\}^d\), where \(x_j\) indicates whether the \(j\)th literal occurs positively or negatively in \(\vec{x}\).


\begin{figure}[t]
    \centering
    \begin{tikzpicture}[scale=0.6,>=stealth]
        \draw[gray,densely dotted] (-4,-4) grid (4,4);
        \node[below] at (-4,0) {\smaller $-1$};
        \node[below] at (-2,0) {\smaller $-\tfrac{1}{2}$};
        \node[below] at (2,0) {\smaller $\tfrac{1}{2}$};
        \node[below] at (4,0) {\smaller $1$};
        \node[left] at (0,-4) {\smaller $-1$};
        \node[left] at (0,-2) {\smaller $-\tfrac{1}{2}$};
        \node[left] at (0,2) {\smaller $\tfrac{1}{2}$};
        \node[left] at (0,4) {\smaller $1$};
        \node[below left] at (0,0) {\smaller $0$};
        \draw[red,<->] (0,-4.5) -- (0,4.5);
        \draw[red,<->] (-4.5,0) -- (4.5,0);
        \draw[fill = green,thin,opacity=0.1] (-4,0) -- (0,4) -- (4,0) -- (0,-4) -- cycle;
        \draw[blue,thick,opacity=0.5,<->] (-2.25,4.25) -- (4.25,-2.25);
        \draw[green,opacity=0.8]  (2,0) -- (0,2);    
        \fill[red,opacity=0.8]  (2,0) circle (3pt);    
        \fill[red,opacity=0.8]  (0,2) circle (3pt);    
        \fill[black,opacity=1.0]  (4,4) circle (3pt);
        \coordinate[label={[black]below left:{\smaller $\vec x$}}] (X) at (4,4);
        \coordinate[label={[red]above:{\smaller $(w_1,0)$}}] (W1) at (4,0);
        \coordinate[label={[red]right:{\smaller $(0,w_2)$}}] (W2) at (0,4);
        \coordinate[label={[blue]left:{\smaller $\mathcal H(\vec x,f(\vec x))$}}] (H) at (4,-2);
    \end{tikzpicture}
    \caption{A geometric illustration of $1$-sparse linear explanations $\vec w$, where $\vec x = (1,1)$ and $f(\vec x) = \frac{1}{2}$. 
    The hyperplane $\mathcal H(\vec x, f(\vec x))$ is shown in blue, while the $L_0$ ball $\mathcal B_0(1)$ is depicted in red. 
    The intersection of these two elements is represented by two red points. 
    The convex hull of this intersection forms the green segment, 
    with the lozenge highlighting the $L_1$ ball $\mathcal B_1(1)$.}  
    \label{fig:explanations}
\end{figure}

A \emph{linear explanation} for \(f(\vec{x})\) is a vector $\vec w \in \R^{d}$ that satisfies the equation \(\vec{w} \cdot \vec{x} = f(\vec{x}) \).
As illustrated in the previous example, such an explanation can be interpreted as an if-then rule over weighted literals: the head corresponds to \(f(\vec{x}) \), 
and the body consists of pairs \((j, w_j)\) for which \(x_j w_j \neq 0\). An explanation $\vec w$ is \emph{$k$-sparse} if $\norm{\vec w}_0 \leq k$.
As illustrated in Figure~\ref{fig:explanations}, the set of $k$-sparse explanations for $f(\vec x)$ is formed by the intersection of two objects: the hyperplane $\mathcal H(\vec x, f(\vec x))$ and the \(L_0\) ball $\mathcal B_0(k)$.
While the former is convex, the latter is not. 

The quality of probabilistic explanations is assessed in relation to a probability distribution \(\mathcal{D}\) over \(\{\pm 1\}^d\). 
For instance, \(\mathcal{D}\) could represent the uniform distribution \(\mathcal{U}\) across \(\{\pm 1\}^d\) or, more restrictively, 
a neighborhood distribution surrounding the instance that is being explained.
The \emph{precision error} of a vector \(\vec{w} \in \mathbb{R}^d\) with respect to a model \(f\), a data instance \(\vec{x}\), and a distribution \(\mathcal{D}\) is defined as follows:
\begin{align}
\label{eq:precision}
\Precision{\vec w} = \mathbb{E}_{\vec{z} \sim \mathcal{D}} \left[ |f(\vec{z}) - f(\vec{x})| \mid \vec{w} \cdot \vec{z} = \vec{w} \cdot \vec{x} \right]
\end{align}
In other words, the precision of $\vec w$ measures the discrepancy between \(f(\vec{z})\) and \(f(\vec{x})\) for random instances \(\vec{z}\) 
that are aligned in the same direction as \(\vec{x}\) in relation to \(\vec{w}\).

With these concepts in mind, the decision version of (\ref{pb:precision}) is referred to as the \textsc{sparse linear explanation} (\textsc{sle}) problem, and formulated as follows:
\begin{description}
    \item [Instance:] A regression model $f: \{\pm 1\}^d \rightarrow [-1,+1]$, a data instance $\vec x \in \{\pm 1\}^d$, a probability distribution $\mathcal D$ over $\{\pm 1\}^d$, a sparsity level $k \geq 1$, 
    and a precision parameter $\epsilon > 0$. 
    \item [Question:] Does there exist a linear explanation $\vec w \in \R^d$ for $f(\vec x)$ such that $\norm{\vec w}_0 \leq k$ and $\mathsf{P}_{f, \vec{x}, \mathcal{D}}(\vec{w}) \leq \epsilon$?
\end{description}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Problem Complexity
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Problem Complexity}
\label{sec:complexity}

To establish the computational hardness of our problem, we need a representation of \( f \) that allows us to evaluate its description length. 
To this end, we consider the class \( \mathcal{N} \) of feedforward neural networks, 
which have weights and biases in \([-1,+1]\) and activation functions in \(\{\sigma_{\textsc{Lin}}, \sigma_{\textsc{ReLU}} \}\), 
where \(\sigma_{\textsc{Lin}}(u) = u\) and \(\sigma_{\textsc{ReLU}}(u) = \max\{0, u\}\). 
The description length of \( f \) is determined by the number of gates in its representation.
Additionally, we assume that the distribution \(\mathcal{D}\) has a closed-form expression, 
allowing us to evaluate the probability \(\mathcal{D}(\vec{z})\) of any \(\vec{z}\) in polynomial time relative to the input dimension.

\begin{theorem}
    \label{thm:computational_complexity}
    For the representation class $\mathcal N$, the \textsc{sparse linear explanation} problem is \NPPP-hard.
\end{theorem}
   
\begin{proof}
    Consider the decision version of (\ref{pb:classification}), referred to as \textsc{Sparse Subset Explanation} 
    (\textsc{sse}).  
    An instance of this problem is a tuple $I_{\textsc{sse}} = (f,\vec x,k,\epsilon)$ such that $f: \{\pm 1\}^d \rightarrow \{\pm 1\}$
    is a Boolean function represented by a Boolean circuit, $\vec x \in \{\pm 1\}^d$ is a data instance, and $k$ and $\epsilon$ are parameters in $\N$ and $(0,1]$, respectively.
    The goal is to decide whether there exists a subset $S \subseteq [d]$ of size at most $k$, such that  $\mathsf{Q}_{f, \vec{x}}(S) \leq \epsilon$, where 
    \[
    \mathsf{Q}_{f, \vec{x}}(S) = \mathbb P_{\vec z \sim \mathcal U}\left[f(\vec z) \neq f(\vec x) \mid \vec z \odot \vec 1_S = \vec x \odot \vec 1_S \right]
    \]
    Using $\delta = 1 - \epsilon$, any such subset $S$ is called \emph{$\delta$-relevant subset} in \citep{Waeldchen.JAIR.2021,Izza.JAR.2023}.
    
    From an instance $I_{\textsc{sse}} = (f,\vec x,k,\epsilon)$, we build an instance $I_{\textsc{sle}} = (f',\vec x',\mathcal D, k',\epsilon')$
    of our problem, defined as follows. Let $k' = k + 1$, let $\epsilon' = \epsilon$ and let $\vec x' = (\vec x,1)$.
    In addition, let $\mathcal D$ be the distribution over $\{\pm 1\}^{d+1}$ defined as $\mathcal D(\vec z, 1) = \mathcal U(\vec z)$ and      
    $\mathcal D(\vec z, -1) = 0$ for any $\vec z \in \{\pm 1\}^d$.
    Finally, let $f': \{\pm 1\}^{d+1} \rightarrow [-1,+1]$ be the function: 
    \[
    f'(\vec z,-1) = f'(\vec z,1) = \tfrac{1}{2} \size{f(\vec x) - f(\vec z)}, \mbox{ for all } \vec z \in \{\pm 1\}^d
    \]
    As shown in \citep{Waeldchen.JAIR.2021}, any Boolean circuit can be efficiently transformed into an equivalent neural network with integer weights and biases in \(\{-1, 0, +1\}\), 
    and activation functions in \(\{\sigma_{\textsc{Lin}}, \sigma_{\textsc{ReLU}}\}\). 
    Consequently, a representation in \(\mathcal{N}\) for \(f'\) can be constructed in polynomial time from the neural representation of \(f\), by simply adding the following units to its output:
    \[
    \tfrac{1}{2}\sigma_{\textsc{lin}}(\sigma_{\textsc{ReLU}}(f(\vec{x}) - f(\vec{z})), \sigma_{\textsc{ReLU}}(f(\vec{z}) - f(\vec{x})))
    \]    
    For a subset $S$, let $\vec w = (\vec 1_S \odot \vec x, -\size{S})$ denote the corresponding linear function. 
    Since $(\vec 1_S \odot \vec x)\cdot \vec x = \size{S}$, we know that $\vec w$ is a $k$-sparse explanation for $f'(\vec x')$.
    Furthermore, for any $\vec z \in \{\pm 1\}^d$, we have $\vec w \cdot (\vec z,1) = \vec w \cdot (\vec x,1)$ if and only if  
    $(\vec 1_S \odot \vec x) \cdot \vec z = \size{S}$, which is equivalent to $\vec 1_S \odot \vec x = \vec 1_S \odot \vec z$.
    This, together with the fact that $\size{f'(\vec z,1) - f'(\vec x,1)} = \tfrac{1}{2} \size{f(\vec z) - f(\vec x)} = \mathbbm 1[f(\vec z) \neq f(\vec x)]$
    implies that  $\mathsf{P}_{f', \vec{x}', \mathcal{D}}(\vec{w}) = \mathsf{Q}_{f, \vec{x}}(S)$.    
    Therefore, $S$ is a solution to $I_{\textsc{sse}}$ if and only if $\vec w$ is a solution to $I_{\textsc{sle}}$.
    Since \textsc{sse} is \NPPP-hard \citep[Theorem 2.4]{Waeldchen.JAIR.2021}, it follows that \textsc{sle} is \NPPP-hard.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Dealing with PP-Hardness
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Dealing with PP-Hardness}
\label{sec:pp}

Theorem~\ref{thm:computational_complexity} reveals that the problem of finding $k$-sparse linear explanations with a precision error of at most $\epsilon$ involves two independent sources of complexity. 
The first source, related to the \NP-hardness of the problem, arises from the challenge of exploring all candidate support sets \( S \subseteq [d] \) of size at most \( k \) and determining 
whether there exists an \(\epsilon\)-precise linear explanation \( \vec w \) with support \( S \). The second source of complexity comes from the inherent difficulty in checking whether 
the precision error of \( \vec{w} \) is indeed at most \( \epsilon \), which is itself a \PP-hard problem. 

In this section, we focus on the second source of complexity. 
The idea is to replace the precision error with the \emph{fidelity error}, which serves as a surrogate function:
\begin{align}
    \label{eq:fidelity}
    \Fidelity{\vec w} = \mathbb{E}_{\vec{z} \sim \mathcal{D}} \left[ (\vec w\! \cdot\! \vec z  - f(\vec{z}))^2 \right]
\end{align}

\begin{lemma}
    \label{lem:fidelity}
    Let $f: \{\pm 1\}^d \rightarrow [-1,+1]$ be a regression model, let $\vec x \in \{\pm 1\}^d$ be a data instance, and let $\mathcal D$ be a probability distribution over $\{\pm 1\}^d$.
    Then, the precision error of any linear explanation $\vec w$ for $f(\vec x)$ satisfies
    \begin{align*}
        \Precision{\vec{w}} \leq \sqrt{\Fidelity{\vec w}} 
    \end{align*}    
\end{lemma}
\begin{proof}
    By sublinearity of the absolute loss function,
\begin{align*}
    \Precision{\vec{w}} 
    & \leq \mathbb{E}_{\vec{z} \sim \mathcal{D}} \left[ \size{f(\vec{z}) - \vec w\!\cdot\!\vec z} \mid \vec{w} \cdot \vec{z} = \vec{w} \cdot \vec{x}\right] \\ 
    & + \mathbb{E}_{\vec{z} \sim \mathcal{D}} \left[ \size{\vec w\!\cdot\!\vec z - \vec w\!\cdot\!\vec x} \mid \vec{w} \cdot \vec{z} = \vec{w} \cdot \vec{x}\right] \\
    & + \mathbb{E}_{\vec{z} \sim \mathcal{D}} \left[ \size{\vec w\!\cdot\!\vec x - f(\vec x)} \mid \vec{w} \cdot \vec{z} = \vec{w} \cdot \vec{x}\right] 
\end{align*}
Note that the second term in the above inequality vanishes. Since $\vec{w}$ satisfies the hyperplane condition $\vec w \cdot \vec{x} = f(\vec{x})$, the third term also disappears.
Using the fact that the expectation in the first term is independent of the condition $\vec{w} \cdot \vec{z} = \vec{w} \cdot \vec{x}$, it follows from Jensen's Inequality that 
\begin{align*}
    \Precision{\vec{w}} 
    \leq \mathbb{E}_{\vec{z} \sim \mathcal{D}} \left[ \size{f(\vec{z}) - \vec w\!\cdot\!\vec z} \right] 
    \leq \sqrt{\Fidelity{\vec w}} 
\end{align*}
\end{proof}
Importantly, (\ref{eq:fidelity}) involves an \emph{unconditional} expectation, which is approximable via sampling. 
In doing so, let $\{(\vec z_i,f(\vec z_i))\}_{i=1}^m$ be a sample set where 
each $\vec z_i$ is drawn independently at random according to $\mathcal D$, and its value $f(\vec z_i)$ is obtained through query access to $f$. 
The corresponding \emph{empirical fidelity error} is given by
\begin{align}
    \label{eq:emp_fidelity}
    \EmpFidelity{\vec w} = \frac{1}{m} \sum_{i=1}^m (\vec w\!\cdot\!\vec z_i  - f(\vec z_i))^2
\end{align} 
Based on this objective function, (\ref{pb:fidelity}) is a variant of the well-studied problem known as \emph{($L_0$) sparse regression}, also referred to as \emph{best subset selection}, 
which dates back at least to \citep{Beale.Biom.1967,Hocking.Tech.1967}. 
While the sparse regression problem is non-convex and \NP-hard \cite{Natarajan.SJC.1995}, the inspiring work by \cite{Bertsimas.AOS.2016} 
has explored various Mixed Integer Programming (MIP) formulations. 
Using modern branch-and-cut solvers, the authors have empirically shown that probably optimal solutions for high-dimensional instances can often be found in a few hours. 
The next formulation is a variation of their parameter-free approach utilizing \emph{Specially Ordered Sets} (SOS) \citep{Bertsimas.Book.2005}:
\begin{equation}
    \label{pb:mip}
    \tag{MIP}
    \begin{aligned}
        \text{minimize}     \quad & \frac{1}{m} \sum_{i=1}^m (\vec w\!\cdot\!\vec z_i  - f(\vec z_i))^2  \\
        \text{subject to}   \quad & \vec{w} \cdot \vec{x} = f(\vec{x})  \\
                            \quad & \vec 1 \cdot \vec u \leq k \\
                            \quad & \norm{(w_j,1 - u_j)}_0 \leq 1, \mbox{ for all } j \in [d] \\
                            \quad & u_j \in \{0,1\} \mbox{ for all } j \in [d] \\
                            \quad & w_j \in [-1,+1] \mbox{ for all } j \in [d]
    \end{aligned}
\end{equation}

The last constraint is used to ensure that the set of $k$-sparse explanations is bounded.
The following result shows that if the solver for (\ref{pb:mip}) is supplied a number of samples \( m \) that is quadratic in \( k \) and logarithmic in \( d \), then with high probability, 
the precision error of any returned solution is upper-bounded by the root of its empirical fidelity.

\begin{theorem}
    \label{thm:sample_complexity}
    Let $f: \{\pm 1\}^d \rightarrow [-1,+1]$ be a regression model, $\vec x \in \{\pm 1\}^d$ be a data instance, $\mathcal D$ be a probability distribution over $\{\pm 1\}^d$, 
    and $k \geq 1$ be a sparsity level. Then, for any $k$-sparse explanation $\vec w \in [-1,+1]^d$ for $f(\vec x)$, any $\delta \in (0,1]$, and any $\varepsilon \in (0,1]$, if 
    \begin{align*}
    m \geq \frac{1}{\varepsilon^4} \left(32 \ln(2d) + 8 \ln(\tfrac{2}{\delta})\right) (k + 1)^2
    \end{align*}
    then with probability at least $1 - \delta$ over the choice of an i.i.d. sample set of size $m$,
    \begin{align*}
        \Precision{\vec{w}} \leq \sqrt{\EmpFidelity{\vec w}} + \varepsilon
    \end{align*}    
\end{theorem}
\begin{proof}
    Let \( \mathcal{W} \) be the hypothesis class consisting of all vectors \( \vec w \in [-1, +1]^{d} \) such that \( \vec{w} \cdot \vec{x} = f(\vec{x})  \) and \( \|\vec{w}\|_0 \leq k \). 
    Using here the fact that \( \|\vec{w}\|_1 \leq \|\vec{w}\|_0 \), we know that $\mathcal{W}$ is included in $\mathcal B_1(k)$.
    Additionally, let $\ell_f$ denote the loss function defined as \( \ell_f(\vec w, \vec{z}) = |\vec{w} \cdot \vec{z}  - f(\vec{z})| \). 
    By construction, \( \ell_f(\vec w, \vec{z}) \) is 1-Lipschitz and upper-bounded by \( k + 1 \) for all \( \vec{z} \in \{0, 1\}^d \).
    Therefore, by application of Theorem 26.15 in \citep{ShalevShwartz.Book.2014} (see also Corollary 4 in \citep{Kakade.NeruIPS.2008}), we have
    \begin{align*}
        \Fidelity{\vec w} 
        &\leq \EmpFidelity{\vec w} \\
        &\quad + 2(k+1)\sqrt{\frac{8\ln(2d) + 2\ln(\tfrac{2}{\delta})}{m}}  
    \end{align*}
    By substituting the upper bound on $m$ defined as above, and applying Lemma~\ref{lem:fidelity}, the result follows.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Dealing with NP-hardness
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Dealing with NP-hardness}
\label{sec:np}

In light of Theorem~\ref{thm:sample_complexity}, we would like to find an \emph{optimal} solution to (\ref{pb:mip}), 
striving for the best possible empirical fidelity. However, since the sparse regression problem is \NP-hard,  
we need to make some additional assumptions for achieving polynomial time efficiency, 
In this section, we focus on the \emph{Restricted Isometry Property} (RIP) 
\citep{Candes.TIT.2009}, a condition that is often recommended to overcome this computational challenge.

A matrix \(\vec Z \in \mathbb{R}^{m \times d}\) is said to satisfy the RIP of order \(k\) with constant \(\beta_k \in (0,1)\) if, 
for all vectors \(\vec w \in \mathcal{B}_0(k)\), the following inequality holds:
\[
(1 - \beta_k) \|\vec w\|_2^2 \leq \tfrac{1}{m} \|\vec Z\vec w\|_2^2 \leq (1 + \beta_k) \|\vec w\|_2^2
\]
This condition is equivalent to requiring that the Gram matrix of $\vec Z$, restricted to the columns in $\support{\vec w}$, is positive definite 
with its eigenvalues confined to the interval \([1 - \beta_k, 1 + \beta_k]\).
Let \(\mathcal{D}\) be a probability distribution over \(\mathbb{R}^d\) such that for any \(\vec w \in \mathbb{R}^d\) and any \(\varepsilon \in (0,1)\), the following concentration inequality holds:
\begin{align}
    \label{eq:concentration}
    \mathbb{P}_{\vec Z \sim \mathcal{D}^m} \left[ \left| \tfrac{1}{m} \|\vec Z\vec w\|_2^2 - \|\vec w\|_2^2 \right| > \varepsilon \right] \leq 2 e^{-\Omega(m)}
\end{align}
As shown in \citep{Baraniuk.CA.2008}, if $\mathcal D$ satisfies such a concentration inequality, then 
the RIP of order $k \leq \tfrac{d}{2}$ with constant $\beta_k$ holds with probability at least $1 - 2e^{-\Omega(m)}$ for matrices $\vec Z$ drawn over $\mathcal D^m$, 
whenever $m = \Omega\bigl(\tfrac{k}{\beta_k^2} \ln \tfrac{d}{\beta_k}\bigr)$.   

%Of particular interest is the uniform distribution $\mathcal D$ over $\{\pm 1\}^d$, which adheres to the above concentration inequality \citep{Achlioptas.PODS.2001}.  
In the context of this study, we are interested in discrete distributions over $\{\pm 1\}^d$ satisfying (\ref{eq:concentration}).
Under this assumption, our algorithm for computing $k$-sparse explanations of high fidelity is a variant of the \emph{Iterative Hard Thresholding} (IHT) method \citep{Blumensath.ACHA.2009,Garg.ICML.2009,Jain.NeurIPS.2014}. 
Instead of projecting onto the ball $\mathcal B_0(k)$, it projects onto the intersection of this ball and 
the hyperplane $\mathcal H(\vec x,f(\vec x))$, ensuring that the solution serves as an explanation for $f(\vec x)$.  

As detailed in Algorithm~\ref{alg:iht}, our version of IHT takes the following inputs: 
a data instance \(\vec x \in \{\pm 1\}^d\) and its predicted value \(f(\vec x) \in [-1, +1]\), along with a sparsity level \(k \geq 1\). 
Additionally, the algorithm requires a sample set \(\{(\vec z_i, f(\vec z_i))\}_{i=1}^m\), which is compactly represented as a pair \((\vec Z, \vec y)\), where 
\(\vec Z \in \{\pm 1\}^{m \times d}\) is the matrix of samples \(\vec z_i\), and \(\vec y \in [-1, +1]^m\) is the vector of the corresponding labels \(f(\vec z_i)\).
The algorithm performs gradient descent (with a step size of 1), followed by a projection onto the set of \(k\)-sparse explanations. 
The following result ensures that each iteration of the algorithm operates in low polynomial time.

\begin{algorithm}[t]
	\SetArgSty{textrm}
	\DontPrintSemicolon
	\caption{Iterative Hard Thresholding (IHT)}
	\label{alg:iht}
 	\SetKw{Input}{Input:}
    \BlankLine
    \Input{query $(\vec x, f(\vec x))$, sparsity level $k$, data $(\vec Z,\vec y)$}\;  
    \BlankLine
    $\vec w_0 = \vec 0$\; 
    \For{$t = 1,2,\ldots$}
    {
        $\vec v_t  = \vec w_{t-1} - \tfrac{1}{m} \vec Z^{\intercal}(\vec Z\vec w_{t-1} - \vec y)$\;
        $\vec w_t   = \Pi_{\mathcal H(\vec x,f(\vec x)) \cap \mathcal B_0(k)}(\vec v_t)$\;
    }
    \BlankLine
\end{algorithm}

\begin{lemma}
    \label{lem:projection}
    For a model $f$, an instance $\vec x$, a sparsity level $k$, and a vector $\vec w$, 
    the projection of $\vec w$ onto $\mathcal H(\vec x, f(\vec x)) \cap \mathcal B_0(k)$ 
    can be computed in $\mathcal O(d\log_2(d) + k^2)$ time.
\end{lemma}
\begin{proof}
    As outlined in Algorithm~\ref{alg:projection}, the idea is to split \(\vec w\) into two components: 
    one that depends on the hyperplane constraint and another that does not. 
    Specifically, let \(\vec w = \vec w_H + \vec w_B\), where \(\vec w_H\) is the projection of \(\vec w\) onto the support set \(S\) of \(\vec x\), 
    and \(\vec w_B\) is the projection of \(\vec w\) onto the complement of \(S\). 
    Since all indices in \(\vec w_B\) are free variables in the equation \(\vec w \cdot \vec x = y\), we can directly project \(\vec w_B\) onto \(\mathcal B_0(k)\).
    The solution \(\vec w^*_B\) can be obtained in \(\mathcal O(d\log_2(d))\) time using the \emph{Hard Thresholding} (\textsc{ht}) operator, 
    which sets all but the largest (in magnitude) elements of \(\vec w_B\) to zero. 
    
    Now, let \(\vec u_H = \vec w_H \odot \vec x\) and let \(y = f(\vec x)\).
    In addition, let $\mathcal W$ and $\mathcal U$ denote the intersections of the ball $\mathcal B_0(k)$ with the hyperplanes 
    $\mathcal H(\vec x, y)$ and $\mathcal H(\vec 1_{S}, y)$, respectively.    
    Since \(\vec u_H \cdot \vec 1_{S} = y\) if and only if \(\vec w_H \cdot \vec x = y\), it follows that 
    $\vec u' \in \mathcal U$ if and only if $\vec w' = (\vec u \odot \vec x) \in \mathcal W$.   
    This, together with the fact that $\norm{\vec u' - \vec u_H}_2 = \norm{\vec w' - \vec w_H}_2$, implies that 
    $\Pi_{\mathcal W}(\vec w_H) = \left(\Pi_{\mathcal U}(\vec u_H)\right) \odot \vec x$.
    Let \(\vec v_H\) be the projection of \(\vec u_H\) onto \(\mathcal U\). 
    By setting  \(\vec w^*_H = (\vec v_H \odot \vec x)\), the projection of \(\vec w\) onto \(\mathcal W\) 
    is therefore $\vec w^* = \vec w^*_H + \vec w^*_B$.  
    
    Finally, since \(\mathcal H(\vec 1_{S}, \lambda)\) is a diagonal hyperplane, 
    the runtime complexity for deriving $\vec w^*_H$ follows from the fact that \(\vec v_H\) can be obtained in \(\mathcal O(d\log_2(d) + k^2)\) 
    time using the \emph{Greedy Selector and Hyperplane Projector} (\textsc{gshp}) operation 
    \citep{Kyrillidis.ICML.2013}.
\end{proof}

\begin{algorithm}[t]
	\SetArgSty{textrm}
	\DontPrintSemicolon
	\caption{Projection onto $k$-Sparse Explanations}
	\label{alg:projection}
 	\SetKw{Input}{Input:}
    \BlankLine
    \Input{query $(\vec x, y)$, sparsity level $k$, vector $\vec w$}\;  
    \BlankLine
    $\vec w_H = \vec w \odot \vec 1_{\mid \support{\vec x}}$ and $\vec w_B = \vec w \odot \vec 1_{\mid [d] \setminus \support{\vec x}}$\;
    $\vec w^*_B = \textsc{ht}(\vec w_B,k)$\;
    $\vec w^*_H  = \textsc{gshp}(\vec w_H \odot \vec x, k, y) \odot \vec x$\;
    return $\vec w^*_H + \vec w^*_B$\;
    \BlankLine
\end{algorithm}

With this lemma in hand, the main result of this section can be formally stated in the following theorem.

\begin{theorem}
    \label{thm:iht}
    Let $f: \{\pm 1\}^d \rightarrow [-1,+1]$ be a regression model, $\vec x \in \{\pm 1\}^d$ be a data instance, $k \in [1, \tfrac{d}{6}]$ be a sparsity level, 
    and $\mathcal D$ be a probability distribution over $\{\pm 1\}^d$ satisfying the concentration inequality (\ref{eq:concentration}). 
    Suppose that the IHT algorithm is run on a sample set $\{(\vec z_i,f(\vec z_i))\}_{i=1}^m$ drawn from $\mathcal D$ and labeled by $f$ such that 
    $m = \Omega\bigl(\tfrac{k}{\alpha^2} \ln \tfrac{d}{\alpha}\bigr)$ with $\alpha < 1/(32 \sqrt 3)$. Then, for any $k$-sparse explanation $\vec w$ for $f(\vec x)$,
    after 
    \[
    t \geq \log_2 \left\lceil \frac{\norm{\vec w}_2}{\EmpFidelity{\vec w}} \right\rceil
    \]
    iterations, the returned vector $\vec w_t$ is a $k$-sparse explanation for $f(\vec x)$ satisfying, with probability at least $1 - 2e^{-\Omega(m)}$, 
    \begin{align*}
    \sqrt{\EmpFidelity{\vec w_t}} \leq 7 \sqrt{\EmpFidelity{\vec w}} 
    \end{align*}  
\end{theorem}
\begin{proof}
    Let \( \vec{Z} \in \{\pm 1\}^{m \times d} \) be the matrix of samples \( (\vec{z}_1, \ldots, \vec{z}_m) \) and let \( \vec{y} \in [-1,+1]^m \) 
    be the vector of corresponding values \( (f(\vec{z}_1), \ldots, f(\vec{z}_m)) \). 
    By applying Lemma 5.1 from \citep{Baraniuk.CA.2008} and using the bound on \( m \), we know that with a probability of at least $1 - 2e^{-\Omega(m)}$, 
    the matrix \( \vec{Z} \) satisfies the RIP of order \( 3k \) with a constant \( \beta_{3k} < \frac{1}{32} \).
    By integrating this result with Theorem 5 from \citep{Blumensath.ACHA.2009}, we can conclude that at iteration \( t \), defined as above, 
    the solution $\vec w_t$ computed by IHT satisfies, with probability at least $1 - 2e^{-\Omega(m)}$,
    \begin{align*}
    \|\vec{w}_t - \vec{w}\|_2 \leq 6 \|\vec{e}\|_2, \text{ where } \vec{e} = \tfrac{1}{m} \vec{Z}\vec{w} - \vec{y}
    \end{align*}
    Leveraging this result, and applying the triangle inequality along with the fact that \( \tfrac{1}{m} \|\vec{Z}\vec{u}\|_2 \leq \|\vec{u}\|_2 \), 
    we obtain
    \begin{align*}
    \sqrt{\EmpFidelity{\vec w_t}} 
    &= \tfrac{1}{m} \|\vec{Z}\vec{w}_t + \vec{Z}\vec{w} - \vec{Z}\vec{w} - \vec{y}\|_2 \\
    &\leq \tfrac{1}{m} \|\vec{Z}(\vec{w}_t - \vec{w})\|_2 + \tfrac{1}{m} \|\vec{Z}\vec{w} - \vec{y}\|_2 \\
    &\leq \|\vec{w}_t - \vec{w}\|_2 + \|\vec{e}\|_2 \\
    &\leq 7\sqrt{\EmpFidelity{\vec w}}
    \end{align*}
\end{proof}

As shown in \citep{Achlioptas.PODS.2001}, the uniform distribution $\mathcal{U}$ over $\{\pm 1\}^d$ satisfies the concentration inequality (\ref{eq:concentration}). 
Thus, by combining Theorems \ref{thm:sample_complexity} and \ref{thm:iht}, we know that using a polynomial number of samples drawn uniformly at random, the IHT algorithm is guaranteed to find, with high probability, 
a $k$-sparse explanation $\vec{w}_t$ that achieves 
\[
\Precision{\vec w_t} \leq 7 \sqrt{\EmpFidelity{\vec w^*}} + o(1),
\] 
where $\vec{w}^*$ is the optimal solution to (\ref{pb:mip}). Additionally, by integrating Lemma~\ref{lem:projection} and the fact that $\norm{\vec w^*}_{\infty} \leq 1$, 
we can conclude that the solution $\vec{w}_t$ can be computed in polynomial time with respect to $d$, $k$, and $\log_2 \lceil 1/ \EmpFidelity{\vec w^*} \rceil$.

At first glance, this result may seem surprising because, as indicated in Theorem \ref{thm:computational_complexity}, finding $k$-sparse linear explanations with minimal precision is \NPPP-hard. 
However, it is important to keep in mind that the fidelity measure does not always provide a tight upper bound on the precision measure. 
In fact, since $\EmpFidelity{\vec w^*}$ here assesses the capability of $\vec w^*$ to fit the regression model $f$ over the uniform distribution, it can be quite large.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Experiments
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Experiments}
\label{sec:experiments}

In order to validate the effectiveness of our methods, we have considered various explanation tasks for regression models.
The code was written using the \texttt{Python} language. 
Our experiments have been conducted on a Quad-core Intel XEON X5550 with 32GB of memory. 

\subsection{Experimental Setup}

We conducted experiments using $18$ tabular datasets, sourced from the standard repository, \texttt{OpenML}.\footnote{Some statistics on the datasets used in our experiments can be found in Table~\ref{tab:statistics} of Appendix~\ref{sec:appendix}.} 
All datasets focus on regression tasks and include both numerical and categorical attributes. 
To convert these raw attributes into interpretable binary features, we applied a standard \( K \)-bins discretization method, creating 4 bins for each attribute. 
For our experimental purposes, the 18 datasets were divided into two groups: 12 medium-dimensional benchmarks with an average of 415 binary features, and 6 low-dimensional benchmarks with an average of 20 binary features.

For each benchmark, an explanation task is defined by a tuple \( (f, \vec{x}, \sigma, k) \), where \( f \) is a black-box regressor implemented using a neural network. 
In our experiments, we utilized the \texttt{Scikit-Learn} implementation of the multi-layer perceptron regressor with default parameters. 
As usual, we trained \( f \) on the training set of the benchmark and evaluated its accuracy on the test set. 
Each data instance \( \vec{x} \) that we aimed to explain was randomly selected from the test set using a uniform distribution. 
Since the performance of state-of-the-art model-agnostic explainers is evaluated according to neighborhood distributions around \( \vec{x} \), 
we employed the following parameterized distribution:
\[
\mathcal{D}_{\vec{x}, \sigma}(\vec{z}) = \frac{1}{Z_{\sigma}} e^{-\sigma \|\vec{x} - \vec{z}\|_1} \quad \text{where} \quad Z_{\sigma} = \sum_{j = 0}^{d} \binom{d}{j} e^{-\sigma j}
\]
Here, \( \sigma \geq 0 \) serves as a spread parameter. Note that \( \mathcal{D}_{\vec{x}, 0} \) corresponds to the uniform distribution. 
We also considered \( k \in \{1, \ldots, 10\} \) to explore different levels of sparsity.

The performance of explainers for each explanation task was measured using the root mean squared error \((\EmpFidelity{\vec{w}})^{\frac{1}{2}} \) of the generated explanation \( \vec{w} \). 
This metric was calculated using \( m = 1000 \) labeled samples \( (\vec{z}_i, f(\vec{z}_i)) \), where each \( \vec{z}_i \) was generated according to the distribution \( \mathcal{D}_{\vec{x}, \sigma} \). 
For low-dimensional benchmarks, we also calculated the precision error \( \Precision{\vec{w}} \) of the generated explanation \( \vec{w} \) by enumerating all data instances \( \vec{z} \in \{\pm 1\}^d \). 
Both metrics were averaged over \( 20 \) random instances \( \vec{x} \).

To implement the MIP approach specified in the formulation (\ref{pb:mip}), we used the \texttt{Gurobi}
solver (version 11.0), running on a single thread with a timeout of \( 60 \) seconds. 
Our MIP and IHT approaches were compared with three methods. The first, referred to as CVX, is the convex relaxation of (\ref{pb:mip}), 
obtained by replacing the constraint \( \norm{\vec{w}}_0 \leq k \) with \( \norm{\vec{w}}_1 \leq k \). 
The last two methods are the state-of-the-art LIME \citep{Ribeiro.KDD.2016} and MAPLE \citep{Plumb.NeurIPS.2018}, both implemented with default parameters.

\setlength{\tabcolsep}{6pt}
\begin{table*}[t]
    \begin{center}
    \begin{adjustbox}{width=\linewidth,center}  
    \begin{tabular}{lr r rrrrr}
        \toprule
        \multicolumn{2}{c}{Benchmark} & & \multicolumn{5}{c}{$\sqrt{\EmpFidelity{\vec w}}$} \\
        \cmidrule{1-2} \cmidrule{4-8} 
        Name & ID &&        \textsc{CVX} &      \textsc{IHT} & \textsc{LIME} & \textsc{MAPLE} & \textsc{MIP} \\
        \midrule
        \small{Airfoil Self Noise} & \small{44957} && $0.040\, (\pm 0.01)$ & \color{blue} $0.055\, (\pm 0.02)$ & \color{blue} $0.321\, (\pm 0.04)$ & \color{blue} $0.218\, (\pm 0.02)$ & \color{blue} $\mathbf{0.049\, (\pm 0.01)}$ \\		
        \small{Auto MPG} & \small{42372} && $0.031\, (\pm 0.00)$ & \color{blue} $0.069\, (\pm 0.02)$ & \color{blue} $0.338\, (\pm 0.07)$ & \color{blue} $0.122\, (\pm 0.05)$ & \color{blue} $\mathbf{0.039\, (\pm 0.01)}$\\ 
        \small{Bike Sharing} & \small{44142} && $0.040\, (\pm 0.00)$ & \color{blue} $0.080\, (\pm 0.01)$ & \color{blue} $0.183\, (\pm 0.05)$ & \color{blue} $0.121\, (\pm 0.03)$ & \color{blue} $\mathbf{0.048\, (\pm 0.01)}$\\ 
        \small{Liver Disorders} & \small{8} && $0.059\, (\pm 0.02)$ & \color{blue} $0.091\, (\pm 0.02)$ & \color{blue} $0.209\, (\pm 0.04)$ & \color{blue} $0.147\, (\pm 0.08)$ & \color{blue} $\mathbf{0.068\, (\pm 0.02)}$ \\        
        \small{Machine CPU} & \small{230} && $0.039\, (\pm 0.01)$ & \color{blue} $0.128\, (\pm 0.02)$ & \color{blue} $0.312\, (\pm 0.08)$ & \color{blue} $0.190\, (\pm 0.06)$ & \color{blue} $\mathbf{0.055\, (\pm 0.01)}$\\        
        \small{Medical Charges} & \small{44146} && $0.040\, (\pm 0.00)$ & \color{blue} $\mathbf{0.049\, (\pm 0.01)}$ & \color{blue} $0.408\, (\pm 0.01)$ & \color{blue} $0.204\, (\pm 0.01)$ & \color{blue} $\mathbf{0.049\, (\pm 0.00)}$\\
        \midrule
        \small{Ailerons} & \small{44137} &&  $0.050\, (\pm 0.01)$ & \color{blue}$0.201\, (\pm 0.02)$ & \color{blue}$0.647\, (\pm 0.05)$ & $0.113\, (\pm 0.02)$ & \color{blue}$\mathbf{0.085\, (\pm 0.02)}$ \\
        \small{Auto Imports} & \small{9} && $0.067\, (\pm 0.01)$ & \color{blue} $0.232\, (\pm 0.03)$ & \color{blue} $0.528\, (\pm 0.06)$ & $0.148\, (\pm 0.04)$ & \color{blue} $\mathbf{0.107\, (\pm 0.01)}$\\
        \small{DNA Methylation} & \small{46139} && $0.121\, (\pm 0.02)$ & \color{blue} $\mathbf{0.192\, (\pm 0.04)}$ & \color{blue} $0.582\, (\pm 0.08)$ & $0.168\, (\pm 0.04)$ & \color{blue} $\mathbf{0.191\, (\pm 0.01)}$\\
        \small{Geographical OM} & \small{44965} && $0.148\, (\pm 0.02)$ & \color{blue} $0.259\, (\pm 0.04)$ & \color{blue} $0.662\, (\pm 0.07)$ & $0.174\, (\pm 0.01)$ & \color{blue} $\mathbf{0.202\, (\pm 0.02)}$\\
        \small{Moneyball} & \small{41021} && $0.039\, (\pm 0.00)$ & \color{blue} $0.192\, (\pm 0.02)$ & \color{blue} $0.483\, (\pm 0.05)$ & $0.120\, (\pm 0.03)$ & \color{blue} $\mathbf{0.071\, (\pm 0.01)}$\\ 
        \small{NCI 60 Thioguanine} & \small{46132} && $0.062\, (\pm 0.01)$ & \color{blue} $0.235\, (\pm 0.07)$ & \color{blue} $0.534\, (\pm 0.10)$ & $0.108\, (\pm 0.02)$ & \color{blue} $\mathbf{0.132\, (\pm 0.06)}$ \\ 
        \small{Online News} & \small{42724} && $0.010\, (\pm 0.00)$ & \color{blue} $0.051\, (\pm 0.00)$ & \color{blue} $0.069\, (\pm 0.01)$ & $0.046\, (\pm 0.01)$ & \color{blue} $\mathbf{0.028\, (\pm 0.01)}$ \\ 
        \small{Pollution} & \small{542} && $0.045\, (\pm 0.01)$ & \color{blue} $0.171\, (\pm 0.05)$ & \color{blue} $0.478\, (\pm 0.06)$ & $0.133\, (\pm 0.06)$ & \color{blue} $\mathbf{0.082\, (\pm 0.02)}$ \\
        \small{RTE Consumption} & \small{46337} && $0.033\, (\pm 0.00)$ & \color{blue} $0.102\, (\pm 0.01)$ & \color{blue} $0.273\, (\pm 0.10)$ & $0.114\, (\pm 0.04)$ & \color{blue} $\mathbf{0.057\, (\pm 0.01)}$\\
        \small{Student Performance} & \small{42352} && $0.074\, (\pm 0.01)$ & \color{blue} $0.143\, (\pm 0.02)$ & \color{blue} $0.454\, (\pm 0.03)$ & $0.169\, (\pm 0.04)$ & \color{blue} $\mathbf{0.105\, (\pm 0.01)}$ \\
        \small{Wave Energy} & \small{44975} && $0.017\, (\pm 0.00)$ & \color{blue} $\mathbf{0.080\, (\pm 0.03)}$ & \color{blue} $0.301\, (\pm 0.03)$ & $0.128\, (\pm 0.02)$ & \color{blue} $0.091\, (\pm 0.01)$ \\
        \small{Wisconsin} & \small{191} && $0.075\, (\pm 0.01)$ & \color{blue} $0.135\, (\pm 0.02)$ & \color{blue} $0.275\, (\pm 0.08)$ & $0.201\, (\pm 0.05)$ & \color{blue} $\mathbf{0.111\, (\pm 0.02)}$\\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \end{center}
  \caption{Experimental results on 6 low-dimensional benchmarks (upper rows) and 12 medium-dimensional benchmarks (lower rows), using \(\sigma = 1\), and \(k = 7\). 
  Entries highlighted in blue indicate that all generated explanations were \(k\)-sparse.}
  \label{tab:results}
\end{table*}

\subsection{Experimental Results}

An overview of our experimental results on 18 benchmarks, specifically for \( \sigma = 1 \) and \( k = 7 \), is presented in Table~\ref{tab:results}. 
The first six rows report results for the low-dimensional benchmarks, while the last twelve rows cover the medium-dimensional benchmarks. 
The first two columns of the table include the name of each dataset along with its corresponding \texttt{OpenML} identifier. 
The last five columns display the average root mean squared errors for the explanations generated by different competitors. 
Entries highlighted in blue indicate that the sparsity of all inferred explanations is at most \( k \), 
while entries highlighted in black indicate that the sparsity of the explanations significantly exceeds \( k \).

From these results, we can confidently conclude that both the IHT and MIP approaches outperform LIME across all benchmarks. 
As CVX operates within the convex hull of \( k \)-sparse explanations (with \( \norm{\vec{w}}_\infty \leq 1 \)), 
it serves as a lower bound for the fidelity of solutions to (\ref{pb:mip}). 
However, because CVX tends to produce dense solutions, it cannot be effectively considered as an explainer. 
Additionally, we can observe that for all medium-dimensional benchmarks, the explanations generated by MAPLE are dense. 
Furthermore, on low-dimensional benchmarks, MAPLE consistently performs worse than both IHT and MIP.

Table~\ref{tab:precisions} presents the average precision errors of IHT, LIME, and MIP across six low-dimensional benchmarks. 
By comparing these results with the average root mean square errors shown in Table~\ref{tab:results}, 
we can see that empirical fidelity serves as a good indicator of an explainer's performance regarding precision errors. 
Notably, both IHT and MIP outperform LIME in terms of precision.

\begin{table}[t]
    \begin{center}
    \begin{adjustbox}{width=\linewidth,center}  
    \begin{tabular}{l r rrr}
        \toprule
        Benchmark & \multicolumn{3}{c}{$\Precision{\vec{w}}$} \\
        \midrule
        Name &  \textsc{IHT} & \textsc{LIME} & \textsc{MIP} \\
        \midrule
        \footnotesize{Airfoil S. N.} & $0.045\, (\pm 0.03)$ & $0.092\, (\pm 0.07)$ & $0.042\, (\pm 0.01)$\\  
        \footnotesize{Auto MPG} &  $0.028\, (\pm 0.02)$ & $0.063\, (\pm 0.02)$ & $0.019\, (\pm 0.01)$\\  
        \footnotesize{Bike Sharing} &  $0.067\, (\pm 0.03)$ & $0.101\, (\pm 0.08)$ & $0.041\, (\pm 0.01)$\\ 
        \footnotesize{Liver Disorders} & $0.024\, (\pm 0.03)$ & $0.071\, (\pm 0.05)$ & $0.010\, (\pm 0.02)$\\  
        \footnotesize{Machine CPU} &  $0.088\, (\pm 0.02)$ & $0.124\, (\pm 0.09)$ & $0.035\, (\pm 0.01)$\\  
        \footnotesize{Medical Charges} & $0.012\, (\pm 0.01)$ & $0.228\, (\pm 0.04)$ & $0.012\, (\pm 0.01)$\\  
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \end{center}
  \caption{Average precisions of IHT, LIME, and MIP across the 6 low-dimensional benchmarks.}
  \label{tab:precisions}
\end{table}

\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-DNAMethylation.pdf}
        \phantomsubcaption
        \label{fig:sparsity:a}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-StudentPerformance.pdf}
        \phantomsubcaption
        \label{fig:sparsity:b}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-WaveEnergy.pdf}
        \phantomsubcaption
        \label{fig:sparsity:c}
    \end{subfigure}                   
    \caption{Comparison of root mean squared errors (y-axis) with increasing sparsity level (x-axis).}
    \label{fig:sparsity}
\end{figure*}

\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-DNAMethylation.pdf}
        \phantomsubcaption
        \label{fig:spread:a}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-StudentPerformance.pdf}
        \phantomsubcaption
        \label{fig:spread:b}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-WaveEnergy.pdf}
        \phantomsubcaption
        \label{fig:spread:c}
    \end{subfigure}                   
    \caption{Comparison of root mean squared errors (y-axis) with increasing spread (x-axis).}
    \label{fig:spread}
\end{figure*}

In Figure~\ref{fig:sparsity}, we present the performance of IHT, LIME, and MIP across varying levels of sparsity \( k \), ranging from 1 to 10, on three benchmarks: 
\emph{DNA Methylation} (which affects cancer drug response), \emph{Student Performance}, and \emph{Wave Energy}. 
The bar plots reveal that the performance of both IHT and MIP remains stable or even improves as \( k \) increases, while LIME exhibits significantly less stability.

Additionally, in Figure~\ref{fig:spread}, we report the performance of the three explainers as the spread \( \sigma \) increases from 0.1 to 1.0, using the same benchmarks. 
In contrast to LIME, both IHT and MIP show robustness to variations in the distribution.

The runtimes of the explainers are outlined in Appendix~\ref{sec:appendix}. 
In summary, the CVX and LIME methods are the fastest, each taking only a few milliseconds per benchmark. 
The IHT and MAPPLE methods have comparable speeds, generally requiring a few seconds per benchmark. 
For the MIP approach, the \texttt{Gurobi} solver can find an optimal solution within a few seconds for low-dimensional benchmarks. 
However, it constantly reaches the one-minute timeout for medium-dimensional benchmarks. 
In these cases, we have found that \texttt{Gurobi} can identify near-optimal solutions in just a few seconds, but verifying their optimality through lower bounds may take several minutes.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Discussion
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Conclusions}
\label{sec:conclusions}

In this paper, we have demonstrated that deriving sparse and precise explanations for regression models is \NPPP-hard. 
To tackle this computational challenge, we established that the precision of these explanations is upper-bounded by their fidelity. 
We can address this surrogate objective using Mixed Integer Programming, and under certain assumptions about the underlying distribution, 
we can achieve polynomial time efficiency through Iterative Hard Thresholding. 
Our comparative experiments on real-world regression tasks support these theoretical findings.

Though this study focused on minimizing the precision \(\Precision{\vec w}\) of linear explanations while maintaining a desired level of sparsity \(\norm{\vec w}_0 \leq k\), 
a promising direction for future research is to explore the reverse problem: minimizing the sparsity of linear explanations \(\norm{\vec w}_0\) while ensuring 
that the desired precision \(\Precision{\vec w} \leq \epsilon\) is maintained. 
This latter problem is also challenging, as verifying such a probabilistic constraint is \PP-hard.

\paragraph{Acknowledgements.}
Many thanks to the reviewers for their comments and suggestions. 
This work has benefited from the support of the AI Chair EXPEKCTATION (ANR-19- CHIA-0005-01) 
of the French National Research Agency. It was also partially supported by TAILOR, 
a project funded by EU Horizon 2020 research and innovation programme under GA No 952215.  

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% References
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\bibliography{uai2025-paper350-cr}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newpage
\onecolumn
\appendix 

\title{Probabilistic Explanations for Regression Models\\(Supplementary Material)}
\maketitle

\section{Additional Theoretical Background}

Our main results in Sections~\ref{sec:pp} and \ref{sec:np} are based on Theorem 26.15 in \citep{ShalevShwartz.Book.2014} and Theorem 5 in \citep{Blumensath.ACHA.2009}, which are presented below.

Given a space of data instances $\mathcal X \subseteq \R^d$, a space of labels $\mathcal Y \subseteq \R$, and a hypothesis class of linear functions $\mathcal H \subseteq \R^d$,
let $\ell: \mathcal H \times \mathcal X \times \mathcal Y \rightarrow \R$ be a loss function of the form 
\begin{align}
\label{eq.LipschitzLoss}
\ell(\vec w, x, y) = \phi(\vec w \cdot \vec x, y)
\end{align}   
where $\phi: \R \times \mathcal Y \rightarrow \R$ is $\rho$-Lipschitz in its first argument. In other words, for every $y \in \mathcal Y$, the scalar function $a \mapsto \phi(a, y)$ is $\rho$-Lipschitz.
As a notable example, the absolute loss function given by $\ell(\vec w, x, y) = |\vec w \cdot \vec x - f(\vec x)|$ can be written as in Equation \ref{eq.LipschitzLoss} 
using $\phi(a, y) = |a - y|$, which is $1$-Lipschitz for all $y \in \R$. 

\begin{theorem}[\citep{ShalevShwartz.Book.2014}]
Let $\mathcal D$ be a distribution over $\mathcal X \times \mathcal Y$ such that $\norm{\vec x}_{\infty} \leq r$ with probability $1$.
Additionally, let $\mathcal H =  \mathcal{B}_0(b)$ and let $\ell: \mathcal H \times \mathcal X \times \mathcal Y \rightarrow \R$ be a loss function 
of the form given in Equation \ref{eq.LipschitzLoss}, such that $\phi$ is $\rho$-Lipschitz in it first argument, and such that $\max_{a \in [-br,+br]} |\phi(a,y)| \leq c$.
Then, for any $\delta \in (0,1)$, with probability of at least $1 - \delta$ over the choice of an i.i.d. sample set $\vec Z = \{(\vec x_i,y_i)\}_{i = 1}^m$, 
\begin{align*}
   \forall \vec w \in \mathcal H,\;\;\;\;& \mathbb{E}_{(\vec x, y) \sim \mathcal D} [\ell(\vec w, \vec x, y)] \leq \frac{1}{m} \sum_{i=1}^m \ell(\vec w, \vec x_i, y_i) 
   + 2\rho br \sqrt{\frac{2 \log_2(2d)}{m}} + c \sqrt{\frac{2 \ln(2 / \delta)}{m}}.  
\end{align*}
\end{theorem}

Recall that a matrix \(\vec X \in \mathcal X^m \) satisfies the RIP of order \(s\) with constant \(\beta_s \in (0,1)\) if, 
for any vector \(\vec w \in \mathcal{B}_0(s)\), the following inequality holds:
\[
(1 - \beta_s) \|\vec w\|_2^2 \leq \tfrac{1}{m} \|\vec X\vec w\|_2^2 \leq (1 + \beta_s) \|\vec w\|_2^2
\]

\begin{theorem}[\citep{Blumensath.ACHA.2009}]
Consider a noisy observation $\vec y  = \vec X\vec w + \vec e$ where $\vec w \in \mathcal{B}_0(k)$. 
If $\vec X$ has the RIP of order $s = 3k$ with constant $\beta_s < 1/\sqrt{32}$, then after at most 
\begin{align*}
t = \left\lceil \log_2 \left( \frac{\norm{\vec w}_2}{\norm{\vec e}_2} \right) \right\rceil
\end{align*}
iterations, the solution $\vec w_t$ returned by the IHT algorithm estimates $\vec w$ with accuracy 
\begin{align*}
\|\vec{w}_t - \vec{w}\|_2 \leq 6 \|\vec{e}\|_2.
\end{align*}
\end{theorem}

\section{Additional Experimental Results}
\label{sec:appendix}

Table~\ref{tab:statistics} presents statistics for the 18 benchmarks used in our experiments. 
The first two columns list the names of the datasets along with their corresponding \textsc{OpenML} identifiers. 
The next four columns provide information on the number of categorical attributes, the number of numeric attributes, the count of binarized literals, and the total number of instances. 
Finally, the last column displays the accuracy of the regression model \( f \), measured as the mean squared error and obtained through 10-fold cross-validation.

\setlength{\tabcolsep}{6pt}
\begin{table}[h]
    \begin{center}
    %\begin{adjustbox}{width=\linewidth,center}  
    \begin{tabular}{lr r rrrr r r}
        \toprule
        \multicolumn{2}{c}{Benchmark} & & \multicolumn{4}{c}{Qualities} & & Accuracy \\
        \cmidrule{1-2} \cmidrule{4-7} \cmidrule{9-9} 
        Name & ID & & \textsc{\#cat} & \textsc{\#num} & \textsc{\#bin} & \textsc{\#inst} & & (\textsc{mse}) \\
        \midrule
        \small{Airfoil Self Noise} & \small{44957} && 0 & 5 & 20 & 1503 && $0.104$ \\
        \small{Auto MPG} & \small{42372} && 0 & 5 & 20 & 392 && $0.054$ \\ 
        \small{Bike Sharing (Demand)} & \small{44142} && 0 & 6 & 24 & 17379 && $0.090$ \\ 
        \small{Liver Disorders} & \small{8} && 0 & 5 & 20 & 345 && $0.104$  \\
        \small{Machine CPU} & \small{230} && 0 & 6 & 24 & 209 && $0.029$ \\
        \small{Medical Charges} & \small{44146} && 0 & 3 & 12 & 163065 && $0.106$ \\
        \midrule		
        \small{Ailerons} & \small{44137} && 0 & 33 & 132 & 13750 && $0.037$ \\
        \small{Auto Imports} & \small{9} && 11 & 14 & 120 & 205 && $0.027$ \\
        \small{DNA Methylation} & \small{46139} && 0 & 808 & 3232 & 475 && $0.028$\\
        \small{Geographical OM} & \small{44965} && 0 & 116 & 464 & 1059 && $0.019$ \\
        \small{Moneyball} & \small{41021} && 9 & 5 & 96 & 1232 && $0.029$ \\ 
        \small{NCI 60 Thioguanine} & \small{46132} && 0 & 48 & 192 & 60 && $0.021$ \\ 
        \small{Online News (Popularity)} & \small{42724} && 0 & 59 & 208 & 39644 && $0.004$ \\ 
        \small{Pollution} & \small{542} && 0 & 15 & 60 & 60 && $0.025$ \\ 
        \small{RTE Consumption} & \small{46337} && 0 & 15 & 56 & 105168 && $0.036$\\
        \small{Student Performance} & \small{42352} && 0 & 32 & 103 & 395 && $0.020$ \\
        \small{Wave Energy} & \small{44975} && 0 & 48 & 192 & 72000 && $0.024$\\
        \small{Wisconsin} & \small{191} && 0 & 32 & 128 & 194 && $0.027$ \\
        \bottomrule
    \end{tabular}
    %\end{adjustbox}
    \end{center}
  \caption{Some statistics about the $18$ benchmarks.}
  \label{tab:statistics}
\end{table}

Table~\ref{tab:runtimes} provides the runtimes in seconds for all explainers. 
As indicated in the paper, the MIP approach, which uses the \texttt{Gurobi} solver, 
is capable of finding an optimal solution within a few seconds for low-dimensional benchmarks. 
However, it experiences a one-minute timeout (highlighted in red) when applied to medium-dimensional benchmarks.

\begin{table}[h]
    \begin{center}
    \begin{tabular}{lr r rrrrr}
        \toprule
        \multicolumn{2}{c}{Benchmark} & & \multicolumn{5}{c}{Time (s)} \\
        \cmidrule{1-2} \cmidrule{4-8} 
        Name & ID &&        \textsc{CVX} &      \textsc{IHT} & \textsc{LIME} & \textsc{MAPLE} & \textsc{MIP} \\
        \midrule
        \small{Airfoil Self Noise} & \small{44957} && $0.005$ & $0.238$ & $0.015$ & $0.177$ & $1.178$ \\		
        \small{Auto MPG} & \small{42372} && $0.005$ & $0.241$ & $0.020$ & $0.185$ & $1.125$\\ 
        \small{Bike Sharing} & \small{44142} && $0.004$ & $0.292$ & $0.015$ & $0.210$ & $6.192$\\ 
        \small{Liver Disorders} & \small{8} && $0.004$ & $0.239$ & $0.015$ & $0.175$ & $1.295$ \\        
        \small{Machine CPU} & \small{230} && $0.004$ & $0.303$ & $0.020$ & $0.212$ & $2.920$\\        
        \small{Medical Charges} & \small{44146} && $0.004$ & $0.145$ & $0.011$ & $0.125$ & $0.601$\\
        \midrule
        \small{Ailerons} & \small{44137} &&  $0.018$ & $2.110$ & $0.056$ & $1.412$ & \color{red} $60.00$ \\        
        \small{Auto Imports} & \small{9} && $0.015$ & $1.494$ & $0.043$ & $0.985$ & \color{red} $60.00$\\
        \small{DNA Methylation} & \small{46139} && $0.315$ & $8.298$ & $0.249$ & $5.292$ & \color{red} $60.01$\\
        \small{Geographical OM} & \small{44965} && $0.270$ & $7.055$ & $0.171$ & $4.142$ & \color{red} $60.02$\\
        \small{Moneyball} & \small{41021} && $0.009$ & $0.817$ & $0.038$ & $0.637$ & \color{red} $60.01$\\ 
        \small{NCI 60 Thioguanine} & \small{46132} && $0.028$ & $2.790$ & $0.084$ & $1.853$ & \color{red} $60.01$ \\ 
        \small{Online News} & \small{42724} && $0.026$ & $3.150$ & $0.093$ & $1.292$ & \color{red} $60.00$ \\ 
        \small{Pollution} & \small{542} && $0.009$ & $0.993$ & $0.030$ & $0.623$ & \color{red} $60.00$ \\
        \small{RTE Consumption} & \small{46337} && $0.008$ & $0.814$ & $0.024$ & $0.581$ & \color{red} $60.01$\\
        \small{Student Performance} & \small{42352} && $0.020$ & $1.817$ & $0.044$ & $1.208$ & \color{red} $60.00$ \\
        \small{Wave Energy} & \small{44975} && $0.028$ & $2.395$ & $0.066$ & $1.905$ & \color{red} $60.00$ \\
        \small{Wisconsin} & \small{191} && $0.020$ & $1.628$ & $0.074$ & $0.967$ & \color{red} $60.01$\\
        \bottomrule
    \end{tabular}
    \end{center}
  \caption{Average runtimes for all explainers across the 18 benchmarks.}
  \label{tab:runtimes}
\end{table}

Figures~\ref{fig:sparsity:1} and \ref{fig:sparsity:2} show bar plots illustrating the increasing sparsity for all datasets, 
while Figures~\ref{fig:spread:1} and \ref{fig:spread:2} present bar plots depicting the increasing spread for all datasets. 
Lastly, the plots in Figure~\ref{fig:walltimes} illustrate the evolution of the solution maintained by the \texttt{Gurobi} solver as the time budget increases. 
It is evident that a near-optimal solution is typically found within just a few seconds, with the majority of the time budget dedicated to certifying its optimality through lower bounds.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-Airfoil.pdf}
        \phantomsubcaption
        \label{fig:sparsity:a1}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-AutoMPG.pdf}
        \phantomsubcaption
        \label{fig:sparsity:a2}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-BikeSharing.pdf}
        \phantomsubcaption
        \label{fig:sparsity:a3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-LiverDisorders.pdf}
        \phantomsubcaption
        \label{fig:sparsity:b1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-MachineCPU.pdf}
        \phantomsubcaption
        \label{fig:sparsity:b2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-MedicalCharges.pdf}
        \phantomsubcaption
        \label{fig:sparsity:b3}
    \end{subfigure}                   
    \caption{Comparison of root mean squared errors (y-axis) with increasing sparsity level (x-axis): Low-dimensional benchmarks.}
    \label{fig:sparsity:1}
\end{figure*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-Ailerons.pdf}
        \phantomsubcaption
        \label{fig:sparsity:c1}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-AutoMPG.pdf}
        \phantomsubcaption
        \label{fig:sparsity:c2}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-DNAMethylation.pdf}
        \phantomsubcaption
        \label{fig:sparsity:c3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-GeographicalOM.pdf}
        \phantomsubcaption
        \label{fig:sparsity:d1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-Moneyball.pdf}
        \phantomsubcaption
        \label{fig:sparsity:d2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-NCI60Thioguanine.pdf}
        \phantomsubcaption
        \label{fig:sparsity:d3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-OnlineNews.pdf}
        \phantomsubcaption
        \label{fig:sparsity:e1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-Pollution.pdf}
        \phantomsubcaption
        \label{fig:sparsity:e2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-RTEConsumption.pdf}
        \phantomsubcaption
        \label{fig:sparsity:e3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-StudentPerformance.pdf}
        \phantomsubcaption
        \label{fig:sparsity:f1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-WaveEnergy.pdf}
        \phantomsubcaption
        \label{fig:sparsity:f2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/size-k-Wisconsin.pdf}
        \phantomsubcaption
        \label{fig:sparsity:f3}
    \end{subfigure}                   
    \caption{Comparison of root mean squared errors (y-axis) with increasing sparsity level (x-axis): Medium-dimensional benchmarks.}
    \label{fig:sparsity:2}
\end{figure*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-Airfoil.pdf}
        \phantomsubcaption
        \label{fig:spread:a1}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-AutoMPG.pdf}
        \phantomsubcaption
        \label{fig:spread:a2}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-BikeSharing.pdf}
        \phantomsubcaption
        \label{fig:spread:a3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-LiverDisorders.pdf}
        \phantomsubcaption
        \label{fig:spread:b1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-MachineCPU.pdf}
        \phantomsubcaption
        \label{fig:spread:b2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-MedicalCharges.pdf}
        \phantomsubcaption
        \label{fig:spread:b3}
    \end{subfigure}                   
    \caption{Comparison of root mean squared errors (y-axis) with increasing spread (x-axis): Low-dimensional benchmarks.}
    \label{fig:spread:1}
\end{figure*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-Ailerons.pdf}
        \phantomsubcaption
        \label{fig:spread:c1}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-AutoMPG.pdf}
        \phantomsubcaption
        \label{fig:spread:c2}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-DNAMethylation.pdf}
        \phantomsubcaption
        \label{fig:spread:c3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-GeographicalOM.pdf}
        \phantomsubcaption
        \label{fig:spread:d1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-Moneyball.pdf}
        \phantomsubcaption
        \label{fig:spread:d2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-NCI60Thioguanine.pdf}
        \phantomsubcaption
        \label{fig:spread:d3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-OnlineNews.pdf}
        \phantomsubcaption
        \label{fig:spread:e1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-Pollution.pdf}
        \phantomsubcaption
        \label{fig:spread:e2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-RTEConsumption.pdf}
        \phantomsubcaption
        \label{fig:spread:e3}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-StudentPerformance.pdf}
        \phantomsubcaption
        \label{fig:spread:f1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-WaveEnergy.pdf}
        \phantomsubcaption
        \label{fig:spread:f2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\textwidth]{fig/spread-Wisconsin.pdf}
        \phantomsubcaption
        \label{fig:spread:f3}
    \end{subfigure}                   
    \caption{Comparison of root mean squared errors (y-axis) with increasing spread (x-axis): Medium-dimensional benchmarks.}
    \label{fig:spread:2}
\end{figure*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.49\textwidth}
        \includegraphics[width=\textwidth]{fig/walltimes-Ailerons.pdf}
        \phantomsubcaption
        \label{fig:walltimes:a1}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \includegraphics[width=\textwidth]{fig/walltimes-AutoMPG.pdf}
        \phantomsubcaption
        \label{fig:walltimes:a2}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \includegraphics[width=\textwidth]{fig/walltimes-Moneyball.pdf}
        \phantomsubcaption
        \label{fig:walltimes:b1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \includegraphics[width=\textwidth]{fig/walltimes-NCI60Thioguanine.pdf}
        \phantomsubcaption
        \label{fig:walltimes:b2}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \includegraphics[width=\textwidth]{fig/walltimes-StudentPerformance.pdf}
        \phantomsubcaption
        \label{fig:walltimes:c1}
    \end{subfigure}                   
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \includegraphics[width=\textwidth]{fig/walltimes-WaveEnergy.pdf}
        \phantomsubcaption
        \label{fig:walltimes:c2}
    \end{subfigure}                   
    \caption{Evolution of the explanations computed by \texttt{Gurobi} for 10 data instances $\vec x$, using $k = 7$ and $\sigma = 1$.}
    \label{fig:walltimes}
\end{figure*}


\end{document}

