\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage[hyphens]{url} 
\usepackage{graphicx}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{cite}
\usepackage{multirow}
\usepackage[tableposition=above]{caption}
\captionsetup[table]{skip=10pt}
\usepackage{subcaption}
\usepackage[switch]{lineno}  
% \usepackage[margin=0.8in]{geometry}
% \usepackage[round]{natbib}

\renewcommand{\cite}{\citep}

\captionsetup[subfigure]{labelfont=bf,textfont=normalfont,singlelinecheck=off,justification=centering}
\newtheorem{theorem}{Theorem}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{CounteRGAN: Generating Counterfactuals for Real-Time Recourse and Interpretability using Residual GANs}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%

% Add authors
\author[1]{\href{mailto:<nemird@amazon.com>?Subject=Your UAI 2022 paper}{Daniel~Nemirovsky\thanks{Work done while at Hired.}}{}}
\author[2]{Nicolas~Thiebaut}
\author[3]{Ye~Xu$^*$}
\author[3]{Abhishek~Gupta$^*$}
\affil[1]{
  Amazon\\
  Seattle, WA, U.S.A.\\
  nemird@amazon.com
}
\affil[2]{
  Hired\\
  New York, NY, U.S.A.\\
  nicolas.thiebaut@hired.com
}
\affil[3]{
    Meta\\
    Menlo Park, CA, U.S.A.\\
    \{yexu, abigupta\}@fb.com
 }

\begin{document}
\maketitle

\begin{abstract}
Model interpretability, fairness, and recourse for end users have increased as machine learning models have become increasingly popular in areas including criminal justice, finance, healthcare, and job marketplaces. This work presents a novel method of addressing these issues by producing \textit{meaningful counterfactuals} that are aimed at providing recourse to users and highlighting potential model biases. A meaningful counterfactual is a reasonable alternative scenario that illustrates how input data perturbations can influence the model's output. The \textit{CounteRGAN} method generates meaningful counterfactuals for a target classifier by utilizing a novel \textit{Residual Generative Adversarial Network (RGAN)}. We compare our method against leading state-of-the-art approaches on image and tabular datasets over a variety of performance metrics. The results indicate a significant improvement over existing techniques in combined metric performance, with a latency reduction of 2 to 7 orders of magnitude which enables providing real-time recourse to users. The code for reproducibility can be found here: https://github.com/gan-counterfactuals/countergan.}
\end{abstract}

\section{Introduction \label{sec:intro}}
\noindent 

% \renewcommand{\thefootnote}{\fnsymbol{footnote}}
% \footnotetext[1]{Work done while at Hired.}
% \addtocounter{footnote}{-1}
% \renewcommand{\thefootnote}{\arabic{footnote}}

\begin{figure}[t]
\centering
% AAAI version
% \includegraphics[width=0.9\columnwidth]{figures/countergan_arch.png}
% arXiv version
\includegraphics[width=0.9\columnwidth]{figures/countergan_arch.png}
\caption{The CounteRGAN method applied to an example from MNIST. Three neural nets are employed: a residual generator, a discriminator that distinguishes the real data, and a target classifier. The loss function of the generator uses both the classifier and discriminator output. In the example, residuals are being produced by the generator, which, when added to the input, creates realistic images of a "4".}
\label{fig:countergan_arch}
\end{figure}


A growing number of domains use machine learning (ML) predictive models on a daily basis, such as criminal justice for predicting recidivism \cite{Tollenaar2013-no}, healthcare for diagnosing patients \cite{Miotto2018-ah}, job marketplaces for hiring candidates \cite{Faliagka2012-pg}, and finance for loan approvals \cite{Addo2018-gl}. The pervasiveness of this technology has resulted in a growing need for model interpretability as well as discussions regarding "the right to explanation" in the legal and machine learning communities \cite{Wachter2017-vk, Selbst2017-zd, Goodman2016-co}. Consequently, considerable resources have been allocated not only to improve prediction explainability, but also to provide recourse to users to enhance fairness and opportunity.


A number of leading explainability methods  \cite{Ribeiro2016-fa, Lundberg2017-ly, Sundararajan2017-cq, Selvaraju_2019, Chattopadhay_2018} have shown promise in shedding light on the opaque logic and feature influences behind a prediction model. By answering \textit{why} a model predicted the result it did, explainability methods are useful for validating training and identifying possible racial, social, and gender biases. Alternatively, recourse aims to provide the user with actionable feedback by showing \textit{how} a prediction can be altered or reversed. By recommending certain changes to the input data, recourse can inform a user how to improve their chances of receiving a better diagnosis, getting a loan, or getting hired.

Alternative hypothetical scenarios that rely on perturbations to the original input values are known as \textit{counterfactuals}. The impacts on a prediction from the changes suggested by counterfactuals can be useful for model interpretability as well as for providing recourse to users. If, for instance, one's gender or race are suggested to be changed to alter the prediction result, then predictor biases could be suspected. In contrast, recourse relies on providing interpretable feedback users can act upon and which helps them change the prediction result in their favor. Counterfactuals should be meaningful in order to provide recourse. \textit{Meaningful counterfactuals} must be \textit{realistic}, \textit{computationally efficient}, and able to provide  \textit{actionable} feedback to the user that would help them achieve the \textit{desired prediction} outcome.

Unlike computing efficiency and prediction outcomes which can be quantified by examining latency and predictor score, realism and actionability are fundamentally more subjective. A counterfactual is said to be realistic when it closely resembles or "fits in" with the known distribution of data. For example, a house with a negative number of bedrooms is evidently unrealistic. A less obvious example, however, would be of a house with a seemingly extreme layout but where realism is dependent upon the location and society.\footnote{One of the authors recalls the wonderment of seeing the tall and narrow Dutch houses neatly packed into picturesque rows lining idyllic canals. Consider how surreal such homes would appear in the Andean mountain villages or vice versa.}By contrast, actionability pertains to whether proposed changes are interpretable and reasonable for a user to take action. For instance, increasing one's body mass index, learning a new programming language, or reducing outstanding debt are all actionable changes (although some are more difficult than others). The proximity and sparsity of a counterfactual can serve as an intuitive but imperfect proxy for actionability since they indicate the magnitude and number of potential perturbations in the counterfactual.
A realistic counterfactual doesn't necessarily result in actionable change. As an example, it is not reasonable to reduce one's age or education even if it results in a counterfactual which is very similar to a realistic individual. Moreover, depending on the use case, actionable changes may result in unrealistic counterfactuals. For instance, manipulating pixels and text or fixing features to specific values can confuse the target classifier in a manner similar to adversarial attacks which exploit seldom used regions of a classifier's decision boundary.

Existing recourse methods \cite{Wachter2017-jr, Van_Looveren2019-hr, Mothilal2020-ge} employ variations of regularized gradient descent to perform the counterfactual search. As a result, these are severely constrained by latency since a counterfactual search is required for every unique input data point. A lack of counterfactual realism also affects algorithms that do not explicitly consider realism constraints \cite{Wachter2017-jr} or conflate realism with actionability \cite{Mothilal2020-ge}. Latency constraints and the distinction between realism and actionability are key to framing the counterfactual search problem as a natural fit for Generative Adversarial Networks (GANs). GANs \cite{Goodfellow2014-wf} are a class of ML models capable of producing strikingly realistic synthetic data with low and fixed latencies.These models formulate the training of two artificial neural networks, a generator and a discriminator, as an adversarial game. While the discriminator is trained to recognize realistic data, the generator is trained to synthesize data that can fool the discriminator. Effectively trained generators produce realistic data requiring a single forward-pass through the neural network. 



% by formulating the training of two artificial neural networks, a generator and a discriminator, as an adversarial game. 

% The discriminator is trained to distinguish realistic data while the generator aims to synthesize data that is able to fool the discriminator. An effectively trained generator will be able to produce realistic data requiring only a forward-pass through the neural network. 

% These models formulate the training of two artificial neural networks, a generator and a discriminator, as an adversarial game. The discriminator is trained to distinguish realistic data while the generator aims to synthesize data that is able to fool the discriminator. An effectively trained generator will be able to produce realistic data requiring only a forward-pass through the neural network. 


In this work, we formalize a \textit{Residual GAN (RGAN)} architecture, useful for generating perturbations directly and alleviating mode collapse. The latter issue can occur during training when the generator model begins to consistently produce similar or identical outputs regardless of the inputs. The RGAN is used in conjunction with a fixed target classifier to generate meaningful counterfactuals that are suitable for providing recourse to users and improving model interpretability and fairness. This new approach, referred to as CounteRGAN, produces counterfactuals that meet or exceed the predictive gain and actionability of two state-of-the-art methods while improving realism and decreasing latency by 2-7 orders of magnitude. Figure \ref{fig:countergan_arch} provides a clarifying illustration of the CounteRGAN architecture applied to an example from MNIST.
The proposed technique can be used to provide real-time recourse to users of ML predictors employed in a wide range of industries. Our goal is to facilitate improved opportunities, transparency, and fairness associated with ML prediction. The main contributions of this work include:


\begin{itemize}
\item The application of GANs to produce meaningful counterfactuals that can provide real-time recourse to users as well as improved model interpretability and fairness. 

\item Formalizing a novel \textit{Residual Generative Adversarial Network (RGAN)} that trains the generator to produce residuals that are intuitive to the notion of perturbations used in counterfactual search. This model is also shown to alleviate mode collapse.
% , which when added to the original input, result in a synthetic data point. Applied to counterfactual generation, this model helps to reduce mode collapse and allows for fine-grained control over the suggested perturbations.

\item The \textit{CounteRGAN} method which applies an RGAN model in conjunction with a target classifier to produce meaningful counterfactuals 2 to 7 orders of magnitude faster than existing methods. A second variant is also introduced for when the target classifier’s gradients or architecture is unknown (e.g., a black-box model). 

% A proof of convergence is provided in the supplementary material.


% \item A CounteRGAN loss variant for when the existing classifier’s gradients or architecture is unknown (e.g., a black-box model). A proof of convergence is also provided.

% \item An evaluation comparing two state-of-the-art counterfactual methods with a standard GAN approach and two CounteRGAN variants. Experimental results on the MNIST image dataset and a popular diabetes tabular dataset show significant improvements regarding prediction gain, realism, and diversity as well as over 50x to 90,000x latency improvements.
\end{itemize}

% The following section outlines related work pertaining to counterfactuals and GANs. This is followed by a description of the Residual GAN model and how it is applied for counterfactual generation using the CounteRGAN technique. We then validate our proposed methods against two current state-of-the-art techniques using experiments on three datasets from separate domains. Lastly, we discuss our conclusions and directions for future work.


% Biases of the classifier has learned can be highlighted by counterfactuals -> counterfactuals for model interpretability



% We observe, however, that simply applying existing GANs to directly synthesize counterfactuals may result in mode collapse, where the generator always seems to output the same or similar counterfactuals irrelevant of the input. Even though they may appear realistic, the counterfactuals will lack sparsity and the requirement of actionability. Therefore, 

% To alleviate mode collapse and al we introduce a Residual GAN (RGAN) model where the generator outputs residuals, which are intuitive to the notion of perturbations. 

% A full synthetic data point is produced when the residuals are added to the generator’s initial input. We propose a CounteRGAN technique that utilizes an RGAN with a target classifier to produce counterfactuals. Our results show that this method is capable of generating counterfactuals that meet or exceed two state-of-the-art methods with regards to desired classification results and actionability while significantly improving realism and vastly outperforming them in terms of latency. 

% Moreover, in the context of counterfactual generation, it may be beneficial to provide fine-grained control, such as regularization, directly over the perturbations as opposed to the full counterfactual.





% \section{Related Work\label{sec:related_work}}


\section{Related Work}
\label{sec:related_work}
Influential and relevant previous work comes from both the counterfactual and GAN domains.
\subsection{Counterfactuals\label{sec:counterfactuals}} 
% \paragraph{Counterfactuals}
\noindent Borrowing from philosophy and causality \cite{Lewis1973-al, pearl_causality, karimi2020algorithmic}, counterfactuals were introduced as explanations for ML predictors by Wachter et al. \cite{Wachter2017-jr}. The authors formulated counterfactual search as a minimization problem with an added regularization term to enforce feature perturbation sparsity. Given an original data point $x$ and a ML classifier $C$, the counterfactual $x_\text{cf}$ is produced using iterations of gradient descent to increase the classifier's prediction $C_t\left(x_\text{cf}\right)$ for a given target class $t$. This approach is useful for producing counterfactuals of the desired class but tends to be slow and results may be unrealistic.

% This is the first of two approaches we evaluate our method against and is hereafter referred to as Vanilla Gradient Descent (VGD). 

Several approaches have targeted increasing counterfactual realism. These include a graph-based density approach \cite{Poyiadzi2019-rr} and applying an autoencoder reconstruction error term to constrict the counterfactual from straying too far from the observed feature space \cite{Dhurandhar2018-hk, joshi2019, pawel2020}. An alternative approach \cite{Mothilal2020-ge} focuses on producing multiple diverse counterfactuals for each query instance such that the user can select the most relevant. A novel technique proposed utilizing class prototypes \cite{kim_2016} to guide the counterfactual search toward high-density regions of the feature space \cite{Van_Looveren2019-hr}. 
% The same work outlines a novel method for handling categorical variables via pairwise distance measures and multi-dimensional scaling.
% to map the variables into a one-dimensional space where meaningful perturbations can be applied before projecting the variable back to the closest category. 
While the aforementioned methods are limited to differentiable classifiers, a heuristic search involving "growing spheres" is used \cite{Laugel2017-tp} to produce sparse counterfactuals for non-differentiable or black-box models. This method, however, does not further address realism nor latency concerns. All of the approaches mentioned above suffer from high computational latencies. The proposed CounteRGAN method, however, is able to produce meaningful counterfactuals within real-time latency constraints for both differentiable and non-differentiable models.


Counterfactuals are also produced in adversarial perturbation techniques \cite{Goodfellow2014-yo}. For example, modifying a single pixel in an image of a horse to fool a classifier into predicting it is an image of a frog \cite{Su2017-pk}. In general, these methods are aimed at confusing a target classifier without necessarily providing meaningful recourse to users; a task that requires balancing desired prediction with realism and actionability.

% Apart from being limited to differentiable models, gradient descent-based approaches suffer two additional disadvantages. First, the design and relative weighting of all of the additional terms added to the counterfactual objective function, for example, to enforce sparsity, diversity, and realism, can be complex and unstable. This can lead to divergences during training if not carefully tuned. Second, the computational latency the search algorithms require is prohibitive to real-time applications and general adoption. Unlike prior work, the proposed CounteRGAN method is able to produce realistic and diverse counterfactuals of a desired within real-time latency constraints.



% While adversarial perturbation and image translation \cite{Isola2016-kf, Zhu2017-mz, Kim2017-fd, Royer2017-de, Zhu2017-sf, Almahairi2018-zi} are ML methods capable of generating counterfactuals, neither is as of yet, adequate at producing counterfactuals for meaningful recourse. Adversarial methods favor confusing classifiers with numerous subtle perturbations that result in unrealistic counterfactuals with poor actionability. Image translation methods are domain-specific and often target realism instead of reverting decisions of existing classifiers and providing actionable feedback to users


\subsection{Generative Adversarial Nets (GANs)}
The introduction of GANs \cite{Goodfellow2014-wf} marked a milestone in the field of generative models. The elegance of a GAN lies in its formulation of training as an adversarial minimax game between two differentiable models able to approximate probability distributions utilizing backpropagation and gradient descent. Interest in GANs has since intensified and several novel approaches have been proposed towards improving training \cite{Salimans2016-wx, Arjovsky2017-rx} and architecture \cite{Radford2015-qx, Denton2015-ka, Zhang2016-qc}. Providing additional input such as label information to condition GANs, for example, to generate specific MNIST digits, has been previously proposed \cite{Mirza2014-kt, Odena2017-jl}. GANs have also been applied to problems that share intuitive notions with counterfactuals such as representation learning \cite{Chen2016-tt, Tran2017-vc}, image-to-image translation \cite{Isola2016-kf, Zhu2017-mz, Zhu2017-sf}, style transfer \cite{Huang2017-ii, Karras2020-uk}, and illumination \cite{Wang2017-fj, Zhang2019-kr}. The use of GANs with residual images has been proposed for attribute manipulation in images \cite{Shen2016-ge}. These methods are domain-specific and often target realism instead of reverting decisions of existing classifiers and providing actionable feedback to users. An unrelated but similarly termed "Residual GAN" \cite{Tavakolian2019-kq} uses a deep residual convolutional network to a generator to magnify subtle facial variations. In contrast, we define and use a Residual GAN, where the generator is trained to synthesize residuals directly. Unlike prior work, and to the best of our knowledge, we are the first to apply GANs towards the generation of meaningful counterfactuals for recourse. 



% We also define and use a residual GAN, where the generator is trained to synthesize residuals, as a way to overcome the mode collapse observed when using standard GANs  

% to produce counterfactuals.




% we define a residual GAN, where the generator is trained to synthesize residuals and apply it towards generating meaningful counterfactuals given a target classifier in order to produce actionable and realistic counterfactuals, given input from the original data distribution. To the best of our knowledge, we are the first to apply GANs to the generation of meaningful counterfactuals.



\section{GAN-based Counterfactual Generation}
\label{sec:countergan}
\noindent 
To overcome the mode collapse and actionability limitations of applying standard GANs to counterfactual generation, we formalize the Residual GAN (RGAN) as a special case of GAN. 
The CounteRGAN, by contrast, is the proposed technique that couples an RGAN with a target classifier to synthesize meaningful counterfactuals.

\subsection{Residual GAN (RGAN)\label{sec:rgan}}
\noindent 
Similar to how conditional GANs \cite{Mirza2014-kt}, though initially motivated by image synthesis, have been generalized to be applicable to several domains, we also introduce a generalized RGAN formulation, whose original motivation stemmed from generating counterfactuals, but could also be applied to other domains including image synthesis and photo editing \cite{Zhang2019-kr}. The generalized RGAN is a special instance of a GAN where the generator generates residuals rather than a complete synthetic data point. As in standard GANs, a discriminator $D$ and a generator $G$ are trained in a minimax game framework where the generator seeks to minimize and the discriminator aims to maximize the following value function:

%% AAAI version
\begin{equation}
\label{eq_rgan_generalized}
\begin{aligned}
    \mathcal{V}_{\mathrm{RGAN}}(D, G)=&\mathbb{E}_{x \sim p_{\mathrm {data}}}\log D(x) \\ 
    &+ \mathbb{E}_{z \sim p_{\mathrm {z}}}\log \left(1-D(z+G(z))\right),
\end{aligned}
\end{equation}
% arXiv version
% \begin{equation}
% \label{eq_rgan_generalized}
%     \mathcal{V}_{\mathrm{RGAN}}(D, G)=\mathbb{E}_{x \sim p_{\mathrm {data}}}\log D(x)
%     + \mathbb{E}_{z \sim p_{\mathrm {z}}}\log \left(1-D(z+G(z))\right),
% \end{equation}
where the generator's input $z\in \mathbb Z$ is a latent variable sampled from a probability distribution $p_{\mathrm {z}}$. The input to the RGAN discriminator is $z+G(z)$, as opposed to the standard GAN which utilizes $G(z)$ directly.

% The contrast between the GAN formulation and that of the generalized RGAN can be seen by comparing their respective value functions, described in Equations \ref{eq_gan} and \ref{eq_rgan_generalized} below. In both cases, the discriminator's objective is to maximize the value function and the objective of the generator is to minimize it.\footnote{To ease convergence when the generator produces obvious unrealistic samples, it is common to replace the generator's minimization objective with the maximization of the second term of these equations.}
% \begin{equation}
% \label{eq_gan}
% \begin{aligned}
%     \mathcal{V}_{\mathrm{GAN}}(D, G)=&\mathbb{E}_{x \sim p_{\mathrm {data}}}\log D(x) \\ 
%     &+ \mathbb{E}_{z \sim p_{\mathrm {z}}}\log \left(1-D(G(z))\right),
% \end{aligned}
% \end{equation}

% \begin{equation}
% \label{eq_rgan_generalized}
% \begin{aligned}
%     \mathcal{V}_{\mathrm{GAN}}(D, G)=&\mathbb{E}_{x \sim p_{\mathrm {data}}}\log D(x) \\ 
%     &+ \mathbb{E}_{z \sim p_{\mathrm {z}}}\log \left(1-D(z+G(z))\right),
% \end{aligned}
% \end{equation}
% Where $D$ is the discriminator model, $G$ is the generator model, and $z$ is the input to the generator taken from a probability function from a latent space $p_{\mathrm {z}}$. 


The generalized RGAN formulation restricts the dimensionality of the latent (input) space to be the same as the data feature (output) space ($\mathbb Z=\mathbb{X}$).\footnote{This constraint could be overcome by utilizing an autoencoder. The synthesized data point $z+G(x)\in \mathbb{Z}$ can then be decoded to a new data point in the same space as the input data, such that $\mathrm{decoder}(z+G(x))\in \mathbb{X}$} and forces the generator to learn contingent relationships between its input and output. This constraint enables fine-grained regularization directly on the residuals \footnote{Note that the activation function for the generator's output layer constrains the residuals and therefore their impact on the final synthesized output. Thus, depending on the scenario, it is recommended to use a symmetric activation function (e.g., linear, tanh) capable of outputting positive and negative values within the same order of magnitude as the input features.} and helps to alleviate mode collapse caused when the GAN generates similar output regardless of its input which it learns to ignore. 
% The RGAN formulation is highly relevant in the context of counterfactual search, which was the principal motivation for its introduction.


% Having been motivated by counterfactual search, the RGAN formulation can be reframe the generalized RGAN formulation specifically for counterfactual search as done in the CounteRGAN method described below.



% (Equation \ref{eq_rgan_specific}).

% \begin{equation}
% \label{eq_rgan_specific}
% \begin{aligned}
%     \mathcal{V}_{\mathrm{RGAN}}(D, G)=&\mathbb{E}_{x \sim p_{\mathrm {target}}}\log D(x) \\ 
%     &+ \mathbb{E}_{x \sim p_{\mathrm {data}}}\log \left(1-D(x+G(x))\right),
% \end{aligned}
% \end{equation}
% Where $p_{\mathrm{target}}=p_\mathrm{data}(\cdot|t)$ is the probability distribution of the data of a target class $t$, and $p_{\mathrm{data}}$ is the probability distribution of the entire data. In other words, in utilizing this formulation, the generator is trained to produce residuals given input from real data, that when added to the original input result in counterfactuals. The discriminator is trained separately using a subset of the real data as well as generated data to distinguish realistic data.

% Note that if both the generator and discriminator use input from the same probability distribution (i.e., $p_{\mathrm{target}}=p_\mathrm{data}(\cdot|t)$), then the generator is simply tasked with learning the identity function, unless an auxiliary classifier.


% Therefore, the inputs for the discriminator and generator should be sampled from dissimilar probability distributions.

% he generator learn mappings to and from the same feature space $\mathbb X$ (i.e., from data points to their respective counterfactuals). Moreover, since counterfactuals are based on perturbations to the original data points, the notion of having the generator produce residuals instead of an outright counterfactual is intuitive and has led to better results in our experiments. Since the residuals can be initialized to act as an identify function, this formulation provides a natural starting point for counterfactual generation while forcing the generator to take its inputs into account which helps to reduce mode collapse and improve diversity.

% While the GAN utilizes input from a (usually random) latent space $\mathbb Z$, the RGAN \begin{equation}
% \label{eq_rgan_specific}
% \begin{aligned}
%     \mathcal{V}_{\mathrm{RGAN}}(D, G)=&\mathbb{E}_{x \sim p_{\mathrm {target}}}\log D(x) \\ 
%     &+ \mathbb{E}_{x \sim p_{\mathrm {input}}}\log \left(1-D(x+G(x))\right),
% \end{aligned}
% \end{equation}

% In contrast to the GAN formulation, however, the RGAN 



% While the GAN draws its inputs from a (usually random) latent space $\mathbb Z$, counterfactual search requires that the generator learn mappings to and from the same feature space $\mathbb X$ (i.e., from data points to their respective counterfactuals). Moreover, since counterfactuals are based on perturbations to the original data points, the notion of having the generator produce residuals instead of an outright counterfactual is intuitive and has led to better results in our experiments. Since the residuals can be initialized to act as an identify function, this formulation provides a natural starting point for counterfactual generation while forcing the generator to take its inputs into account which helps to reduce mode collapse and improve diversity. 


\subsection{CounteRGAN}
\noindent 
The proposed counterfactual search method, termed CounteRGAN, utilizes an RGAN and a fixed target classifier $C$ to produce meaningful counterfactuals for providing recourse to users and improved interpretability. The method is capable of producing counterfactuals that are of the desired target class, realistic, actionable, and require low computational latency. Below we present two variants of the CounteRGAN value function for when the classifier's gradients are and are not known. The search process seeks to maximize the value function with respect to the discriminator $D$ and minimize it with respect to the generator $G$.

% As a result, paving the way towards providing real-time meaningful recourse. The RGAN can be used to produce counterfactuals specific to a fixed and previously trained classifier $C$. 



% \begin{figure}[t]
% \centering
% \includegraphics[width=0.9\columnwidth]{figures/countergan_arch.pdf}
% \caption{The architecture of the CounteRGAN. Given an input data point, the generator synthesizes residuals, which when added to the input produce a counterfactual. A discriminator and classifier models are used to ensure the counterfactual is both realistic and of the desired target class.}
% \label{fig:countergan_arch}
% \end{figure}


If the classifier is known and differentiable, then the following CounteRGAN value function can be used:
%% AAAI version
\begin{multline}
\label{eq_countergan_diff}
 \mathcal{V}_\text{CounterRGAN}(G, D)=\mathcal{V}_{\mathrm{RGAN}}(G, D) +\mathcal{V}_{\mathrm{CF}}(G, C, t) \\
 + \mathrm{Reg}(G(x)),
\end{multline}

% arXiv version
% \begin{equation*}
% \label{eq_countergan_diff}
%  \mathcal{V}_\text{CounterRGAN}(G, D)=\mathcal{V}_{\mathrm{RGAN}}(G, D) +\mathcal{V}_{\mathrm{CF}}(G, C, t) \\
%  + \mathrm{Reg}(G(x)),
% \end{equation*}
% \begin{equation}
%  (G, D)=\arg \min _{G} \max _{D}\left[\mathcal{V}_{\mathrm{RGAN}}(G, D)+\mathcal{V}_{\mathrm{CF}}(G, C, t)\right]   
% \end{equation}
% \begin{multline}
%  (G, D)=\arg \min _{G} \max _{D}\left[\mathcal{V}_{\mathrm{RGAN}}(G, D) \\ +\mathcal{V}_{\mathrm{CF}}(G, C, t)\right]  
% \end{multline}
where $t$ is the target class. The first term ($\mathcal{V}_{\mathrm{RGAN}}$) uses a specialized RGAN that reads:
%% AAAI version
\begin{equation}
\label{eq_rgan_specific}
\begin{aligned}
    \mathcal{V}_{\mathrm{RGAN}}&(D, G)=\mathbb{E}_{x \sim p_{\mathrm {data}}}\log D(x) \\ 
    &+ \mathbb{E}_{x \sim p_{\mathrm {data}}}\log \left(1-D(x+G(x))\right),
\end{aligned}
\end{equation}

%% arXiv version
% \begin{equation}
% \label{eq_rgan_specific}
%     \mathcal{V}_{\mathrm{RGAN}}(D, G)=\mathbb{E}_{x \sim p_{\mathrm {data}}}\log D(x) 
%     + \mathbb{E}_{x \sim p_{\mathrm {data}}}\log \left(1-D(x+G(x))\right),
% \end{equation}
where both the generator $G$ and discriminator $D$ use inputs samples $x$ from the same probability distribution $p_{\mathrm {data}}$. In isolation, this formulation would result in the generator learning to systematically output null residuals since the inputs are already realistic data. However, since the generator is also required to account for the classifier's loss term $\mathcal{V}_{\mathrm{CF}}$, this formulation helps to enforce counterfactual realism.

The term ($\mathcal{V}_{\mathrm{CF}}$) drives the counterfactual toward the desired class $t$, it reads:

\begin{equation}
\mathcal{V}_{\mathrm{CF}}(G, C, y)=\mathbb{E}_{x \sim p_{\mathrm{data}}} \log \left(1-C_t(x+G(x))\right),
\end{equation}
where $C_t$ is the classifier's prediction function for the desired class.

The last term of the CounteRGAN value function, $\mathrm{Reg}(G(x))$, can be any weighted combination of L1 and L2 regularization terms and helps to control the sparsity and amplitude of the residuals (i.e., feature perturbations) which serves as a proxy for counterfactual actionability. 



% During CounteRGAN training, the discriminator still learns to discriminate between real and synthetic samples, while the generator seeks to balance the realism (maximization of $\mathbb{E}_{x \sim p_{\mathrm {input}}}\log D(x+G(x))$) and counterfactual (maximization of $\mathbb{E}_{x \sim p_{\mathrm{input}}}\log \left(C_t(x+G(x))\right)$) constraints. As a result, the generator learns to produce residuals that, when added to the input, produce realistic counterfactuals that are classified by $C$ to be as close to 1 for the desired class.


While most existing counterfactual search methods target differentiable models, the target classifiers used in production settings may often be non-differentiable or unknown (black-box).\footnote{For example, while a bank employee may have access the a loan classifier's architecture, the same cannot necessarily be said about the customer or a third-party service.} To account for such scenarios, we introduce a second CounteRGAN value function termed CounteRGAN-bb for black-box models. Instead of computing a classifier's gradients, this variant weighs the first term of the RGAN value function by the classifier's prediction score $C_t(x_i)$ such that the corresponding value function reads
% AAAI version
\begin{equation}
\begin{aligned}
\label{eq_countergan_nondiff}
% \begin{aligned}
    &\mathcal{V}_{\mathrm{CounteRGAN-bb}}(D, G)=\frac{\sum_i C_t(x_i) \log D(x_i)}{\sum_i C_t(x_i)} \\ 
    &\quad\qquad+ \frac{1}{N} \sum_i \log \left(1-D(x_i+G(x_i))\right) + \mathrm{Reg}(G, \left\{x_i\right\}),
\end{aligned}
\end{equation}
% % arXiv version
% \begin{equation}
% \label{eq_countergan_nondiff}
%     \mathcal{V}_{\mathrm{CounteRGAN-wt}}(D, G)=\frac{\sum_i C_t(x_i) \log D(x_i)}{\sum_i C_t(x_i)} 
% + \frac{1}{N} \sum_i \log \left(1-D(x_i+G(x_i))\right) + \mathrm{Reg}(G, \left\{x_i\right\}),
% \end{equation}
where $\mathrm{Reg}(G, \left\{x_i\right\})$ is analogous to the regularization term introduced previously and samples $x_i$ are drawn from the entire data distribution.
$$ \mathrm{Reg}\left(G, \left\{x_i\right\}\right) = \alpha \sum_i \Vert G(x_i)\Vert_1 + \beta \sum_i \Vert G(x_i)\Vert_2^2.$$ 

% where samples $x_i$ are drawn from the entire data distribution. 
% \begin{proof}
The specific form of this value function is motivated by the resulting convergence properties, formalized by Theorem 1 below. The proof of convergence is provided in the supplementary material. 
% Given sufficient capacity and training time, the generator's output converges to a probability distribution $p_{C_t}$ that balances realism with the desired classification. 
% It is defined by 

% \begin{equation*}
%     p_{C_t}(x) = \mathcal N_t \; C_t(x) \; p_\mathrm{data}(x),
% \end{equation*}

% where $\mathcal N_t= \left(\int C_t(x) \; p_\mathrm{data}(x) \mathrm{d}x\right)^{-1}$ is a normalization constant. 
% To prove the convergence to $p_{C_t}$, we note that the value function in equation \ref{eq_countergan_nondiff} (omitting the regularization term) is the weighted sample estimate of 

% % AAAI version
% \begin{equation}
% \begin{aligned}
%     &\mathcal{V}_{\mathrm{CounteRGAN-wt}}(D, G)=\mathbb{E}_{x \sim p_{C_t}} \log D(x) \\ 
%     &\quad\qquad+ \mathbb{E}_{x \sim p_{\mathrm{data}}} \log \left(1-D(x+G(x))\right).
% \end{aligned}
% \end{equation}


% % arXiv version
% \begin{equation}
%     \mathcal{V}_{\mathrm{CounteRGAN-wt}}(D, G)=\mathbb{E}_{x \sim p_{C_t}} \log D(x)
% + \mathbb{E}_{x \sim p_{\mathrm{data}}} \log \left(1-D(x+G(x))\right).
% \end{equation}


% The latter formulation is equivalent to the value function of the RGAN value function (equation \ref{eq_rgan_generalized}), but with $p_{C_t}$ instead of $p_\mathrm{data}$. The regular GAN was proven to converge to $p_\mathrm{data}$ under generic conditions in \cite{Goodfellow2014-wf} and the proof holds for RGANs as described below.

% Hence, the CounteRGAN-wt converges to $p_{C_t}$ under the same conditions on the training setup. See the supplementary material for an elaborated proof.

% We introduced a variant of the CounteRGAN named \textit{CounteRGAN-wt} that, conversely to the first formulation, is compatible with unknown or non-differentiable target classifiers. Without the regularization term (omitted in the following), the corresponding value function reads
% % AAAI version
% \begin{equation}
% \begin{aligned}
% \label{appendix:eq_countergan_nondiff}
% % \begin{aligned}
%     &\mathcal{V}_{\mathrm{CounteRGAN-wt}}(D, G)=\frac{\sum_i C_t(x_i) \log D(x_i)}{\sum_i C_t(x_i)} \\ 
%     &\quad\qquad+ \frac{1}{N} \sum_i \log \left(1-D(x_i+G(x_i))\right),
% \end{aligned}
% \end{equation}
% where $D$, $G$, and $C_t$ denote the discriminator, generator, and target classifier respectively. $C_t(x_i)$ is the target classifier's score given target class $t$ and data point $x_i$ which is drawn from the available dataset.


% \subsubsection{Proof of convergence}
\begin{theorem} %[Convergence of CounteRGAN-wt to $p_{C_t}$] 
If the discriminator is systematically allowed to reach its optimum, and the generator has sufficient capacity, then the minimax optimization of the value function from equation \ref{eq_countergan_nondiff} converges to the Nash equilibrium. The full generator's output distribution $p_{g_+}$ converges to a distribution $p_{C_t}$ defined by

\begin{equation}
p_{C_t}(x) = \mathcal N_t \; C_t(x) \; p_\mathrm{data}(x),
\end{equation}
\newline
\noindent where $N_t$ is a normalization constant.\footnote{Explicitly, $\mathcal N_t= \left(\int C_t(x) \; p_\mathrm{data}(x) \mathrm{d}x\right)^{-1}$ but it doesn't need to be computed for our purpose.} 
\end{theorem}


% \begin{proof}
% We first introduce the full generator output function $G_+(x) = x + G(x)$, and note that the value function defined by equation \ref{eq_countergan_nondiff} can be written as 
% \begin{equation}
% \label{eq_countergan_nondiff_expectations}
% \begin{aligned}
%     &\mathcal{V}_{\mathrm{CounteRGAN-bb}}(D, G)=\mathbb{E}_{x \sim p_{C_t}} \log D(x) \\ 
%     &\qquad\qquad+ \mathbb{E}_{x \sim p_{g_+}} \log \left(1-D(x)\right),
% \end{aligned}
% \end{equation}
% since the first term on the r.h.s. of Equation \ref{eq_countergan_nondiff} is a weighted sampling estimate of $\mathbb{E}_{x \sim p_{C_t}} \log D(x)$, and for the second term, the equality $\mathbb{E}_{x \sim p_{g_+}} \log \left(1-D(x)\right)=\mathbb{E}_{x \sim p_{\mathrm{data}}} \log \left(1-D(G_+(x))\right)
% $ is a consequence of the Radon–Nikodym theorem. 


% % $$\int  p_\mathrm{data}(x) \log D(G_+(x)) \mathrm{d}x = \int p_\mathrm{g_+}(x) \log D(x)\mathrm{d}x$$

% % $$  \sum_i  \frac{N_i}{N} \log D(G_+(x_i))$$
% %In the following we note $p_{C_t}$ and $p_{g_+}$ as $p$ and $q$ respectively to make notations shorter.

% From the expression of the value function in equation \ref{eq_countergan_nondiff_expectations}, Proposition 1 of Goodfellow et al. \cite{Goodfellow2014-wf} implies that for any generator $G$ the optimal discriminator is 
% \begin{equation}
% D^*(x) = \frac{p_{C_t}(x)}{p_{g_+}(x)+p_{C_t}(x)}.
% \end{equation}

% The value function for an ideal discriminator thus reads:
% \begin{equation}
% \begin{aligned}
%     &\mathcal{V}^*(G) = \mathcal{V}(D^*, G)= \mathbb E_{x\sim p_{C_t}} \log \frac{p_{C_t}(x)}{p_{g_+}(x)+p_{C_t}(x)} \\
%     &\qquad\qquad + \mathbb E_{x\sim p_{g_+}} \log \frac{p_{g_+}(x)}{p_{g_+}(x)+p_{C_t}(x)}.
% \end{aligned}
% \end{equation}

% To find the distribution $p_{g_+}^*$ that minimizes $\mathcal{V}^*$ under the probability normalization constraint, $\int p_{g_+}(x) \mathrm{d}x = 1 $, we introduce a Lagrange multiplier $\mu$. We then compute the functional derivative of $\mathcal{V}^*$ with respect to $p_{g_+}$ using the shortened notation for $p = p_{C_t}(x)$ and $q = p_{g_+}(x)$ in the following equation
% \begin{equation}
% \begin{aligned}
%     \frac{\delta \mathcal{V}^*}{\delta q} & = \frac{\partial}{\partial q}\left[p\log\left(\frac{p}{p+q}\right) + q\log\left(\frac{q}{p+q} \right) + \mu q\right] \\
%     & = \log\left(\frac{q}{p+q}\right) +\mu.
% \end{aligned}
% \end{equation}

% The optimum of $\mathcal{V}^*$ is attained for 
% \begin{equation}
%     \frac{\delta V}{\delta p_{g_+}^*}(x) = 0 \quad \Longleftrightarrow \quad p_{g_+}^*(x) = \frac{p_{C_t}(x)}{\exp(\mu) - 1},
% \end{equation}
% from which the normalization constraint leads to
% \begin{equation}
% \int \frac{p_{C_t}(x)}{\exp(\mu)- 1}\mathrm d x=1 \quad \Longleftrightarrow \quad \exp(\mu)=2,
% \end{equation}
% such that 
% \begin{equation}
% p_{g_+}^*(x) = p_{C_t}(x)
% \end{equation}
% for all $x$. Hence $\mathcal V^*$ has a unique optimum\footnote{The optimum is a minimum here since $\mathcal V^*$ is a convex functional of $p_{g_+}$, as can be seen from the form of the second functional derivative $\frac{\delta^2 V}{(\delta p_{g_+}^*)^2}(x) = \frac{p_{C_t}(x)}{p_{g_+}(x)(p_{g_+}(x)+p_{C_t}(x))}$, which is always positive.} that is reached when 
% \begin{equation}
%  p_{g_+}^* = p_{C_t}.
% \end{equation}

% The fact that $p_{g_+}$ converges to the optimum when using the alternating gradient updates follows from Proposition 2 in \cite{Goodfellow2014-wf}.
% \end{proof}

% \footnote{Namely, the discriminator and the generator have enough capacity and, after each update to the generator, the discriminator is updated until it reaches its optimum.}

% The proof relies on the observation that the first term of the value function in equation \ref{eq_countergan_nondiff} is a weighted sampling estimate of $\mathbb{E}_{x \sim p_{C_t}} \log D(x)$, which is the same as the first term of the regular GAN value function, but with $p_{C_t}$ instead of $p_\mathrm{data}$. Since the regular GAN converges to $p_\mathrm{data}$ under generic conditions (\cite{Goodfellow2014-wf}), the CounteRGAN-wt converges to $p_{C_t}$ under the same conditions (see suplementary material for a detailed proof).


% \begin{equation}
% \begin{aligned}
%     \mathcal{V}_{\mathrm{CounteRGAN}}^*(D, G)=&\mathbb{E}_{x \sim p_{c}} \log D(x) \\ 
%     &+ \mathbb{E}_{x \sim p_{\mathrm{data}}} \log \left(1-D(x+G(x))\right),
% \end{aligned}
% \end{equation}



%The specific form of this value function is motivated by the convergence properties of the generator's output to maximize the classifier prediction while still retaining realism and actionability. The proof of convergence is provided in the supplementary material.




% We introduce a second formulation of the CounteRGAN that does not require the classifier $C$ to be differentiable, conversely to the first approach and to most existing methods for counterfactual search. The corresponding value function is similar to the value function of the RGAN (equation \ref{eq_rgan_specific}), but uses samples:

% Keep for the appendix convergence proof
% \begin{equation}
% \label{eq_countergan_nondiff}
% \begin{aligned}
%     \mathcal{V}_{\mathrm{CounteRGAN}}^*(D, G)=&\mathbb{E}_{x \sim p_{c}} \log D(x) \\ 
%     &+ \mathbb{E}_{x \sim p_{\mathrm{data}}} \log \left(1-D(x+G(x))\right),
% \end{aligned}
% \end{equation}
% where $p_c$ denotes the likelihood of belonging to the data distribution and being classified as $t$ by the classifier:  $$p_c(x) = \mathcal N_t C_t(x) p_\mathrm{data}(x)$$
% with $\mathcal N_t$ being a normalization constant. 

% This formulation of the value function only affects the first term of training of the discriminator compared to regular GANs. Intuitively, drawing real samples from $p_c$ teaches the discriminator to distinguish real samples as $t$, thereby teaching the generator to synthetize samples that are realistic and likely to be classified as $t$. A proof that with this formulation the generator's output distribution converges to $p_c$ given sufficient capacity and training time, is provided in the appendix \ref{sec:appendix:convergence_proof}. 

% In practice, we use weighted sampling to compute the expectation under $p_c$ using the available samples from $p_\mathrm{data}$:
% \begin{equation}
% \begin{aligned}
%     \mathcal{V}_{\mathrm{CounteRGAN}}^*(D, G)=&\frac{\sum_i C_t(x_i) \log D(x_i)}{\sum_i C_t(x_i)} \\ 
%     &+ \frac{1}{N} \sum_i \log \left(1-D(x_i+G(x_i))\right),
% \end{aligned}
% \end{equation}
% where samples $x_i$ are drawn from the entire dataset.

Using either value function variant, the CounteRGAN discriminator learns to discriminate between real and synthetic data points, while the generator aims to balance the desired classification with realism and sparsity (actionability) constraints. As a result, the generator learns to produce residuals that, when added to the input, produce realistic and sparse counterfactuals that are classified by $C$ to be as close to 1 for the desired class as possible. Once trained, the generator can produce counterfactuals quickly via a single forward pass through the neural network.

% recursive counterfactual generation





% \begin{figure*}[htb!]
%     \small
%      \centering
%      \begin{subfigure}[t]{0.16\textwidth}
%         %  \centering
%   \includegraphics[width=1.0\columnwidth]{figures/toy_dataset.png} 
%          \caption{Dataset.}
%          \label{fig:toy_dataset_distr}
%      \end{subfigure}
%      \begin{subfigure}[t]{0.16\textwidth}
%         %  \centering
%   \includegraphics[width=1.0\columnwidth]{figures/toy_dataset_classifier.png} 
%          \caption{Decision boundary of trained classifier.}
%          \label{fig:toy_dataset_clf}
%      \end{subfigure}
%     %  \hfill
%      \begin{subfigure}[t]{0.16\textwidth}
%         %  \centering
%   \includegraphics[width=1.0\columnwidth]{figures/toy_dataset_classifier_and_samples.png} 
%          \caption{Data points for counterfactuals search.}
%          \label{fig:toy_dataset_samples}
%      \end{subfigure}
%      \begin{subfigure}[t]{0.167\textwidth}
%         %  \centering
%   \includegraphics[width=0.94\columnwidth]{figures/toy_dataset_vanilla.png} 
%          \caption{Regularized gradient descent (RGD).}
%          \label{fig:toy_dataset_vgd}
%      \end{subfigure}
%     %  \hfill
%      \begin{subfigure}[t]{0.165\textwidth}
%         %  \centering
% \includegraphics[width=0.96\columnwidth]{figures/toy_dataset_regular_gan.png}
%          \caption{Standard GAN.}
%          \label{fig:toy_dataset_regular_gan}
%      \end{subfigure}
%     %  \hfill
%      \begin{subfigure}[t]{0.165\textwidth}
%         %  \centering
% \includegraphics[width=0.96\columnwidth]{figures/toy_dataset_countergan.png}
%          \caption{CounterGAN.}
%          \label{fig:toy_dataset_countergan}
%      \end{subfigure}
%         \caption{Comparing how three different counterfactual search techniques are able to achieve their objectives while producing significantly different counterfactuals on a synthetic and binary class dataset.}
%         \label{fig:toy_dataset}
% \end{figure*}

% \begin{figure*}[htb!]
%      \centering
%      \begin{subfigure}[t]{0.33\textwidth}
%          \centering
%   \includegraphics[width=0.6\columnwidth]{figures/toy_dataset.png} 
%          \caption{Original distribution of data points.}
%          \label{fig:toy_dataset_distr}
%      \end{subfigure}
%      \begin{subfigure}[t]{0.33\textwidth}
%          \centering
%   \includegraphics[width=0.6\columnwidth]{figures/toy_dataset_classifier.png} 
%          \caption{Decision boundary of trained classifier.}
%          \label{fig:toy_dataset_clf}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[t]{0.33\textwidth}
%          \centering
%   \includegraphics[width=0.6\columnwidth]{figures/toy_dataset_classifier_and_samples.png} 
%          \caption{Data points for counterfactuals search.}
%          \label{fig:toy_dataset_samples}
%      \end{subfigure}
%      \begin{subfigure}[t]{0.33\textwidth}
%          \centering
%   \includegraphics[width=0.6\columnwidth]{figures/toy_dataset_vanilla.png} 
%          \caption{Regularized gradient descent (RGD).}
%          \label{fig:toy_dataset_vgd}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[t]{0.33\textwidth}
%          \centering
% \includegraphics[width=0.6\columnwidth]{figures/toy_dataset_regular_gan.png}
%          \caption{Standard GAN.}
%          \label{fig:toy_dataset_regular_gan}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[t]{0.33\textwidth}
%          \centering
% \includegraphics[width=0.6\columnwidth]{figures/toy_dataset_countergan.png}
%          \caption{CounterGAN.}
%          \label{fig:toy_dataset_countergan}
%      \end{subfigure}
%         \caption{Comparing how three different counterfactual search techniques are able to achieve their objectives while producing significantly different counterfactuals on a synthetic and binary class dataset.}
%         \label{fig:toy_dataset}
% \end{figure*}

% \subsection{Synthetic dataset example}

% Figure \ref{fig:toy_dataset} provides an example of counterfactual search using a synthetic dataset meant to illustrate the challenges faced by counterfactual generation methods. The data points shown in (a) can be interpreted as the known populations from two different societies (red/blue). An ML classifier has been trained to predict the type of society a person belongs to based on their weight (x-axis) and height (y-axis). The solid white line in (b) represents the classifier's decision boundary such that all predictions for points falling within the red shaded region are classified as persons belonging to the red society and vice-versa. The five selected orange points in (c) represent persons from the red society we seek to provide counterfactuals for. These counterfactuals should provide meaningful recourse regarding how to turn themselves into realistic looking persons of the blue society, as predicted by the classifier. The counterfactuals generated by an existing method (d) produce the correct classification result (blue) but the suggested changes would mean that the transformed individuals would not look like the rest of the known populace of the blue society (lack of realism). Using a standard GAN, the counterfactuals always result in the same or similar looking persons of the blue society. While these results are more realistic than those obtained with the previous method, the suggested changes may be harder to apply to some original persons than others (i.e., lower sparsity) and hence less actionable. The proposed CounteRGAN method (f) results in counterfactuals that are of the desired classification (blue) and are most realistic and actionable than those obtained with previous methods. Red society members seeking to imperceptibly infiltrate the blue society would benefit the most from the meaningful recourse provided by this method.


\section{Experiments}
\label{sec:experiments}
\begin{figure*}[hbt!]
     \centering
     \begin{subfigure}[t]{0.19\textwidth}
         \centering
         \caption{RGD}
   \includegraphics[width=0.75\columnwidth]{figures/mnist_vanilla.png} 
         \label{fig:mnist_vgd}
     \end{subfigure}
     \hfill
     \begin{subfigure}[t]{0.19\textwidth}
         \centering
         \caption{CSGP}
\includegraphics[width=0.75\columnwidth]{figures/mnist_prototypes.png}
         \label{fig:mnist_csgp}
     \end{subfigure}
     \hfill
     \begin{subfigure}[t]{0.19\textwidth}
         \centering
         \caption{Standard GAN}
\includegraphics[width=0.75\columnwidth]{figures/mnist_reg_gan.png}
         \label{fig:mnist_gan}
     \end{subfigure}
          \hfill
     \begin{subfigure}[t]{0.19\textwidth}
         \centering
         \caption{CounteRGAN}
\includegraphics[width=0.75\columnwidth]{figures/mnist_countergan.png}
         \label{fig:mnist_countergan}
     \end{subfigure}
          \hfill
     \begin{subfigure}[t]{0.19\textwidth}
         \centering
         \caption{CounteRGAN-bb}
\includegraphics[width=0.75\columnwidth]{figures/mnist_weighted_rgan.png}
         \label{fig:mnist_countergan_star}
     \end{subfigure}
        \caption{Comparison of counterfactual examples produced by different methods on MNIST. Given two separate digit images (7 and 9), each method is tasked with producing counterfactuals that the classifier will predict as a "4". The first row shows the original input image. The second row highlights the perturbations that the counterfactual produces (residuals in the case of CounteRGAN). Negative perturbation values are black, positive values are white, and null or zero values are grey. The third row shows the final counterfactual produced after adding the input with the perturbations. The fourth and final row displays the autoencoder reconstruction error with brighter points representing less realism. Existing methods (a) and (b) result in less realistic counterfactuals. Method (c) lacks realism as well as actionability due to mode collapse. The CounteRGAN methods (d) and (e) (black-box) result in the most realistic counterfactuals.}
        \label{fig:mnist_examples}
\end{figure*}


% \renewcommand{\arraystretch}{1.8}
% \begin{table*}[hbt!]
% \small
% \centering
% \begin{tabular}{l|c|c|c|c|c}
%              & RGD           & CSGP          & GAN & CounteRGAN    & CounteRGAN-wt \\
% \hline
% $\uparrow$ Prediction gain & \textbf{0.83 $\pm$ 0.01} & 0.43 $\pm$ 0.00 & 0.69 $\pm$ 0.01 & 0.80 $\pm$ 0.01 & \textbf{0.85 $\pm$ 0.01} \\
% $\downarrow$  Realism    & 4.56 $\pm$ 0.01   & 4.58 $\pm$ 0.01   & 4.50 $\pm$ 0.00             & \textbf{3.95 $\pm$ 0.01}   & 4.37 $\pm$ 0.01 \\
% $\downarrow$  Actionability & \textbf{20.63 $\pm$ 0.41} & 54.24 $\pm$ 0.60 & 151.98 $\pm$ 0.43          & 79.47 $\pm$ 0.47 & 72.99 $\pm$ 0.52 \\
% $\downarrow$  Latency (ms)  & 4,129.57 $\pm$ 3.33     & 5,359.58 $\pm$ 2.72    & \textbf{13.05 $\pm$ 0.04} & 13.33 $\pm$ 0.04          &  13.52 $\pm$ 0.04 \\
% $\downarrow$  Batch latency (seconds) & 4,129,570    & 5,359,580    & \textbf{45} & \textbf{45} &  \textbf{45}
% \end{tabular}
% \caption{MNIST test data results (mean and 95 \% confidence interval). The arrows indicate whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best results are in bold. The realism metric typically ranges from 3.89 (mean reconstruction error on the test set) to 11.99 (reconstruction error random uniform noise $[0, 1]$). Computations are performed using 1000 samples from the test set.}
% \label{table:mnist_metrics}
% \end{table*}


% \renewcommand{\arraystretch}{1.8}
% \begin{table*}[hbt!]
% \small
% \centering

% \begin{tabular}{l|c|c|c|c|c}
% \toprule
% {} &                    RGD &                    CSGP &                 GAN &          CounterGAN &     CounterRGAN$^*$ \\
% \midrule
% \hline
% $\uparrow$ Prediction gain &      \textbf{0.202 $\pm$ 0.038} &       0.148 $\pm$ 0.023 &   \textbf{0.186 $\pm$ 0.021} &   \textbf{0.192 $\pm$ 0.022} &   \textbf{0.184 $\pm$ 0.022} \\
% $\downarrow$  Realism      &      0.070 $\pm$ 0.022 &       \textbf{0.050 $\pm$ 0.015} &   \textbf{0.040 $\pm$ 0.008} &   \textbf{0.035 $\pm$ 0.006} &   0.061 $\pm$ 0.013 \\
% $\downarrow$  Sparsity     &      1.930 $\pm$ 0.323 &       \textbf{1.124 $\pm$ 0.184} &   2.134 $\pm$ 0.072 &   2.157 $\pm$ 0.084 &   2.895 $\pm$ 0.099 \\
% $\downarrow$  Latency (ms) &  1,228.049 $\pm$ 0.015 &  17,489.839 $\pm$ 0.160 &  \textbf{39.521 $\pm$ 0.001} &  41.588 $\pm$ 0.004 &  41.538 $\pm$ 0.004 \\
% $\downarrow$  Batch latency (seconds) & 189.11 & 2,693.31 & 0.011 & 0.012 & 0.012   
% \bottomrule
% \end{tabular}




% \caption{Diabetes test data results (averages and standard deviations). For each metric, the arrow indicates whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best model is in bold. The realism metric typically ranges from 0.050  (average reconstruction error of the autoencoder on the test set) to 2.759 (reconstruction error of noise uniformly sampled from $[-10, 10]$). Computations are performed using all 154 samples from the test set.}
% \label{table:diabetes_metrics}
% \end{table*}


% \renewcommand{\arraystretch}{1.8}
% \begin{table*}[hbt!]
% \small
% \centering
% \begin{tabular}{l|c|c|c|c|c}
% {} &                    RGD &                        CSGP &                          GAN &                  CounterGAN &     CounterRGAN-wt \\
% \hline
% $\uparrow$ Prediction gain &      0.15 $\pm$ 0.01 &           0.13 $\pm$ 0.02 &            0.15 $\pm$ 0.03 &  \textbf{0.33 $\pm$ 0.04} &   0.16 $\pm$ 0.02 \\
% $\downarrow$  Realism      &      2.20 $\pm$ 0.24 &           2.03 $\pm$ 0.11 &            3.33 $\pm$ 0.11 &  \textbf{1.79 $\pm$ 0.11} &   2.13 $\pm$ 0.12 \\
% $\downarrow$  Actionability  &      1.64 $\pm$ 0.20 &  \textbf{1.14 $\pm$ 0.19} &            9.46 $\pm$ 0.53 &           6.91 $\pm$ 0.43 &   2.97 $\pm$ 0.12  \\
% $\downarrow$  Latency (ms) &  1,195.91 $\pm$ 5.65 &       3,211.67 $\pm$ 11.65 &  1.68 $\pm$ 0.06 &          \textbf{1.51 $\pm$ 0.03} &  1.82 $\pm$ 0.12 \\
% $\downarrow$  Batch latency (seconds) & 204.58 & 483.88 & 0.26 & \textbf{0.23} & 0.32
% \end{tabular}
% \caption{Diabetes test data results (mean and 95 \% confidence interval). The arrows indicate whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best results are in bold. The realism metric typically ranges from 1.844  (mean reconstruction error on the test set) to 2.443 (reconstruction error on random Gaussian noise). Computations are performed using the entire test set (154 samples).}
% \label{table:diabetes_metrics}
% \end{table*}

% We utilize a dataset from one technique and the other dataset from the other. plus the same model architectures to ensure a fair evaluation.
% This section presents the methodology and results of the three experiments used to evaluate our proposed CounteRGAN method. 
\noindent We compare the proposed CounteRGAN approach against two state-of-the-art counterfactual search methods \cite{Wachter2017-jr, Van_Looveren2019-hr}. As far as possible, our experiments mirror the experimental setups used in those proposals including the evaluation datasets and model architectures. 
With the exception of \cite{Wachter2017-jr} which is a foundational work and conventional baseline, other counterfactual search methods mentioned in the Related Work are not included either because they do not address realism \cite{Mothilal2020-ge, Laugel2017-tp} or because their latency is prohibitive for real-time applications \cite{Poyiadzi2019-rr}.
% To account for possible real world scenarios, we perform our experiments assuming the target classifier is a white-box model, where the classifier's gradients are known to the counterfactual search methods, as well as a black-box model where the methods do not have access to the gradients.
% To ensure fairness, the datasets and architectures used are taken directly from the two methods we compare against \cite{Wachter2017-jr, Van_Looveren2019-hr}. 
The first experiment is conducted using the MNIST handwritten digit dataset \cite{lecun-mnisthandwrittendigit-2010} which lends to providing visual clarity of each method's approach. The second experiment makes use of the COMPAS recidivism dataset \cite{Compas} to highlight how meaningful counterfactuals can be helpful for improving model interpretability and fairness. We added a third experiment in the supplementary material, which uses an Indian diabetes dataset \cite{diabetes_dataset} and helps to demonstrate that the CounteRGAN is also effective on tabular data when some of the features are immutable. 

\paragraph{Methods}
Given an input data point $x_i$, all methods described below aim to produce a counterfactual $x^\text{cf}_i$ that a target classifier $C$ will predict as the desired class.

\begin{itemize}
    \item \textit{Regularized Gradient Descent (RGD)}: a gradient descent based counterfactual search \cite{Wachter2017-jr} that minimizes the sums of the squared differences between the desired outcome and the counterfactual. A regularization term is used to enforce sparsity.\footnote{For this method and the next, we use the implementations (including gradient approximating versions for black-box models) provided by \url{https://github.com/SeldonIO/alibi}.}
    \item \textit{Counterfactual Search Guided by Prototypes (CSGP)}: this method \cite{Van_Looveren2019-hr} extends RGD by using class prototypes to push the counterfactual towards a more realistic data point of the desired class. The value function is modified to include a distance measure from the counterfactual to the class prototype in latent space ($L_\text{proto}$).
    \item \textit{Standard GAN (GAN)}: This method applies a standard GAN \cite{Goodfellow2014-wf}, in conjunction with the target classifier $C$. The generator is modified to use real data points as input (as opposed to random latent variables) and synthesize complete counterfactuals.
    \item \textit{CounteRGAN}: The proposed method from section \ref{sec:countergan} that uses the specialized RGAN together with the target classifier $C$. The value function from Equation \ref{eq_countergan_diff} is used when $C$ is a white-box model (i.e., known gradients) and Equation \ref{eq_countergan_nondiff} is used when $C$ is a black-box model (i.e., unknown or undefined gradients).
    % \item \textit{CounteRGAN-wt}: Similar to the method above except that this variant uses the weighted value function (see Equation \ref{eq_countergan_nondiff}) which is useful for non-differentiable or unknown (black-box) models.
\end{itemize}


\renewcommand{\arraystretch}{1.4}
\begin{table}
\centering
\small
% \scriptsize
\begin{tabular}{ c | c  }
Metric & Formula \\
 \hline
 Counterfactual prediction gain & $\mathbb{E}\left[ C(x^{\text{cf}}_i) - C(x_i)\right]$ \\  
 Realism & $\mathbb{E}\left[  \left\Vert\text{AE}\left( x^{\text{cf}}_i \right) - x^{\text{cf}}_i\right\Vert_2^2\right]$ \\ 
 Actionability (Sparsity \& proximity) & $\mathbb{E}\left[ \left\Vert x^{\text{cf}}_i - x_i\right\Vert_1 \right]$\\
 Latency & $\mathbb{E}\left[\delta t_i \right]$\\
\end{tabular}
\medskip
\caption{Evaluation metrics summary. $C$ is the target classifier and $x_i$ denotes the data point for which a counterfactual ($x_i^\text{cf}$) is sought. An autoencoder ($\text{AE}$) is used to reconstruct $x_i^\text{cf}$. Expectations are computed over the test sets.}
\label{table:eval_metrics}
\end{table}

\paragraph{Evaluation metrics}
% \renewcommand{\arraystretch}{1.8}
% \begin{table}
% \centering
% \begin{tabular}{ c | c  }
% Metric & Formula \\
%  \hline
%  Counterfactual prediction gain & $\mathbb{E}\left[ C(x^{\text{cf}}_i) - C(x_i)\right]$ \\  
%  Realism & $\mathbb{E}\left[  \left\Vert\text{AE}\left( x^{\text{cf}}_i \right) - x^{\text{cf}}_i\right\Vert_2^2\right]$ \\ 
%  Actionability (Sparsity \& proximity) & $\mathbb{E}\left[ \left\Vert x^{\text{cf}}_i - x_i\right\Vert_1 \right]$\\
%  Latency & $\mathbb{E}\left[\delta t_i \right]$\\
% \end{tabular}
% \caption{Evaluation metrics summary. Where $C$ and $x_i$ denote the classifier and original data point for which a counterfactual ($x_i^\text{cf}$) is sought. The autoencoder whose reconstruction error serves as a proxy for realism is denoted as $\text{AE}$. The expectations are computed over the test datasets.}
% \label{table:eval_metrics}
% \end{table}



\renewcommand{\arraystretch}{1.6}
\begin{table*}[hbt!]
% \tiny
\scriptsize
\centering
\begin{tabular}{l||c|c|c|c||c|c|c}
 \multirow{2}{*}{} &           \multicolumn{4}{c||}{White-box classifier} & \multicolumn{3}{c}{Black-box classifier} \\ 
&                    RGD &                        CSGP &                          GAN &                  CounteRGAN &    RGD & CSGP & CounteRGAN \\
\hline
$\uparrow$ Prediction gain & \textbf{0.83 $\pm$ 0.01} & 0.43 $\pm$ 0.00 & 0.69 $\pm$ 0.01 & 0.80 $\pm$ 0.01 & 0.45 $\pm$ 0.01 & 0.41 $\pm$ 0.00 & \textbf{0.85 $\pm$ 0.01} \\
$\downarrow$  Realism    & 4.56 $\pm$ 0.01   & 4.58 $\pm$ 0.01   & 4.50 $\pm$ 0.00             & \textbf{3.95 $\pm$ 0.01} & 3.94 $\pm$ 0.01 &  \textbf{3.58 $\pm$ 0.01} & 4.37 $\pm$ 0.01 \\
$\downarrow$  Actionability & \textbf{20.63 $\pm$ 0.41} & 54.24 $\pm$ 0.60 & 151.98 $\pm$ 0.43          & 79.47 $\pm$ 0.47 & \textbf{31.86 $\pm$ 0.61} & 48.79 $\pm$ 1.69 & 72.99 $\pm$ 0.52 \\
$\downarrow$  Latency (ms)  & 4,129.57 $\pm$ 3.33     & 5,359.58 $\pm$ 2.72    & \textbf{13.05 $\pm$ 0.04} & 13.33 $\pm$ 0.04     & 8,464.10 $\pm$ 42.54 &   30,235.47 $\pm$ 553.47  &  \textbf{13.52 $\pm$ 0.04} \\
$\downarrow$  Batch latency (s) & 4,129,570    & 5,359,580    & \textbf{45} & \textbf{45} & 84,641,012 & 302,354,681 &  \textbf{45} \\
\end{tabular}
 \medskip
\caption{MNIST test data results (mean and 95\% confidence interval). The arrows indicate whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best results are in bold. The realism metric typically ranges from 3.89 (mean reconstruction error on the test set) to 11.99 (reconstruction error random uniform noise $[0, 1]$). Computations are performed using the entire test set (10,000 samples).}
\label{table:mnist_metrics}
\end{table*}

To evaluate the relative performance of the methods, we identify four desirable properties of counterfactual generation and propose the corresponding metrics detailed below (see Table \ref{table:eval_metrics} for a summary). These metrics are based on those found in related work and we have also added established measures of realism and actionability. All metric results from the experiments, except for batch latency, are based on averages of individually computed counterfactuals using the test data. Batch latency is the total computation time necessary to produce counterfactuals for an entire batch. Each table presents the results of the methods assuming that the target classifier's gradients are known (white-box model) or unknown (black-box model).
% Additional experimental setup details, including model architectures and training parameters are elaborated in the supplementary material.
\begin{itemize}
    \item \textit{Prediction gain}: the difference between the classifier's prediction on the counterfactual ($C_t(x^{\text{cf}}_i)$) and the input data point ($C_t(x_i)$), for the target class $t$. Since the maximum score classifier $C$ can predict is 1, the range for prediction gain is $[0,1]$ with higher gain indicating more improvement.
    \item \textit{Realism}: a measure of how well a counterfactual "fits in" with the known data distribution. We adopt a strategy inspired by \cite{Van_Looveren2019-hr, Dhurandhar2018-hk}, in which we train a denoising autoencoder $\text{AE}\left( \cdot\right)$ on the training set and use the L2 norm of the reconstruction error as a measure of realism. A lower value represents higher realism. 
    \item \textit{Actionability (sparsity \& proximity)}: Sparsity and proximity are commonly used \cite{Mothilal2020-ge}, though imperfect, proxies for true actionability which is inherently difficult to quantify. We follow existing methods which have measured actionability as a function of the number and magnitude of perturbations present in the counterfactual ($x^{\text{cf}}_i$) relative to the input data point ($x_i$) using the L1 norm. Lower values indicate fewer changes and therefore a higher degree of actionable feedback. Weighting the sparsity penalty according to the degree of feature mutability could be promising for future work. In the supplemental materials, an experiment using the Pima Indian Diabetes dataset \cite{diabetes_dataset} is provided as an example.
    \item \textit{Latency}: the computational latency needed to generate counterfactuals. Individual counterfactual computations can impact real-time applicability. Batch results are useful to highlight scalability limitations since large amounts of counterfactuals may be desired to be generated without real-time constraints but within practical latency and cost budgets. Lower values are better and subsecond latencies are necessary for real-time applicability.

\end{itemize}

% \footnote{Note that we don't use an equivalent of the popular Inception Score \cite{Salimans2016-wx} for realism evaluation. This is because the Inception Score is partly a measure of diversity, but it does not take into account intra-class diversity (see \cite{Barratt2018-sg}) and the latter is of utmost relevance for counterfactual generation.}
% \begin{figure*}[hbt!]
%      \centering
%      \begin{subfigure}[t]{0.19\textwidth}
%          \centering
%          \caption{RGD}
%   \includegraphics[width=0.9\columnwidth]{figures/mnist_vanilla.png} 
%          \label{fig:mnist_vgd}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[t]{0.19\textwidth}
%          \centering
%          \caption{CSGP}
% \includegraphics[width=0.9\columnwidth]{figures/mnist_prototypes.png}
%          \label{fig:mnist_csgp}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[t]{0.19\textwidth}
%          \centering
%          \caption{Standard GAN}
% \includegraphics[width=0.9\columnwidth]{figures/mnist_reg_gan.png}
%          \label{fig:mnist_gan}
%      \end{subfigure}
%           \hfill
%      \begin{subfigure}[t]{0.19\textwidth}
%          \centering
%          \caption{CounterGAN}
% \includegraphics[width=0.9\columnwidth]{figures/mnist_countergan.png}
%          \label{fig:mnist_countergan}
%      \end{subfigure}
%           \hfill
%      \begin{subfigure}[t]{0.19\textwidth}
%          \centering
%          \caption{CounterGAN-wt}
% \includegraphics[width=0.9\columnwidth]{figures/mnist_weighted_rgan.png}
%          \label{fig:mnist_countergan_star}
%      \end{subfigure}
%         \caption{Comparison of counterfactual examples produced by different methods on MNIST. Given two separate digit images (7 and 9), each method is tasked with producing counterfactuals that the classifier will predict as a "4". The first row shows the original input data point. The second row highlights the perturbations that the counterfactual produces (residuals in the case of CounteRGAN). Perturbation values correspond to color (black is negative, white is positive, and grey is zero). The third row shows the final counterfactual produced after adding the input with the perturbations. The fourth and final row displays the autoencode reconstruction error (brighter points can be viewed as lacking realism). Existing methods (a) and (b) result in unrealistic counterfactuals. Method (c) lacks realism as well as actionability due to mode collapse. The CounteRGAN methods (d) and (e) result in the most realistic counterfactuals while still being diverse.}
%         \label{fig:mnist_examples}
% \end{figure*}



% \vspace{-0.28cm}
\subsection{Experiment using MNIST image dataset}
MNIST consists of 70,000 images of handwritten digits (28x28 black and white pixels, that we normalize to have values between 0 and 1) with equal amounts of samples for each digit class. The images are split for training and testing with 60,000 and 10,000 samples respectively, both of which are balanced in terms of labels.

A convolutional neural network (CNN) is used as the target classifier which is trained to correctly classify the digits (98.6\% accuracy on the test set). In addition to the classifier, we train a denoising convolutional autoencoder that is used to gauge counterfactual realism. Each method is tasked with generating counterfactuals that the classifier should predict as a "4" digit. All results are based on the averages from generating counterfactuals for all of the 10,000 samples from the test set. 
% Additional architectural and training details are elaborated in the supplementary material.


% \setlength{\tabcolsep}{20pt}
% \renewcommand{\arraystretch}{1.8}
% \begin{table*}[hbt!]
% \small
% \centering
% \begin{tabular}{l|c|c|c|c|c}
%              & RGD           & CSGP          & GAN & CounteRGAN    & CounteRGAN-wt \\
% \hline
% $\uparrow$ Prediction gain & \textbf{0.83 $\pm$ 0.01} & 0.43 $\pm$ 0.00 & 0.69 $\pm$ 0.01 & 0.80 $\pm$ 0.01 & \textbf{0.85 $\pm$ 0.01} \\
% $\downarrow$  Realism    & 4.56 $\pm$ 0.01   & 4.58 $\pm$ 0.01   & 4.50 $\pm$ 0.00             & \textbf{3.95 $\pm$ 0.01}   & 4.37 $\pm$ 0.01 \\
% $\downarrow$  Sparsity  & \textbf{20.63 $\pm$ 0.41} & 54.24 $\pm$ 0.60 & 151.98 $\pm$ 0.43          & 79.47 $\pm$ 0.47 & 72.99 $\pm$ 0.52 \\
% $\downarrow$  Latency (ms)  & 4,129.57 $\pm$ 3.33     & 5,359.58 $\pm$ 2.72    & \textbf{13.05 $\pm$ 0.04} & 13.33 $\pm$ 0.04          &  13.52 $\pm$ 0.04 \\
% $\downarrow$  Batch latency (seconds) & 4,129,570    & 5,359,580    & \textbf{45} & \textbf{45} &  \textbf{45}
% \end{tabular}
% \caption{MNIST test data results (mean and 95 \% confidence interval). For each metric, the arrow indicates whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best model is in bold. The realism metric typically ranges from 3.89 (average reconstruction error of the autoencoder on the test set) to 11.99 (reconstruction error of noise uniformly sampled from $[0, 1]$). Computations are performed using 1000 samples from the test set.}
% \label{table:mnist_metrics}
% \end{table*}



Examples of counterfactuals for two digits are shown in Figure \ref{fig:mnist_examples}. All methods succeed in producing counterfactuals that the classifier labels as "4", with predicted probabilities ranging from 0.55 to 1. In RGD (Figure \ref{fig:mnist_vgd}), counterfactuals resemble adversarial attacks in that they are composed of subtle perturbations that lead to the desired classification, but are highly unrealistic. The CSGP algorithm (Figure \ref{fig:mnist_csgp}) seems to perform better visually, affecting relevant pixels to turn the digits into the desired "4" but still lacks realism. The counterfactual search with a regular GAN (Figure \ref{fig:mnist_gan}) saliently exhibits mode collapse. Without the residual formulation, the generator simply learns to generate the same image regardless of the input. The two CounteRGAN formulations (Figures \ref{fig:mnist_countergan} and \ref{fig:mnist_countergan_star}) output visually convincing counterfactuals, as corroborated by the large classifier scores (0.97 to 1) and low autoencoder reconstruction errors.

The complete metrics results for the MNIST dataset are presented in Table \ref{table:mnist_metrics}. While all methods largely increase the prediction of the target class, CSGP is noticeably less impactful. The RGD method outputs sparser counterfactuals at the significant cost of realism. The two CounteRGAN variants, by contrast, generate the most realistic counterfactuals with high actionability and prediction gain. Notably, the GAN and proposed CounteRGAN approaches also achieve $>$300x and $>$600x latency improvements over existing methods when generating single counterfactuals on white-box and black-box classifiers respectively. On a batch of the full 10000 samples from the test set, the GAN based methods achieve an impressive 5 to 7 orders of magnitude improvement. 

% Although we have not included additional image datasets in our experiments, the same methodology can be applied to large-scale image datasets such as ImageNet \cite{deng2009imagenet} or CelebA \cite{liu2015faceattributes} with no modification. While the CounteRGAN setup and training principles would remain the same, the extension to larger datasets would require more complex generator and discriminator architectures, and a more extensive optimization of the training parameters. Because this work focuses on actionable feedback rather than the generation of realistic images, we leave applications of the CounteRGAN to large-scale datasets for future work and turn our attention to tabular data in the following. 


% This is due to the generator only needing a forward pass through the neural network to produce a counterfactual instead of an recomputing the counterfactual search for every data point. As the 


% \renewcommand{\arraystretch}{1.6}
% \begin{table*}[hbt!]
% \scriptsize
% \centering
% \begin{tabular}{l||c|c|c|c||c|c|c}
%  \multirow{2}{*}{} &           \multicolumn{4}{c||}{White-box classifier} & \multicolumn{3}{c}{Black-box classifier} \\ 
% &                    RGD &                        CSGP &                          GAN &                  CounteRGAN &    RGD & CSGP & CounteRGAN \\
% \hline
% $\uparrow$ Prediction gain &      0.15 $\pm$ 0.01 &           0.13 $\pm$ 0.02 &            0.15 $\pm$ 0.03 &  \textbf{0.33 $\pm$ 0.04} &  \textbf{0.17 $\pm$ 0.00} & 0.13 $\pm$ 0.00 & \textbf{0.16 $\pm$ 0.02} \\
% $\downarrow$  Realism      &      2.20 $\pm$ 0.24 &           2.03 $\pm$ 0.11 &            3.33 $\pm$ 0.11 &  \textbf{1.79 $\pm$ 0.11} &  2.22 $\pm$ 0.01  & \textbf{1.98 $\pm$ 0.01} & 2.13 $\pm$ 0.12 \\
% $\downarrow$  Actionability  &      1.64 $\pm$ 0.20 &  \textbf{1.14 $\pm$ 0.19} &            9.46 $\pm$ 0.53 &           6.91 $\pm$ 0.43 & 1.75 $\pm$ 0.02  & \textbf{1.29 $\pm$ 0.02} & 2.97 $\pm$ 0.12  \\
% $\downarrow$  Latency (ms) &  1,195.91 $\pm$ 5.65 &       3,211.67 $\pm$ 11.65 &  1.68 $\pm$ 0.06 &          \textbf{1.51 $\pm$ 0.03} & 2,525.99 $\pm$ 1.23  & 15,921 $\pm$ 23.66 &  \textbf{1.82 $\pm$ 0.12} \\
% $\downarrow$  Batch latency (seconds) & 204.58 & 483.88 & 0.26 & \textbf{0.23} & 453.45  & 2,228.23 & \textbf{0.32}
% \end{tabular}
% \medskip
% \caption{Diabetes test data results (mean and 95\% confidence interval). The arrows indicate whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best results are in bold. The realism metric typically ranges from 1.844  (mean reconstruction error on the test set) to 2.443 (reconstruction error on random Gaussian noise). Computations are performed using the entire test set (154 samples).}
% \label{table:diabetes_metrics}
% \end{table*}

% \vspace{-0.28cm}
% \subsection{Second experiment: Pima Indians Diabetes dataset}
% Following the experiments in Wachter et al. \cite{Wachter2017-jr}, we utilize the Pima Indians Diabetes dataset \cite{diabetes_dataset}. It is composed of low dimensional tabular data and helps to validate the CounteRGAN's versatility and its applicability to diverse use cases. The dataset contains 8 features describing the relevant characteristics of patients useful for predicting diabetes. The target label is positive if the patient has diabetes (268 examples) and negative otherwise (500 examples). We use stratified (label balanced) sampling with 80\% of the dataset being assigned to the train set and the remaining 20\% for the test set. The classifier is the same as the neural network architecture used in \cite{Wachter2017-jr} and achieves an accuracy of 74.68\% on the test set.
% % \footnote{Note that this is relatively low compared to the 65.10 \% accuracy achieved using a random classifier as a baseline.}. 
% % More details on the model architectures and parameters used for the counterfactual search methods can be found in the supplementary material.

% For this experiment we introduce the important concept of \textit{mutable} and \textit{immutable features}. For most practical applications of counterfactual search, certain features may be hard or impossible to change and can be considered immutable. Though features typically vary in their degree of mutability, for the purposes of this experiment we consider features as either mutable or immutable. For the Pima Indians Diabetes dataset, we consider \textit{Pregnancies}, \textit{Age}, and \textit{Diabetes Pedigree Function} features to be immutable. We use \textit{Glucose}, \textit{Insulin}, \textit{Body Mass Index}, \textit{Tricept Skin Fold Thickness}, and \textit{Blood Pressure} as mutable features. In practice, we apply counterfactual search with no modifications, then simply cancel the perturbations applied to immutable features.


% % \renewcommand{\arraystretch}{1.8}
% % \begin{table*}[hbt!]
% % \small
% % \centering

% % \begin{tabular}{l|c|c|c|c|c}
% % \toprule
% % {} &                    RGD &                    CSGP &                 GAN &          CounterGAN &     CounterRGAN$^*$ \\
% % \midrule
% % \hline
% % $\uparrow$ Prediction gain &      \textbf{0.202 $\pm$ 0.038} &       0.148 $\pm$ 0.023 &   \textbf{0.186 $\pm$ 0.021} &   \textbf{0.192 $\pm$ 0.022} &   \textbf{0.184 $\pm$ 0.022} \\
% % $\downarrow$  Realism      &      0.070 $\pm$ 0.022 &       \textbf{0.050 $\pm$ 0.015} &   \textbf{0.040 $\pm$ 0.008} &   \textbf{0.035 $\pm$ 0.006} &   0.061 $\pm$ 0.013 \\
% % $\downarrow$  Sparsity     &      1.930 $\pm$ 0.323 &       \textbf{1.124 $\pm$ 0.184} &   2.134 $\pm$ 0.072 &   2.157 $\pm$ 0.084 &   2.895 $\pm$ 0.099 \\
% % $\downarrow$  Latency (ms) &  1,228.049 $\pm$ 0.015 &  17,489.839 $\pm$ 0.160 &  \textbf{39.521 $\pm$ 0.001} &  41.588 $\pm$ 0.004 &  41.538 $\pm$ 0.004 \\
% % $\downarrow$  Batch latency (seconds) & 189.11 & 2,693.31 & 0.011 & 0.012 & 0.012   
% % \bottomrule
% % \end{tabular}

% % \caption{Diabetes test data results (averages and standard deviations). For each metric, the arrow indicates whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best model is in bold. The realism metric typically ranges from 0.050  (average reconstruction error of the autoencoder on the test set) to 2.759 (reconstruction error of noise uniformly sampled from $[-10, 10]$). Computations are performed using all 154 samples from the test set.}
% % \label{table:diabetes_metrics}
% % \end{table*}

% Table \ref{table:diabetes_metrics} summarizes our findings for this experiment. On this dataset, all methods appear equally capable of improving classifier prediction gain. The CounteRGAN generates more realistic instances, and the CSGP outputs the sparsest counterfactuals. Even on this low-dimensional dataset, the CounteRGAN is able to meet or exceed the evaluation metrics of counterfactuals produced by existing methods while heavily outperforming them in terms of latency. This includes >1,000x to >2,000x improvements for individual counterfactuals on white-box and black-box models respectively and from 3 to 4 orders of magnitude for batch generation of all counterfactuals.  

% The evaluation results validate that the proposed CounteRGAN method is capable of overcoming the main limitations of existing methods, namely the lack of realism and high latency. It also provides similar or better prediction gain and actionability on high dimensional images and a low-dimensional tabular dataset. The impressive latency improvements are pivotal with regard to real-time applicability and scalability. This is due to the generator only needing a forward-pass through the neural network as opposed to performing a new counterfactual search for every data point, as required by existing methods. 

% % The realism and latency improvements are expected to be accentuated as the dimensionality of the dataset increases, which complicates individual counterfactual searches. 

% % An important observation is that just as a better discriminator can help the generator learn to produce more realistic data, so too can a better target classifier lead to better and more actionable counterfactuals.
\renewcommand{\arraystretch}{1.6}
\begin{table*}[hbt!]
\scriptsize
\centering
\begin{tabular}{l||c|c|c|c||c|c|c}
 \multirow{2}{*}{} &           \multicolumn{4}{c||}{White-box classifier} & \multicolumn{3}{c}{Black-box classifier} \\ 
&                    RGD &                        CSGP &                          GAN &                  CounteRGAN &    RGD & CSGP & CounteRGAN \\
\hline
↑ Prediction gain    &     \textbf{0.38 ± 0.01} &     0.06 ± 0.01 &  0.29 ± 0.01 &  0.07 ± 0.01 & \textbf{0.38 ± 0.01} & 0.06 ± 0.01 & 0.12 ± 0.01 \\
↓ Realism & 1.60 ± 0.08 & 0.78 ± 0.06 & \textbf{0.57 ± 0.00} & 0.85 ± 0.09 & 1.60 ± 0.08 & \textbf{0.77  ± 0.06} &0.93 ± 0.09  \\
% ↓ Realism & 1.60 ± 0.08 & 0.78 ± 0.06 & \textbf{0.57 ± 0.00} & 0.85 ± 0.09 & 1.60 ± 0.08 & \texbf{0.77 ± 0.06} & 0.93 ± 0.09 \\
↓ Sparsity           &     2.07 ± 0.05 &     \textbf{0.53 ± 0.08} &  7.32 ± 0.16 &  0.85 ± 0.05 & 2.07 ± 0.05 & \textbf{0.50 ± 0.08} & 1.48 ± 0.08 \\
↓ Latency (ms)       &  1,704.62 ± 2.12 &  3,312.14 ± 5.46 &  \textbf{1.39 ± 0.01} &  1.43 ± 0.01 & 3,005.13 ± 2.35 & 9,894.08 ± 51.75 &  \textbf{1.42 ± 0.12} \\
↓ Batch latency (s) &       2,459.76 &       4,779.42 &       \textbf{2.00} &       2.06 &  4,336.40 & 14,277.15 &     \textbf{2.04} \\

\end{tabular}
\medskip
\caption{COMPAS test data results (mean and 95\% confidence interval). The arrows indicate whether larger $\uparrow$ or lower $\downarrow$ values are better, and the best results are in bold. The realism metric typically ranges from 0.87 (mean reconstruction error on the test set) to 5.43 (reconstruction error of random uniform noise in $[0, 1]$).}
\label{table:compas_metrics}
\end{table*}

% \vspace{-0.28cm}
\subsection{Experiment using COMPAS recidivism dataset}
% In this third experiment we explore the COMPAS recidivism dataset \cite{Compas}. While we have focused on the actionability of suggested counterfactuals until now, 


% This experiment focuses on the use of meaningful counterfactuals for model interpretation and fairness. 

% When predictive models are applied highly impactful areas such as criminal justice, they can have long lasting and life altering consequences for individuals and should 

Predictive models can have life-changing effects on individuals in certain situations. In the United States, for example, recidivism prediction models such as the COMPAS score \cite{Compas} are used to guide sentencing for crimes in several states and major cities. As this experiment demonstrates, meaningful counterfactuals can be used to improve model interpretability and fairness by exposing harmful biases such as racial and gender biases.

\renewcommand{\arraystretch}{1.2}
\begin{table*}
% \small
\scriptsize
\centering
\begin{tabular}{l||c||c|c|c|c||c|c|c}
 \multirow{2}{*}{} &    \multirow{2}{*}{Initial values}    &    \multicolumn{4}{c||}{White-box classifier} & \multicolumn{3}{c}{Black-box classifier} \\ 
&           &          RGD &                        CSGP &                          GAN &                  CounteRGAN &    RGD & CSGP & CounteRGAN \\
\hline
age                                         &             24 &     - &   +1 &         +23 &         +6 &     - &  +2 &       +12 \\
priors\_count                                &              3 &    -9 &   -1 &          -4 &         -2 &   -9 & -1    &      -1 \\
days\_b\_screening\_arrest                     &             -1 &    -1 &    - &          -3 &          - &      -1  & - &   -12 \\
sex\_Male                                    &              1 &     - &    - &          -1 &          - &   - & - &          - \\
c\_charge\_degree\_M                           &              0 &     - &    - &          +1 &          - &       - & - &      - \\
c\_charge\_desc\_Pos Cannabis W/Intent Sel/Del &              1 &     - &    - &          -1 &          - & - & - &           -1 \\
c\_charge\_desc\_Possession of Cocaine         &              0 &     - &    - &           - &          - &        - & - &    +1 \\
race\_Caucasian                              &              0 &     - &    - &          +1 &          - &   - & - &         +1 \\
\hline
Classifier Prediction (prob of not recidivating)                      &           0.36 &  0.99 &  0.50 &        0.87 &       0.71 &    0.99 & 0.52 &      0.54 \\
\end{tabular}
\medskip
\caption{Comparison of counterfactual examples produced by different methods given a sample data point from the COMPAS recidivism dataset. Some of the counterfactuals suggest changing the race and gender indicating potentially unfair biases.}
\label{table:compas_example}
\end{table*}


The COMPAS dataset consists of 7,214 arrests logged in Broward County, Florida, and contains 29 features describing the demographics and criminal history of the defendants. The binary target label is positive if the defendant did not re-offend within two years after the arrest (55\% of the data) and negative if they did (45\% of the data). Numerical features are standardized and categorical variables are one-hot-encoded. The one-hot-encoded features are then perturbed in the same fashion as the numerical features and then rounded to the closest binary value for the final residuals.\footnote{An alternative approach would be to handle categorical features using pairwise distance measures and multi-dimensional scaling \cite{Van_Looveren2019-hr}.}
% Attempting to address interpretability and fairness within this scenario, we do not distinguish between mutable and immutable features in this experiment although we demonstrate such a case in the supplementary materials. 
We randomly assign 80\% of samples to the train set and the remaining 20\% to the test set. A neural network with three hidden layers is trained and reaches an accuracy of 69.72\% on the test set. A threshold of 0.5 is chosen for determining whether an individual will recidivate ($<$0.5) or not ($\ge$0.5).

Table \ref{table:compas_metrics} presents the results for the counterfactual search methods on the COMPAS test set. Similar to previous experiments, the RGD approach tends to produce unrealistic counterfactuals with large increases to the classifier's prediction. Conversely, CSGP typically leads to small increases of the classifier score but outputs sparser and more realistic perturbations. The regular GAN method achieves decent gains in prediction score and realism but suffers greatly with respect to sparsity and hence actionability. The CounteRGAN methods proposed in this work are more satisfying than RGD in terms of realism and sparsity. They also achieve similar increases of the classifier prediction as CSGP and produce counterfactuals $>$1,000x faster than RGD and CSGP.


Specific examples are relevant for investigating what, if any, biases a classifier has learned. Table \ref{table:compas_example} presents one such data point from the test set. 
% It compares the original feature values with those from counterfactuals produced by every method included in our benchmark. 
Each method is able to generate a counterfactual that successfully reverts the prediction although they propose very different perturbations to the features. 
% RGD suggests an unrealistic change that corresponds to a negative number of prior offenses. CSGP is able to barely flip the prediction (score $\ge$0.5) with minimal and realistic changes. Though general conclusions should be based on subsequent analysis of complete datasets, counterfactuals such as these can help to probe a classifier's decision boundary in the vicinity of individual data points. By illuminating regions in the feature space where the classifier predicts non-recidivism, they can serve a pertinent role in understanding the impact and relation certain feature value changes will have on the final prediction, thereby adding to a model's interpretability. 
Interestingly, the counterfactuals produced by the GAN and CounteRGAN methods for black-box classifiers find that changing the race to "Caucasian" instead of "Black" contributes to reversing the prediction. In addition, the GAN counterfactual also suggests changing the gender from "Male" to "Female". These insights signal that the recidivism predictor likely holds unfair biases. By extension, these biases can also be manifest in the COMPAS dataset. This is not necessarily certain, however, since it may have been by chance that the training subset was unbalanced and the model simply picked up on these spurious biases. Though general conclusions should be based on subsequent analysis of complete datasets, counterfactuals such as these can help to probe a classifier's decision boundary in the vicinity of individual data points. Insights such as these illustrate the potential counterfactuals have in helping to audit the fairness of ML systems which should be of paramount relevance to all practitioners.


\section{Discussion}
 \label{sec:discussion}
 \noindent The proposed CounteRGAN approach applies a novel Residual GAN (RGAN) together with a fixed target classifier to produce realistic and actionable counterfactuals that achieve favorable prediction increases at low fixed latencies. CounteRGAN's separate value functions allow it to be effective even when the target classifier is non-differentiable or a black-box model. In experiments on two diverse datasets, the CounteRGAN produces counterfactuals between 2 and 7 orders of magnitude faster than two state-of-the-art methods. The drop from seconds to milliseconds opens up the possibility of real-time applications. Overall, the resulting counterfactuals are more realistic than competing methods and generally match or exceed prediction gain and actionability. This approach has also shown promise for probing a classifier's decision boundaries and highlighting potentially unfair biases in use cases such as criminal justice that can have significant stakes for users. Meaningful counterfactuals, such as those produced using the CounteRGAN method, can provide real-time recourse to users and help improve model interpretability and fairness. Together, these form the critical foundations for building effective, scalable, and trustworthy ML systems. 


Several promising areas outside the scope of this work are left for future research. These include investigating additional techniques to quantify and ensure actionability, addressing partially mutable features, applying the RGAN and CounteRGAN to additional domains, and experimenting with iteratively improving the counterfactuals by creating a feedback loop to the generator.

 
 
 
% \input{intro}
% \input{related_work}
% \input{countergan}
% \input{experiments}
% \input{discussion}
% \newpage
% \input{ethical_statement}

% \newpage
% \setlength{\itemindent}{-\leftmargin}
% \makeatletter\renewcommand{\@biblabel}[1]{}\makeatother

% \bibliographystyle{plainnat}
\bibliography{countergan.bib}

\end{document}
\endinput
