\documentclass[accepted]{uai2023} % for initial submission

\usepackage{booktabs}
\usepackage{tikz} 
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{tikz}

\usepackage{algorithm2e}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{bbm}
\usepackage{bm}
\usepackage{array}
\usepackage[graphicx]{realboxes}
\usepackage{xr}
\usepackage{multirow}

% using instructions on Overleaf's website to use xr package for cross-referencing. See https://www.overleaf.com/learn/how-to/Cross_referencing_with_the_xr_package_in_Overleaf

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}

\myexternaldocument{lanners_407}

%% Self-defined macros
\newcommand{\x}{\mathbf{x}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\MG}{\mathrm{MG}}
\newcommand{\tr}{\textrm{tr}}
\DeclareMathOperator\supp{supp}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{manualtheoreminner}{Theorem}
\newenvironment{manualtheorem}[1]{%
  \renewcommand\themanualtheoreminner{#1}%
  \manualtheoreminner
}{\endmanualtheoreminner}


\title{Variable Importance Matching for Causal Inference\\(Supplementary material)}

\author[1]{Quinn Lanners}
\author[2]{Harsh Parikh}
\author[3]{Alexander Volfovsky}
\author[2]{Cynthia Rudin}
\author[1]{David Page}

\affil[1]{%
    Dept. of Biostatistics\\
    Duke University\\
    Durham, NC, USA.
}
\affil[2]{%
    Dept. of Computer Science\\
    Duke University\\
    Durham, NC, USA.
}
\affil[3]{%
    Dept. of Statistical Science\\
    Duke University\\
    Durham, NC, USA.
  }
  \begin{document}
  
\onecolumn 
\maketitle

\appendix
\section{Proofs for Theorems in Section 5}

\begin{manualtheorem}
{\ref{thm:motivation}}[Closeness in $\X$ implies closeness in $Y$] Consider a $p$-dimensional covariate space where for $t' \in \{0,1\}$, $f^{(t')}(\X_i) = \mathbb{E}[Y_i | \X = \X_i, T = t' ] = \X_i\bm{\beta}^{(t')}$. Construct $\mathcal{M}\in\mathbb{R}^{p\times p}$ where for all $l,r \in \{1,...,p\}$ $\mathcal{M}_{l,l} = |\beta^{(t')}_l|$ and for $l\neq r$ $\mathcal{M}_{l,r} = 0$. Then, $\forall i, j$, we have that $d_{\mathcal{M}}(\X_i, \X_j) \geq \left|f^{(t')}(\X_i) - f^{(t')}(\X_j) \right|$.
\end{manualtheorem}
\textbf{Proof for Theorem~\ref{thm:motivation}}. 
\begin{align*}
    d_{\mathcal{M}}(\X_i, \X_j) = \sum\limits_{l=1}^p \mathcal{M}_{l,l}|X_{i,l} - X_{j,l}| = \sum_{l=1}^p |\beta^{(t')}_l||X_{i,l} - X_{j,l}| &\geq \left|\sum_{l=1}^p \beta^{(t')}_l(X_{i,l} - X_{j,l})\right| \\&= \left|f^{(t')}(\X_i) - f^{(t')}(\X_j) \right|.
\end{align*}
QED


\begin{manualtheorem}{\ref{thm:supp}}[Optimality of $\mathcal{M}$]
    Using the setup of Theorem~\ref{thm:motivation}, let $\supp(\X) = \mathbb{R}^p$.
    Consider an arbitrary diagonal Mahalanobis distance matrix $\widetilde{\mathcal{M}}\in\mathbb{R}^{p\times p}$ where $\|\widetilde{\mathcal{M}}\|_1 = \|\bm{\beta}^{(t')}\|_1$ and $\widetilde{\mathcal{M}}_{l,l} > 0$ when $|\beta^{(t')}_l| > 0$.
    For some $\epsilon \geq 0$ and $\X_1\in\mathbb{R}^p$, define $S_{\widetilde{\mathcal{M}}, \epsilon}(\X_1) := \{\X_2 : \X_2\in\mathbb{R}^p, d_{\widetilde{\mathcal{M}}}(\X_1, \X_2) = \epsilon\}$. Then, 
   \begin{equation*}
        \sup\limits_{\X_2\in S_{\mathcal{M}, \epsilon}(\X_1)}|f^{(t')}(\X_1)- f^{(t')}(\X_2)| 
        \leq
        \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}}, \epsilon}(\X_1)}|f^{(t')}(\X_1)- f^{(t')}(\X_3)|.
    \end{equation*}
\end{manualtheorem}

In what follows, we recall that a diagonal Mahalanobis distance matrix, $\widetilde{\mathcal{M}}$, is: 
\begin{itemize}
\item  diagonal: for all $l,r \in \{1,...,p\}$, $l\neq r$, $\widetilde{\mathcal{M}}_{l,r} = 0$.
\item non-negative entries: for all $l \in \{1,...,p\}$, $\widetilde{\mathcal{M}}_{l,l} \geq 0$.
\end{itemize}

To prove this result, we first prove the following two lemmas.

\textbf{Lemma 1}\label{lemma-1} (Maximum Absolute Difference in Expected Outcomes under $\mathcal{M}$). Consider a $p$-dimensional covariate space where $\supp(\X) = \mathbb{R}^p$ and for $t' \in \{0,1\}$, $f^{(t')}(\X_i) = \mathbb{E}[Y_i | \X = \X_i, T = t' ] = \X_i\bm{\beta}^{(t')}$. Define $\mathcal{L} := \{l: \left|\beta^{(t')}_l\right| > 0\}$. Construct any diagonal Mahalanobis distance matrix, $\widetilde{\mathcal{M}}$, where $\|\widetilde{\mathcal{M}}\|_1 = \|\bm{\beta}^{(t')}\|_1$ and $\widetilde{\mathcal{M}}_{l,l} > 0$ when $|\beta^{(t')}_l| > 0$. Then, for some $\epsilon\geq 0$ and $\X_1\in\mathbb{R}^p$, let $S_{\widetilde{\mathcal{M}}, \epsilon}(\X_1)$ be as defined in Theorem~\ref{thm:supp}.
We can conclude that 
\begin{equation*}
    \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}}, \epsilon}(\X_1)}|f^{(t')}(\X_1)- f^{(t')}(\X_3)| = \epsilon \max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\right\}.
\end{equation*}

\textbf{Proof of Lemma 1}.
\begin{align*}
    \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}}, \epsilon}(\X_1)}|f^{(t')}(\X_1)- f^{(t')}(\X_3)| &= \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}},\epsilon}(\X_1)}\left|\sum\limits_{l\in\mathcal{L}} \beta^{(t')}_l(X_{1,l} - X_{3,l})\right|.
\end{align*}
Note that since $\supp(\X) = \mathbb{R}^p$, with probability strictly greater than zero there exists an $\X_1$ and $\X_3$ such that $d_{\widetilde{\mathcal{M}}}(\X_1, \X_3) = \epsilon$ and for all $l\in\mathcal{L}$, $X_{1,l} > X_{3,l}$ when $\beta^{(t')}_l > 0$ and $X_{1,l} < X_{3,l}$ when $\beta^{(t')}_l < 0$. Then,
\begin{align*}
\sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}},\epsilon}(\X_1)}\left|\sum\limits_{l\in\mathcal{L}} \beta^{(t')}_l(X_{1,l} - X_{3,l})\right|
    &= \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}},\epsilon}(\X_1)}\left\{ \sum\limits_{l\in\mathcal{L}} \left|\beta^{(t')}_l(X_{1,l} - X_{3,l})\right|\right\} \\
    &= \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}},\epsilon}(\X_1)}\left\{ \sum\limits_{l\in\mathcal{L}} \frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\widetilde{\mathcal{M}}_{l,l}\left|X_{1,l} - X_{3,l}\right|\right\}.
\end{align*}
Note that $\left\{\sum\limits_{l\in\mathcal{L}} \frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\widetilde{\mathcal{M}}_{l,l}\left|X_{1,l} - X_{3,l}\right| : \X_3\in S_{\widetilde{\mathcal{M}},\epsilon}(\X_1)\right\}$ is maximized at $ \epsilon\max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\right\}$. It is known that if the maximum value of a set is in the set, the supremum of that set equals the maximum value of that set. Therefore, we conclude that,
\begin{align*}
    \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}},\epsilon}(\X_1)}\left\{ \sum\limits_{l\in\mathcal{L}} \frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\widetilde{\mathcal{M}}_{l,l}\left|X_{1,l} - X_{3,l}\right|\right\}
    &= \epsilon\max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\right\}.
\end{align*}
QED

\textbf{Lemma 2}\label{lemma-2} Under the same setup as Lemma \hyperref[lemma-1]{1}, $\max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\right\} \geq 1$.

\textbf{Proof of Lemma 2}.
First note that $\sum\limits_{l\in\mathcal{L}} \widetilde{\mathcal{M}}_{l,l} \leq \sum\limits_{l=1}^p \widetilde{\mathcal{M}}_{l,l} = \sum\limits_{l=1}^p |\beta^{(t')}_l| = \sum\limits_{l\in\mathcal{L}} |\beta^{(t')}_l|$. There are two possible cases. In case one, $\forall l\in\mathcal{L}$, $\widetilde{\mathcal{M}}_{l,l} = \mathcal{M}_{l,l} = |\beta^{(t')}_l|$. Then $\max_{l\in\mathcal{L}} \frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}} = 1$. In case two, there exists $l\in\mathcal{L}$ for which $\widetilde{\mathcal{M}}_{l,l} \neq |\beta^{(t')}_l|$. But then there must exist an $l'\in\mathcal{L}$ for which $\widetilde{\mathcal{M}}_{l',l'} < |\beta^{(t')}_{l'}| \implies \max_{l\in\mathcal{L}} \frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}} > 1$.
QED

\textbf{Proof of Theorem~\ref{thm:supp}}.
First note that $\mathcal{M}$ is a diagonal Mahalanobis distance matrix, $\|\mathcal{M}\|_1 = \|\bm{\beta}^{(t')}\|_1$, and $\mathcal{M}_{l,l} > 0$ when $|\beta^{(t')}_l| > 0$. The proof of the theorem 
%Theorem~\ref{thm:supp} 
then follows directly from Lemma \hyperref[lemma-1]{1} and Lemma \hyperref[lemma-2]{2}.
\begin{equation*}
    \begin{split}
        \sup\limits_{\X_2\in S_{\mathcal{M}, \epsilon}(\X_1)}|f^{(t')}(\X_1)- f^{(t')}(\X_2)|
        &= \epsilon \max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{\mathcal{M}_{l,l}}\right\} \\
        &= \epsilon \max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{|\beta^{(t')}_l|}\right\} \\
        &= \epsilon \\
        &\leq \epsilon \max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\right\} \\
        &= \sup\limits_{\X_3\in S_{\widetilde{\mathcal{M}}, \epsilon}(\X_1)}|f^{(t')}(\X_1)- f^{(t')}(\X_3)|.
    \end{split}
\end{equation*}
Where $\epsilon \leq \epsilon \max_{l\in\mathcal{L}}\left\{\frac{|\beta^{(t')}_l|}{\widetilde{\mathcal{M}}_{l,l}}\right\}$ because of Lemma \hyperref[lemma-2]{2}.
QED


\begin{manualtheorem}{\ref{thm:consistency}}[Consistency of LCM]
     For $t'\in\{0,1\}$, let $f^{(t')}(\X_i) = \mathbb{E}[Y_i | \X = \X_i, T = t' ] $. Let $f^{(t')}$ be Lipschitz continuous and,
     \begin{equation*}
        \supp\left(f^{(t')}\right) :=  \left\{j : 
             \textrm{importance of }\X_{\cdot, j}\textrm{ in } f^{(t')}\textrm{ is } > 0\right\}.  
     \end{equation*}
     Denote $d_{\mathcal{M}^*}$ as the distance metric learned by LCM in Section~\ref{sec: method} and let $\Gamma\left(\mathcal{M}^*\right) = \{j : \mathcal{M}^*_{j,j} > 0\}$. LCM is consistent for CATE estimation if $\supp\left(f^{(0)}\right) \bigcup \supp\left(f^{(1)}\right) \subseteq \Gamma\left(\mathcal{M}^*\right)$.
\end{manualtheorem}     
\textbf{Proof of Theorem \ref{thm:consistency}}. 
First, let us introduce the concept of a smooth distance metric (defined in \cite{malts}). 
\begin{definition}[Smooth Distance Metric] \label{def-1}
    $d: \X \times \X \rightarrow \mathbb{R}^+$ is a smooth distance metric if there exists a monotonically increasing bounded function $\delta_d(\cdot)$ with zero intercepts, such that $\forall i, j \in \mathcal{S}$ if $T_i = T_j = t'$ and $d(\X_i, \X_j) \leq a$ then $\left|\mathbb{E}\left[Y_i(t') | \X_i \right] - \mathbb{E}\left[Y_j(t') | \X_j \right]\right|\leq \delta_{d}(a).$
\end{definition}
Theorem 1 in \citep{malts} shows that matching with a smooth distance metric guarantees consistency of CATE estimates. 

Recovering the correct support for the potential outcome functions implies that restricting to only variables in the recovered support, the potential outcomes are independent of the covariates: 
$(Y(1),Y(0)) \perp \X\mid \{\X_{\cdot,j}\}_{j\in \supp( f^{(0)} ) \cup \supp( f^{(1)} )} $. Also, note that if $\{\X_{i,j}\}_{j\in \supp( f^{(0)} ) \cup \supp( f^{(1)} )}$ is close to $\{\X_{k,j}\}_{j\in \supp( f^{(0)} ) \cup \supp( f^{(1)} )}$ then $f^{(0)}(X_i)$ is close to $f^{(0)}(X_k)$ and $f^{(1)}(X_i)$ is close to $f^{(1)}(X_k)$ by the definition of support and the Lipschitz continuity assumption. Thus, if $ \supp( f^{(0)} ) \cup \supp( f^{(1)} )\subseteq \Gamma(\mathcal{M^*})$ then $d_\mathcal{M}^*$ is a smooth distance metric. This guarantees the consistency of our estimates. QED

\paragraph{Consistency of LASSO.} Much work has been done on the consistency of LASSO for feature selection 
\citep{Zhang2016a}.  
The ability for LASSO to recover the correct support even in the case of non-linear targets makes it more robust to model misspecification. LASSO is consistent for support recovery if $f(\X_i, t) = \mathbb{E}[Y_i | \X = \X_i, T = t' ]$ satisfies one of the following conditions:
    \begin{enumerate}[label=(\roman*)]
        \item $f(\X_i, t') = \X_i\bm{\beta^{(t')}}$
        \item $f(\X_i, t') = g\left(\X_i\bm{\beta^{(t')}}\right)$ where $\beta^{(t')}_k \neq 0$ for $k\in\{1,..,r\}$, for some $r\leq p$, and, if $r < p$, $\beta^{(t)}_k = 0$ for $k\in\{r, ..., p\}$, and the following conditions are met:
        \begin{enumerate}
            \item \textbf{Cov}($\X, \X$) is invertible.
            \item The eigenvalues of $\Sigma_{r,r} =$ \textbf{Cov}($\X_{1:r}, \X_{1:r}$) are such that $0 < c_1 \leq \Lambda\left(\Sigma_{r,r}\right) \leq c_2 < \infty$. Where $\Lambda\left(\Sigma_{r,r}\right)$ are the eigenvalues of $\Sigma_{r,r}$.
            \item $E[Y(t')]^4 < \infty$
            \item $g$ is differentiable almost everywhere and for $t\sim\mathcal{N}(0,1)$, $E(|g(t)|) < \infty$ and $E(|g'(t)|) < \infty$.
            \item For all $i$, $E\left[X_i^TX_i\left|g\left(\X_i\bm{\beta^{(t')}}\right)\right|^2\right] < \infty$.
        \end{enumerate}
    \end{enumerate}

\section{Method Implementation for Experiments}
In this section we outline how we implemented each method used in our experiments. To calculate CATE estimates for all samples, we employed the same $\eta$-fold cross-fitting strategy for each method. In particular, we train models to estimate the $\widehat{Y}_i(t') = f^{(t')}(\X_i)$ for $t'\in\{0,1\}$ using $S_{n,tr}$ and perform estimation on $S_{n,est}$. The only method that we did not use cross-fitting for was GenMatch, which does not use the outcome to learn it's distance metric and thus does not require a training set. All references to scikit-learn refer the Python machine learning package from \cite{scikit-learn}.
\begin{itemize}
    \item \textbf{LASSO Coefficient Matching}: We implemented the method described in this paper in Python. We use scikit-learn's \texttt{LassoCV} to learn $d_{\mathcal{M^*}}$ and \texttt{NearestNeighbors} with \texttt{metric='manhattan'} to perform nearest neighbor matching.

    \item \textbf{Linear and Nonparametric Prognostic Score Matching}: We follow the notion of a prognostic score outlined in \cite{Hansen2008}. In particular, we employ a \textit{double} prognostic score matching method were we model both the control and treatment space separately as $\widehat{Y}_i(t') = f^{(t')}(\X_i)$ for $t'\in\{0,1\}$. For linear PGM we use scikit-learn's \texttt{LassoCV} as our prognostic score models and for nonparametric PGM we use \texttt{GradientBoostingRegressor} for our prognostic score models. We then match with replacement on $[f^{(0)}(\X_i), f^{(1)}(\X_i]$ using scikit-learn's \texttt{NearestNeighbors} with \texttt{metric='euclidean'} to perform nearest neighbor matching. We estimated CATEs with the same mean estimator as LCM.

    \item \textbf{MALTS Matching}: We use the method developed in \citet{malts} that was implemented in Python \citep{git_malts}. We use the package's \texttt{mean} CATE estimator with \texttt{smooth\_cate=False}.     

    \item \textbf{MatchIt}: We use MatchIt's implementation of GenMatch \citep{ho2007matching}. We kept the default setting of \texttt{ratio=1}, which set $K=1$ for matching. But we matched with replacement to be in line with LCM and the other matching methods we compared with.
    \item \textbf{Linear and Nonparametric TLearner}: We use the EconML TLearner implementation from \cite{econml}. For Linear TLearner we use scikit-learn's \texttt{LassoCV} for our models and for Nonparametric TLearner we use scikit-learn's \texttt{GradientBoostingRegressor} for our models.

    \item \textbf{AHB}: We use the method developed in \citet{morucci2020adaptive} that was implemented in R \citep{git_ahb}. We use the package's \texttt{AHB\_fast\_match} implementation with the default settings.

    \item \textbf{Bart T-Learner}: We use the dbarts R package from \cite{dbart}. We train  a BART model on $S_{n,tr}$ to model $\widehat{Y}_i(t') = f^{(t')}(\X_i)$ for $t'\in\{0,1\}$. We then estimate CATEs for each $j\in S_{n,est}$ as $f^{(1)}(\X_j) - f^{(0)}(\X_j)$.

    \item \textbf{Linear DoubleML}: We use the \texttt{econml.dml.DML} class in the econml Python package from \cite{econml}. We fit a model on $S_{n,tr}$ setting \texttt{model\_y=WeightedLassoCV}, \texttt{model\_t=LogisticRegressionCV}, and \texttt{model\_final=LassoCV}. We then estimate CATEs for each $j\in S_{n,est}$ using the \texttt{.effect()} method.

    \item \textbf{Causal Forest DoubleML}: We use the \texttt{econml.dml.CausalForestDML} class in the econml Python package from \cite{econml}. We fit a model on $S_{n,tr}$ setting \texttt{model\_y=WeightedLassoCV} and \texttt{model\_t=LogisticRegressionCV}. We then estimate CATEs for each $j\in S_{n,est}$ using the \texttt{.effect()} method.    

    \item \textbf{Causal Forest}: We use the implementation of causal forest from the grf R package from \cite{econml}. We fit a model on $S_{n,tr}$ with the default package settings. We then used the fit model to estimate CATEs for each $j\in S_{n,est}$.

\end{itemize}

\section{Experimental Details for Section~\ref{sec: results} and Section~\ref{sec: extensions}}
In this section, we describe the data generating processes used and provide further details regarding the setup of each experiment conducted in this paper. The source code necessary to reproduce all of the experiments in this paper is located in the GitHub repository: \url{https://github.com/almost-matching-exactly/variable_imp_matching}.

\subsection{Data Generation Processes}\label{sec:dgps}
Here we outline the data generation processes (DGPs) not fully outlined in the main text.

\textbf{Sine and Exponential DGPs}. \textit{Used in Sections~\ref{sec:results-nonlinear} and \ref{sec: metalearner}}. We generate the covariates and treatment assignments for the Sine and Exponential DGPs in a similar manner.
For both, we generate data as follows:
\begin{align*}
    X_{i,1},\dots, X_{i,p} \overset{iid}{\sim} \text{Uniform}(-\alpha, \beta) \\
    \epsilon_{i,y} \overset{iid}{\sim} \mathcal{N}(0, \sigma^2), \epsilon_{i,t} \overset{iid}{\sim} \mathcal{N}(0, 1) \\
    T_i = \mathbbm{1}\Bigg[\text{expit}\Big(X_{i,1} + X_{i,2} + \epsilon_{i,t}\Big) > 0.5\Bigg]  \\
    Y_i = T_i Y_i(1) + (1-T_i) Y_i(0) + \epsilon_{i, y},
\end{align*}
where expit is the logistic sigmoid: \(\text{expit}(x) = \frac{1}{1 + e^{-x}}\).

For \textbf{Sine} we set 
\(\alpha=\beta=\pi\), \(\sigma^2=0.1\) and calculate the potential outcomes as
\begin{equation*}
    Y_i(0) = \sin(X_{i,1}),\; Y_i(1) = \sin(X_{i,1}) - \sin(X_{i,2}).
\end{equation*}
For \textbf{Exponential} we set \(\alpha=\beta=3\), \(\sigma^2=1\) and calculate the potential outcomes as 
\begin{equation*}
    Y_i(0) = 2e^{X_{i,1}} - \sum_{j=2}^3 e^{X_{i,j}},\; Y_i(1) = 2e^{X_{i,1}} - \sum_{j=2}^3 e^{X_{i,j}} + e^{X_{i,4}}.
\end{equation*}

\textbf{Quadratic DGP}. \textit{Used in Sections~\ref{sec:results-scalability} and \ref{sec:lcm-aug-pgm}}. 
This quadratic data generation process is also described in \cite{malts}. This DGP includes both linear and quadratic terms. For each sample, let $\X_{i}$ be a $p$-dimensional vector where the first $k\leq p$ covariates are relevant and $\kappa \leq k$ is the number of covariates relevant to determining the treatment choice. The DGP is outlined below.

\begin{equation*}
    \begin{gathered}
        X_{i,p} \overset{iid}{\sim} \mathcal{N}(1, 1.5), \  \epsilon_{i,y} \epsilon_{i,t} \overset{iid}{\sim} \mathcal{N}(0,1), \
        \ s_1,\dots,s_{|k|} \overset{iid}{\sim} \text{Uniform}\{-1,1\} \\
        \alpha_j|s_j \overset{iid}{\sim} \mathcal{N}(10s_j,9), \ \beta_1,\dots,\beta_{|k|} \overset{iid}{\sim} \mathcal{N}(1,0.25)        
    \end{gathered}
\end{equation*}

\begin{equation*}
    Y_i(0) = \sum_{j\leq k} \alpha_jX_{i,j}    
\end{equation*}
\begin{equation*}
    Y_i(1) = \sum_{j\leq k} \alpha_jX_{i,j} + \sum_{j\leq k}\beta_jX_{i,j} + \sum_{j\leq k} \sum_{j'\leq k} X_{i,j}X_{i,j'}
\end{equation*}
\begin{equation*}
    T_i = \mathbbm{1}\Bigg[\text{expit}\Big(\sum_{j\leq \kappa} X_{i,j} - \kappa +  \epsilon_{i,t}\Big)  > 0.5 \Bigg]
\end{equation*}
\begin{equation*}
    Y_i = T_iY_i(1) + (1-T_i) Y_i(0) + \epsilon_{i,y}
\end{equation*}
Where expit$(x)$ = $\frac{1}{1+e^{-x}}$.

\textbf{Basic Quadratic DGP}. \textit{Used in Section~\ref{sec:feature-imp}}. This DGP is a quadratic DGP centered at zero. We generate each sample as shown.
\begin{equation*}
    X_{i,1},\dots, X_{i,10} \overset{iid}{\sim} \mathcal{N}(0, 2.5), \ \epsilon_{i,y} \overset{iid}{\sim} \mathcal{N}(0, 1), \ T_i \sim\text{Bernoulli}(0.5)
\end{equation*}
\begin{equation*}
    Y_i(0) = X_{i,1}^2, \ Y_i(1) = X_{i,1}^2+10
\end{equation*}
\begin{equation*}
    Y_i = T_i Y_i(1) + (1-T_i)Y_i(0) + \epsilon_{i, y}
\end{equation*}

\subsection{Experimental Details}\label{sec:add-exp-details}
In Table~\ref{tab:exp-details} we provide details on the experiments shown in this paper. We include additional notes for selected experiments below:

\begin{itemize}
    \item Section~\ref{sec:results-auditability}: Accuracy and
Auditability: We included the school id as a categorical covariate in our dataset. After preprocessing the categorical covariates, we had 6 continuous covariates and 98 binary covariates that we used as input to each model. We used only two splits due to the small occurrence rate of many of the categorical values. We repeated the cross-fitting process 50 times to smooth out treatment effect estimates for each method. All of the results in this section are for the combined 50 iterations.
    \item Section~\ref{sec:results-scalability}: Scalability: The matchit package only performs k:1 matching, so we kept K=1 for GenMatch (which is the default value). Reported runtimes were measured on a Slurm cluster with VMware, where each VM was an Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz. For measuring runtime, we ran each method 20 times on each dataset size. We report the average runtime for each method on each dataset. The variability across the 20 runs was negligible so we ommitted bars showing the standard deviation from the final plot. Each individual runtime measurement was ran on a separate Slurm job that was allocated a single core with 16GB RAM.
    \item Section~\ref{sec:lcm-aug-pgm}: LCM-Augmented-PGM: For ease of implementation, we did not perform cross-fitting for this experiment. Rather, we just used half of the samples (2500) for training and the other half of the samples (2500) for estimation.
\end{itemize}

\begin{table}[]
\caption{Details of Experiments in Sections \ref{sec: results} and \ref{sec: extensions}. The \textit{Additional Information} column indicates if further details for that experiment are included in Section~\ref{sec:add-exp-details}.}
\centering
\begin{tabular}{|l|l|l|l|l|l|l|}
\hline
\textbf{Section}                                                                                             & \textbf{Dataset}                                                             & \textbf{\# Samples} & \textbf{\# Covariates} & \textbf{K}                                                                   & \textbf{$\eta$} & \textbf{Additional Notes} \\ \hline
\begin{tabular}[c]{@{}l@{}}\ref{sec:results-auditability}: Accuracy and\\ Auditability\end{tabular}  & \begin{tabular}[c]{@{}l@{}}ACIC 2018 Learning\\ Mindset Dataset\end{tabular} & 10,000              & 10                     & 10                                                                           & 2            & Y                         \\ \hline
\multirow{2}{*}{\begin{tabular}[c]{@{}l@{}}\ref{sec:results-nonlinear}: Nonlinear \\ Outcome\end{tabular}} & Sine                                                                         & 5000                & 100                    & 10                                                                           & 10           &                           \\ \cline{2-7} 
                                                                                                             & Exponential                                                                  & 5000                & 100                    & 10                                                                           & 10           &                           \\ \hline
\ref{sec:results-scalability}: Scalability                                                                   & Linear + Quadratic                                                           & Varies              & Varies                 & \begin{tabular}[c]{@{}l@{}}10 (1 for \\ GenMatch\\ - see notes)\end{tabular} & 2            & Y                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\ref{sec: metalearner}: Metalearner \\ LCM\end{tabular}                    & Sine                                                                         & 500                 & 10                     & 10                                                                           & 5            &                           \\ \hline
\begin{tabular}[c]{@{}l@{}}\ref{sec:feature-imp}: Feature\\ Importance Matching\end{tabular}         & Simple Quadratic                                                             & 500                 & 10                     & 10                                                                           & 5            &                           \\ \hline
\begin{tabular}[c]{@{}l@{}}\ref{sec:lcm-aug-pgm}: LCM-\\ Augmented-PGM\end{tabular}                  &   Linear + Quadratic                                                                           &        5000             &             20           &        \begin{tabular}[c]{@{}l@{}}25 using PGM\\ followed by\\ 5 using LCM\end{tabular}                                                                     &   N/A       &  Y \\ \hline
\end{tabular}
\label{tab:exp-details}
\end{table}


\section{Additional Experimental Results}
In this section, we include additional experimental results using LCM. We first discuss further findings from experiments in Section~\ref{sec: results} and Section~\ref{sec: extensions}. We then show results of additional experiments comparing LCM to non-matching methods and matching methods with equal weights after feature selection.

\textbf{Section~\ref{sec:results-auditability}: Accuracy and Auditability}. Figure~\ref{fig:schools-mg-full} in this document is an expanded plot of Figure~\ref{fig:schools-mg-diff}(a) in the main text. The supplementary material's Figure~\ref{fig:schools-mg-full} includes S3, X1, and all other effect modifiers X2, C1=1, C1=13, and C1=14. As mentioned in the caption of Figure~\ref{fig:schools-mg-diff}(a) in the main text, $\text{S3}$ indicates the self-reported prior achievements of students and $\text{X1}$ indicates school-level average mindset score of the students. X2 is a school-level continuous covariate that measures the school's achievement level and C1 is a categorical covariate for race/ethnicity. We measure closeness in continuous covariates using the same mean absolute difference metric used in Figure~\ref{fig:schools-mg-diff}(a) in the main text. Whereas, we measure closeness in categorical covariates as the percent of samples in a match group that do not have the same label as the query unit (\% Mismatch). LCM matches much more tightly on all of the continuous covariates. For categorical covariates, while LCM matches tighter than PGM methods, it struggles compared to continuous covariates. We theorize this is due to the low occurrence rate of these features. In particular, C1=1 in 9.5\%, C1=13 in 1.8\% and C1=14 in 6.2\% of samples. Therefore, it is difficult to find matches that have the same C1 value and are also similar in all of the other important covariates. LCM sometimes prioritizes matching almost-exactly on other covariates at the expense of these rare categorical covariates.

\begin{figure}
\centering
\includegraphics[width=0.6\linewidth]{figures/all_mg_full.png}
  \caption{Closeness in important covariates for matched groups produced by LCM, linear PGM, and nonparametric (NP) PGM. Smaller values imply better and tighter matches.}
  \label{fig:schools-mg-full}
\end{figure}


\cite{Carvalho2019} also states that although XC (Urbanicity) is not an effect modifier it is strongly related to X1 (student's fixed mindsets - summarized at the school level) and X2 (school achievement level) which are true effect modifiers. Because of this, seven of the eight methods that are summarized in \cite{Carvalho2019} identified XC as an effect modifier. \cite{Carvalho2019} further shows that, in this dataset, marginally the true cates for XC=3 are much lower than other values of XC. We show in Figure~\ref{fig:schools-xc} that LCM also identifies this trend in XC.

For Section~\ref{sec:results-auditability}, we did not compare to other almost-matching-exactly methods (i.e. MALTS, AHB, GenMatch) due to the large size of the dataset. The ACIC 2018 Learning Mindset Dataset has 50,000 samples and >100 covariates after encoding the categorical features. Results from Section~\ref{sec:results-scalability} highlight how intractable it would be to run other AME methods on a dataset of this size.

\begin{figure}
\centering
\includegraphics[width=0.6\linewidth]{figures/cate_by_xc.png}
  \caption{Marginal CATE estimates produced by LCM, Linear PGM, and Nonparametric PGM for the categorical school-level covariate of urbanicty (XC).}
  \label{fig:schools-xc}
\end{figure}

\textbf{Section~\ref{sec:results-nonlinear}: Nonlinear Outcomes}. Figure~\ref{fig:nonlinear-highdim} shows CATE estimation accuracy for the same experiment in Section~\ref{sec:results-nonlinear} with the number of covariates increased to 500 for both the \textbf{Sine} and \textbf{Exponential} datasets. Given that we used 10 splits for this experiment, the training set in each fold had 500 samples. Note that LCM's accuracy does not suffer in this extremely high-dimensional setting where the number of samples equals the number of covariates. These results further highlight the ability of LCM to scale to very high-dimensional data even in the case of nonlinear outcome functions.

\begin{figure}
\centering
\includegraphics[width=0.6\linewidth]{figures/lcm_vs_lin_pgm_500_covs.png}
    \caption{Comparing LCM’s and Linear PGM’s performances for high-dimensional nonlinear synthetically generated datasets \textbf{Sine} and \textbf{Exponential}.}
  \label{fig:nonlinear-highdim}
\end{figure}

\textbf{Section~\ref{sec: metalearner}: Metalearner LCM}. For the Metalearner LCM, here we show the effect of learning unique distance metrics for calculating control vs treated KNNs. We measure the distance between query unit's covariate values and the values of the ten nearest neighbors' of each treatment type. In particular, we calculate the mean absolute difference between a query unit's value and the values of its ten nearest neighbors. As explained in Section~\ref{sec: metalearner}, X1 is a relevant covariate to the outcome under both treatment regimes, whereas X2 is only relevant to the outcome under treatment. X3 is unimportant in both setting and shown as a reference point. Figure~\ref{fig:metalearner-mg-diff} shows that while LCM's nearest neighbors are equally close on X0 and X1 in both treatment spaces, Metalearner LCM considers X2 as unimportant when calculating KNNs who are in the control group. This highlights how Metalearner LCM is able to adapt to outcome spaces that are different under different treatment regimes.

\begin{figure}
\centering
\includegraphics[width=0.5\linewidth]{figures/metalearner_barplot_mg_avg_diff.png}
  \caption{Measure of how tightly the KNN groups are for LCM versus Metalearner LCM under different treatment regimes.}
  \label{fig:metalearner-mg-diff}
\end{figure}

\textbf{LCM vs Machine Learning Methods}. Previous almost-matching-exactly literature has established that AME methods perform as well as (and often better than) machine learning methods like BART, causal forest, and double machine learning for estimating CATEs \citep{malts, morucci2020adaptive, wang2017flame}. For this reason, this paper focuses on comparing LCM to matching methods and particularly other AME methods. However, here we include an experiment comparing the CATE estimation accuracy of LCM to various machine learning methods on a high-dimensional non-linear dataset. 

We use the Quadratic DGP with 25 relevant covariates, 2 of which are relevant to the treatment choice, and 125 irrelevant covariates. We generate 2500 samples and set $\eta=5$. We run LCM with two configurations. \textit{LCM Mean} is run with $K=10$ and uses a mean estimator inside the match groups. \textit{LCM Linear} is run with $K=40$ and uses linear regression as the estimator inside the match groups. We compare to state-of-the-art machine learning methods double machine learning (DML), causal forest, and BART TLearner. Figure~\ref{fig:lcm-vs-ml} shows that LCM Mean performs on par with the machine learning methods on this dataset, further highlighting the accuracy our method. LCM Linear improves upon LCM Mean, showing that we can achieve better accuracy with more sophisticated estimators if we are willing to increase the size of the match groups.

\begin{figure}
\centering
\includegraphics[width=0.6\linewidth]{figures/lcm_vs_ml.png}
    \caption{Estimated CATE absolute error relative to the true ATE for LCM Mean, LCM Linear, and state-of-the-art machine learning methods. DML stands for double machine learning.}
  \label{fig:lcm-vs-ml}
\end{figure}

\textbf{LCM vs Feature Selection}. Here we show CATE estimation accuracy of LCM compared to matching equally on the covariates after feature selection. To compare with LCM, we estimate CATEs using feature selection by simply following the same steps as LCM but replacing the $\mathcal{M}^*$ with an $\mathcal{M}\in\mathbb{R}^{p\times p}$ such that $\mathcal{M}_{l,l} = 1$ when $\mathcal{M}^*_{l,l} > 0$ and $\mathcal{M}_{l,l} = 0$ when $\mathcal{M}^*_{l,l} = 0$. We refer to this method as \textit{LASSO FS}. We also compare to an \textit{Oracle} feature selector in which we assume that we know which covariates are important and match equally only on the important covariates.

We run our analysis on three of the data generation processes used earlier in this paper. Namely, we run on the \textbf{Sine}, \textbf{Exponential}, and \textbf{Quadratic} DGPs described in Section~\ref{sec:dgps}. We generate 5000 samples and 100 covariates for each DGP and have two important covariates for \textbf{Sine}, four important covariates for \textbf{Exponential}, and five important covariates for \textbf{Quadratic}. All tests set $\eta=5$ and $K=10$. Figure~\ref{fig:lcm-vs-fs} shows that LCM outperforms LASSO feature selection and performs on par with an Oracle feature selector. This highlights how using the relative weights of feature importance values in a distance metric, and thus matching tighter on covariates that more heavily contribute to the outcome, ultimately leads to more accurate CATE estimates.

\begin{figure}
\centering
\includegraphics[width=0.9\linewidth]{figures/lcm_vs_fs.png}
  \caption{Estimated CATE absolute error relative to the true ATE for LCM and matching equally on covariates after LASSO and Oracle feature selection.}
  \label{fig:lcm-vs-fs}
\end{figure}


\bibliography{lanners_407}

\end{document}
