%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
%\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{url}


%\usepackage[hypertexnames=false, colorlinks=true, allcolors=blue]{hyperref} hyperref is loaded twice (see .cls file)
% \renewcommand\backrefxxx[3]{%
%   \hyperlink{page.#1}{$\Lsh$ p.#1}%
% }


\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{algorithm}
\usepackage{algorithmic}
\urlstyle{same}

%%% added packages
\usepackage{amssymb} 
\usepackage{natbib}
\usepackage{multirow}
\usepackage{diagbox}
\usepackage{nccmath}
\usepackage{ascmac}
\usepackage{here}

\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
%\newtheorem{proof}{Proof}
\newtheorem{assumption}{Assumption}
\newtheorem{proposition}{Proposition}
\newtheorem{example}{Example}

\newcommand \indep{\mathop{\perp\!\!\!\!\perp}}                                 
\newcommand \dep{\mathop{\not\perp\!\!\!\!\perp}}

\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\V}{\mathrm{Var}}
\DeclareMathOperator{\Cov}{\mathrm{Cov}}
\DeclareMathOperator{\R}{\mathbb{R}}
\DeclareMathOperator{\pr}{\mathrm{P}}
\DeclareMathOperator{\I}{\mathbf{I}}
\DeclareMathOperator{\hz}{\mathcal{H}_{0}}
\DeclareMathOperator{\ho}{\mathcal{H}_{1}}
\newcommand{\si}{m}
\DeclareMathOperator{\hzsi}{\mathcal{H}_{0, \si}}
\DeclareMathOperator{\hosi}{\mathcal{H}_{1, \si}}
\DeclareMathOperator{\s}{S}
\DeclareMathOperator{\sh}{\hat{S}}

\usepackage{bm}

\DeclareMathOperator{\bimu}{\bm{\mu}}
\DeclareMathOperator{\biSigma}{\bm{\Sigma}}
\DeclareMathOperator{\bieta}{\bm{\eta}}
\newcommand{\biA}{\textbf{\textit{A}}}
\newcommand{\bib}{\textbf{\textit{b}}}
\newcommand{\biz}{\textbf{\textit{z}}}
\newcommand{\bic}{\textbf{\textit{c}}}
\newcommand{\bie}{\textbf{\textit{e}}}
\newcommand{\biX}{\textbf{\textit{X}}}
\newcommand{\bix}{\textbf{\textit{x}}}
\newcommand{\Yz}{Y^{0}}
\newcommand{\Yo}{Y^{1}}
\newcommand{\Ya}{Y^{a}}
\newcommand{\yz}{Y^{0}}
\newcommand{\yzp}{{Y^0}^{\prime}}
\newcommand{\yo}{Y^{1}}
\newcommand{\yop}{{Y^1}^{\prime}}
\newcommand{\wpz}{w^{0}}
\newcommand{\wpo}{w^{1}}
\newcommand{\wpa}{w^{a}}
\newcommand{\xn}{x}
\newcommand{\ozx}{{\omega}^{0, \xn}}
\newcommand{\oox}{{\omega}^{1, \xn}}
\newcommand{\oax}{{\omega}^{a, \xn}}

\newcommand{\xsid}{x_{\si}^{\star}}
\newcommand{\nf}{d}
\newcommand{\nrff}{r}


\newcommand{\acontra}{\Cref{asec:contra}}
\newcommand{\acounter}{\Cref{asec:counter}}
\newcommand{\akernel}{\Cref{asec:kernel}}
\newcommand{\aprop}{\Cref{asubsec:prop1}}
\newcommand{\ath}{\Cref{asubsec:thm1}}
\newcommand{\acounterexp}{\Cref{asubsec:counterexp}}
\newcommand{\aneuronexp}{\Cref{asubsec:neuronexp}}

%\newcommand{\acontra}{Appendix A}
%\newcommand{\acounter}{Appendix B}
%\newcommand{\akernel}{Appendix C}
%\newcommand{\aprop}{Appendix D.1}
%\newcommand{\ath}{Appendix D.2}
%\newcommand{\acounterexp}{Appendix E.1}
%\newcommand{\aneuronexp}{Appendix E.2}

\usepackage{txfonts}

\usepackage{cleveref}
\crefname{assumption}{assumption}{assumptions}
\crefname{proposition}{proposition}{proposition}
\crefname{example}{example}{example}

\makeatletter
\DeclareRobustCommand\widecheck[1]{{\mathpalette\@widecheck{#1}}}
\def\@widecheck#1#2{%
   \setbox\z@\hbox{\m@th$#1#2$}%
   \setbox\tw@\hbox{\m@th$#1%
      \widehat{%
         \vrule\@width\z@\@height\ht\z@
         \vrule\@height\z@\@width\wd\z@}$}%
   \dp\tw@-\ht\z@
   \@tempdima\ht\z@ \advance\@tempdima2\ht\tw@ \divide\@tempdima\thr@@
   \setbox\tw@\hbox{%
      \raise\@tempdima\hbox{\scalebox{1}[-1]{\lower\@tempdima\box\tw@}}}%
   {\ooalign{\box\tw@ \cr \box\z@}}}
\makeatother

\hypersetup{
    colorlinks=true,
    allcolors=Navy 
}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


\title{Feature Selection for Discovering Distributional Treatment Effect Modifiers}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Yoichi Chikahara}
\author[2]{Makoto Yamada}
\author[2]{Hisashi Kashima}
% Add affiliations after the authors
\affil[1]{%
    NTT Communication Science Laboratories, Kyoto, Japan
}
\affil[2]{%
    Kyoto University, Kyoto, Japan
}
  
  \begin{document}


\appendix
\setcounter{theorem}{0}
\setcounter{proposition}{0}
\setcounter{table}{0}
\setcounter{equation}{0}
\renewcommand{\thetable}{A.\arabic{table}}
\renewcommand{\theequation}{A.\arabic{equation}}

\title{Feature Selection for Discovering Distributional Treatment Effect Modifiers (Supplementary material)}

\onecolumn

\maketitle


\section{Relationship between Marginal and Joint Distributions} \label{asec:contra}

To confirm that our feature importance measure is reasonable, we consider the following two relationships:
\begin{screen}
    \begin{itemize}
        \item {\it If the discrepancy between marginal potential outcome distributions $\pr(\Yz \mid X_{\si})$ and $\pr(\Yo \mid X_{\si})$ varies with feature $X_{\si}$'s values, then joint distribution $\pr(\Yz, \Yo \mid X_{\si})$ is also changeable depending on $X_{\si}$'s values}.
        \item {\it If joint distribution $\pr(\Yz, \Yo \mid X_{\si})$ changes depending on feature $X_{\si}$'s values, then some functionals of the joint distribution depend on $X_{\si}$'s values}.
    \end{itemize}
\end{screen}
Since the second relationship is obvious, in this section, we show that the first relationship holds. For simplicity, we consider binary feature $X_{\si} \in \{0, 1\}$; however, the following discussion also holds for discrete-valued and continuous-valued $X_{\si}$. 

To prove the first relationship, it is sufficient to show that its contraposition holds: If $\pr(\Yz, \Yo \mid X_{\si} = 0) = \pr(\Yz, \Yo \mid X_{\si} = 1)$, then the discrepancy between $\pr(\Yz \mid X_{\si} = 0)$ and $\pr(\Yo \mid X_{\si} = 0)$ equals the one between $\pr(\Yz \mid X_{\si} = 1)$ and $\pr(\Yo \mid X_{\si} = 1)$. We can easily prove this contraposition. From the equality of the joint distributions, we have $\pr(\Yz \mid X_{\si} = 0) = \pr(\Yz \mid X_{\si} = 1)$ and $\pr(\Yo \mid X_{\si} = 0) = \pr(\Yo \mid X_{\si} = 1)$. These equalities imply that the discrepancy between $\pr(\Yz \mid X_{\si} = 0)$ and $\pr(\Yo \mid X_{\si} = 0)$ equals the one between $\pr(\Yz \mid X_{\si} = 1)$ and $\pr(\Yo \mid X_{\si} = 1)$. Thus we proved the first relationship.

\section{Counterexamples} \label{asec:counter}
 
As described in Section 3.1, there are several counterexamples where our method cannot find the features related to the functionals of the joint distribution of potential outcomes. 

Let $\Yz$ and $\Yo$ be the potential outcomes and $X \in \{0, 1\}$ be a binary feature. Suppose that the discrepancy between marginal distributions $\pr(\Yz \mid X)$ and $\pr(\Yo \mid X)$ is measured as the MMD \citep{gretton2012kernel}. Then we can represent such counterexamples as the cases where the following relations hold:
\begin{align*}
    &\pr(\Yz, \Yo \mid X=0) \neq  \pr(\Yz, \Yo \mid X=1) \\
    &\mathrm{MMD}^2( \pr(\Yz \mid X = 0), \pr(\Yo \mid X = 0)) = \mathrm{MMD}^2( \pr(\Yz \mid X = 1), \pr(\Yo \mid X = 1)).
\end{align*}

Letting the potential outcomes be $\Yz, \Yo \in \{-1, 0, 1\} \subset \R$, we take an example of joint probability tables that satisfies the above relations in \Cref{UAI2022-table_ex_2}. In this example, the MMD between marginal distributions remains unchanged:
\begin{align*}
    &\mathrm{MMD}^2( \pr(\Yz \mid X = 0), \pr(\Yo \mid X = 0)) = \mathrm{MMD}^2( \pr(\Yz \mid X = 1), \pr(\Yo \mid X = 1)) = 0.
\end{align*}
By contrast, the joint distribution changes depending on $X$'s values, as illustrated in \Cref{UAI2022-table_ex_2}. As a result, although the average treatment effect does not change, the treatment effect variance and the covariance between potential outcomes vary as follows:
\begin{align*}
    &\E[\Yo - \Yz \mid X=0] = \E[\Yo - \Yz \mid X=1] = 0\\
    &\Cov[\Yz, \Yo \mid X=0] = 1; \quad \Cov[\Yz, \Yo \mid X=1] = -1\\
    &\V[\Yo - \Yz \mid X=0] = 0; \quad \V[\Yo - \Yz \mid X=1] = 4.
\end{align*}

In this example, since we cannot detect any change in the MMD between marginal distributions, our method fails to find that feature $X$ is related to treatment effect heterogeneity. Note, however, that the existing mean-based approaches would also fail because the average treatment effect remains unchanged. 

Addressing such counterexamples is extremely difficult. It requires us to estimate the functionals of the joint potential outcome distribution; however, inferring such a joint distribution is impossible, as described in Section 3.1. One possible solution is to utilize several techniques for estimating the lower and upper bounds on these functionals by making additional assumptions \citep{chen2016inference,russell2021sharp,shingaki2021identification}. Establishing a feature selection framework that utilizes such lower and upper bounds remains our future work.


\begin{table}[t]
    \caption{Joint probability tables of potential outcomes. Nonzero probabilities are shown in bold. Total expresses marginal potential outcome probabilities.}
    \centering 
        \scalebox{1.}{
        \begin{tabular}{c|ccc|c}
            \toprule 
            \multicolumn{5}{c}{$\pr(\Yz, \Yo \mid X = 0)$} \\ \midrule
        \diagbox{$\Yz$}{$\Yo$}& -1        & 0    & 1        & Total \\ \midrule 
                            -1& {\bf 0.5} & 0    & 0        & {\bf 0.5} \\
                             0&        0  & 0    & 0        & 0 \\
                             1&        0  & 0    & {\bf 0.5}& {\bf 0.5} \\ \midrule
                         Total& {\bf 0.5} & 0    & {\bf 0.5}& {\bf 1.0} \\
            \bottomrule
        \end{tabular}
        } 
        \scalebox{1.}{
        \begin{tabular}{c|ccc|c}
            \toprule 
            \multicolumn{5}{c}{$\pr(\Yz, \Yo \mid X = 1)$} \\ \midrule
        \diagbox{$\Yz$}{$\Yo$}& -1        & 0    & 1        & Total \\ \midrule 
                            -1&        0  & 0    & {\bf 0.5}& {\bf 0.5} \\
                             0&        0  & 0    & 0        & 0 \\
                             1& {\bf 0.5} & 0    & 0        & {\bf 0.5} \\ \midrule
                         Total& {\bf 0.5} & 0    & {\bf 0.5}& {\bf 1.0} \\
            \bottomrule
        \end{tabular}
        }
\label{UAI2022-table_ex_2}

\end{table}




\section{Characteristic kernels} \label{asec:kernel}

This section provides a brief overview on characteristic kernels. For the formal definition, see e.g., \citet{sriperumbudur2010hilbert} and \citet[Section 3.3.1]{muandet2017kernel}.

The notion of characteristic kernels is closely related to \textit{kernel mean embedding} \citep{smola2007hilbert}, which is defined as the mean of feature mapping induced by a kernel function. Let $k_X\colon \mathcal{X} \times \mathcal{X} \rightarrow \R$ be a symmetric and positive-definite kernel function and $\Phi_X(x) \coloneqq k_X(x, \cdot)$ be the feature mapping of kernel $k_X$ that maps point $x \in \mathcal{X}$ into reproducing kernel Hilbert space (RKHS) $\mathcal{H}_{k_X}$. Then kernel mean embedding is defined as the mean of random variable $\Phi_X(X)$:
\begin{align*}
    \mu_{X} \coloneqq \E_{X}[\Phi_X(X)] \in \mathcal{H}_{k_X}.
\end{align*}
Here, the expectation is taken with respect to distribution $\pr(X)$; therefore, the concept of kernel mean embedding can be regarded as a mapping of distribution $\pr(X)$ into the RKHS, i.e., $\pr(X) \mapsto \mu_X \in \mathcal{H}_{k_X}$.

A characteristic kernel is a kernel function whose kernel mean embedding does not map different distributions to the same point in the RKHS; that is, the mapping by kernel mean embedding is injective \citep{sriperumbudur2010hilbert}. 

Roughly speaking, a kernel function is characteristic if mean $\E_{X}[\Phi_X(X)]$ contains all moments of random variable $X$. For instance, Gaussian kernel $k_X(x, x') = \mathrm{exp}(- \frac{ (x - x')^2}{2 h_X^2})$ for $x, x' \in \mathbb{R}^1$ is characteristic because the feature mapping is given as $\Phi_X(x) = \mathrm{e}^{- x^2 / 2 h_X^2} [1, \sqrt{\frac{1}{1! h_X^2}}x, \sqrt{\frac{1}{2! h_X^4}}x^2, \dots ]^{\top}$, and its expected value $\mathbb{E}_X[\Phi_X(X)]$ includes all moments: $\mathbb{E}_X[X], \mathbb{E}_X[X^2], \dots$.

By contrast, if $k_X$ is given as a polynomial function (i.e., polynomial kernel), $k_X$ is \textbf{not} a characteristic kernel. For instance, if $k_X$ is formulated as the 2nd-order polynomial kernel $k_X(x, x') = (1 + xx')^2$ for $x, x' \in \mathbb{R}^1$, the feature mapping is given as the finite-dimensional vector $\Phi_X(x) = [1, \sqrt{2} x, x^2]$. In this case, no element in expectation $\mathbb{E}_X [\Phi_X(X)]$ is represented as a function of higher-order moments than $2$; hence, kernel $k_X$ is not characteristic.


\section{Proofs}

\subsection{Proposition 1} \label{asubsec:prop1}


\begin{proof}
    Recall the following definition of $\mbox{WCMMD}^2_{X_{\si} = \xn}$:
    \begin{align} 
        &\mbox{WCMMD}^2_{X_{\si} = \xn} \nonumber \\
        \coloneqq &\E_{A, A', \biX_{-\si}, \biX'_{-\si}, Y, Y' \mid X_{\si} = X'_{\si} = \xn}[ \wpz(A, \biX) \wpz(A', \biX') k_Y(Y, Y')] \nonumber \\
        + &\E_{A, A', \biX_{-\si}, \biX'_{-\si}, Y, Y' \mid X_{\si} = X'_{\si} = \xn}[\wpo(A, \biX) \wpo(A', \biX') k_Y(Y, Y')] \nonumber \\%\nonumber\\
        - &2 \E_{A, A', \biX_{-\si}, \biX'_{-\si}, Y, Y' \mid X_{\si} = X'_{\si} = \xn}[\wpz(A, \biX) \wpo(A', \biX') k_Y(Y, Y')]. \tag{5}
    \end{align}

    We show that the first term in Eq. (5) equals the one in $D^2_{\si}(\xn)$ in Eq. (2). Using conditional ignorability and positivity assumptions, we have
    \begin{align*}
        &\E_{A, A', \biX_{-\si}, \biX'_{-\si}, Y, Y' \mid X_{\si} = \xn, X'_{\si} = \xn}[ \wpz(A, \biX) \wpz(A', \biX') k_Y(Y, Y')] \\
        = &\E_{\biX_{-\si}, \biX'_{-\si} \mid X_{\si} = \xn, X'_{\si} = \xn}\left[ \E_{A, A', Y, Y' \mid \biX_{-\si}, \biX'_{-\si}, X_{\si} = \xn, X'_{\si} = \xn}\left[\frac{\I(A=0)}{1 - \mathrm{e}(\biX)} \frac{\I(A'=0)}{1 - \mathrm{e}(\biX')} k_Y(Y, Y') \right] \right]\\
        = &\E_{\biX_{-\si}, \biX'_{-\si} \mid X_{\si} = \xn, X'_{\si} = \xn, A = 0, A' =0}\left[ \E_{\Yz, \yzp \mid \biX_{-\si}, \biX'_{-\si}, X_{\si} = \xn, X'_{\si} = \xn, A = 0, A' = 0}\left[ \frac{\pr(A=0)}{\pr(A=0 \mid \biX)} \frac{\pr(A'=0)}{\pr(A'=0 \mid \biX'))}  k_Y(Y, Y') \right] \right] \\
        = &\E_{\biX_{-\si}, \biX'_{-\si} \mid X_{\si} = \xn, X'_{\si} = \xn}[ \E_{\Yz, \yzp \mid \biX_{-\si}, \biX'_{-\si}, X_{\si} = \xn, X'_{\si} = \xn}[k_Y(\Yz, \yzp)] ] \\ 
        = &\E_{\Yz, \yzp \mid X_{\si} = \xn, X'_{\si} = \xn}[k_Y(\Yz, \yzp)].
    \end{align*}
    Similarly, the second and third terms in Eq. (5) equal those in $\mathrm{MMD}^2(\pr(\Yz \mid \xn), \pr(\Yo \mid \xn))$ in Eq. (2). Thus we proved Proposition 1.
\end{proof}

\subsection{Theorem 1} \label{asubsec:thm1}

From Proposition 1, we only have to show that $\widehat{D}^2_{\si}(\xn) \overset{p}{\rightarrow} \mbox{WCMMD}^2_{X_{\si} = \xn}$ ($n \rightarrow \infty$) under the assumptions of conditional ignorability and positivity:
\begin{assumption}[Conditional ignorability] \label{UAI2022-asmp1}
    For treatment $A$, features $\biX$, and potential outcomes $\Yz$ and $\Yo$, the following conditional independence relation holds:
    \begin{align*}
        \{\Yz, \Yo\} \indep A \mid \biX.
    \end{align*} 
\end{assumption}
\begin{assumption}[Positivity] \label{UAI2022-asmp2}
    For any value $\bix$ of features $\biX$, propensity score $\mathrm{e}(\biX)$ satisfies the following support condition:
    \begin{align*}
        0 < \mathrm{e}(\bix) < 1.
    \end{align*}
\end{assumption}

To prove $\widehat{D}^2_{\si}(\xn) \overset{p}{\rightarrow} \mbox{WCMMD}^2_{X_{\si} = \xn}$ ($n \rightarrow \infty$), we make several additional assumptions and impose the condition that the following symmetric function is square integrable: 
\begin{align*}
    &K((A, \biX, Y), (A', \biX', Y')) \nonumber \\
    \coloneqq &\left( \wpz(A, \biX) \wpz(A', \biX') + \wpo(A, \biX, Y) \wpo(A', \biX', Y') - \wpz(A, \biX) \wpo(A', \biX') - \wpo(A, \biX) \wpz(A', \biX') \right) k_Y(Y, Y'). 
\end{align*}
\begin{assumption} \label{UAI2022-asmp3}
    Symmetric function $K$ is square integrable:
    \begin{align*}
        \E_{A, A', \biX, \biX', Y, Y'}[K((A, \biX, Y), (A', \biX', Y'))] < \infty.
    \end{align*}
\end{assumption}
When $X_{\si}$ is continuous-valued, and $\oax$ is given by Eq. (8), we make the following standard assumptions on kernel function $k_{X_{\si}}$:
\begin{assumption} \label{UAI2022-asmp4}
    Let $K_{X_{\si}}$ be the following kernel function that measures the similarity between two values $x_{\si}$ and $x^{\star}_{\si}$ on $\mathcal{X}$:
    \begin{align*}
     K_{X_{\si}}(x_{\si} - x^{\star}_{\si}) \coloneqq \frac{1}{h_{X_{\si}}} k_{X_{\si}}(x_{\si}, x^{\star}_{\si}).
    \end{align*}
    Then the order of function $K_{X_{\si}}(u)$ is given by integer $\delta \geq 2$; in other words, the following holds:
    \begin{align*}
        \int u^{\delta} K_{X_{\si}}(u) du < \infty.
    \end{align*}
\end{assumption}
\begin{assumption} \label{UAI2022-asmp5}
    Bandwidth $h_{X_{\si}}$ of kernel function $k_{X_{\si}}$ satisfies 
    \begin{align*}
       h_{X_{\si}} \rightarrow 0 \quad \mbox{and} \quad nh_{X_{\si}} \rightarrow \infty. \quad (n \rightarrow \infty)
    \end{align*}
\end{assumption}
In addition, we impose the smoothness conditions on marginal distribution $\pr(X_{\si})$ and the joint distribution of features $\pr(\biX)$:
\begin{assumption} \label{UAI2022-asmp6}
    Density functions $\pr(X_{\si})$ and $\pr(\biX)$ are $\delta$ times continuously differentiable.
\end{assumption}

Using these assumptions, we prove Theorem 1:

\begin{proof}
    \textbf{The case where weight $\oax_i$ is given by Eq. (6):} Let $K_{i, j} \coloneqq K((a_i, \bix_i, y_i), (a_j, \bix_j, y_j))$ for $i, j \in \{1, \dots, n\}$ and $n_{\xn} \coloneqq \sum_{i=1}^n \I(x_{\si, i} = \xn)$. Then empirical estimator $\widehat{D}^2_{\si}(\xn)$ is given as
    \begin{align*}
        \widehat{D}^2_{\si}(\xn) &= \frac{1}{n^2_{\xn}} \sum_{i=1}^n \sum_{j=1}^n \I(x_{\si, i} = \xn) \I(x_{\si, j} = \xn) K_{i, j}  \\
        &= \left( \frac{n}{n_{\xn}} \right)^2  \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n \I(x_{\si, i} = \xn) \I(x_{\si, j} = \xn) K_{i, j}  \\
        &=  \left( \frac{n}{n_{\xn}} \right)^2 V_n^{\xn},
    \end{align*}
    where 
    \begin{align*}
        V_n^{\xn} \coloneqq \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n \I(x_{\si, i} = \xn) \I(x_{\si, j} = \xn) K_{i, j}
    \end{align*}
    is a V-statistic whose corresponding U-statistic is given as
    \begin{align*}
        U_n^{\xn} \coloneqq \frac{1}{{}_n \mathrm{C}_2} \sum_{i < j} \I(x_{\si, i} = \xn) \I(x_{\si, j} = \xn) K_{i, j}.
    \end{align*}

    We prove the consistency of $\widehat{D}^2_{\si}(\xn)$ by showing the following three relations: 
    \begin{align}
        &U_n^{\xn} \overset{a.s.}{\rightarrow} \E_{A, A', \biX, \biX', Y, Y'}[\I(X_{\si} = \xn) \I(X_{\si} = \xn)  K((A, \biX, Y), (A', \biX', Y'))] \label{UAI2022-proof_disc1} \\
        &\left( \frac{n}{n_{\xn}} \right)^2 U_n^{\xn} \overset{a.s.}{\rightarrow} \mbox{WCMMD}^2_{X_{\si} = \xn} \label{UAI2022-proof_disc2} \\
        &U_n^{\xn} - V_n^{\xn} \overset{p}{\rightarrow} 0 \label{UAI2022-proof_disc3}. 
    \end{align}
    Relation \eqref{UAI2022-proof_disc1} holds from the Strong Law of Large Numbers for U-statistics \citep{hoeffding1961strong}. By combining this relation with the fact that $\frac{n_{\xn}}{n} = \frac{1}{n} \sum_{i=1}^n \I(x_{\si, i} = \xn)  \overset{a.s.}{\rightarrow} \pr(X_{\si} = \xn)$, we can derive the relation in Eq. \eqref{UAI2022-proof_disc2}. The relation in Eq. \eqref{UAI2022-proof_disc3} can be shown as follows. Under \Cref{UAI2022-asmp3}, since $\E[K((A, \biX, Y), (A', \biX', Y'))] \leq \E[K((A, \biX, Y), (A, \biX, Y))] < \infty$, by employing Lemma 5.7.3 in \citet{serfling2009approximation}, we have $\E[| U_n^{\xn} - V_n^{\xn} |] = O(n^{-1})$,  and thus by applying Markov's inequality, we have 
    \begin{align*}
        \pr(| U_n^{\xn} - V_n^{\xn} | \geq \epsilon) \leq \frac{\E[| U_n^{\xn} - V_n^{\xn} |]}{\epsilon} \rightarrow 0 \quad \mbox{as $n \rightarrow \infty$},
    \end{align*}
    which is sufficient to prove the relation in Eq. \eqref{UAI2022-proof_disc3}.

    By combining Eq. \eqref{UAI2022-proof_disc1}, \eqref{UAI2022-proof_disc2}, and \eqref{UAI2022-proof_disc3}, we have $\widehat{D}^2_{\si}(\xn) \overset{p}{\rightarrow} \mbox{WCMMD}^2_{X_{\si} = \xn}$ as $n \rightarrow \infty$. Since Proposition 1 holds under \Cref{UAI2022-asmp1,UAI2022-asmp2}, we have $\widehat{D}^2_{\si}(\xn) \overset{p}{\rightarrow} D^2_{\si}(\xn)$ as $n \rightarrow \infty$. Thus we prove the consistency of $\widehat{D}^2_{\si}(\xn)$.

    \textbf{The case where weight $\oax_i$ is given by Eq. (8):}

    In this case, empirical estimator $\widehat{D}^2_{\si}(\xn)$ is given as
    \begin{align}
        \widehat{D}^2_{\si}(\xn) &= \frac{\frac{1}{n^2 h^2_{X_{\si}}} \sum_{i=1}^n \sum_{j=1}^n k_{X_{\si}} (x_{\si, i}, \xn) k_{X_{\si}} (x_{\si, j}, \xn) K_{i, j}}{\frac{1}{n^2 h^2_{X_{\si}}} \sum_{i=1}^n \sum_{j=1}^n k_{X_{\si}} (x_{\si, i} , \xn) k_{X_{\si}} (x_{\si, j} , \xn)}. \label{UAI2022-proof_cont}
    \end{align}

    From the Strong Law of Large Numbers, as $n \rightarrow \infty$, the numerator in Eq. \eqref{UAI2022-proof_cont} converges to the following expected value:
    \begin{align*}
        \E_{A, A', \biX, \biX', Y, Y'}\left[ \frac{1}{h^2_{X_{\si}}} K_{X_{\si}} \left(\frac{X_{\si} - \xn}{h_{X_{\si}}}\right) K_{X_{\si}} \left(\frac{X'_{\si} - \xn}{h_{X_{\si}}}\right) K((A, \biX, Y), (A', \biX', Y'))\right].
    \end{align*}
    Under \Cref{UAI2022-asmp4,UAI2022-asmp6}, we can reformulate this expected value by performing a Taylor expansion as follows:
    \begin{align}
        &\E_{A, A', \biX, \biX', Y, Y'}\left[ \frac{1}{h^2_{X_{\si}}} K_{X_{\si}} \left(\frac{X_{\si} - \xn}{h_{X_{\si}}}\right) K_{X_{\si}} \left(\frac{X'_{\si} - \xn}{h_{X_{\si}}}\right) K((A, \biX, Y), (A', \biX', Y'))\right] \nonumber \\
        = &\E_{U=u, V=v}[ \E_{A, A', \biX_{-\si}, \biX'_{-\si}, Y, Y' \mid X_m = \xn + h_{X_{\si}} u, X'_m = \xn + h_{X_{\si}} v}[ \pr(X_{\si} = \xn + h_{X_{\si}} u) \pr(X'_{\si} = \xn + h_{X_{\si}} v) K_{X_{\si}}(u) K_{X_{\si}}(v) K((A, \biX, Y), (A', \biX', Y')) ]] \nonumber \\
        = &\E_{A, A', \biX_{-\si}, \biX'_{-\si}, Y, Y' \mid X_m = \xn, X'_m = \xn}[\pr^2(X_{\si} = \xn) K((A, \biX, Y), (A', \biX', Y'))] + O_p \left(h_{X_{\si}}^{\delta}\right). \label{UAI2022-proof_cont1}
    \end{align}

    Regarding the denominator in Eq. \eqref{UAI2022-proof_cont}, from the consistency results of the kernel density estimator in \citet{wied2012consistency}, we have
    \begin{align}
        \frac{1}{n h_{X_{\si}}} \sum_{j=1}^n k_{X_{\si}} (x_{m, j}, \xn) \overset{a.s.}{\rightarrow} \pr(X_{\si} = \xn). \label{UAI2022-proof_cont2}
    \end{align}

    By combining Eqs. \eqref{UAI2022-proof_cont1} and \eqref{UAI2022-proof_cont2}, under \Cref{UAI2022-asmp5}, we have $\widehat{D}^2_{\si}(\xn) \overset{p}{\rightarrow} \mbox{WCMMD}^2_{X_{\si} = \xn}$ as $n \rightarrow \infty$. Using Proposition 1, we have $\widehat{D}^2_{\si}(\xn) \overset{p}{\rightarrow} D^2_{\si}(\xn)$ as $n \rightarrow \infty$. Thus we proved the consistency of $\widehat{D}^2_{\si}(\xn)$. 

\end{proof}

\section{Additional Experimental Results}

In what follows, we present several additional synthetic data experiments to further evaluate the performance of our method. \Cref{asubsec:counterexp} shows the performance on the data where the truly relevant features do not affect the discrepancy between marginal potential outcome distributions, which is our inference target. \Cref{asubsec:neuronexp} displays the results when using different neural network architectures in the models of propensity score and CVAE.

\subsection{Examining Counterexamples} \label{asubsec:counterexp}

This section presents the performance of our method on the synthetic data where the features do not influence the discrepancy between conditional distributions $\pr(\Yz \mid X_{\si})$ and $\pr(\Yo \mid X_{\si})$ but affect joint distribution $\pr(\Yz, \Yo \mid X_{\si})$. With such data, our method does not work well because it relies on the discrepancy between $\pr(\Yz \mid X_{\si})$ and $\pr(\Yo \mid X_{\si})$, as described in Section 3.1.

To evaluate the performance, we prepared synthetic data in a similar manner to Section 4.2, which only differs in the generation process of potential outcomes $\Yz$ and $\Yo$. Here, we set the sample size to $n=2000$ and sampled the values of $\Yz$ and $\Yo$ from the following $2$-dimensional Gaussian distributions:
\begin{itemize}
    \item \textbf{LinCovar}:
\begin{align}
    \left[
        \begin{array}{c}
            \Yz \\
            \Yo
        \end{array} \right]
    \sim \mathcal{N}\left(
        \left[
            \begin{array}{c}
                -5 \\
                0
            \end{array} \right],
            \left[
                \begin{array}{cc}
                    1 & 1 - \frac{1}{h(f(X_1,\dots,X_5))}\\
                    1 - \frac{1}{h(f(X_1,\dots,X_5))} & 1
                \end{array} \right]                    
       \right),     
\end{align}
    \item \textbf{NonlinCovar}:
    \begin{align}
        \left[
            \begin{array}{c}
                \Yz \\
                \Yo
            \end{array} \right]
        \sim \mathcal{N}\left(
            \left[
                \begin{array}{c}
                    -5 \\
                    0
                \end{array} \right],
                \left[
                    \begin{array}{cc}
                        1 & 1 - \frac{1}{h(g(X_1,\dots,X_5))}\\
                        1 - \frac{1}{h(g(X_1,\dots,X_5))} & 1
                    \end{array} \right]                    
           \right), 
    \end{align}    
\end{itemize}
where functions $f$, $g$, and $h$ are presented in Section 4.2. Under LinCovar and NonlinCovar, features $X_1, \dots, X_5$ only influence the covariance between potential outcomes $\Yz$ and $\Yo$ and do not affect any functionals of the marginal distributions.

We performed $50$ experiments and evaluated their mean and standard deviation of TPRs and FPRs. \Cref{UAI2022-table_ex_counter} presents the results. As expected, our method could not correctly select features $X_1, \dots, X_5$ because their values do not affect the discrepancy between conditional potential outcome distributions. 


\begin{table}[t]
    \caption{TPRs and FPRs of our method on LinCovar and NonlinCovar datasets. Mean and standard deviation over $50$ runs are shown.}
    \centering 
        \scalebox{1.}{
        \begin{tabular}{c|cc}
            \toprule 
                              & TPR        & FPR \\ \midrule 
                            LinCovar & 0.02 $\pm$ 0.06 & 0.02 $\pm$ 0.02 \\
                            NonlinCovar & 0.04 $\pm$ 0.08  & 0.02 $\pm$ 0.02  \\
            \bottomrule
        \end{tabular}
        } 

\label{UAI2022-table_ex_counter}

\end{table}

Note, however, that selecting these features is extremely challenging because it is impossible to estimate the covariance since we cannot infer the joint distribution of potential outcomes, as described in Section 3.1. Due to this difficulty, all of the existing mean-based methods also fail, and compared with such methods, ours can detect a wider variety of features.

\subsection{Performance Evaluation with Different Neural Network Architectures} \label{asubsec:neuronexp}

Since our method relies on two neural network models to represent propensity function $\mathrm{e}(\biX)$ and CVAE $\mathcal{L}(X_{\si} \mid \biX_{-\si})$ ($\si = 1, \dots, \nf$), we confirmed how greatly the neural network architectures affect the overall feature selection performance.

For this purpose, we performed additional synthetic data experiments with sample size $n = 1000$. We evaluated the mean and standard deviation of TPRs and FPRs over $50$ runs by changing the number of neurons of each layer in the two-layered neural network models, which is fixed to $50$ for propensity score and to $128$ for CVAE in the experiments in Section 4.2.

\Cref{UAI2022-table_neurons_propensity,UAI2022-table_neurons_cvae} display the results. With all synthetic datasets, the number of neurons in propensity score and CVAE did not greatly affect the performance. 

\newpage

\begin{table}[t]
    \caption{TPRs and FPRs of our method with different numbers of neurons in propensity score model. Mean and standard deviation over $50$ runs are shown.}
    \centering 
    \begin{tabular}{llcccc}
        \toprule
        &  & \multicolumn{4}{c}{Number of neurons in propensity score model}           \\

                   & & 25            & 50            & 100           & 200           \\ \midrule
    \multirow{2}{*}{LinMean}    & TPR     & 0.80$\pm$0.21 & 0.79$\pm$0.22 & 0.84$\pm$0.14 & 0.84$\pm$0.16 \\
                                & FPR     & 0.06$\pm$0.06 & 0.06$\pm$0.07 & 0.08$\pm$0.06 & 0.08$\pm$0.06 \\ 
    \multirow{2}{*}{NonlinMean} & TPR     & 0.95$\pm$0.10 & 0.94$\pm$0.12 & 0.98$\pm$0.06 & 0.97$\pm$0.08             \\
                                & FPR     & 0.04$\pm$0.04 & 0.04$\pm$0.04 & 0.03$\pm$0.03 & 0.05$\pm$0.04  \\
    \multirow{2}{*}{LinVar}     & TPR     & 0.71$\pm$0.19 & 0.73$\pm$0.19 & 0.77$\pm$0.16 & 0.76$\pm$0.18    \\
                                & FPR     & 0.08$\pm$0.07 & 0.07$\pm$0.08 & 0.10$\pm$0.07 & 0.09$\pm$0.07    \\
    \multirow{2}{*}{NonlinVar}  & TPR     & 0.64$\pm$0.25 & 0.62$\pm$0.25 & 0.63$\pm$0.26 & 0.64$\pm$0.25    \\
                                & FPR     & 0.04$\pm$0.04 & 0.04$\pm$0.04 & 0.04$\pm$0.04 & 0.04$\pm$0.04  \\ \bottomrule
    \end{tabular}
    \label{UAI2022-table_neurons_propensity}
\end{table}

\begin{table}[t]
    \caption{TPRs and FPRs of our method with different numbers of neurons in CVAE model. Mean and standard deviation over $50$ runs are shown.}
    \centering     
    \begin{tabular}{llcccc}
        \toprule
        &  & \multicolumn{4}{c}{Number of neurons in CVAE model}           \\
        &  & 16            & 64            & 128           & 256           \\ \midrule
        \multirow{2}{*}{LinMean}    & TPR     & 0.82$\pm$0.18 & 0.82$\pm$0.17 & 0.79$\pm$0.22 & 0.83$\pm$0.16 \\
                                & FPR     & 0.08$\pm$0.06 & 0.07$\pm$0.06 & 0.06$\pm$0.07 & 0.10$\pm$0.07 \\ 
    \multirow{2}{*}{NonlinMean} & TPR     & 0.96$\pm$0.09 & 0.98$\pm$0.06 & 0.94$\pm$0.12 & 0.94$\pm$0.05 \\
                                & FPR     & 0.04$\pm$0.04 & 0.03$\pm$0.03 & 0.04$\pm$0.04 & 0.05$\pm$0.04  \\
    \multirow{2}{*}{LinVar}     & TPR     & 0.68$\pm$0.19 & 0.66$\pm$0.17 & 0.73$\pm$0.19 & 0.70$\pm$0.16    \\
                                & FPR     & 0.07$\pm$0.05 & 0.06$\pm$0.05 & 0.07$\pm$0.08 & 0.08$\pm$0.07   \\
    \multirow{2}{*}{NonlinVar}  & TPR     & 0.58$\pm$0.25 & 0.56$\pm$0.25 & 0.62$\pm$0.25 & 0.60$\pm$0.20  \\
                                & FPR     & 0.02$\pm$0.03 & 0.03$\pm$0.03 & 0.04$\pm$0.04 & 0.04$\pm$0.05  \\ \bottomrule
    \end{tabular}
    \label{UAI2022-table_neurons_cvae}
\end{table}

\bibliography{chikahara_61-supp}

\end{document}