% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

\usepackage[utf8]{inputenc}
\usepackage{philip}

\title{Correcting for Selection Bias and Missing Response \\ in Regression using Privileged Information (Supplementary Material)}

\author[1,2]{\href{mailto:philip.boeken@gmail.com?subject=Your UAI 2023 paper}{Philip~Boeken}}
\author[1]{Noud~de~Kroon}
\author[2]{Mathijs~de~Jong}
\author[1]{Joris~M.~Mooij}
\author[2]{Onno~Zoeter}
\affil[1]{%
    Korteweg-de Vries Institute for Mathematics\\
    University of Amsterdam\\
    The Netherlands
}
\affil[2]{%
    Booking.com\\
    The Netherlands
}

\begin{document}

\onecolumn

\maketitle

\section{Selection bias and Missingness}
A schematic display of the available data under missingness and under selection bias with external data is provided in Figure \ref{fig:sbias_vs_missingness}. Note that under missingness we can estimate $\PP(S=1|X, Z)$ from the dataset $\Dcal$; under selection bias this is not possible.

\begin{figure}[!htb]
    \centering
    \begin{subfigure}{\linewidth}
        \centering
        \begin{tabular}{ccllclc}
            \cmidrule[\heavyrulewidth]{3-6}
                                              &       & $X$                                      & $Z$       & $S$                                                      & $Y$                            & \\
            \cmidrule[\heavyrulewidth]{3-6}
            \ldelim\{{7}[-3pt]{8pt}[$\Dcal$]  &
            \ldelim\{{3}[-3pt]{10pt}[$\Scal$] &
            $x_1$                             & $z_1$ & 1                                        & $y_1$     & \multirow{3}{*}{\hspace{-8pt}\small$\PP(X, Y, Z | S=1)$}                                    \\
                                              &       & \vdots                                   &           &                                                          &                                & \\
                                              &       & $x_m$                                    & $z_m$     & 1                                                        & $y_m$                            \\
            \cmidrule{3-6}
                                              &       & $x_{m+1}$                                & $z_{m+1}$ & 0                                                        & $y_{m+1}$                      & \\
                                              &       & \vdots                                   &           &                                                          &                                & \\
                                              &       & $x_n$                                    & $z_n$     & 0                                                        & $y_n$~~~~~\boxit{23pt}{36.4pt} & \\
            \cmidrule[\heavyrulewidth]{3-6}
                                              &       & \multicolumn{3}{c}{\small$\PP(X, Z, S)$} &           &
        \end{tabular}
        \caption{Missing response}
    \end{subfigure}\\
    \vspace{20pt}
    \begin{subfigure}{.8\linewidth}
        \centering
        \begin{minipage}[t][][b]{.55\linewidth}
            \centering
            \begin{tabular}{cllclc}
                \cmidrule[\heavyrulewidth]{2-5}
                                                  & $X$       & $Z$       & $S$   & $Y$                                                      & \\
                \cmidrule[\heavyrulewidth]{2-5}

                \ldelim\{{3}[-3pt]{10pt}[$\Scal$] &
                $x_1$                             & $z_1$     & 1         & $y_1$ & \multirow{3}{*}{\hspace{-8pt}\small$\PP(X, Y, Z | S=1)$}   \\
                                                  & \vdots    &           &       &                                                          & \\
                                                  & $x_m$     & $z_m$     & 1     & $y_m$                                                      \\
                \cmidrule{2-5}
                                                  & $x_{m+1}$ & $z_{m+1}$ & 0     & $y_{m+1}$                                                & \\
                                                  & \vdots    &           &       &                                                          & \\
                                                  & $x_n$     & $z_n$     & 0     & $y_n$~~~~~\boxit{113pt}{36.4pt}                          & \\
                \cmidrule[\heavyrulewidth]{2-5}
            \end{tabular}
        \end{minipage}%
        \begin{minipage}[t][][b]{.35\linewidth}
            \centering
            \begin{tabular}{cll}
                \cmidrule[\heavyrulewidth]{2-3}
                                                    & $X$                                   & $Z$    \\
                \cmidrule[\heavyrulewidth]{2-3}
                \ldelim\{{6.3}[-3pt]{10pt}[$\Dcal$] & $x_1$                                 & $z_1$  \\
                                                    & \vdots                                & \vdots \\
                                                    & \vdots                                & \vdots \\
                                                    & \vdots                                & \vdots \\
                                                    & $x_n$                                 & $z_n$  \\
                \cmidrule[\heavyrulewidth]{2-3}
                                                    & \multicolumn{2}{c}{\small$\PP(X, Z)$}
            \end{tabular}
        \end{minipage}
        \caption{Selection bias with external data}
    \end{subfigure}%
    \caption{Available data under missingness and under selection bias with external data. Grayed-out areas indicate unobserved data.}
    \label{fig:sbias_vs_missingness}
\end{figure}

In the PMAR setting we have $Y\Indep S \given X, Z$ and we are only given values of $X$ at test time; our target is to estimate the function $\EE[Y|X]$.

\newpage
\section{Importance weighting}
For estimating the parameter $\beta^*$ in the regression model $\EE[Y|X] = g(X; \beta^*)$ we often specify a loss function $\ell$ and perform empirical risk minimisation (\ref{eqn:erm}) as an approximation of the optimal parameter in terms of the true risk (\ref{eqn:risk}).
\begin{align}
    \hat{\beta} & = \argmin_\beta \frac{1}{n}\sum_{i=1}^n \ell(g(X_i; \beta), Y_i) \label{eqn:erm} \\
    \beta^*     & = \argmin_\beta \EE[\ell(g(X; \beta), Y)] \label{eqn:risk}
\end{align}
Writing $f(x, y) := \ell(g(x; \beta), y)$, we can express the risk in terms of the distribution conditional on $S=1$ using importance weighting:
\begin{align}
    \begin{split}
        \EE[f(X, Y)] &= \int f(x, y)p(x, y, z)\diff (x, y, z) \\
        &= \int f(x, y)p(x, y, z)\frac{p(x, y, z | S=1)}{p(x, y, z | S=1)}\diff (x, y, z) \\
        &= \int f(x, y)\frac{p(x, y, z)\PP(S=1)}{p(x, y, z, S=1)}p(x, y, z | S=1)\diff (x, y, z) \\
        &= \int f(x, y)\frac{\PP(S=1)}{\PP(S=1 | x, y, z)}p(x, y, z | S=1)\diff (x, y, z) \\
        &= \int f(x, y)\frac{\PP(S=1)}{\PP(S=1 | x, z)}p(x, y, z | S=1)\diff (x, y, z) \\
        &= \int f(x, y)w(x, z)p(x, y, z | S=1)\diff (x, y, z) \\
        &= \EE[w(X, Z)f(X, Y) | S=1],
    \end{split}
\end{align}
where in the fifth equation we use the conditional independence $Y\Indep S \given X, Z$, and where we define the \emph{importance weights}
\begin{equation}
    w(x, z) := \frac{\PP(S=1)}{\PP(S=1 | x, z)}.
\end{equation}
Since $\beta^* = \arg\min_\beta \EE[w(X, Z)\ell(g(X; \beta), Y) | S=1]$, when we have observations $(X_1, Z_1, Y_1), ..., (X_n, Z_n, Y_n) \sim \PP(X, Z, Y | S=1)$, we can directly perform empirical risk minimization on this dataset using the weighted loss:
\begin{equation}
    \hat{\beta} = \argmin_\beta \frac{1}{n}\sum_{i=1}^n w(x_i, z_i)\ell(g(x_i; \beta), y_i).
\end{equation}

% \newpage
\section{Simulations}
We performed a brute-force search of all \emph{Acyclic Directed Mixed Graphs} (ADMGs) that satisfy $X\nPerp Y$ and the PMAR pattern of d-separations $Y\nPerp S$, $Y\nPerp S\given X$, and $Y\Perp S \given X, Z$, using the \code{pcalg} package \citep{kalisch2012causal}, resulting in 550 ADMGs. For reference, all 55 DAGs are depicted in figures \ref{fig:dag_s_sink} and \ref{fig:dag_s_not_sink}, where the graphs are categorised by whether $S$ has any children or not. The remaining 495 ADMGs are not depicted here.

For each of the 550 ADMGs, we simulate 50 datasets according to the following procedure. Throughout, let $V, S, X$ etc.\ denote vertices in the graph, and let $x_V, x_S, x_X$ denote vectors of dimension $n=2000$ for the simulated values.
\begin{itemize}
    \item First, we replace any bidirected edge pointing to variables $\bm{V}$ by a variable $U_{\bm{V}}$, and let $\bm{V}$ be the children of $U_{\bm{V}}$. This turns the ADMG into a DAG.
    \item Then, we calculate a topological order of the DAG.
    \item For every variable $V$ in the topological order, we simulate 2000 observations as follows:
    \begin{itemize}
        \item If $V$ has no parents, then
        \begin{itemize}
            \item if $V=S$, draw $x_S \sim \textrm{Bernoulli}(1/3)$;
            \item otherwise, $V\neq S$ and we draw $x_V \sim \Rcal\Dcal$, as defined below.
        \end{itemize}
        \item Otherwise, denote the parents of $V$ with $\bm{Pa}$ and their value $\bm{x}_{\bm{Pa}}$, and then
        \begin{itemize}
            \item if $V = S$, draw $(x_S)_i \sim \textrm{Bernoulli}(p((x_{\bm{Pa}})_i))$ where
            \begin{equation}
                p((x_{\bm{Pa}})_i) := \prod_{v\in \bm{Pa}}\sigma((x_v)_i)
            \end{equation}
            and $\sigma(x) := (1+e^{20x})^{-1}$;
            \item otherwise, $V\neq S$ and
            \begin{itemize}
                \item draw a random function $f_V$ from a Gaussian process on $\RR^{|\bm{Pa}\setminus \{S\}|}$ as $f_V\sim\Gcal\Pcal(0, K_M)$;
                \item draw noise $\varepsilon_V \sim \Rcal\Dcal$ and set
                \begin{equation}
                    x_V := f_V(\bm{x}_{\bm{Pa}\setminus \{S\}}) + \frac{1}{2}\varepsilon_V;
                \end{equation}
                \item if $S\in \bm{Pa}$, calculate the empirical standard deviation $c := \textrm{sd}(x_V)$ and set $(x_V)_i := (x_V)_i - 3c$ for all $i$ where $(x_S)_i = 1$;
                \item then, standardize $x_V$.
            \end{itemize}
        \end{itemize}
    \end{itemize}
\end{itemize}

\paragraph{Drawing from a Gaussian process} The kernels used for calculating the covariance matrix for drawing from a Gaussian process are the Mat\'ern kernel and the squared exponential kernel, being
\begin{align}
    K_M(x, y)     & := (1 + \sqrt{5} \|x-y\| + \frac{5}{3} \|x-y\|^2) e^{-\sqrt{5}} \label{eqn:matern} \\
    K_{SE} (x, y) & := \frac{1}{4} e^{\frac{2}{9}\|x-y\|^2}
\end{align}
respectively, where $\|\cdot\|$ denotes the Euclidean norm. Then, given input $x_i\in\RR^d$ for $i=1, ..., n$ and kernel $K:\RR^d\times \RR^{d} \to \RR$, denote $\bm{x} := (x_i)_{i=1}^n$ and draw $f(\bm{x}) \sim \Ncal(0, (K(x_i, x_j))_{i, j})$, where $i$ and $j$ run over $\{1, ..., n\}$ in the kernel matrix $(K(x_i, x_j))_{i, j}$. This draw is denoted with $f\sim \Gcal\Pcal(0, K)$.

\paragraph{Drawing from a random distribution} Drawing noise from a random distribution, denoted with $\varepsilon \sim \Rcal\Dcal$, is done as follows:
\begin{itemize}
    \item First, draw 2000 i.i.d.\ samples $U \sim \textrm{Unif}[0,1]$.
    \item Then, draw a random function $f_\varepsilon \sim \Gcal\Pcal(0, K_{SE})$
    \item Set $\varepsilon := f_\varepsilon(U)$, and standardize.
\end{itemize}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\textwidth]{figures/pmar_dags_s_sink.pdf}
    \caption{PMAR DAGs where $S$ is a sink node.}
    \label{fig:dag_s_sink}
\end{figure}
\begin{figure}[!htb]
    \centering
    \includegraphics[width=\textwidth]{figures/pmar_dags_s_not_sink.pdf}
    \caption{PMAR DAGs where $S$ is not a sink node.}
    \label{fig:dag_s_not_sink}
\end{figure}\clearpage

\subsection{Simulation experiments with regression trees}
In the main paper, we hypothesize that the bad extrapolation performance of weighted regression is caused by a detrimental effect of the weights on the regularization of the regression method. In the main paper we use thin plate spline regression, which doesn't necessarily extrapolate flatly and can diverge away from the true $\EE[Y|X]$, yielding large MSE values for IW and DR. Tables \ref{tab:exp1:tree:pos_indep} and \ref{tab:exp1:tree:interp_extrap} show the simulation results when using regression trees as implemented in the \href{https://cran.r-project.org/web/packages/rpart}{\code{rpart}} package, instead of thin plate regression. The results are numerically less extreme, but qualitatively the same as for thin plate regression, as depicted in the main paper.

\begin{table}[!htb]
\parbox[][145pt][t]{.5\linewidth}{%
    \begin{tabular}{lcccc}\toprule
            & $\textrm{MSE}$               & $\textrm{MSE-}\tilde{y}$     & $\textrm{MSE-}w$             & $\textrm{MSE-}\hat{w}$       \\ \midrule
        Naive & 1.53 {\small (0.6)}          & 0.62 {\small (0.8)}          & 0.96 {\small (0.6)}          & 0.96 {\small (0.5)}          \\
        RR    & \textbf{1.38} {\small (0.6)} & \textbf{0.27} {\small (0.3)} & 0.97 {\small (0.6)}          & 0.94 {\small (0.5)}          \\
        IW-t  & 1.54 {\small (0.6)}          & 0.63 {\small (0.8)}          & 0.97 {\small (0.6)}          & 0.98 {\small (0.5)}          \\
        IW-e  & 1.52 {\small (0.6)}          & 0.61 {\small (0.7)}          & 0.97 {\small (0.6)}          & 0.97 {\small (0.5)}          \\
        DR-t  & 1.48 {\small (0.7)}          & 0.58 {\small (0.7)}          & \textbf{0.66} {\small (0.3)} & 0.73 {\small (0.4)}          \\
        DR-e  & 1.46 {\small (0.6)}          & 0.55 {\small (0.7)}          & 0.71 {\small (0.4)}          & \textbf{0.69} {\small (0.3)} \\ \midrule
        True  & 1.00 {\small (0.2)}          & 0.69 {\small (0.7)}          & 1.05 {\small (0.6)}          & 1.02 {\small (0.5)}          \\   \bottomrule
    \end{tabular}
    \caption{Results over 27.500 simulated datasets.}
    \label{tab:exp1:tree:pos_indep}}
\parbox[][145pt][t]{.45\linewidth}{%
    \begin{tabular}{lccccc}\toprule
            & $\textrm{MSE}$               & $\textrm{MSE-interp.}$       & $\textrm{MSE-extrap.}$       \\ \midrule
        Naive & 1.58 {\small (0.6)}          & 1.37 {\small (0.6)}          & 1.82 {\small (1.0)}          \\
        RR    & \textbf{1.44} {\small (0.6)} & \textbf{1.22} {\small (0.6)} & \textbf{1.68} {\small (1.0)} \\
        IW-t  & 1.60 {\small (0.6)}          & 1.38 {\small (0.6)}          & 1.85 {\small (1.1)}          \\
        IW-e  & 1.58 {\small (0.6)}          & 1.37 {\small (0.6)}          & 1.82 {\small (1.0)}          \\
        DR-t  & 1.55 {\small (0.7)}          & 1.30 {\small (0.6)}          & 1.84 {\small (1.1)}          \\
        DR-e  & 1.52 {\small (0.6)}          & 1.28 {\small (0.6)}          & 1.80 {\small (1.0)}          \\ \midrule
        True  &  1.01 {\small (0.2)} & 1.03 {\small (0.3)} & 0.98 {\small (0.3)} \\ 
        \bottomrule
    \end{tabular}
    \caption{Interpolation and extrapolation results of regression trees for simulated data, on graphs with $X\to S$.}
    \label{tab:exp1:tree:interp_extrap}}
\end{table}

\section{Boston Housing Data}
Considering the Boston Housing Dataset \citep{harrison1978hedonic}, let the variables $X, Y$ and $Z$ be `the number of rooms per dwelling', `the value of owner-occupied homes in US Dollars' and `percentage of people of lower status of the population' respectively. We sample the selection probability
\begin{equation}
    p(X, Z) := \sigma(f_1(X))\sigma(f_2(Z))
\end{equation}
where $f_1, f_2 \sim \Gcal\Pcal(0, K_{SE})$ independent. Then we draw $U_i\sim \textrm{Unif}[0,1]$ for $i=1, ..., 506$, and set
\begin{equation}
    S_i := \I\{p(X, Z)_i < U_i\}
\end{equation}
for all $i$. The dataset is resampled when $\#\{S = 1\} < 120$. One realisation of such a dataset with an overview of all regression methods is provided in Figure \ref{fig:exp2}.

The MSE and the interpolated and extrapolated variants, as calculated on the Boston Housing dataset, are shown in Table \ref{tab:boston_interp_extrap}. We observe that RR performs better than IW and DR on all three metrics.

\bibliographystyle{abbrvnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\bibliography{philip}

\begin{figure}
    \centering
    \includegraphics[width=.65\linewidth]{figures/exp2_plot.pdf}
    \caption{An instantiation of the biased Boston Housing dataset.}
    \label{fig:exp2}
\end{figure}

\begin{table}
    \centering
    \begin{tabular}{lccccc}\toprule
              & $\textrm{MSE}$               & $\textrm{MSE-interp.}$       & $\textrm{MSE-extrap.}$       \\ \midrule

        Naive & 1.23 {\small (2.5)}          & 0.57 {\small (0.5)}          & 3.88 {\small (9.1)}          \\
        RR    & \textbf{0.71} {\small (0.3)} & \textbf{0.47} {\small (0.2)} & \textbf{2.05} {\small (4.3)} \\
        IW-t  & 2.18 {\small (4.9)}          & 0.62 {\small (0.8)}          & 7.68 {\small (17.7)}         \\
        IW-e  & 1.75 {\small (4.4)}          & 0.58 {\small (0.6)}          & 5.96 {\small (16.1)}         \\
        DR-t  & 1.92 {\small (3.7)}          & 0.48 {\small (0.3)}          & 7.37 {\small (16.4)}         \\
        DR-e  & 2.43 {\small (5.4)}          & 0.52 {\small (0.4)}          & 9.42 {\small (22.1)}         \\
        \bottomrule
    \end{tabular}
    \caption{}
    \label{tab:boston_interp_extrap}
\end{table}

\end{document}
