\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{ bbold }

\usepackage{ amssymb }
\usepackage{bbm}
\usepackage{bm}

\usepackage{amsmath,amsthm}
\usepackage[utf8]{inputenc}
\usepackage{microtype}
\usepackage{ulem}
% % \usepackage{colortbl,arydshln} % color and dashed lines in arrays
% %\usepackage[charter,cal,scr]{mathdesign}
% \usepackage[mathscr]{euscript}
% \usepackage[all]{xy}
% \usepackage[x11names,dvipsnames]{xcolor}
\usepackage{tikz}
\usetikzlibrary{matrix, positioning}
\usetikzlibrary{automata,arrows}
\usepackage{listings}
\usepackage{mathtools}
\usepackage{verbatim}
\usepackage{subcaption}
\usepackage{algorithmicx}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{xr} 
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\externaldocument{mazaheri_521}

\title{Causal Information Splitting: \\ Engineering Proxy Features for Robustness to Distribution Shifts (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{https://bijanmazaheri.com/}{Bijan Mazaheri}}
\author[2]{Atalanti Mastakouri}
\author[2]{Dominik Janzing}
\author[2]{Michaela Hardt}
% Add affiliations after the authors
\affil[1]{%
    California Institute of Technology\\
    Pasadena, CA, USA
}
\affil[2]{%
    Amazon Web Services\\
    Tübingen, Germany
}

\newcommand{\zset}{\mathbb Z}
\newcommand{\fset}{\mathbb F}
\newcommand{\nset}{\mathbb N}
\newcommand{\rset}{\mathbb R}
\newcommand{\cset}{\mathbb C}
\newcommand\flowsfrom{\mathrel{\reflectbox{$\leadsto$}}}
\def\Real{\mathbb{R}}
\def\Proj{\mathbb{P}}
\def\Hyper{\mathbb{H}}
\def\Integer{\mathbb{Z}}
\def\Natural{\mathbb{N}}
\def\Complex{\mathbb{C}}
\def\Rational{\mathbb{Q}}
\def\Field{\mathbb{K}}
\newcommand{\ep}{{\varepsilon}}

\let\K\Field
\let\N\Natural
\let\Q\Rational
\let\R\Real
\let\C\Complex
\let\Z\Integer

% ---- OPERATORS (requires amsmath) ----
\def\argmax{\operatornamewithlimits{arg\,max}}
\def\argmin{\operatornamewithlimits{arg\,min}}
\def\rank{\operatorname{rk}}
\def\nnrank{{\operatorname{rk}_+}}
\newcommand{\indep}{\perp \!\!\! \perp}
\newcommand{\G}{\mathcal{G}}
\newcommand{\Xin}{\vec{X}_{\mathrm{in}}}
\def\De{\operatorname{\mathbf{DE}}}
\def\Err{\operatorname{Err}}
\def\An{\operatorname{\mathbf{AN}}}
\def\Ch{\operatorname{\mathbf{CH}}}
\def\ch{\operatorname{\mathbf{ch}}}
\def\Pa{\operatorname{\mathbf{PA}}}
\def\CoAn{\operatorname{\mathbf{COAN}}}
\def\Fb{\operatorname{\mathbf{FB}}}
\def\Fm{\operatorname{\mathbf{FM}}}
\def\LF{\operatorname{\mathbf{LF}}}
\def\pa{\operatorname{\mathbf{pa}}}
\def\Mb{\operatorname{\mathbf{MB}}}
\def\Nb{\operatorname{\mathbf{NB}}}
\def\mb{\operatorname{\mathbf{mb}}}
\def\Nd{\operatorname{Nd}}
\def\Pr{\operatorname{Pr}}
\def\I{\operatorname{\mathcal{I}}}
\def\H{\operatorname{\mathcal{H}}}
% ---- RELATORS ----
\def\deq{\stackrel{\scriptscriptstyle\triangle}{=}}	% Use := instead.
\def\into{\DOTSB\hookrightarrow}		% = one-to-one
\def\onto{\DOTSB\twoheadrightarrow}
\def\inonto{\DOTSB\lhook\joinrel\twoheadrightarrow}
\def\from{\leftarrow}
\def\tofrom{\leftrightarrow}
\def\mapsfrom{\mathrel{\reflectbox{$\mapsto$}}}
\def\longmapsfrom{\mathrel{\reflectbox{$\longmapsto$}}}

% ---- DELIMITER PAIRS ----
\def\floor#1{\lfloor #1 \rfloor}
\def\ceil#1{\lceil #1 \rceil}
\def\seq#1{\langle #1 \rangle}
\def\set#1{\{ #1 \}}
\def\abs#1{\mathopen| #1 \mathclose|}			% use instead of $|x|$ 
\def\norm#1{\mathopen\| #1 \mathclose\|}		% use instead of $\|x\|$ 
\def\indic#1{\big[#1\big]}		% indicator variable; Iverson notation
								% e.g., Kronecker delta = [x=0]

% --- Self-scaling delmiter pairs ---
\def\Floor#1{\left\lfloor #1 \right\rfloor}
\def\Ceil#1{\left\lceil #1 \right\rceil}
\def\Seq#1{\left\langle #1 \right\rangle}
\def\Set#1{\left\{ #1 \right\}}
\def\Abs#1{\left| #1 \right|}
\def\Card#1{\left| #1 \right|}
\def\Norm#1{\left\| #1 \right\|}
\def\Paren#1{\left( #1 \right)}		% need better macro name!
\def\Brack#1{\left[ #1 \right]}		% need better macro name!
\def\Indic#1{\mathbbm{1}\left[ #1 \right]}		% indicator variable; Iverson notation


%
%  Macros to typeset sets like {foo|bar} with all three delimiters
%  correctly scaled to fit.  What I *really* want is a \middle macro
%  that acts just like \left and \right.  Grumble.
%
\makeatletter
\def\Bigbar#1{\mathrel{\left|\vphantom{#1}\right.\n@space}}
\def\Setbar#1#2{\Set{#1 \Bigbar{#1 #2} #2}}
\def\Seqbar#1#2{\Seq{#1 \Bigbar{#1 #2} #2}}
\def\Brackbar#1#2{\Brack{#1 \Bigbar{#1 #2} #2}}
\makeatother 

\def\complement#1{\overline{#1}}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{observation}{Observation}
\newtheorem{claim}{Claim}
\newtheorem{corollary}{Corollary}

\renewcommand{\vec}{\bm}
\newcommand{\dmax}{d_{\mathrm{max}}}

\def\nbor{\mathcal{N}}
\newcommand{\suppress}[1]{}

\def\eps{\varepsilon}
\def\given{\mid}
\def\ggiven{\mid\mid}
\newcommand{\DKL}[2]{\mathrm{D}_{\mathrm{KL}}(#1 || #2)}


\newcommand\bijan[1]{\textcolor{red}{B: #1} }%

\newcommand\mila[1]{\textcolor{blue}{Mila: #1} }% Notes for Mila

\newcommand\dominik[1]{\textcolor{brown}{Dominik: #1}}% Notes for Dominik

\newcommand\atalanti[1]{\textcolor{magenta}{Atalanti: #1} }% Notes for Atalanti

\newcommand\leena[1]{\textcolor{orange}{Leena: #1} }% Notes for Leena
% \newcommand\bijan[1]{}%

% \newcommand\mila[1]{}% Notes for Mila

% \newcommand\dominik[1]{}

% \newcommand\atalanti[1]{}

% \newcommand\leena[1]{}% Notes for Leena
% Commands for types of active paths

% General Active path - label over path gives restricted set the path can go through

\usetikzlibrary{decorations,decorations.pathmorphing, decorations.pathreplacing, decorations.shapes, arrows, arrows.meta}

\pgfdeclaremetadecoration{middlezigzag}{straight}{
    \state{straight}[switch if less than=\pgfmetadecorationsegmentlength to final,
                  width=\pgfmetadecoratedpathlength/2 - \pgfmetadecorationsegmentlength/2 ,
                  next state=zigzag] {
        \decoration{curveto}
    }
    \state{zigzag}[width=\pgfmetadecorationsegmentlength,
                   next state=final] {
        \decoration{zigzag}
    }
    \state{final}{
        \decoration{curveto}
        \beforedecoration{\pgfpathmoveto{\pgfpointmetadecoratedpathfirst}}
    }
}


\tikzset{
    middle zigzag/.style={
        decorate,
        decoration={
            middlezigzag,
            meta-segment length=.36cm,
            segment length=0.12cm,
            amplitude = .05cm,
        }
    }
}


\newcommand\activepathNN{
\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](.25,0) -- (.37,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.37,0);
\end{tikzpicture}}}
\newcommand\activepathLN{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](.25,0) -- (.37,0);
    \draw [-{to [width=1.2mm, length=1.2mm]}](-.25,0) -- (-.33,0);
\end{tikzpicture}}}
\newcommand\activepathRN{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](.25,0) -- (.37,0);
    \draw [-{to [reversed, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.33,0);
\end{tikzpicture}}}

\newcommand\activepathNR{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.37,0);
    \draw [-{to [width=1.2mm, length=1.2mm]}](.25,0) -- (.33,0);
\end{tikzpicture}}}

\newcommand\activepathLR{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{to[width=1.2mm, length=1.2mm]}](.25,0) -- (.33,0);
    \draw [-{to [width=1.2mm, length=1.2mm]}](-.25,0) -- (-.33,0);
\end{tikzpicture}}}
\newcommand\activepathRR{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{to[width=1.2mm, length=1.2mm]}](.25,0) -- (.33,0);
    \draw [-{to [reversed, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.33,0);
\end{tikzpicture}}}

\newcommand\activepathNL{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{to[reversed, width=1.2mm, length=1.2mm]}](.25,0) -- (.33,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.37,0);
\end{tikzpicture}}}
\newcommand\activepathLL{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{to[reversed, width=1.2mm, length=1.2mm]}](.25,0) -- (.33,0);
    \draw [-{to [width=1.2mm, length=1.2mm]}](-.25,0) -- (-.33,0);
\end{tikzpicture}}}
\newcommand\activepathRL{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [middle zigzag] (-.25,0) -- (.25,0);
    \draw [-{to[reversed, width=1.2mm, length=1.2mm]}](.25,0) -- (.33,0);
    \draw [-{to [reversed, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.33,0);
\end{tikzpicture}}}

\newcommand\activepathNB{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}] (-.25,0) -- (-.1,0);
    \draw [-{Bar[width = 2mm]}](-.1,0) -- (-.05,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.44,0);
\end{tikzpicture}}}

\newcommand\activepathBN{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}] (.25,0) -- (.1,0);
    \draw [-{Bar[width = 2mm]}](.1,0) -- (.05,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](.25,0) -- (.44,0);
\end{tikzpicture}}}

\newcommand\activepathNC{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}] (-.25,0) -- (-.1,0);
    \draw [-{to[width = 2mm]}](-.1,0) -- (-.05,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](-.25,0) -- (-.44,0);
\end{tikzpicture}}}

\newcommand\activepathCN{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}] (.25,0) -- (.1,0);
    \draw [-{to[width = 2mm]}](.1,0) -- (.05,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}](.25,0) -- (.44,0);
\end{tikzpicture}}}

\newcommand\activepathCR{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}] (.25,0) -- (.1,0);
    \draw [-{to[width = 2mm]}](.1,0) -- (.05,0);
    \draw [-{to[width=1.2mm]}](.25,0) -- (.44,0);
\end{tikzpicture}}}

\newcommand\activepathCL{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}] (.25,0) -- (.1,0);
    \draw [-{to[width = 2mm]}](.1,0) -- (.05,0);
    \draw [-{to[, reversed, width=1.2mm]}](.25,0) -- (.44,0);
\end{tikzpicture}}}

\newcommand\doubleactivepathNB{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}, double distance=.2mm] (-.25,0) -- (-.1,0);
    \draw [] (-.242,-.02) -- (-.218,0.02]);
    \draw [] (-.24,-.017) -- (-.25, -.017);
    \draw [-{Bar[width = 2mm]}, double distance=.2mm](-.1,0) -- (-.05,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}, double distance=.2mm](-.25,0) -- (-.44,0);
\end{tikzpicture}}}
\newcommand\doubleactivepathBN{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}, double distance=.2mm] (.25,0) -- (.1,0);
    \draw [] (.242,.02) -- (.218,-0.02]);
    \draw [] (.24,.017) -- (.25, .017);
    \draw [-{Bar[width = 2mm]}, double distance=.2mm](.1,0) -- (.05,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}, double distance=.2mm](.25,0) -- (.44,0);
\end{tikzpicture}}}

\newcommand\doubleactivepathNC{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}, double distance=.2mm] (-.25,0) -- (-.1,0);
    \draw [] (-.242,-.02) -- (-.218,0.02]);
    \draw [] (-.24,-.017) -- (-.25, -.017);
    \draw [-{to[width = 2mm]}](-.055,0) -- (-.05,0);
    \draw [-, double distance=.2mm](-.08,0) -- (-.1,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}, double distance=.2mm](-.25,0) -- (-.44,0);
\end{tikzpicture}}}
\newcommand\doubleactivepathCN{\mathrel{
\begin{tikzpicture}[baseline={([yshift=-.8ex]current bounding box.center)}]
    \draw [decorate, decoration={zigzag, segment length=.12cm, amplitude=.05cm}, double distance=.2mm] (.25,0) -- (.1,0);
    \draw [] (.242,.02) -- (.218,-0.02]);
    \draw [] (.24,.017) -- (.25, .017);
    \draw [-{to[width = 2mm]}](.055,0) -- (.05,0);
    \draw [-, double distance=.2mm](.08,0) -- (.1,0);
    \draw [-{Circle[open, width=1.2mm, length=1.2mm]}, double distance=.2mm](.25,0) -- (.44,0);
\end{tikzpicture}}}

\makeatletter
\newcommand{\bigcomp}{%
  \DOTSB
  \mathop{\vphantom{\sum}\mathpalette\bigcomp@\relax}%
  \slimits@
}
\newcommand{\bigcomp@}[2]{%
  \begingroup\m@th
  \sbox\z@{$#1\sum$}%
  \setlength{\unitlength}{0.9\dimexpr\ht\z@+\dp\z@}%
  \vcenter{\hbox{%
    \begin{picture}(1,1)
    \bigcomp@linethickness{#1}
    \put(0.5,0.5){\circle{1}}
    \end{picture}%
  }}%
  \endgroup
}
\newcommand{\bigcomp@linethickness}[1]{%
  \linethickness{%
      \ifx#1\displaystyle 2\fontdimen8\textfont\else
      \ifx#1\textstyle 1.65\fontdimen8\textfont\else
      \ifx#1\scriptstyle 1.65\fontdimen8\scriptfont\else
      1.65\fontdimen8\scriptscriptfont\fi\fi\fi 3
  }%
}

\makeatother


\def\tpose{\top}
\def\vv{\mathbf{v}}
\def\TT{\mathbf{T}}
\def\G{\mathcal{G}}
\def\Tin{\vec{T}_{\mathrm{in}}}
\def\Tout{\vec{T}_{\mathrm{out}}}
\def\tin{T_{\mathrm{in}}}
\def\tout{T_{\mathrm{out}}}
\def\Ain{\vec{A}_{\mathrm{in}}}
\def\Aout{\vec{A}_{\mathrm{out}}}
\def\Vmissing{\vec{V}^{\mathrm{MISS}}}
\def\Vextra{\vec{V}^{\mathrm{EXTRA}}}
\def\Ugood{\vec{U}^{\mathrm{GOOD}}}
\def\Ugoodsap{\vec{U}^{\mathrm{GOOD}}_{\mathrm{SAP}}}
\def\Ubadsap{\vec{U}^{\mathrm{BAD}}_{\mathrm{SAP}}}
\def\Ubad{\vec{U}^{\mathrm{BAD}}}
\def\Uperp{\vec{U}^\perp}
\def\Ustar{\vec{U}^*}
\def\Vgood{\vec{V}^{\mathrm{GOOD}}}
\def\Vbad{\vec{V}^{\mathrm{BAD}}}
\def\Vambig{\vec{V}^{\mathrm{AMBIG}}}
\def\Ssub{\vec{S}^{\mathrm{SUB}}}
\def\Ssup{\vec{S}^{\mathrm{SUP}}}
\def\E{\mathbb{E}}
\def\Vsap{V_{\mathrm{SAP}}}
\newcommand{\Fisolate}[2]{F_{\mathrm{ISO}(#1)}(#2)}
\newcommand{\TFisolate}[2]{\tilde{F}_{\mathrm{ISO}(#1)}(#2)}
\newcommand{\Fremove}[2]{F_{\mathrm{REM}(#1)}(#2)}

\usepackage{subfiles} % Best loaded last in the preamble

  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix

\section{Additional SCM Background}
\label{apx:d-separation background}
For any DAG $\G = (\vec{V}, \vec{E})$, we call $\vec{P} \subseteq \vec{E}$ a \textbf{path} if it connects $A$ and $B$ with no repeated vertices. The path is \textbf{directed} if it obeys the directions of the edges and \textbf{undirected} if it does not.
Both directed and undirected paths in a causal DAG can result in dependencies between variables. To understand the conditions for dependence/independence ($d$-connection/$d$-separation) \cite{pearl1988probabilistic}, we will use the concepts of \textbf{active} and \textbf{inactive} paths, which are defined relative to a conditioning set \cite{pearl2009causality, peters2017elements}. Intuitively, whether a path is active or not indicates whether it ``carries dependence'' between the variables.

For an empty conditioning set, a path between $A$ to $B$ is active if it is directed or if it is made up of two directed paths from a common cause along that path. In the same unconditioned setting, \textbf{inactive paths} are paths that contain a \textbf{collider}, i.e. a vertex for which the path has two inward pointing arrows.

When we are given a conditioning set $\vec{Z}$, conditional dependencies differ from unconditional ones. Active paths can be \textbf{blocked} (thus becoming inactive paths) if some vertex $Z$ along the path between $A$ to $B$ is included in $\vec{C}$. Similarly, inactive paths with a collider $C$ can become \textbf{unblocked} by including $C$ or some descendant of the collider variable in the conditioning set $\vec{Z}$.
If two variables $A, B$ contain no active paths (they may contain inactive paths), then we say they are $d$-separated ($A \indep_d B \given \vec{Z}$). If two variables contain at least one active path for a conditioning set $\vec{Z}$, we say that they are $d$-connected.

\cite{pearl1988probabilistic} uses structural causal models to justify the \emph{local Markov condition}, which means that $d$-Separation always implies independence and allows DAG structures to be factorized. It is possible that two $d$-connected variables by chance exhibit some unexpected \emph{statistical} independence. The assumption of faithfulness \cite{spirtes2000causation} ensures that $d$-connectedness implies statistical dependence. This assumptions is popular in the causal discovery literature, but we will not need it for our theoretical results.

\section{Information Theory Preliminaries}
\label{apx: info theory prelim}
\begin{lemma}[Chain Rule, \citep{cover1999elements}]\label{lem:chainrule}
For sets of variables $\vec{A}, \vec{B}$, and subset $\vec{B}' \subset \vec{B}$
\begin{equation}
    \I(\vec{A} : \vec{B}) = \I(\vec{A}: \vec{B}') + \I(\vec{A}: \vec{B} \setminus \vec{B'} \given \vec{B}')
\end{equation}
\end{lemma}
\begin{definition}[\citep{cover1999elements}]
For sets of variables $\vec{A}, \vec{B}, \vec{C}$, the \textbf{interaction information} is defined,
\begin{equation}
    \I(\vec{A}: \vec{B}: \vec{C}) := \I(\vec{A} : \vec{B}) - \I(\vec{A} : \vec{B} \given \vec{C}).
\end{equation}
\end{definition}
A key property of interaction information is that it is symmetric to permutations in its three inputs,
\begin{equation}
        \I(\vec{A}: \vec{B}: \vec{C}) = \H(\vec{A}, \vec{B}, \vec{C}) + \H(\vec{A}) + \H(\vec{B}) + \H(\vec{C}) - \H(\vec{A}, \vec{B}) - \H(\vec{B}, \vec{C}) - \H(\vec{C}, \vec{A}).
\end{equation}

Another key property is that interaction information can be either positive or negative, differing from mutual information which is non-negative. The following lemmas will describe two common situations in which we can expect positive and negative interaction information.

\begin{lemma}\label{lem:positiveII}
    Given three sets of random variables $\vec{A}, \vec{B}, \vec{C}$ if $\vec{A} \indep \vec{C} \given \vec{B}$ then $\I(\vec{A}:\vec{B}:\vec{C}) \geq 0$.
\end{lemma}
Graphically, Lemma~\ref{lem:positiveII} represents a situation where conditioning on $\vec{B}$ $d$-separates $\vec{A}$ and $\vec{C}$ (i.e. $\vec{B}$ is a \emph{separating set} of $\vec{A}$ and $\vec{B}$). Hence, a sufficient condition to utilize Lemma~\ref{lem:positiveII} is $\vec{A} \doubleactivepathNB \vec{B} \doubleactivepathBN \vec{C}$, in which case the inequality becomes strict. The symmetry of interaction information means that it is not important which set of variables is the separating set. Conveniently $\I(A: B: C) \geq 0 \Rightarrow \I(A:B \given C) \leq \I(A:B)$, meaning we can ``drop'' conditioned variables in our upper bounds.

The \textbf{data processing inequality} uses each ``step'' of an active path to upper bound the mutual information.
\begin{lemma}[Data Processing Inequality (modified from \cite{cover1999elements})]
If $\vec{A} \indep \vec{C} \given \vec{B}, \vec{D}$ then
\begin{align}
       \I(\vec{A} : \vec{C} \given  \vec{D}) & \leq \min(\I(\vec{A}:\vec{B} \given \vec{D}), \I(\vec{B}:\vec{C} \given \vec{D})) \nonumber \\
       &\leq \H(\vec{B} \given \vec{D}).
\end{align}
\label{lem: DPI}
\end{lemma}

\section{Redundancy Bounds without Functional Assumptions} \label{apx:redundancy bounds without func assumtions}
Without the functional assumptions of our framework, we can only provide upper bounds on the context sensitivity.

To decompose the mutual information between sets of vertices $\vec{M}$ we will use the chain rule:
\begin{equation}
    \I(Y:\vec{M} \given \vec{X}) = \I(Y:M \given \vec{X}) + \I(Y:\vec{M} \setminus \{M\} \given \vec{X}, M)
\end{equation}
Repeated application of the chain rule accumulates conditioned $M$ variables for an arbitrary ordering. This conditioning represents the interactive nature of distribution shift mechanisms. We will now consider the context sensitivity of an arbitrary context variable conditioned on some arbitrary set of previously considered $\vec{M}' \subset \vec{M}$. We will be able to drop this conditioning on $\vec{M}'$ in our context sensitivity bounds.

\begin{lemma}
\label{lem: applied DPI}
For some $U_i \in \vec{U}$, and $\vec{M}' \subseteq \vec{M}$, if $M_i \doubleactivepathNB U_i \doubleactivepathBN Y$, then
\begin{equation*}
    \I(M_i:Y \given \vec{X}, \vec{M}') \leq \H(U_i \given \vec{X}) = \H(U_i) - \I(U_i:\vec{X}).
\end{equation*}
\end{lemma}
\begin{proof}
Using the data processing inequality (Lemma~\ref{lem: DPI}),
\begin{align*}
    \I(M_i:&Y \given \vec{X}, \vec{M}') \\
    &\leq \min (\I(U_i:M_i \given \vec{X}, \vec{M}'), \I(U_i:Y \given \vec{X}, \vec{M}'))\\
    &\leq H(U_i \given \vec{X}, \vec{M}') \\
    &\leq H(U_i \given \vec{X})\tag*{\qedhere}
\end{align*}
\end{proof}

Lemma~\ref{lem: applied DPI} gives an information-theoretic quantification of the notion of $d$-separation, showing that reducing context sensitivity involves selecting $\vec{X}$ with high $\Ugood$ redundancy. An invariant set is an extreme case of this rule, in which $\vec{X}$ has full redundancy with $\Ugood$.

\begin{lemma}
\label{lem: Collider DPI}
For $U_i \in \vec{U}, \vec{X} \subseteq \vec{V}$ let $\vec{X}' = \vec{X} \cap \Ch(U_i)$. If $U \in \Ubad$ then 
\begin{equation*}
    \I(M_i:Y \given \vec{X}, \vec{M}') \leq \I(U_i:\vec{X}' \given Y) \leq \I(U_i:\vec{X}').
\end{equation*}
\end{lemma}
Lemma~\ref{lem: Collider DPI} quantitatively states that we should avoid redundancy with $\Ubad$.
\begin{proof}
$M_i \indep_d Y$ by the conditions of the lemma. Because $\vec{X} \setminus \vec{X}'$ has no vertices in $\Ch(U_i)$ and there are no other descendants, $M_i \indep_d Y \given (\vec{X} \setminus \vec{X}')$. 
\begin{equation}
\begin{split}
    \I(M_i:Y \given \vec{X}, \vec{M}') &= -\I(M_i:Y:\vec{X}' \given  \vec{M}')\\
    %\begin{split}
        &=\I(M_i: \vec{X}' \given Y, \vec{M}') \\
        &- \I(M_i:\vec{X} \given  \vec{M}')\\
    %\end{split} \\
    &\leq \I(M_i: \vec{X}' \given Y, \vec{M}').
\end{split}
\end{equation}

$M_i \doubleactivepathNB U_i \doubleactivepathBN \vec{X}'$ because $U_i$ is the only descendant of $M_i$. The DPI gives,
\begin{equation}
    \I(M_U: \vec{X}' \given Y,  \vec{M}') \leq \I(U: \vec{X}' \given Y,  \vec{M}').
\end{equation}
We apply Lemma~\ref{lem:positiveII} because $\vec{X}' \doubleactivepathNB U_i \doubleactivepathBN \vec{M}'$
\begin{equation}
    \I(M_U: \vec{X}' \given Y,  \vec{M}') \leq \I(U: \vec{X}' \given Y).
\end{equation}
We also have $\vec{X}' \doubleactivepathNB U_i \doubleactivepathBN Y$ from structural sparsity, so applying Lemma~\ref{lem:positiveII} gives the final loosest bound.
\end{proof}

\section{Guaranteeing Redundancy} \label{apx: guarnateeing redundancy}
Though we cannot ever measure the relevance of a proxy with unobserved $\vec{U}$, Lemma~\ref{lemma:mutualinf} shows that we can harness the graphical structure to obtain lower bounds.
\begin{lemma}[Unobserved common-cause information]\label{lemma:mutualinf}
Given a causal DAG $\mathcal{G} = (\vec{V}, \vec{E})$, for any $U, V_i, V_j \in \vec{V}$ that satisfy $V_i \doubleactivepathNB U \doubleactivepathBN V_j$, then  $\I(V_i, V_j: U) \geq \I(V_i:V_j)$.
\end{lemma}
\begin{proof}
A visualization of this proof is given in Figure~\ref{fig:MI_bound_proof}. Colors are added to the equations in the proof to match this figure. Begin with the definition of mutual information.
\begin{equation}
    \I(V_i, V_j:U) = \H(V_i, V_j) - \H(V_i, V_j \given U)
\end{equation}
We can expand the joint entropy of both terms as follows,
\begin{align}
    \H(V_i, V_j) =& \H(V_i \given V_j) + \H(V_j \given V_i) + \I(V_i:V_j) \label{eq:expand_ABC}\\
    \H(V_i, V_j \given U) =& \H(V_i \given V_j, U) + \H(V_j \given V_i, U) + \underbrace{\I(V_i:V_j \given U)}_{=0 \text{ because } V_i \indep_d^{\mathcal{G}} V_j \given U} \label{eq:expand_ABDC}
\end{align}
Together, Equations \ref{eq:expand_ABC} and \ref{eq:expand_ABDC} give:
\begin{align*}
    \begin{split}
    \I(V_i, V_j:U) =& \color{red}{\H(V_i \given V_j, U)} \color{black}{ + } \color{blue}{\H(V_j \given V_i)} \color{black}{ + } \color{purple}{\I(V_i: V_j)} \color{black}{ - } \color{red}{\H(V_i \given U, V_j)} \color{black}{ - } \color{blue}{\H(V_j \given U, V_i)}
    \end{split}\\
    =& \color{red}{\I(V_i:U \given V_j)} \color{black}{ + } \color{blue}{\I(V_j:U \given V_i)} \color{black}{ + } \color{purple}{\I(V_i: V_j)}\\
    \geq& \color{purple}{\I(V_i: V_j)}
\end{align*}
\end{proof}
\def\firstcircle{(0,0.5) circle (1.7cm)}
\def\secondcircle{(1.9, -1) circle (1.5cm and 3.0cm)}
\def\thirdcircle{(-1.9, -1) circle (1.5cm and 3.0cm)}
\begin{figure}[h]
\setlength{\belowcaptionskip}{-10pt}
\centering
    \centering
    \scalebox{.7}{
    \begin {tikzpicture}[-latex ,auto ,node distance =1.5 cm and 1.5 cm ,on grid , semithick, state/.style ={ circle, draw, minimum width =.5 cm}]
    \begin{scope}
      \clip \firstcircle;
      \fill[rotate around= {45:(1.8, -1)}, blue] \secondcircle;
    \end{scope}
    
    \begin{scope}
      \clip \firstcircle;
      \fill[rotate around= {-45:(-1.8, -1)}, red] \thirdcircle;
    \end{scope}
    
    \begin{scope}
      \clip[rotate around= {45:(1.8, -1)}] \secondcircle;
      \fill[rotate around= {-45:(-1.8, -1)}, purple] \thirdcircle;
    \end{scope}
    \draw \firstcircle node[above=.9] {$H(U)$};
    \draw[rotate around={45:(1.8, -1)}] \secondcircle;
    \node (hac) at (-2.7, -2.5) {$\H(V_i)$};
    \draw[rotate around={-45:(-1.8, -1)}] \thirdcircle; 
    \node (hbc) at (2.7, -2.5) {$\H(V_j )$};
    \node[blue] (hij) at (2.25, -1) {$\I(V_j:U \given V_i)$};
    \node[red] (hji) at (-2.25, -1) {$\I(V_i:U \given V_j)$};
    \node[purple] (i) at (0, -2.3) {$\I(V_i: V_j )$};
    
        \end{tikzpicture}
        }
    \caption{A visual proof for Lemma~\ref{lemma:mutualinf}.}
    \label{fig:MI_bound_proof}
\end{figure}

Hence, we can lower bound $\Fisolate{V_G}{U_A}$'s hold on good information in $U_A^{(G)}$ using Lemma~\ref{lemma:mutualinf},
\begin{equation}
   \I((V_i, \Fisolate{V_i}{V_A}):\Pa(V_i)) \geq \I(V_A:V_i). 
\end{equation}

\section{Deferred Proofs}\label{apx: deferred proofs}

% LEMMA 1
\subsection{Proof of Lemma~\ref{lem: d conn but not dependent}}
\begin{proof}
 We can rewrite the conditional mutual information making use of $U_1 \indep U_2$ as follows.
\begin{equation*}
\begin{aligned}
        \I(U_1:U_2 \given V) &= -\I(U_1 : U_2 : V)\\
        &=-\I(U_1:V) + \I(U_1:V \given U_2)\\
        &=-\I(U_1:V^{(U_1)}) + \I(U_1:V^{(U_2)} \given U_2)\\
        &=-\I(U_1:V^{(U_1)}) + \I(U_1:V^{(U_1)})= 0.
\end{aligned}\qedhere
\end{equation*}
\end{proof}

% LEMMA 2
\subsection{Proof of Lemma~\ref{lem: setting redundancy}}
\begin{proof}
$\I(U:\vec{X}) = \H(U) - \H(U \given \vec{X})$. If at least one child of $U$ is conditioned on and transmits (i.e. in $V \in \Ch(U) \cap \vec{X}$ and $v_{\vec{x}} \neq \phi)$, then $\H(U \given \vec{x}) = 0$. Otherwise, $\H(U \given \vec{X}) = \H(U)$ because all $X \in \vec{X} \setminus \Ch(U)$ with active paths to $U$ must go through colliders in $\Ch(U)$ - all of which are not in $\vec{X}$ or not transmitting.
\end{proof}

% LEMMA 3
\subsection{Proof of Lemma~\ref{lem: new applied DPI}}
\begin{proof}
 $\I(M_i:Y \given \vec{X}) = \sum_{\vec{X}} \I(M_i:Y \given \vec{X})$ is zero unless $(M_i, U_i)$ and $(U_i, Y)$ both transmit. Furthermore, the DPI gives $\I(M_i:Y \given \vec{X}) \leq \H(U_i \given \vec{X})$, which is zero if any of the edges from $U_i \rightarrow X$ for $X \in \vec{X}$ transmit.
\end{proof}

% LEMMA 4
\subsection{Proof of Lemma~\ref{lem: new Collider DPI}}
\begin{proof}
 $\I(M_i:Y \given \vec{X}) = \sum_{\vec{X}} \I(M_i:Y \given \vec{X})$ is zero unless both $(M_i, U_i)$ and $(U_i, Y)$ transmit and at least one of $(U_i, X)$ transmits for $X \in \vec{X}$, in which case $\I(M_i:Y \given \vec{x}) = \I(M_i:Y \given U_i)$.
\end{proof}

% LEMMA 5
\subsection{Proof of Lemma~\ref{lem:conditioning on y separates sets}}
\begin{proof}
($\Rightarrow$) We will prove this with the contrapositive. If there is no shared parent between $V_i$ and $V_j$, then all active paths must go through $Y$. However, because at least one of $V_i, V_j$ is not connected to a cause, all paths between $V_i$ and $V_j$ cannot have a collider at $Y$ (through two causes). This means conditioning on $Y$ blocks the remaining paths.

($\Leftarrow$) If there is a shared parent $U$ between $V_i$ and $V_j$, then $V_i \leftarrow U \rightarrow V_j$ is an active path, $d$-connecting the two vertices. If $V_i$ and $V_j$ each have corresponding parents $U_i, U_j \in \Pa(Y)$, then $V_i \leftarrow U_i \rightarrow Y \leftarrow U_j \rightarrow V_j$ is an active path conditioned on $Y$.
\end{proof}

% LEMMA 6
\subsection{Proof of Lemma~\ref{lem: bootstraping}}
\begin{proof}
 The proof follows from Lemma~\ref{lem:conditioning on y separates sets}. Adjacent edges in $\G_Y$ either indicate shared parents or that both vertices have a (potentially different) cause of $Y$ as their parent. 
 
 If $V_i$ and $V_j$ share a parent, then $\Pa(V_i) \subseteq \Ugood$ implies $\Ugood \cap \Pa(V_j) \neq \emptyset$, so $V_j$ has at least one ``good'' parent. The symmetric argument holds for $\Pa(V_i) \subseteq \Ubad$.
 
 If $V_i$ and $V_j$ both have at least one causal parent, then we know $V_i, V_j \not \in \Vbad$. We know both vertices have at least one good $U$ as a parent, so it trivially follows that both are either in $\Vgood$ or $\Vambig$.
 \end{proof}
 
 \subsection{Proof of Theorem~\ref{thm: proxy bootstrapping works}}
\begin{proof}
The only potential conditioned collider is $Y$, which is not allowed to be separable by partial faithfulness. Hence, partial faithfulness guarantees that we can construct $\G_Y$ from conditional independence tests because $d$-connection implies dependence between $V$.

 The requirements on $\vec{V}^*$ given by the theorem ensure that every $V \in \vec{V}$ has at least one label from an adjacency to a $V^* \in \vec{V}^*$ in $\G_Y$.
 
 The algorithm adds ``good'' labels to all vertices with a known good parent and ``bad'' labels to all vertices with a known bad parent. Therefore, all ``ambigious'' vertices are correctly labeled. 
 
 We now only need to guarantee that that the ``good'' and ``bad'' vertices are not ambiguous. If $V$ were ambiguous, it would be connected to a $U$ of the opposite label (i.e. a ``good'' vertex would be connected to a bad $U$). Such a $U$ would have at least one $V^* \in \vec{V}^* \cap \Ch(U)$ which would be adjacent to $V$ and have given $V$ the label of $U$, a contradiction.
 \end{proof}
 
 \subsection{Proof of Lemma~\ref{lem: new applied DPI engineering}}
 \begin{proof}
$\I(M_i:Y \given \vec{X}) = \sum_{\vec{x} \in \vec{X}} \Pr(\vec{x}) \I(M_i:Y \given \vec{x})$. Now, we have \[\I(M_i:Y \given \vec{x}) = \H(M_i \given \vec{x}) = \H(U_i \given \vec{x})\] if both $(M_i, U_i)$ and $(U_i, Y_i)$ edges transmit, which occurs with probability $\alpha_{M_i, U_i}\alpha_{U_i, Y}$. Pulling this coefficient outside of the sum gives $\I(M_i:Y \given \vec{X}) = \alpha_{M_i, U_i}\alpha_{U_i, Y} \H(U_i \given \vec{x})$.

\end{proof}

\subsection{Proof of Lemma~\ref{lem: if we dont include information with bad u, then we are happy} }
\begin{proof}
    $U_i \in \Ubad$ means $M_i \doubleactivepathNC U_i\doubleactivepathCN Y$, so $\I(M_i: Y) = 0$.
    \begin{equation}
        \begin{aligned}
            \I(M_i:Y \given \vec{X}) &= -\I(M_i: Y: \vec{X})\\
            &\leq \I(M_i:\vec{X} \given Y)\\
            &\leq \I(U_i: \vec{X} \given Y) = 0
        \end{aligned}
    \end{equation}
    The final inequality comes from the data processing inequality.
\end{proof}

\subsection{Proof of Lemma~\ref{lem: isolation bound 2}}
\begin{proof}
Consider the function $F_C(V_A) = F_C(G, B)) = \Fisolate{V_G}{G}$. By definition,
\begin{align}
    \I(\Fisolate{V_G}{G}:V_G \given Y) &= \I(G:V_G \given Y)\\
    &= \I(V_A:V_G \given Y).
\end{align}
Hence, $F_C$ is in the feasible set of the optimization function defining isolation functions. Furthermore, $F_C(V_A)$ is only a function of $G$ and $G \indep U_B \given Y$, so we can also conclude that $F_C(V_A) \indep U_B$. This means
\begin{align*}
\H(F_C(V_A) \given Y) &=\H(F_C(V_A) \given U_B, Y)\\
 \leq& \H(\Fisolate{V_G}{V_A} \given U_B, Y)\\
 \leq& \H(\Fisolate{V_G}{V_A} \given Y) - \I(\Fisolate{V_G}{V_A} : U_B \given Y).
\end{align*}
Hence, if $\I(\Fisolate{V_G}{V_A} : U_B \given Y) > 0$, then $\H(F_C(V_A) \given Y) < \H(\Fisolate{V_G}{V_A} \given Y)$, contradicting the minimality of $\Fisolate{V_G}{V_A}$.
\end{proof}

\subsection{Proof of Theorem~\ref{thm: when help and do no harm}}
\begin{proof}
To shorten some equations, we will use \[\vec{F}(V_A):=\Fisolate{V_G}{V_A \given Y}.\]
We first show that Equation~\ref{eq: thm2} is sufficient for an improvement in relevance. We can expand the relevance of $\vec{X}^+$ as follows
\begin{equation}
\begin{split}
    \I(Y:\vec{X}^+) =& \I(Y:\vec{F}(V_A)) + \I(Y: V_G\given \vec{F}(V_A))\\
& \geq \I(Y: V_G\given \vec{F}(V_A))\\
&\geq \I(Y: V_G) - \I(Y: V_G : \vec{F}(V_A)).
\end{split}
\end{equation}
So, for guaranteed improvement in relevance ($\I(Y: \vec{X}^+) > \I(Y:V_G)$), we need negative $\I(Y: V_G : \vec{F}(V_A)) < 0$. Expanding,
\begin{equation}
    \I(Y: V_G : \vec{F}(V_A)) = \I(\vec{F}(V_A):V_G) - \I(\vec{F}(V_A):V_G \given Y).
\end{equation}
Thus, Equation~\ref{eq: thm2} gives us the exact condition needed for negative interaction information, guaranteeing improvement.

We can show that the context sensitivity is no worse by separately considering the context sensitivity with $\Pa(\Ubad)$ and $\Pa(\Ugood)$. We begin with $\Ubad$. Applying Lemma~\ref{lem: isolation bound 2},
\begin{equation}
    \I((V_G, \vec{F}(V_A)): \Ubad \given Y) = 0,
\end{equation}
which satisfies the conditions for Lemma~\ref{lem: if we dont include information with bad u, then we are happy} to ensure us that $\I(\Pa(\Ubad):Y \given \vec{X}^+) = 0$.

Now, consider an arbitrary for $M_G = \Pa(U_G) \in \Pa(\Ugood)$. Lemma~\ref{lem: new applied DPI engineering} tells us that
\begin{equation*}
    \I(M_G:Y \given \vec{X}^+) = \alpha_{M_G:U_G} \alpha_{U_G, Y} H(U_G \given \widetilde{\Ch}_{\vec{X}^{+}}(U_G)).
\end{equation*}
We then observe that $H(U_G \given \widetilde{\Ch}_{\vec{X}^{+}}(U_G)) \leq H(U_G \given \vec{X})$ because entropy is submodular, which leads us to conclude,
\begin{equation*}
     \I(M_G:Y \given \vec{X}^+) \leq \I(M_G:Y \given \vec{X}).
\end{equation*}
This completes the proof.
\end{proof}

\section{Experiments}
\subsection{Synthetic experimental setup}\label{sec:appendix_synthethic}
 $M_G$ and $M_B$ are drawn from normal distributions with mean $0$ and variable standard deviations. All other vertices (other than $Y$) are the average of their parents plus additional Gaussian noise $N(0, .2)$. $T_A\in \R^2$ is generated by applying a rotation matrix to $(T_A^{(G)}, T_A^{B})^T$\footnote{Many rotations were tried in our experiments with identical results, so we display results from a $45$ degree rotation.}. $Y$ indicates whether its parents  sum to a positive number with a $5\%$ probability of flipping randomly.

\subsection{F1 scores for real world experiment}
We give the F1 scores for the experiment described in Section~\ref{sec:real_data} in Table~\ref{tab:performance_via_f1}.
\begin{table}[htbp]
\centering
\small
\caption{Comparison of out-of-domain (2021) performance on predicting high income via F1 scores.}
\label{tab:performance_via_f1}
\begin{tabular}{|c|l l l|}
\hline
State & All Features & Engineered Features & Limited Features \\
\hline
CA & \textbf{0.684} & \textbf{0.683} & 0.676 \\
FL & \textbf{0.459} & 0.388 & 0.388 \\
GA & 0.541 & \textbf{0.626} & \textbf{0.624} \\
IL & 0.563 & \textbf{0.630} & \textbf{0.628} \\
NY & \textbf{0.688} & \textbf{0.690} & 0.662 \\
NC & \textbf{0.475} & 0.410 & 0.410 \\
OH & 0.519 & \textbf{0.581} & \textbf{0.580} \\
PA & 0.531 & \textbf{0.608} & \textbf{0.606} \\
TX & 0.554 & \textbf{0.619} & \textbf{0.619} \\
\hline
avg & 0.557 & \textbf{0.582} & 0.577 \\
\hline
\end{tabular}
\end{table}



\subsection{Real world experiment in-domain performance}\label{apx:real_data}
Here we provide the results of the in-domain accuracy for the experiment described in Sec.~\ref{sec:real_data}.
Recall, that we use US Census data and consider distributions shifts across time as suggested by~\cite{ding2021retiring}. 
Table~\ref{tab:test_2019_real_results} shows the accuracy on 2019 data on a held-out dataset (separate from the training split). We repeated the experiment 10 times on different training/testing splits and report the mean and standard deviation of the accuracy for the largest states in the U.S. As expected, using all features has the most predictive power for in-domain tasks. 

% We use US Census data processed through folktables~\cite{ding2021retiring} to predict whether the income of a person exceeds 50k (reflecting the target of the Adult dataset~\cite{Dua:2019}). The data contains information about the education, commute distance, health insurance (among other features) of people alongside with their income. 
% As suggested by~\cite{ding2021retiring} the dataset can be used to study distribution shifts across location and time. 
% Specifically, we consider 2019 (pre-pandemic) to train models and 2021 during the pandemic to evaluate them.\footnote{We skipped 2020 since the Census release is experimental for that year.} Clearly, a lot of things changed during the pandemic. 

% We limit ourselves to a small feature set to see a starker effect of including/excluding individual features. We consider commute time (coded as JWMNP in the dataset), a flag whether the person received Medicaid, Medical Assistance, or any kind of government-assistance plan for those with low incomes or a disability (coded as HINS4) and education level (SCHL) as proxies for the unobserved causes shown in Figure \ref{fig:experiments_proxies}. 
% We purposefully selected  a feature with relatively stable predicitve power (education level) alongside with features that were heavily affected by the pandemic when people started working from home more and medicaid's coverage increased through the continuous enrollment provision.  
% Therefore, our auxiliary task from Sec.~\ref{sec:aux}, \emph{engineered features}, uses the ambiguous proxies, a flag for government-assisted healthcare and commute time, indirectly to predict the good proxy, education, %another feature (i.e. education level) 
% conditioned on the label. The two predictions (assuming high and low income) of the education-level are then used as derived features alongside the actual education-level as input features.  
% We compare it to using all three features directly, \emph{all features}, or using just the stable education feature \emph{limited features}.

% % We compare the accuracy of our proposed method, called \emph{engineered features} against two baselines: In the first baseline we 
% % % One option is to 
% % include the ambiguous features HINS4 and JWMNP %the flag for government-assisted healthcare and commute time
% % directly in the input features of a model, alongside the good proxy, education. We name this baseline \emph{all features}. This  is expected to help %certainly helps 
% % with the in-domain performance. In the second baseline we completely ignore the ambiguous features and only include as input to the model the good proxy, education. We name this baseline \emph{limited features}.%Completely ignoring the feature is another option, we refer to as \emph{limited features}. 
% % This is expected to reduce the predictive power in general, but also to increase robustness to shifts in the ambiguous features. 

% We use logistic regression from sklearn with l1 regularization\footnote{l1 regularization is crucial since l2 regularization leads to significantly worse out-of-domain results as it forces models to use the unreliable features more. } to build models based on the different feature sets that the three methods created. 



% \begin{table}[h!]
%     \centering
%     \small
%     \begin{tabular}{|c | l l l |  }
%     %  & \multicolumn{3}{|c|}{2021 (out-of-domain) accuracy}  \\
%      \hline
%      state & all features & engineered features & limited features   \\ 
%     \hline
%     % evaluate.report_accuracy_with_std_dev(across_repetitions, big_states,  years=[2021], split='test')
% CA  & 0.712 $\pm$ 0.0011 & 0.711 $\pm$ 0.0014 & 0.692 $\pm$ 0.0014 \\
% FL  & 0.683 $\pm$ 0.0012 & 0.678 $\pm$ 0.0018 & 0.68 $\pm$ 0.0013 \\
% GA  & 0.689 $\pm$ 0.0025 & 0.707 $\pm$ 0.0055 & 0.709 $\pm$ 0.0029 \\
% IL  & 0.662 $\pm$ 0.0026 & 0.689 $\pm$ 0.0033 & 0.684 $\pm$ 0.0019 \\
% NY  & 0.707 $\pm$ 0.0022 & 0.702 $\pm$ 0.0025 & 0.687 $\pm$ 0.008 \\
% NC  & 0.691 $\pm$ 0.0031 & 0.684 $\pm$ 0.0034 & 0.683 $\pm$ 0.003 \\
% OH  & 0.689 $\pm$ 0.0022 & 0.703 $\pm$ 0.004 & 0.696 $\pm$ 0.0029 \\
% PA  & 0.672 $\pm$ 0.0017 & 0.695 $\pm$ 0.0023 & 0.688 $\pm$ 0.0022 \\
% TX  & 0.69 $\pm$ 0.0029 & 0.712 $\pm$ 0.0028 & 0.712 $\pm$ 0.0027 \\
% \hline
% avg & 0.688 & 0.698  & 0.692 \\
% \hline
% %2021
% % all
% % mean 0.688  min 0.662 q25 0.683 q50 0.689 q75 0.691
% % aux
% % mean 0.698  min 0.678 q25 0.689 q50 0.702 q75 0.707
% % without
% % mean 0.692  min 0.680 q25 0.684 q50 0.688 q75 0.696
% % old with bootsstrapping confidence intervals
% % % evaluate.report_accuracy(all_states, big_states, years=[2021], split='test')
% % CA  & \textbf{0.715} [0.714, 0.716] & \textbf{0.713} [0.712, 0.714] & 0.693 [0.692, 0.695] \\
% % FL  & 0.682 [0.68, 0.684] & 0.679 [0.677, 0.68] & 0.679 [0.677, 0.68] \\
% % GA  & 0.689 [0.686, 0.691] & 0.687 [0.684, 0.689] & \textbf{0.711} [0.708, 0.713] \\
% % IL  & 0.664 [0.663, 0.666] & \textbf{0.69} [0.688, 0.693] & 0.683 [0.681, 0.686] \\
% % NY  & \textbf{0.707} [0.706, 0.709] & \textbf{0.704} [0.703, 0.706] & 0.694 [0.692, 0.695] \\
% % NC  & \textbf{0.691} [0.689, 0.694] & 0.681 [0.679, 0.683] & 0.685 [0.683, 0.688] \\
% % OH  & 0.684 [0.682, 0.687] & \textbf{0.699} [0.697, 0.701] & 0.692 [0.691, 0.694] \\
% % PA  & 0.669 [0.668, 0.672] & \textbf{0.69} [0.687, 0.692] & \textbf{0.684} [0.683, 0.687] \\
% % TX  & 0.692 [0.69, 0.694] & \textbf{0.713} [0.712, 0.715] & \textbf{0.713} [0.712, 0.715] \\
% % \hline
% % avg & 0.688  & 0.695 & 0.693 \\
% % \hline
% % % mean 0.688  min 0.664 q25 0.682 q50 0.689 q75 0.692
% % % aux
% % % mean 0.695  min 0.679 q25 0.687 q50 0.690 q75 0.704
% % % without
% % % mean 0.693  min 0.679 q25 0.684 q50 0.692 q75 0.694
%     \end{tabular}
%     \caption{Out-of-domain accuracy on the 2021 American Community Survey for the largest states in the United States. 
%     %As input features to the logistic regression models trained on the 2019 to predict high income data serve the all features (education level, medicaid insurance status, and commute time), engineered features with predictions of education level based on medicaid insurance status and commute time along side with education level, and limited features with only education level.
%     Mean of the held-out data from 10 random data splits with std deviation.}
%     \label{tab:test_real_results}
% \end{table}

\begin{table}[ht!]
    \centering
    \small
    \caption{Comparison of in-domain (2019) performance on predicting high income via Accuracies.}
    \begin{tabular}{|c | l l l |  }
    %  & \multicolumn{3}{|c|}{2021 (out-of-domain) accuracy}  \\
     \hline
     State & All Features & Engineered Features & Limited Features   \\ 
    \hline
    %evaluate.report_accuracy_with_std_dev(across_repetitions, big_states, years=[2019], split='test')
CA  & \textbf{0.713} $\pm$ 0.0010 & \textbf{0.710} $\pm$ 0.0012 & 0.691 $\pm$ 0.0011 \\
FL  & \textbf{0.700} $\pm$ 0.0014 & 0.693 $\pm$ 0.0020 & 0.694 $\pm$ 0.0017 \\
GA  & \textbf{0.708} $\pm$ 0.0025 & \textbf{0.708} $\pm$ 0.0036 & \textbf{0.707} $\pm$ 0.0036 \\
IL  & \textbf{0.689} $\pm$ 0.0023 & \textbf{0.690} $\pm$ 0.0039 & \textbf{0.685} $\pm$ 0.0021 \\
NY  & \textbf{0.705} $\pm$ 0.0024 & 0.698 $\pm$ 0.0022 & 0.687 $\pm$ 0.0076 \\
NC  & \textbf{0.713} $\pm$ 0.0020 & 0.703 $\pm$ 0.0049 & 0.700 $\pm$ 0.0028 \\
OH  & \textbf{0.717} $\pm$ 0.0029 & \textbf{0.716} $\pm$ 0.0042 & \textbf{0.712} $\pm$ 0.0033 \\
PA  & \textbf{0.702} $\pm$ 0.0028 & \textbf{0.701} $\pm$ 0.0027 & 0.695 $\pm$ 0.0026 \\
TX  & \textbf{0.708} $\pm$ 0.0019 & \textbf{0.705} $\pm$ 0.0025 & \textbf{0.706} $\pm$ 0.0022 \\
\hline
avg & \textbf{0.706} &  0.703  &  0.697 \\
\hline
% 2019
% all
% mean 0.706  min 0.689 q25 0.702 q50 0.708 q75 0.713
% aux
% mean 0.703  min 0.690 q25 0.698 q50 0.703 q75 0.708
% without
% mean 0.697  min 0.685 q25 0.691 q50 0.695 q75 0.706 
% old with bootstrapping
% % evaluate.report_accuracy(all_states, big_states, years=[2019], split='test')
% CA  & 0.715 [0.714, 0.716] & 0.712 [0.711, 0.713] & 0.694 [0.692, 0.695] \\
% FL  & 0.7 [0.698, 0.702] & 0.697 [0.695, 0.699] & 0.697 [0.695, 0.699] \\
% GA  & 0.71 [0.708, 0.712] & 0.714 [0.711, 0.716] & 0.714 [0.712, 0.716] \\
% IL  & 0.688 [0.686, 0.69] & 0.689 [0.687, 0.692] & 0.682 [0.68, 0.685] \\
% NY  & 0.702 [0.701, 0.704] & 0.695 [0.693, 0.697] & 0.691 [0.689, 0.693] \\
% NC  & 0.718 [0.715, 0.72] & 0.702 [0.699, 0.703] & 0.705 [0.702, 0.708] \\
% OH  & 0.72 [0.717, 0.722] & 0.717 [0.716, 0.72] & 0.713 [0.711, 0.716] \\
% PA  & 0.706 [0.704, 0.708] & 0.704 [0.701, 0.707] & 0.698 [0.695, 0.7] \\
% TX  & 0.703 [0.702, 0.705] & 0.701 [0.699, 0.702] & 0.701 [0.7, 0.702] \\
% \hline
% 0.707 & 0.703 & 0.699 \\
% \hline
% % 2019
% % all
% % mean 0.707  min 0.688 q25 0.702 q50 0.706 q75 0.715
% % aux
% % mean 0.703  min 0.689 q25 0.697 q50 0.702 q75 0.712
% % without
% % mean 0.699  min 0.682 q25 0.694 q50 0.698 q75 0.705
    \end{tabular}
    \label{tab:test_2019_real_results}
\end{table}
% \begin{table*}[h!]
%     \centering
%     \scriptsize
%     \begin{tabular}{c | l l l | l l l  }
%      &  \multicolumn{3}{|c|}{2019 (in-domain) accuracy} & \multicolumn{3}{|c|}{2021 (out-of-domain) accuracy}  \\
%      \hline
%      state & all features & engineered features & limited features & all features & engineered features & limited features  \\ 
%     \hline
% CA  & 0.711 [0.71, 0.712] & 0.711 [0.71, 0.713] & 0.69 [0.689, 0.691] & \textbf{0.713} [0.712, 0.714] & \textbf{0.713} [0.712, 0.714] & 0.693 [0.692, 0.694] \\
% FL  & 0.699 [0.697, 0.7] & 0.693 [0.691, 0.694] & 0.693 [0.692, 0.694] & \textbf{0.685} [0.683, 0.687] & 0.681 [0.679, 0.683] & 0.681 [0.679, 0.682] \\
% GA  & 0.709 [0.707, 0.711] & 0.707 [0.704, 0.709] & 0.704 [0.703, 0.706] & 0.689 [0.687, 0.691] & \textbf{0.71} [0.707, 0.713] & \textbf{0.706} [0.704, 0.71] \\
% IL  & 0.696 [0.694, 0.698] & 0.695 [0.693, 0.697] & 0.689 [0.687, 0.692] & 0.66 [0.658, 0.662] & \textbf{0.693} [0.69, 0.695] & 0.686 [0.684, 0.687] \\
% NY  & 0.703 [0.701, 0.704] & 0.697 [0.696, 0.699] & 0.691 [0.689, 0.693] & \textbf{0.705} [0.704, 0.707] & 0.701 [0.7, 0.703] & 0.691 [0.69, 0.693] \\
% NC  & 0.712 [0.709, 0.714] & 0.699 [0.696, 0.7] & 0.702 [0.7, 0.704] & \textbf{0.69} [0.688, 0.694] & 0.679 [0.677, 0.681] & 0.683 [0.68, 0.685] \\
% OH  & 0.716 [0.714, 0.719] & 0.717 [0.715, 0.719] & 0.711 [0.71, 0.714] & 0.689 [0.687, 0.692] & \textbf{0.702} [0.7, 0.703] & 0.695 [0.692, 0.697] \\
% PA  & 0.705 [0.703, 0.707] & 0.704 [0.701, 0.705] & 0.697 [0.695, 0.699] & 0.673 [0.671, 0.675] & \textbf{0.697} [0.695, 0.699] & 0.69 [0.688, 0.691] \\
% TX  & 0.711 [0.709, 0.712] & 0.707 [0.705, 0.708] & 0.707 [0.705, 0.708] & 0.69 [0.689, 0.692] & \textbf{0.708} [0.707, 0.71] & \textbf{0.708} [0.707, 0.71] \\
% \hline
% avg & 0.707 & 0.703 & 0.698 & 0.688  & 0.698 & 0.693 \\
%     \end{tabular}
%     \caption{Accuracy results on the Public Use Microdata Sample of the American Community Survey to predict high income with logistic regression based on data in 2019 and evaluated on data from 2019 and 2021 for the largest states in the United States. As input features serve the all features (education level, medicaid insurance status, and commute time), engineered features with predictions of education level based on medicaid insurance status and commute time along side with education level, and limited features with only education level. 95\%-Confidence intervals are computed using bootstrapping. Best out-of-domain performance are bolded for each state (including methods whose confidence interval overlaps with that of the best method).   }
%     \label{tab:real_results}
% \end{table*}

% Table~\ref{tab:test_real_results} shows a mixed picture of for which states which method performs best. Using all features leads to the best in-domain performance (see Table~\ref{tab:test_2019_real_results}), but not necessarily the best out-of-domain performance. How much dropping the ambiguous features hurts predictive power but helps with robustness varies across the states: For some states it leads to a overall higher accuracy on the 2021 data compared to using all features while for others using all features is better. 
% Our proposed feature engineering using CIS has the best mean out-of-domain  accuracy of 0.698 across states compared to these two baselines. 
% It also achieves close to the best out-of-domain accuracy for 6 out of 9 states. 


% \begin{table*}[]
%     \centering
%     \small
%     \begin{tabular}{c | l l l | l l l  }
%      &  \multicolumn{3}{|c|}{2019 (in-domain) accuracy} & \multicolumn{3}{|c|}{2021 (out-of-domain) accuracy}  \\
%      \hline
%      state & all features & engineered features & limited features & all features & engineered features & limited features  \\ 
%     \hline
% CA  & 0.711 [0.71, 0.712] & 0.707 [0.707, 0.709] & 0.689 [0.687, 0.69] & 0.713 [0.711, 0.714] & 0.711 [0.709, 0.712] & 0.693 [0.692, 0.694] \\
% FL  & 0.699 [0.697, 0.7] & 0.694 [0.693, 0.696] & 0.694 [0.692, 0.695] & 0.68 [0.679, 0.682] & 0.677 [0.676, 0.678] & 0.677 [0.675, 0.678] \\
% GA  & 0.709 [0.707, 0.711] & 0.708 [0.705, 0.711] & 0.708 [0.706, 0.711] & 0.689 [0.687, 0.692] & 0.708 [0.704, 0.71] & 0.708 [0.706, 0.711] \\
% IL  & 0.694 [0.691, 0.695] & 0.693 [0.69, 0.694] & 0.686 [0.684, 0.689] & 0.659 [0.657, 0.662] & 0.691 [0.689, 0.693] & 0.684 [0.682, 0.686] \\
% NY  & 0.708 [0.706, 0.71] & 0.698 [0.696, 0.699] & 0.693 [0.691, 0.694] & 0.708 [0.706, 0.71] & 0.702 [0.701, 0.704] & 0.691 [0.689, 0.692] \\
% NC  & 0.71 [0.708, 0.713] & 0.704 [0.702, 0.706] & 0.7 [0.698, 0.702] & 0.682 [0.68, 0.684] & 0.679 [0.677, 0.68] & 0.677 [0.674, 0.679] \\
% OH  & 0.716 [0.714, 0.718] & 0.716 [0.714, 0.717] & 0.71 [0.708, 0.712] & 0.69 [0.688, 0.693] & 0.706 [0.704, 0.708] & 0.699 [0.695, 0.7] \\
% PA  & 0.704 [0.702, 0.706] & 0.701 [0.699, 0.703] & 0.695 [0.693, 0.696] & 0.673 [0.671, 0.675] & 0.697 [0.695, 0.699] & 0.688 [0.686, 0.691] \\
% TX  & 0.711 [0.71, 0.712] & 0.709 [0.708, 0.71] & 0.709 [0.708, 0.711] & 0.691 [0.689, 0.692] & 0.712 [0.711, 0.714] & 0.712 [0.711, 0.713] \\
% \hline
% avg &  0.707 &  0.703  & 0.698   & 0.687  & 0.698 &  0.692 \\
%     \end{tabular}
%     \caption{REPEAT new random train/validation split \mila{REMOVE}}
% \end{table*}


\bibliography{biblio}

\end{document}
