% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
%\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
 \usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage[utf8]{inputenc}
\usepackage{hyperref}
\usepackage{amsmath,amssymb,amsfonts,mathrsfs, amsthm}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{dsfont}
\usepackage{bm}
\usepackage{colortbl}
\usepackage{fullpage}
\usepackage[ruled, linesnumbered]{algorithm2e}
\usepackage{multirow,booktabs,bigdelim}
\usepackage{caption}
\usepackage{comment}
\usepackage{graphicx}
\usepackage{placeins}
\usepackage{siunitx}
\usepackage{url}
\usepackage{enumitem}
\usepackage{layouts}
\usepackage[rgb]{xcolor}
\usepackage{scalerel}
\usepackage{tikz}
\usepackage{tkz-graph}
\usepackage{authblk}
\usetikzlibrary{shapes.geometric}
\usetikzlibrary{backgrounds}
\usetikzlibrary{arrows.meta}
\usepackage[framemethod=TikZ]{mdframed}
\usepackage{xr}
\externaldocument{pfister_731}

%% Theorem-like environments
%\numberwithin{equation}{section}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem} %[section]
\newtheorem{example}[theorem]{Example}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{assumption}{Assumption}

\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{setting}{Setting}

\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

%% Custom commands
%% ===============

\newcommand{\spaceIV}{\texttt{spaceIV} }

%% Sets
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}

%% Operators
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}


%% Bracket-operators
\DeclarePairedDelimiterX{\norm}[1]{\lVert}{\rVert}{#1}
\DeclarePairedDelimiterX{\abs}[1]{\lvert}{\rvert}{#1}

%% General
\renewcommand{\epsilon}{\varepsilon}

%% Linear Algebra
\newcommand{\rank}[1]{\operatorname{Rank}\!\left(#1\right)}
\newcommand{\im}[1]{\operatorname{Im}\!\left(#1\right)}
\newcommand{\nullspace}[1]{\operatorname{Null}\!\left(#1\right)}
\newcommand{\dimension}{\operatorname{dim}}

%% Probability
\renewcommand{\P}{\mathbb{P}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\iid}{\overset{\text{\tiny iid}}{\sim}}
\newcommand{\independent}{\perp\!\!\!\perp}
\newcommand{\nindependent}{\not\!\perp\!\!\!\perp}
\newcommand{\independentG}{\perp\!\!\!\perp_\mathcal{G}}
\newcommand{\nindependentG}{\not\!\perp\!\!\!\perp_\mathcal{G}}
\newcommand{\var}{\operatorname{Var}}
\newcommand{\cov}{\operatorname{Cov}}
\newcommand{\vI}{\operatorname{Id}}
\newcommand{\vNull}{\mathbf{0}}
\newcommand{\landauO}{\mathcal{O}}
\newcommand{\landauo}{o}
\newcommand{\landauOp}{\mathcal{O}_{\P}}
\newcommand{\landauop}{o_{\P}}
\newcommand{\landauOLp}{\mathcal{O}_{\operatorname{L}^p}}

%% Vectors
\newcommand{\vA}{\mathbf{A}}
\newcommand{\vB}{\mathbf{B}}
\newcommand{\vD}{\mathbf{D}}
\newcommand{\vE}{\mathbf{E}}
\newcommand{\vM}{\mathbf{M}}
\newcommand{\vP}{\mathbf{P}}
\newcommand{\vQ}{\mathbf{Q}}
\newcommand{\vV}{\mathbf{V}}
\newcommand{\vX}{\mathbf{X}}
\newcommand{\vY}{\mathbf{Y}}
\newcommand{\vZ}{\mathbf{Z}}
\newcommand{\vr}{\mathbf{r}}
\newcommand{\vv}{\mathbf{v}}
\newcommand{\vx}{\mathbf{x}}
\newcommand{\vy}{\mathbf{y}}
\newcommand{\vz}{\mathbf{z}}
\newcommand{\vepsilon}{\boldsymbol{\epsilon}}

%% Time series notation
\newcommand{\CPt}{\operatorname{CP}}
\newcommand{\ec}{\bar{e}}

%% Regression notation
\newcommand{\hbeta}{\hat{\beta}}
\newcommand{\hsigma}{\hat{\sigma}}
\newcommand{\hepsilon}{\hat{\epsilon}}
\newcommand{\bhepsilon}{\hat{\boldsymbol{\epsilon}}}
\newcommand{\hgamma}{\hat{\gamma}}
\newcommand{\resid}{\mathbf{R}}
\newcommand{\sresid}{\widetilde{\mathbf{R}}}
\newcommand{\noise}{\boldsymbol{\epsilon}}
\newcommand{\snoise}{\tilde{\boldsymbol{\epsilon}}}

\newcommand{\HOS}{H_{0,S}}
\newcommand{\HOstar}{H_{0,S^*}}
\newcommand{\HAS}{H_{A,S}}
\newcommand{\HO}{H_{0}}
\newcommand{\HA}{H_{A}}
\newcommand{\tildeHOS}{\widetilde{H}_{0,S,p}}
\newcommand{\HOrand}{H_{0,S}^{\text{r\hspace{-0.05em}a\hspace{-0.05em}n\hspace{-0.05em}d\hspace{-0.05em}o\hspace{-0.05em}m}}}
\newcommand{\HOfix}{H_{0,S}^{\text{f\hspace{-0.05em}i\hspace{-0.05em}x\hspace{-0.05em}e\hspace{-0.05em}d}}}

%% Causality
\newcommand{\XPA}{\operatorname{PA}_X}
\newcommand{\IPA}{\operatorname{PA}_I}
\newcommand{\PA}{\operatorname{PA}}
\newcommand{\ND}{\operatorname{ND}}
\newcommand{\CH}{\operatorname{CH}}
\newcommand{\tCH}{\scaleto{\operatorname{CH}}{3.5pt}}
\newcommand{\tPA}{\scaleto{\operatorname{PA}}{3.5pt}}
\newcommand{\DE}{\operatorname{DE}}
\newcommand{\AN}{\operatorname{AN}}
\newcommand{\SB}{\operatorname{SB}_I}
\newcommand{\NSB}{\operatorname{NSB}_I}
\newcommand{\MB}{\operatorname{MB}}

%% More
\newcommand{\stabscore}{\textbf{s}_{\operatorname{stab}}}
\newcommand{\predscore}{\textbf{s}_{\operatorname{pred}}}
\newcommand\ti[1]{{\tilde{#1}}}
\newcommand\tX{{\tilde{X}}}
\newcommand\tY{{\tilde{Y}}}
\newcommand\ttX{{\tilde{\tilde{X}}}}
\newcommand{\supp}{\operatorname{supp}}

%% colors
\definecolor{col1}{RGB}{88,140,126}
\definecolor{col2}{RGB}{242,227,148}
\definecolor{col3}{RGB}{242,174,114}
\definecolor{col4}{RGB}{217,100,89}
\definecolor{col5}{RGB}{140,70,70}
\colorlet{lightgray}{black!15}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Identifiability of Sparse Causal Effects using Instrumental
  Variables (Supplementary material)}
% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, *]{\href{mailto:<np@math.ku.dk>}{Niklas~Pfister}{}}
\author[1, *]{\href{mailto:<jonas.peters@math.ku.dk>}{Jonas~Peters}{}}

% Add affiliations after the authors
\affil[1]{%
  Department of Mathematical Sciences\\
  University of Copenhagen\\
  Denmark
}
\affil[*]{%
  Authors contributed equally.
}
  
  
\begin{document}

% increase counters
\setcounter{equation}{19}
\setcounter{theorem}{8}
\setcounter{figure}{6}

\maketitle

\appendix

\section{Proof of
  Proposition~\ref{thm:partial_identifiability}}\label{app:partial}

\begin{proof}
  Fix $j\in\{1,\ldots,d\}$, then it holds that $\beta^*_j$ is
  identifiable by \eqref{eq:moment_eq} if and only if the space
  $\mathcal{B}$ is degenerate in the $j$-th coordinate, that is,
  $\mathcal{B}_j=\{\beta^*_j\}$.  Next, define $M\coloneqq \cov(I, X)$
  and $v\coloneqq\cov(I, Y)$. Then, denoting the Moore-Penrose inverse
  of $M$ by $M^{\dagger}$, we get that for any solution
  $\beta\in\mathcal{B}$ there exists
  $w\in\nullspace{M} \subseteq \mathbb{R}^d$ such that
  \begin{equation}
    \label{eq:decomposition}
    \beta=M^{\dagger}v + w.
  \end{equation}
  Therefore, the space $\mathcal{B}$ has a degenerate $j$-th
  coordinate if and only if $\nullspace{M}_j=\{0\}$. Denoting the
  Moore-Penrose inverse by $M^{\dagger}$, the null space of $M$ can be
  expressed as
  \begin{equation*}
    \nullspace{M}=\{(\vI-M^{\dagger}M)w\,\vert\, w\in\R^d\}.
  \end{equation*}
  Next, \eqref{eq:scm} and the assumption of joint independence of
  $I$, $\xi^X$, and $\xi^Y$ imply that
  \begin{align*}
    M=\cov[I, X]
    &=\cov\left[I, (\vI-B)^{-1}(AI+\xi^X)\right]\\
    &=\cov[I]A^{\top}(\vI-B)^{-\top}\\
    &=\cov[I]C.
  \end{align*}
  Therefore, using the properties of the Moore-Penrose inverse and that $\cov[I]$ is invertible we get that
  \begin{equation}
    \label{eq:MM_part1}
    M^{\dagger}M=C^{\dagger}\cov[I]^{-1}\cov[I]C.
  \end{equation}
  Hence, we get that $M^{\dagger}M=C^{\dagger}C$ which implies that
  $\nullspace{M}=\nullspace{C}$. This proves the first part of the
  statement. The second part of the proposition uses
  \eqref{eq:decomposition} together with
  $\nullspace{M}=\nullspace{C}$. This completes the proof of
  Proposition~\ref{thm:partial_identifiability}.
\end{proof}


\section{Further Results} \label{app:additional}
\begin{proposition}
  \label{prop:random_proj}
  Let $A\in\R^{n\times m}$ and $B\in\R^{n\times p}$ be two matrices
  satisfying
  \begin{equation*}
    \rank{B}\leq\rank{A}
    \quad\text{and}\quad
    \im{A}\neq\im{B}
  \end{equation*}
  and let $W\in\R^m$ be a random variable with a distribution on
  $\R^m$ that is absolutely continuous with respect to Lebesgue
  measure. Then it holds that
  \begin{equation*}
    \P(AW\in\im{B})=0.
  \end{equation*}
\end{proposition}

\begin{proof}
  We begin by showing that
  \begin{equation}
    \label{eq:orth_intersection}
    \im{B}^{\bot}\cap\im{A}\neq\varnothing.
  \end{equation}
  Assume for the sake of contradiction this is not true. Then it would
  hold that $\im{A}\subseteq\im{B}$. Moreover, since
  by assumption $\rank{B}\leq\rank{A}$ this would imply that
  $\im{A}=\im{B}$, which contradicts the assumptions on $A$ and
  $B$. Hence, \eqref{eq:orth_intersection} is true.

  Next, let $b_1,\ldots,b_n\in\R^n$ be an orthogonal basis of $\R^n$ such
  that
  \begin{equation*}
    \operatorname{span}(b_1,\ldots,b_{k})=\im{B}^{\bot}
  \end{equation*}
  and
    \begin{equation*}
    \operatorname{span}(b_{k+1},\ldots,b_{n})=\im{B}.
  \end{equation*}
  Then, for every $\ell\in\{1,\ldots,m\}$ there exits unique
  $\alpha_1^{\ell},\ldots,\alpha_n^{\ell}\in\R$ such that
  \begin{equation*}
    A_{\ell}=\sum_{i=1}^n\alpha_i^{\ell}b_i.
  \end{equation*}
  Furthermore, by \eqref{eq:orth_intersection}, it holds that
  there exists at least one $i^*\in\{1,\ldots,k\}$ and
  $\ell^*\in\{1,\ldots,m\}$ such that $\alpha_{i^*}^{\ell^*}\neq 0$.
  Furthermore, for every $w\in\R^m$ it holds that
  \begin{equation*}
    Aw
    =\sum_{\ell=1}^mw^{\ell}A_{\ell}
    =\sum_{\ell=1}^m\sum_{i=1}^nw^{\ell}\alpha_i^{\ell}b_i
    =\sum_{i=1}^n\left(\sum_{\ell=1}^mw^{\ell}\alpha_i^{\ell}\right)b_i.
  \end{equation*}
  This implies that $Aw\in\im{B}$ if and only if
  $\sum_{\ell=1}^mw^{\ell}\alpha_i^{\ell}=0$ for all
  $i\in\{1,\ldots,\ell\}$. Using this we get
  \begin{align*}
    \P(AW\in\im{B})
    &=\P(\forall i\in\{1,\ldots,\ell\}:\,
      \textstyle\sum_{\ell=1}^mW^{\ell}\alpha_i^{\ell}=0)\\
    &\leq \P(\textstyle\sum_{\ell\neq
      \ell^*}W^{\ell}\alpha_{i^*}^{\ell}=W^{\ell^*}\alpha_{i^*}^{\ell^*})\\
    &=0,
  \end{align*}
  where for the last step we used that the distribution of $W$ is
  absolutely continuous with respect to Lebesgue measure.
  This completes the proof of Proposition~\ref{prop:random_proj}.
\end{proof}


\section{Proof of Theorem~\ref{thm:cons}} \label{app:proofcons}

\begin{proof}
  It is known that for $\beta\in\R^d\setminus\mathcal{B}$ with
  $\beta\neq\beta^*$ the Anderson-Rubin test statistic (given Gaussian
  noise variables and conditioned on the observations of $I$ and $X$)
  satisfies
  \begin{equation*}
    T(\beta)\sim \chi^2\left(1, n\frac{\|\widehat{\cov}(I, X)(\beta^*-\beta)\|_2^2}{\sigma^{2}}\right),
  \end{equation*}
  where $\chi^2(1, \lambda)$ is the non-central $\chi^2$-distribution
  with one degree of freedom and non-centrality parameter $\lambda$,
  see for example \citet{moreira2009tests}.

  We first prove (i).  Fix $s \in \mathbb{N}$ such that
  $s < \|\beta^*\|_0$ (if $\|\beta^*\|_0=1$, the proof simplifies and
  one can consider \eqref{eq:part_1} directly). Then, for all
  $\beta \in \mathbb{R}^d$ such that $\|\beta\|_0 = s$, we have by
  Theorem~\ref{thm:sparse_identifiability} that
  $\cov\left(I, Y-X^{\top}\beta\right) \neq 0$.  Furthermore, there
  exists $\varepsilon>0$ such that for all $\beta \in \mathbb{R}^d$
  with $\|\beta\|_0 = s$ it holds that
  $\|\beta - \beta^*\|_2^2 > \varepsilon$. Therefore, since
  $\beta \mapsto \|\cov\left(I,
    Y-X^{\top}\beta\right)\|_2^2=\|\cov\left(I,
    X^{\top}(\beta^*-\beta)\right)\|_2^2$ is a quadratic form, there
  exists $c>0$ such that
  $\|\cov\left(I, X^{\top}(\beta^*-\beta)\right)\|_2^2 > c$.

  Conditioning on the observed data of $X$ and $I$, we have
  \begin{align}
    &P\left(\inf_{\beta \,:\,\|\beta\|_0 = s} T(\beta)>c_{\alpha} \,\big\vert\, (X_1, I_1), \ldots, (X_n, I_n)\right)\nonumber\\
    &\quad=1-\kappa\left(c_{\alpha}, n\inf_{\beta \,:\,\|\beta\|_0 = s}\frac{\|\widehat{\cov}(I, X)(\beta^*-\beta)\|_2^2}{\sigma^{2}}\right),\label{eq:chi-bound}
  \end{align}
  where $\kappa(\cdot, \lambda)$ is the
  $\chi^2(1, \lambda)$-distribution function; here, we have exploited
  that for all $x$, $\lambda \mapsto \kappa(x, \lambda)$ is
  monotonically decreasing.


  As $n$ tends to infinity, it holds almost surely that
  $\|\widehat{\cov}(I, X)(\beta^*-\beta)\|_2^2\rightarrow\|\cov(I,
  X)(\beta^*-\beta)\|_2^2>c$. Hence, since $c$ does not depend on
  $\beta$, the non-centrality parameter in the $\chi^2$-distribution
  tends to infinity and \eqref{eq:chi-bound} converges to $1$. Thus,
  $$
  \lim_{n\rightarrow\infty}P(\phi_s = 1)= 1.
  $$
  Since this holds for any $s\in\mathbb{N}$ such that
  $s < \|\beta^*\|_0$, we have
  \begin{align}
    &\lim_{n\rightarrow\infty}P(\|\hat{\beta}_{\leq s_{\max}}\|_0 = \|\beta^*\|_0) \nonumber\\
    &\quad =
      \lim_{n\rightarrow\infty}P\left(\min_{s<\|\beta^*\|_0} \phi_s = 1, 
      \phi_{\|\beta^*\|_0} = 0\right)\nonumber\\
    &\quad =
      \lim_{n\rightarrow\infty}P(\phi_{\|\beta^*\|_0} = 0)\nonumber\\
    &\quad = 1-\alpha,\label{eq:part_1}
  \end{align}
  where the last statement follows from 
  the fact that $\phi_s$ has valid level.

  Statement (ii) follows with the same argument noting that for all
  $\varepsilon > 0$ there exists a $c >0$ such that for all
  $\beta \in \mathbb{R}^d$ satisfying $\|\beta\|_0 < \|\beta^*\|_0$ or
  $\|\beta\|_0 = \|\beta^*\|_0$ and
  $\|\beta - \beta^*\|_2 \geq \varepsilon$, we have
  $\cov\left(I, Y-X^{\top}\beta\right) > c > 0$, again, using
  Theorem~\ref{thm:sparse_identifiability}.  This concludes the proof
  of Theorem~\ref{thm:cons}.
\end{proof}

\section{Proof of
  Proposition~\ref{prop:icp-test}} \label{app:prop:icp-test}

\begin{proof}
  To prove the first statement, we note that
  \begin{align*}
    &\left\{\textstyle\bigcap_{\substack{S: |S| = |\PA[Y]| \text{ and }\\
    H_0(S) \text{ accepted}
    }} S \subseteq \PA[Y]\right\}\\
    &\qquad\qquad\qquad
      \supseteq
      \left\{H_0(\PA[Y]) \text{ accepted}\right\}.
  \end{align*}
  But because 
  $$
  T(\beta^*) \geq 
  T(\hat{\beta}_{\operatorname{LIML}}(\PA[Y])),
  $$
  we have 
  $$
  P\left(H_0(\PA[Y]) \text{ accepted }\right) \geq 1 - \alpha. 
  $$

  To prove the second statement, observe that by the definition of $M$
  it holds that
  $$
  \Big\{ 
  M \geq \|\beta^*\|_0
  \Big\}
  \supseteq
  \left\{\min_{s<\|\beta^*\|_0} \phi_s = 1\right\}
  $$
  and therefore
  \begin{align*}
    &\left\{ \textstyle\bigcap_{\substack{S: |S| = M \text{ and }\\
    H_0(S) \text{ accepted}
    }} S \subseteq \PA[Y] \right\}\\
    &\quad\supseteq
      \Bigg\{\big\{\min_{s<\|\beta^*\|_0} \phi_s = 1\big\}\\
    &\qquad\qquad\quad \cap
      \{T(\hat{\beta}_{\operatorname{LIML}}(\PA[Y]))\leq F_{n-m,m}^{-1}(1-\alpha)\} \Bigg\}.
  \end{align*}
  It follows from the first part of
  Theorem~\ref{thm:sparse_identifiability} that for all
  $\beta \in \mathbb{R}^d$ such that $\|\beta\|_0 < \|\beta^*\|_0$, we
  have $\cov\left(I, Y-X^{\top}\beta\right) \neq 0$.  We can therefore
  apply the same arguments as in Theorem~\ref{thm:cons} to argue that
  for all $s < \|\beta^*\|_0$, we have
  $$
  \lim_{n\rightarrow\infty}P(\phi_s = 1)= 1.
  $$
  The statement then follows from
  $ T(\beta^*) \geq T(\hat{\beta}_{\operatorname{LIML}}(\PA[Y]))$ and
  the fact that the Anderson-Rubin test holds level.  This completes
  the proof of Proposition~\ref{prop:icp-test}.
\end{proof}

\section{Example~\ref{ex:1} continued} \label{app:graphs}

Figure~\ref{fig:ex1-marg} discusses the example graph mentioned in
Example~\ref{ex:1}.
\begin{figure}[ht]
  \centering
  \begin{tikzpicture}[scale=1]
    % Graph nodes
    \tikzstyle{VertexStyle} = [shape = circle, minimum width =
    2.5em, fill=lightgray]
    \Vertex[Math,L=Y,x=0,y=0]{Y}
    \Vertex[Math,L=X^1,x=-6,y=0]{X1}
    \Vertex[Math,L=X^2,x=-3,y=0]{X2}
    \Vertex[Math,L=X^3,x=-1.5,y=2]{X3}
    \tikzstyle{VertexStyle} = [draw, shape = rectangle, minimum
    width=2em]
    \Vertex[Math,L=1,x=-6.0,y=2]{1}
    \Vertex[Math,L=2,x=-4.5,y=2]{2}
    % Graph edges
    \tikzstyle{VertexStyle} = [draw, dashed, shape = circle, minimum
    width=2.5em]
    \tikzset{EdgeStyle/.append style = {-Latex, line width=1}}
    \Edge(1)(X1)
    \Edge(X1)(X2)
    \Edge(X2)(Y)
    \Edge(2)(X1)
    \Edge(2)(X3)
    \Edge(2)(X2)
  \end{tikzpicture}\vspace{0.8cm}\\
  \begin{tikzpicture}[scale=1]
    % Graph nodes
    \tikzstyle{VertexStyle} = [shape = circle, minimum width =
    2.5em, fill=lightgray]
    \Vertex[Math,L=Y,x=0,y=0]{Y}
    \Vertex[Math,L=X^2,x=-3,y=0]{X2}
    \tikzstyle{VertexStyle} = [draw, shape = rectangle, minimum
    width=2em]
    \Vertex[Math,L=1,x=-6.0,y=2]{1}
    \Vertex[Math,L=2,x=-4.5,y=2]{2}
    % Graph edges
    \tikzstyle{VertexStyle} = [draw, dashed, shape = circle, minimum
    width=2.5em]
    \tikzset{EdgeStyle/.append style = {-Latex, line width=1}}
    \Edge(1)(X2)
    \Edge(X2)(Y)
    \Edge(2)(X2)
  \end{tikzpicture}
  \caption{Top: Graph copied from Example~\ref{ex:1} and
    Figure~\ref{fig:ex1}.  Assumption (B1) holds because of the path
    $2 \rightarrow X^2$, for example.  For $S=\{1\}$, (B3) (i) is not
    satisfied but (B3) (ii) holds: there is no set $T$ of size one,
    such that all directed paths from $I$ to $\PA(Y)$ go through $T$.
    Therefore, if (B2) holds, the effect $\beta^*$ is identifiable
    (see Theorem~\ref{thm:sparse_identifiability_graph}). If, however,
    we were to remove the second instrument node from
    Example~\ref{ex:1}, (B3)(i) and (ii) would be violated (for set
    $S=\{X^1\}$).  Bottom: Marginalized graph $\mathcal{G}^{\PA(Y)}$.}
  \label{fig:ex1-marg}
\end{figure}

\FloatBarrier
\section{Example violating Assumption (A2)} \label{app:example_A2}

\begin{example} \label{ex:counterA2}
  Consider an SCM of the following form
  {\small
    \begin{align}
      \begin{pmatrix}
        X^1\\
        X^2\\
        X^3
      \end{pmatrix} 
      &:= 
        \begin{pmatrix}
          0 & 0 & 0\\
          0 & 0 & 0\\
          1 & 2 & 0
        \end{pmatrix}
                  \begin{pmatrix}
                    X^1\\
                    X^2\\
                    X^3
                  \end{pmatrix} 
      + 
      \begin{pmatrix}
        4 & 0 \\
        0 & 3 \\
        0 & 0 \\
      \end{pmatrix}
      \begin{pmatrix}
        I^1\\
        I^2
      \end{pmatrix} 
      + h(H, \epsilon^X) 
      \nonumber\\
      Y &:= 
          \begin{pmatrix}
            X^1 & X^2 & X^3
          \end{pmatrix} 
                        \begin{pmatrix}
                          1\\
                          2\\
                          0
                        \end{pmatrix} 
      + g(H, \epsilon^Y), \label{eq:counter_exampleA2}
    \end{align}}
  where   $I^1$, $I^2$, $H$, $\epsilon^Y$, $\epsilon^X$ are jointly independent. 
  Figure~\ref{fig:counter_A2} shows the corresponding graphical representation. In this case, it holds that
  \begin{equation*}
    C=
    \begin{pmatrix}
      1 & 0 & 1\\
      0 &1 & 1
    \end{pmatrix}.
  \end{equation*}
  Hence, the set $S=\{3\}$ violates Assumption (A2). In particular,
  the coefficient $\tilde{\beta}=(0, 0, 1)^{\top}\in\mathcal{B}$
  yields a sparser solution than the causal coefficient
  $(1, 1, 0)^{\top}$. Therefore, the result of
  Theorem~\ref{thm:sparse_identifiability} cannot be valid.
  Assumption (A2) is violated in this example because the coefficients
  can be matched exactly. If the coefficients are chosen randomly with
  a distribution that is absolutely continuous with respect to
  Lebesgue measure, this happens with probability zero, see
  Proposition~\ref{prop:random_proj}.
  \begin{figure}[ht]
    \centering
    \begin{tikzpicture}[scale=1]
      % Graph nodes
      \tikzstyle{VertexStyle} = [shape = circle, minimum width =
      2.5em, fill=lightgray]
      \Vertex[Math,L=Y,x=0,y=0]{Y}
      \Vertex[Math,L=X^1,x=-3,y=2]{X1}
      \Vertex[Math,L=X^2,x=-3,y=-2]{X2}
      \Vertex[Math,L=X^3,x=-6,y=0]{X3}
      \tikzstyle{VertexStyle} = [draw, shape = rectangle, minimum
      width=2em]
      \Vertex[Math,L=1,x=-6.0,y=2]{1}
      \Vertex[Math,L=2,x=-6,y=-2]{2}
      % Graph edges
      \tikzstyle{VertexStyle} = [draw, dashed, shape = circle, minimum
      width=2.5em]
      \tikzset{EdgeStyle/.append style = {-Latex, line width=1}}
      \Edge[label=4](1)(X1)
      \Edge[label=3](2)(X2)
      \Edge[label=1](X1)(Y)
      \Edge[label=2](X2)(Y)
      \Edge[label=1](X1)(X3)
      \Edge[label=2](X2)(X3)
    \end{tikzpicture}
    \caption{Example graph for which Assumption (A2) can be violated
      if the edge coefficients are fine-tuned to match each other
      exactly.}
  \label{fig:counter_A2}
\end{figure}
\end{example}

\FloatBarrier

\section{Additional simulation results}

\begin{figure}[ht]
  \centering
  \includegraphics[width=\linewidth]{figures/mse_vs_assumptions_TSLS.pdf}
  \caption{Same experiment as in Figure~\ref{fig:mse_vs_assumptions}
    but with TSLS estimator instead of LIML. Results for all $2000$
    random models with $n=1600$. We split the models into three cases
    depending on which of the assumptions (A1) and (A3) are satisfied
    (the group `(A1)' contains $88$ models, the group `(A1) \& (A3)'
    contains $1871$ models and the group `none' contains $41$
    models). If none of the assumptions are satisfied, not even the
    oracle with known parent set works. If only (A1) is satisfied,
    multiple sets of size $2$ are able to satisfy the moment
    equation~\eqref{eq:moment_eq} and \spaceIV may not estimate the
    correct set. These findings are in par with
    Theorem~\ref{thm:sparse_identifiability}.}
  \label{fig:mse_vs_assumptions_TSLS}
\end{figure}
\begin{figure}[ht]
    \centering
    \includegraphics[width=\linewidth]{figures/consistency_experiment_all_TSLS.pdf}
    \caption{Same experiment as in Figure~\ref{fig:consistency_valid}
      but with TSLS estimator instead of LIML. Results for all random
      models that satisfy (A1)-(A3) (in total $1871$ out of $2000$
      models). The median RSME of the \spaceIV estimator converges to
      zero as the simple size increases, which does not hold for
      \texttt{OLS-sparse}. Note that some of the outliers are cut-off
      in this plot.}
    \label{fig:consistency_valid_TSLS}
\end{figure}
\FloatBarrier
{\begin{figure}[ht]
    \centering
    \includegraphics[width=\linewidth]{figures/expected_fraction_correct_sparsity_TSLS.pdf}
    \caption{Same experiment as in Figure~\ref{fig:expected_sparsity}
      but with TSLS estimator instead of LIML. Expected fraction of
      random models for which \spaceIV estimated the correct sparsity
      level. Only random models that satisfy (A1)-(A3) are considered
      (in total $1871$ models). As the sample size increases the
      estimation of the sparsity level becomes more accurate.}
    \label{fig:expected_sparsity_TSLS}
\end{figure}

\bibliography{uai2022-template.bib}
 
\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
