\documentclass{article}
\usepackage{graphicx} % Required for inserting images

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

\usepackage{graphicx}
\usepackage{comment}
\usepackage{bm}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{enumitem} 
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{xfrac}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{caption}
\usepackage{subcaption}
\newtheorem{theorem}{Theorem}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}[theorem]

\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz}
\usetikzlibrary{positioning,shapes, arrows}
\tikzset{events/.style={ellipse, draw, align=center},}

\input{macros.tex}

\newcommand{\SSigma}{\boldsymbol{\Sigma}}
\newcommand{\vect}[1]{\boldsymbol{#1}}
\newcommand{\cov}{\mathrm{Cov}}
\newcommand{\var}{\mathrm{Var}}
\newcommand{\E}{\mathbb{E}}
\title{Tight bounds for partially linear causal effects with imperfect instruments violating either untestable IV assumptions,\\even when noise terms are not additively separable from instruments}
\author{Jordan}
\date{February 2024}

\begin{document}

\maketitle

\begin{abstract}
    The instrumental variables setup (IV) is a widely used tool for estimating the causal effect of one variable on another in the presence of unobserved confounding. 
    However, IV relies on two strong and untestable assumptions: the \textit{exclusion criterion} and the criterion of \textit{independent instruments}. 
    A recently published report identifies provably tight bounds for a causal effect in linear IV models when the former assumption (exclusion) is broken to a limited degree, provided it is due to a direct linear causal effect from instrument to outcome. 
    By interpreting the methods geometrically, we extend this result to incorporate any spurious relationships violating either or both assumptions in a non-parametric way. 
    We prove these bounds are tight, in that they cannot be improved upon without making further structural assumptions---and that all causal effects within the bounds cannot be ruled out. 
    % We prove these bounds are tight, in that, without making further structural assumptions, these bounds cannot be improved upon and all causal effects within these bounds cannot be ruled out. 
\end{abstract}

\begin{figure}
    \centering
    \begin{tikzpicture} [node distance=10mm and 30mm,>=stealth',sh/.style={shade},line width=1.5pt]
    \node [events] (Z) {$\vect{Z}$};
    \node [events, right = of Z ] (X) {$X$};
    \node [events, right = of X ] (Y) {$Y$};
    \draw [red, dashed, ->] (Z) to (X);
    \draw [->] (X) to (Y);
    \draw [dashed,<->] (X) to [out=45, in=135] (Y);
    \draw[red, dashed,<->] (Z) to [out=45, in=135] (X);
    \draw [dashed,->] (Z) to [out=-30, in=-150] (Y);
    \draw [dashed,<->] (Z) to [out=75, in=105] (Y);
    \end{tikzpicture}
    \caption{The most general acyclic directed mixed graph entailed by the structural equations (plus any spurious correlations) in our problem setup. 
    The red dashed lines show deviations from the typical IV setup, and indicate that our model is agnostic between whether the deviations are due to violation of (A2), (A3) or both. 
    The red dashed lines indicate that we still rely on the relevance assumption $\Sigma_{xy} \neq 0$. 
    Bi-directional arrows may indicate common confounding or spurious correlation or both. 
    The solid line indicates the causal effect from $X$ to $Y$, which we wish to estimate and assume is linear. 
    Note that any individual line need not appear for our model to remain valid.}\vspace{-3mm}
    \label{fig:BenignDisequilibrium}
\end{figure}



\section{Problem Setup}

We begin with the following model:
% \begin{align}
%     & X = \vect{\beta} \cdot \vect{Z} + \epsilon_x, \label{eqn:X} \\
%     & Y = \theta X + g_y (\vect{Z}, \epsilon_y), \label{eqn:Y} \\
%     & \SSigma_{(\epsilon_x, g_y, \vect{z}) (\epsilon_x, g_y, \vect{z})} = \begin{bmatrix} 
%     \eta_x^2 & \rho \eta_x \eta_y & \SSigma_{\epsilon_x \vect{z}}^\mathrm{T} \\
%     \rho \eta_x \eta_y & \eta_y^2 & \SSigma_{g_y \vect{z}}^\mathrm{T} \\ 
%     \SSigma_{\epsilon_x \vect{z}} & \SSigma_{g_y \vect{z}} & \SSigma_{\vect{z} \vect{z}}
%     \end{bmatrix}, \label{eqn:cov_ee}
% \end{align}
\begin{align}
    & X = \vect{\beta} \cdot \vect{Z} + \epsilon_x, \label{eqn:X} \\
    & Y = \theta X + g_y (\vect{Z}, \epsilon_y), \label{eqn:Y} \\
    & \SSigma_{(\epsilon_x, g_y, \vect{z}) (\epsilon_x, g_y, \vect{z})} = \begin{bmatrix} 
    \eta_x^2 & \rho \eta_x \eta_y & \SSigma_{\epsilon_x \vect{z}}^\mathrm{T} \\
    \rho \eta_x \eta_y & \eta_y^2 & \SSigma_{g_y \vect{z}}^\mathrm{T} \\ 
    \SSigma_{\epsilon_x \vect{z}} & \SSigma_{g_y \vect{z}} & \SSigma_{\vect{z} \vect{z}}
    \end{bmatrix}, \label{eqn:cov_ee}
\end{align}
under the condition 
\begin{align}
    \left\lVert \SSigma_{\vect{z}\vect{z}}^{-1} \cdot \SSigma_{\vect{z} g_y} \right\rVert < \tau_y, \label{eqn:tau criterion}
\end{align}
for some choice of norm $\lVert \cdot \rVert$. The transformation by $\SSigma_{\vect{z}\vect{z}}^{-1}$ is chosen so these conditions reduce to the ``tau-exclusion'' criterion in the cite\{LeakyIV\} setup, with $\tau = \tau_y$. 

The structural equations above entail the following relationships for the covariances amongst $(X, Y, \vect{Z})$:
\begin{align}
    & (X\vect{Z}) & \vect{\beta} = \SSigma_{\vect{z}\vect{z}}^{-1} \cdot ( \SSigma_{\vect{z} x} - \SSigma_{\vect{z} \epsilon_x}), \label{eqn:XZ} \\
    & (Y\vect{Z}) & \SSigma_{y\vect{z}} = \theta \SSigma_{x\vect{z}} + \SSigma_{\vect{z} g_y}, \label{eqn:YZ} \\
    & (XX) & C_{xx} = \kappa_{xx}, \label{eqn:XX} \\
    & (XY) & \kappa_{xy} = \theta \kappa_{xx} + C_{xy}, \label{eqn:XY} \\
    & (YY) & \kappa_{yy} = \theta \kappa_{xy} + C_{yy} + \theta C_{xy}, \label{eqn:YY}
\end{align}
where 
\begin{align}
    & C_{xx} := \var (\epsilon_x \mid \vect{Z}) = \eta_x^2 - \SSigma_{\vect{z} \epsilon_x}^\mathrm{T} \cdot \SSigma_{\vect{z} \vect{z}}^{-1} \cdot \SSigma_{\vect{z} \epsilon_x}, \label{eqn:cond cov exex}\\
    & C_{xy} := \cov (\epsilon_x, g_y \mid \vect{Z}) = \rho \eta_x \eta_y - \SSigma_{\vect{z} \epsilon_x}^\mathrm{T} \cdot \SSigma_{\vect{z} \vect{z}}^{-1} \cdot \SSigma_{\vect{z} g_y}, \label{eqn:cond cov exey}\\
    & C_{yy} := \var (g_y \mid \vect{Z}) = \eta_y^2 - \SSigma_{\vect{z} g_y}^\mathrm{T} \cdot \SSigma_{\vect{z} \vect{z}}^{-1} \cdot \SSigma_{\vect{z} g_y} \label{eqn:cond cov eyey}
\end{align}
are the variances and covariance of the noise terms $\epsilon_x$, $g_y$ and $(\epsilon_x,g_y)$, each conditioned on $\vect{Z}$. The other new quantities are:
\begin{align}
    & \kappa_{xx} := \var (X \mid \vect{Z}) = \Sigma_{xx} - \SSigma_{\vect{z} x}^\mathrm{T} \cdot \SSigma_{\vect{z}\vect{z}}^{-1} \cdot \SSigma_{\vect{z} x}, \\
    & \kappa_{xy} := \cov (X, Y \mid \vect{Z}) = \Sigma_{xy} - \SSigma_{\vect{z} x}^\mathrm{T} \cdot \SSigma_{\vect{z}\vect{z}}^{-1} \cdot \SSigma_{\vect{z} y}, \\
    & \kappa_{yy} := \var (Y \mid \vect{Z}) = \Sigma_{yy} - \SSigma_{\vect{z} y}^\mathrm{T} \cdot \SSigma_{\vect{z}\vect{z}}^{-1} \cdot \SSigma_{\vect{z} y}, 
\end{align}
which define the variances and covariance $X$, $Y$ and $(X,Y)$, each conditioned on $\vect{Z}$. 

\newpage

\section{Results}

\begin{lemma}[] \label{lemma:Unique minimal leakge}
    For any choice of vectors $\vect{A} \in \mathbb{R}^{\lvert \vect{Z} \rvert}$, $\vect{B} \in \mathbb{R}^{\lvert \vect{Z} \rvert} \backslash \vect{0}$ (where $\lvert \cdot \rvert$ denotes set cardinality), and any choice of norm $\lVert \cdot \rVert$, there exists a finite $\check{\theta}$ satisfying
    \begin{equation}
        \check{\theta} = \arginf_{\theta \in \mathbb{R}} \left\lVert \vect{A} - \vect{B} \theta \right\rVert.
    \end{equation}
    Proof: this norm is bounded below by zero and approaches $+ \infty$ as $\theta$ approaches $\pm \infty$. We do not demand that $\check{\theta}$ is unique. \qedsymbol
\end{lemma}

Rewriting the covariance equation \ref{eqn:YZ} between $Y$ and $\vect{Z}$, and enforcing the condition in equation \ref{eqn:tau criterion} by taking the relevant norm yields:
\begin{equation}
    \left\lVert \vect{A} - \vect{B} \theta \right\rVert < \tau_y,
    \label{eqn:regression problem}
\end{equation}
where
\begin{align}
    & \vect{A} = \SSigma_{\vect{z} \vect{z}}^{-1} \cdot \SSigma_{y \vect{z}}, \\
    & \vect{B} = \SSigma_{\vect{z} \vect{z}}^{-1} \cdot \SSigma_{x \vect{z} }.
\end{align}

\begin{equation}
    \boldsymbol{\gamma} := \SSigma_{g_y \vect{z}} = \vect{A} - \vect{B} \theta,
\end{equation}

\begin{theorem}[Bound entailed by equation \ref{eqn:regression problem}] \label{thm:general ate_bounds}
     Let $\check{\theta} < \tau_y$. Then there are finite solutions for $\theta^-$ and $\theta^+$ defined below: 
     \begin{align}
         & \theta^- := \inf_{\theta < \check{\theta}} \theta \quad \mathrm{_{S.T.}} \quad  \left\lVert \vect{A} - \vect{B} \theta \right\rVert < \tau_y \\
         & \theta^+ := \sup_{\theta > \check{\theta}} \theta \quad \mathrm{_{S.T.}} \quad  \left\lVert \vect{A} - \vect{B} \theta \right\rVert < \tau_y
     \end{align}
     Proof: a norm is convex, meaning convex along all lines, and so is convex along the line traced by $\vect{A} - \theta \vect{B}$ (another way to put this is that a convex function remains convex after after an affine transformation). The norm approaches $+ \infty$ as $\theta$ approaches $\pm \infty$, so therefore leaves the open set of $\theta$ for which $\left\lVert \vect{A} - \vect{B} \theta \right\rVert < \tau_y$ exactly twice. \qedsymbol
\end{theorem}

Any $\theta \in [\theta^-, \theta^+]$ is valid according to equation \ref{eqn:regression problem}. We can also prove that such $\theta$ are valid according to the other covariance equations (particularly the last three, since equation \ref{eqn:XZ} defines $\vect{\beta}$). These are the only equations that $\theta$ must satisfy in our setup, since equations involving any higher order cumulants introduce new free parameters such as $\mathrm{Coskew} (\epsilon_x, g_y, \vect{Z})$.

\begin{theorem}[Tightness of bound] \label{thm:Tightness of theta bound}
    The bound given in theorem \ref{thm:general ate_bounds} is tight in the following sense: all $\theta$ within the bounds are permissible, unless further structural assumptions are made. 
    
    Proof: proving that the bound is tight is equivalent to saying there is a permissible choice $(\rho, \eta_x^2, \eta_y^2, \SSigma_{\vect{z} \epsilon_x}, \SSigma_{\vect{z} g_y})$ for every $\theta$ in the bound that satisfies equations \ref{eqn:XX}, \ref{eqn:XY} and \ref{eqn:YY}. 

    We illustrate this by showing that there is a permissible choice of $(\rho, \eta_x^2, \eta_y^2, C_{xy})$ for any valid choice of $(C_{xx}, C_{yy}, \theta)$.

    For ease of notation, let's rewrite equations \ref{eqn:cond cov exex}, \ref{eqn:cond cov exey}, \ref{eqn:cond cov eyey} as follows:
    \begin{align}
        & C_{xx} = \eta_x^2 - \psi, \\
        & C_{xy} = \rho \eta_x \eta_y - \phi, \\
        & C_{yy} = \eta_y^2 - \chi, 
    \end{align}
    which yields the following covariance equations: 
    \begin{align}
        & (XX) & \eta_x^2 = \kappa_{xx} + \psi, \\
        & (XY) & \rho \eta_x \eta_y = \kappa_{xy} - \theta \kappa_{xx} + \phi, \label{eqn:XY rho}  \\
        & (YY) & \eta_y^2 = \theta^2 \kappa_{xx} - 2 \theta \kappa_{xy} + \kappa_{yy} + \chi.
    \end{align}
    These can be considered equations giving $\eta_x^2$, $\rho$ and $\eta_y^2$ respectively at all other values fixed. We must show that $\eta_x^2, \eta_y^2 \geq 0$. We know $\psi, \chi > 0$ because $\SSigma_{\vect{z} \vect{z}}^{-1}$ is positive definite. This immediately implies $\eta_x^2$ is valid. For $\eta_y^2$, we can apply the Cauchy-Schwartz inequality to the $\kappa_{xy}$ term to find:
    \begin{equation}
        \left( \sqrt{\kappa_{yy}} - \lvert \theta \rvert \sqrt{\kappa_{xx}} \right)^2 + \chi \leq \eta_y^2 \leq \left( \sqrt{\kappa_{yy}} + \lvert \theta \rvert \sqrt{\kappa_{xx}} \right)^2 + \chi,
    \end{equation}
    so that $\eta_y^2 > 0$ is always satisfied. 
    
    Finally, we must choose a valid $\rho \in [-1, 1]$ through equation \ref{eqn:XY rho}.  Let us choose the direction of $\SSigma_{\vect{z} \epsilon_x}$ so that it is orthogonal to $\SSigma_{\vect{z} \vect{z}}^{-1} \cdot \SSigma_{\vect{z} g_y}$, meaning $\phi = 0$. Then, $\rho$ is uniquely determined by the equation. 

    We can prove this choice of $\rho$ (and thus $\phi$) is valid by applying the Cauchy-Schwartz inequality on the on the other set of conditional (co)variances: $C_{xy}^2 \leq C_{xx} C_{yy}$. This implies the following: 
    \begin{equation}
        \rho^2 \eta_x^2 \eta_y^2 \leq (\eta_x^2 - \psi) (\eta_y^2 - \chi).
    \end{equation}
    It follows immediately from the facts we have already discussed---$\eta_x^2 \geq \psi \geq 0$, $\eta_y^2 \geq \chi \geq 0$---that the above equations implies $\rho^2 \leq 1$. \qedsymbol
\end{theorem}

Note: we may be concerned that the Cauchy-Schwartz inequality for the conditional (co)variances $(C_{xx}, C_{xy}, C_{yy})$ imposes further restrictions for $(\psi, \phi, \chi)$ other than the ordering $\eta_x^2 \geq \psi \geq 0$, $\eta_y^2 \geq \chi \geq 0$. Substituting equations \ref{eqn:cond cov exex}, \ref{eqn:cond cov exey} and \ref{eqn:cond cov eyey} into the statement $C_{xy}^2 \leq C_{xx} C_{yy}$, however, reduces the relationship to $\kappa_{xy}^2 \leq \kappa_{xx} \kappa_{yy}$, which is always true, and testable since this other set of conditional (co)variances is observed from data. 

\newpage

\begin{corollary}[Tightness against more general class of models] \label{coly:Extension}
    The bound on $\theta$ in theorem \ref{thm:general ate_bounds} is also a tight bound for the following, more general, class of models:
    \begin{align}
        & X = f_x (\vect{Z}, \epsilon_x), \\
        & Y = \theta X + g_y (\vect{Z}, \epsilon_y), \\
        & \SSigma_{(f_x, g_y, \vect{z}) (f_x, g_y, \vect{z})} = \begin{bmatrix} 
        \eta_x^2 & \rho \eta_x \eta_y & \SSigma_{\vect{z} f_x}^\mathrm{T} \\
        \rho \eta_x \eta_y & \eta_y^2 & \SSigma_{\vect{z} g_y}^\mathrm{T} \\ 
        \SSigma_{\vect{z} f_x} & \SSigma_{\vect{z} g_y} & \SSigma_{\vect{z} \vect{z}}
        \end{bmatrix}. \label{eqn:cov_ee}
    \end{align}
    The proof of this is straightforward: equation \ref{eqn:YZ} is unchanged by the extension to general $f_x$, and therefore entails the same bound on $\theta$ as before. However, all $\theta$ within this bound are permissible under the special case of $f_x = \vect{\beta} \cdot \vect{Z} + \epsilon_x$ from theorem \ref{thm:Tightness of theta bound}. \qedsymbol
\end{corollary}

\begin{corollary}[Tightness against any bound on $\SSigma_{\vect{z} f_x}$] \label{coly:No better with extra cov}
    The following constraint:
    \begin{equation}
        \left\lVert \SSigma_{\vect{z} \vect{z}}^{-1} \cdot \SSigma_{\vect{z} f_x} \right\rVert < \tau_x
    \end{equation}
    does not tighten the bound further. This is valid for the linear case $X = \vect{\beta} \cdot \vect{Z} + \epsilon_x$ as well as the general case $X = f_x (\vect{Z}, \epsilon_x)$. \\
    Proof: this follows directly from the fact that there was a permissible choice of $(\rho, \eta_x^2, \eta_y^2, \phi)$ for any $(\psi, \chi, \theta)$ defined in the proof of theorem \ref{thm:Tightness of theta bound}, including for $\psi, \phi$ which don't violate this new bound. \qedsymbol
\end{corollary}

\begin{corollary}[Tightness against even more general class of models] \label{coly:Extension 2}
    The bound on $\theta$ in theorem \ref{thm:general ate_bounds} is also a tight bound for the following, even more general, class of models:
    \begin{align}
        & X = f_x (\vect{Z}, \epsilon_x), \\
        & Y = \theta h_y (X) + g_y (\vect{Z}, \epsilon_y), \\
        & \SSigma_{(f_x, g_y, \vect{z}) (f_x, g_y, \vect{z})} = \begin{bmatrix} 
        \eta_x^2 & \rho \eta_x \eta_y & \SSigma_{\vect{z} f_x}^\mathrm{T} \\
        \rho \eta_x \eta_y & \eta_y^2 & \SSigma_{\vect{z} g_y}^\mathrm{T} \\ 
        \SSigma_{\vect{z} f_x} & \SSigma_{\vect{z} g_y} & \SSigma_{\vect{z} \vect{z}}
        \end{bmatrix}, \label{eqn:cov_ee}
    \end{align}
    Provided the functional form of $h_x$ is known. Here, $\theta$ is interpreted as a scale factor of the parametric causal effect from $X$ to $Y$. 

    Proof: we can code $X \gets \tilde{X} := (h_y \circ f_x)(\vect{Z}, \epsilon_x)$, so that the structural equational model above takes the same form as that in corollary \ref{coly:Extension}. \qedsymbol
\end{corollary}

\begin{theorem}[Natural search space]
    We can pretend $\psi = \phi = \chi = 0$ to get a continuous, invertible mapping $F: \rho \rightarrow \theta$. 
\end{theorem}


\begin{equation}
    \mathbf{A} := 
\end{equation}

\end{document}