\section{Background}

Throughout, we use capital letters (e.g. $A$) to denote a random variable on a measurable space. We denote measurable spaces by calligraphic letters ($\mathcal{A}$), with one exception: $\mathcal{P}$, which we use to denote a probability measure. We use lowercase letters to denote realizations of a random variable ($A=a$). We will use the structural causal model (SCM) formulation of \cite{pearl2009causality}, where causal relationships are represented as directed acyclic graphs (DAGs). The operator $do(\cdot)$ is defined in these models to describe the process of forcing a random variable to take a particular value, which isolates its effect on downstream variables (i.e. $\E[Y~|~do(A=a)]$ describes the isolated effect of $A$ on $Y$). 

Our goal is to estimate the average treatment effect (ATE) $\mathbb E[Y~|~do(X = x)]$ given the graph in Figure~\ref{fig:merror_iv_graph} (equivalent to the structural function $f: \mathcal{X} \longrightarrow \mathcal{Y}$). Here $X,\epsilon$ are unobserved. We only have access to an instrument $Z$, the effect $Y$, and corrupted measurements of $X$: $M$ and $N$.
%Specifically, we seek to estimate the structural function $f: \mathcal{X} \longrightarrow \mathcal{Y}$, corresponding to the estimand $\mathbb E[Y~|~do(X = x)]$, when $X$ is unobserved and measured with error. To do so, we are given access to two error-ed measurements of $X$, as well as an instrumental variable $Z$. Moreover, $X$ and $Y$ are confounded by an unobserved confounder $\epsilon$. Formally, given access to samples from the conditional distribution with density $p(m, n, y~|~z)$, we provide an estimate $\hat f(\cdot)$. 

\subsection{Structural Assumptions on $p(x, y~|~z)$} 

When the treatment $X$ is observed and an instrument $Z$ is available, the structural function is identified by

\begin{assumption}\label{assp: iv_additive_error}
$Y = f(X) + \epsilon$ and $\E[\epsilon|Z] = 0$
\end{assumption}
\begin{assumption}\label{assp: relevance_of_instrument}
$p(x|z)$ is not constant in $z$.
\end{assumption}
Under Assumptions \ref{assp: iv_additive_error} and \ref{assp: relevance_of_instrument}, the structural function satisfies the following equation $\mathcal{P_{Z}}$-\text{almost surely.}:
\begin{align}
    \E[Y|Z] = \E[f(X)|Z] = \int f(x)d\mathcal{P_{X|Z}}
\end{align}
Typical methods fall into two categories: 1) two-stage methods (\cite{kiv, Hartford17:DIV, xu2020}): first identify the conditional distribution directly or through estimating conditional expectations of basis functions; this is followed by identifying $f$ under the identified conditional distribution of vector of conditional expected values of basis functions; 2) moment-based methods (\cite{zhang2020maximum, Bennett19:DeepGMM}): estimate $f$ using moment conditions generated by the conditional moment restriction: $\E[(Y-f(X))g(Z)]=0$ $\forall g$ measurable.  A practical difference between two-stage methods and moment-based methods is that two-stage methods require separate data for each stage, and the first stage does not require the outcome observations $Y$, whereas moment-based methods require data for all variables simultaneously. In this work, we seek to identify the measurement process before identifying the structural function. We thus naturally adopt the two-stage framework since the measurement process requires only the instrument and the measurements, and not the outcome labels. 

% Kernel methods are a class of flexible methods which are easy to provide guarantees for; we now outline a kernel method for instrumental variable regression with observed $X$ which we later use to 

\subsection{Reproducing kernel hilbert spaces} %(RKHSes)}
For any space $\mathcal{S} \in \{\X, \Y, \cM, \cN, \Z\}$, let $k: \mathcal{S} \times \mathcal{S} \rightarrow \mathbb{R}$ be a positive definite kernel. We denote by $\phi$ its associated canonical feature map $\phi(x)=k(x, \cdot)$ for any $x \in \mathcal{S}$, and $\mathcal{H_S}$ its corresponding Reproducing Kernel Hilbert Space (RKHS) of real-valued functions on $\mathcal{S}$. The space $\mathcal{H_S}$ is a Hilbert space with inner product $\langle \cdot, \cdot \rangle_{\mathcal{H_S}}$. It satisfies two important properties: (i) $k(x, \cdot) \in \mathcal{H_S}$ for all $x \in \mathcal{S}$, (ii) the reproducing property: for all $h \in \mathcal{H_S}$ and $x\in \mathcal{S}, h(x) = \langle h, k(x, \cdot) \rangle_{\mathcal{H_S}}$. For any distribution $p$ on $\mathcal{S}$, $\mu_p \defeq \int k(x, \cdot) p(x)dx $ is an element of $\mathcal{H_S}$ and is referred to as the kernel mean embedding of $p$ (\cite{Smola07Hilbert}). Similarly, for any conditional distribution $p(x|z)$, $\mu_{\cP_{X|z}} \defeq \int k(x, \cdot)p(x|z)dx$ is a \textit{conditional mean embedding} (CME) of $p(x|z)$ (\cite{song2009hilbert, song2013kernel}); see \cite{Muandet17:KME} for a review. 

\subsection{Structure learning using kernel mean embeddings} \label{subsec: kiv_method}
Provided that the structural function $f$ lies in the RKHS $\cH_{\cX}$, then its conditional expectation under $\cP_{X|Z}$ can be written as $ \E[f(X)|Z]=\langle f, \mu_{\cP_{X|Z}}\rangle_{\mathcal{H_X}}$. In \cite{singh2019kernel}, the conditional mean embedding is estimated by the standard regression formula using the observed samples $\{z_j, x_j\}_{j=1}^{s_1}$ before the structural function $f$ is estimated using a second-stage sample $\{\check{z}_j, \check{y}_j\}_{j=1}^{s_2}$. We present their solution here. 

The CME estimator of $\cP_{X|z}$ is estimated using the samples $\{z_j, x_j\}_{j=1}^{s_1}$
\begin{align}
    \hat{\mu}^{(s_1)}_{\cP_{X|z}} &= \Phi_{X} (K_{ZZ} + s_1 \hat{\lambda} I)^{-1} \Phi'_Z\phi(z) \label{eq: kivs1}
\end{align}
where $\hat{\lambda}$ is the ridge regression hyperparameter chosen using the validation procedure described in \cite[App.7.4.2]{kiv}. $K_{ZZ}$ denote the kernel matrix where $(K_{ZZ})_{jl} = k(z_j, z_l)$, $(\Phi_X)_{(:, j)} = \phi(x_j)$. This is precisely the adaptation of ridge regression to multi-dimensional feature spaces to the case where the number of features can be infinite. Furthermore, if we assume that the structure function lies in an RKHS, then we can learn the function $f$ in two steps of regression: first a regression to get the CME, followed by a regression from the CME to $Y$ to obtain $f$. We go ahead to make this assumption. Importantly, we stress that the purpose of this assumption is for the nonparametric modelling of $f$, and is \textit{not} to do with the correction of measurement error.
\begin{assumption}\label{assump: f_rkhs}
$f \in \cH_{\cX}$
\end{assumption}
Assuming $f\in\cH_{\cX}$, the estimated CME $\hat{\mu}^{(s_1)}_{\cP_{X|z}}$ is used to learn the structural function $f$ by solving the empirical analogue of the following:
\begin{align}
    \E[Y|Z]&=\langle f, \mu_{\cP_{X|z}}\rangle_{\mathcal{H_X}} \label{eq: cme_to_y}
\end{align}
We solve for $f$ via least squares in two stages: 1. Use $\{\check{z}_j, \check{y}_j\}_{j=1}^{s_1}$ to Monte-Carlo estimate $\E[Y|Z]$ and $\mu_{\cP_{X|z}}$ (call the latter $\hat{\mu}^{(s_1)}_{\cP_{X|z}}$); 2. Use $\{\check{z}_j, \check{y}_j\}_{j=1}^{s_2}$ to estimate $f$ via
\begin{align}
    \hat{f}^{(s_2)}(x) &= \hat{\beta}'K_{Xx} \label{eq: kiv_1} \\ 
    \hat{\beta} &= (VV' + s_2 \hat{\xi}K_{XX})^{-1}V\check{y} \label{eq: kiv_2} \\
    V &= K_{XX}(K_{ZZ} + s_1 \hat{\lambda}I)^{-1}K_{Z\check{Z}} \label{eq: kiv_3} 
    % \text{where } V_{jl} &= \phi(x_j)^T\Phi_{X} (K_{ZZ} + s_1 \hat{\lambda} I)^{-1} \Phi'_Z\phi(\check{z}_l)\\
    % &= \hat{\mu}^{(s_1)}_{\cP_{X|\check{z}_l}}(x_j)
\end{align}
where $\hat{\xi}$ is a hyperparameter.
Note that $\hat{\mu}^{(s_1)}_{\cP_{X|z}}$ enters in eq.~\eqref{eq: kiv_3}: $V_{jl} = \phi(x_j)^T\Phi_{X} (K_{ZZ} + s_1 \hat{\lambda} I)^{-1} \Phi'_Z\phi(\check{z}_l)= \hat{\mu}^{(s_1)}_{\cP_{X|\check{z}_l}}(x_j)$. We refer the readers to \citet{singh2019kernel} for the full derivation and for tuning $\hat{\xi}$.
%also chosen by the validation procedure described in \cite[App.7.4.2]{kiv}. 

This approach works when we observe treatment $X$. 
When $X$ is unobserved, this is not possible. Thus, we propose a method to learn the CME directly from corrupted measurements of $X$; then, $f$ is yielded as a mapping from the learnt CME to $Y$ as in Eq.~\eqref{eq: kiv_1} to Eq.~\eqref{eq: kiv_3}. Our method is detailed in Section~\ref{sec: method}. 
% But first we introduce the relevant kernel learning concepts which we later use in our proposed methodology.
We note that the solution of \citet{singh2019kernel} requires standard conditions for kernel causal learning, which we inherit. For clarity of presentation, we detail them in the Section~\ref*{app: kernel_assumptions} of the Supplementary Materials.

% The conditional mean embedding of $p(x|z)$ can be estimated consistently using kernel ridge regression. Denoting the estimated CME as $\hat\mu_{X|Z}$, the structural function f can be found by solving the following IV regression equation \citep{kiv}:
% \begin{equation}
%     \E[Y|Z] = \langle f, \hat\mu_{X|Z} \rangle_{\mathcal{F}}
% \end{equation}

\subsection{Characteristic function identification of a latent variable using mismeasured observations}
The main obstacle in the learning of the CME $\mu_{\mathcal{X|Z}}$ is the lack of observed data of $X$. To this end, we first review a strongly related problem, which is to identify the characteristic function of $p(x|z)$ using corrupted observations $M$ and $N$. The following assumptions are needed.

\begin{assumption}\label{assp: additive_merror}
Measurement errors enter additively:
\begin{align}
    M &= X + \Delta M \label{eq:m}\\
    N &= X + \Delta N  \label{eq:n}
\end{align}
\end{assumption}
\begin{assumption}\label{assp: merror_correlation}
The measurement errors are uncorrelated with each other, $\Delta M$ is uncorrelated with $X$, $\Delta N$ is independent with $X$, and $\epsilon$ is uncorrelated with $\Delta N$:
\begin{align}
&\E[\Delta M | X, \Delta N] =0 \label{eq:M_XN}\\ 
&X \indep \Delta N \label{eq:X_N}\\
&\E[\epsilon|\Delta N] = 0 \label{eq:E_N}
\end{align}
\end{assumption}
As $X$ is unobserved and can be redefined up to any invertible transformation, Eq. (\ref{eq:m}) is not imposing further constraints besides a monotonic relation between $M$ and $X$ in expectation. Eqs.~(\ref{eq:M_XN}) and (\ref{eq:E_N}) are weaker formulations of conditional independence statements $\Delta M \indep \{X, \Delta N\}$ and $\epsilon \indep \Delta N$. 

% \caroline{Need explanation of the assumptions. The merror assumptions are proposed in \cite{schennach04} } \ricardo{What about the following intuition for the UAI audience: ``As $X$ is unobserved and can be redefined up to any invertible transformation, Eq. (\ref{eq:m}) is not imposing further constraints besides a monotonic relation between $M$ and $X$ in expectation. Eqs. (\ref{eq:M_XN}) and (\ref{eq:E_N}) are weaker formulations of conditional independence statements $M \indep \{X, \Delta N\}$ and $\epsilon \indep \Delta N$ if we were to write a likelihood function explicitly.''} 

% \ricardo{I have to say I still don't get what the deal is with making Eq. (\ref{eq:n}) to have the same identity map as Eq. (\ref{eq:m}). I can't explain why this isn't a very restrictive (and unnecessary?) assumption.}

\begin{remark}
Eq.~\eqref{eq:n} is a restrictive assumption. However, we point out that it can be relaxed and the relaxed setting can be reduced to our setting. Thus we focus on the simplified setting where future methods can extend from; we discuss one way to relax the assumption in Section~\ref*{app: relax_merror_assumptions} of the Supplementary Materials.
\end{remark}
% \caroline{Hu Y, Sasaki Y. 2015. Closed-form estimation of nonparametric models with non-classical measurement errors. Here, they show that the characteristic function method can be adapted for one of the measurements being only a polynomial function of x^* (with additive noise).  

% Instrumental variable estimation of nonlinear errors-in-variables models might allow us to reduce the number of measurements from 2 to 1.

% Describe what these methods assume.}

With two measurements, \cite{schennach04} provides a constructive estimator for the moments of latent variables. Our work uses a special case of their theorem which we state below.
\begin{assumption}\label{assp: schennach_technical_assump}
$\E[|X|] < \infty$ and $\E[|\Delta M|] < \infty$
\end{assumption}
% \begin{thm}[\cite{schennach04}, Theorem 1]\label{thm: schennach} Given Assumptions \ref{assp: additive_merror}-\ref{assp: schennach_technical_assump}, the characteristic function of $X$, $\psi(\chi) \defeq \E[e^{i\chi X}]$, is identified by 
% \begin{align}
%     \psi(\chi) &= \exp \left(\int_0^{\chi} i \frac{\E[Me^{i\nu N}]}{\E[e^{i\nu N}]}d\nu\right)
% \end{align}
% where $i=\sqrt{-1}$. 
% %Moreover, $\psi(\chi) \defeq \E[e^{i\chi X}]$, is the characteristic function of $X$.
% \end{thm}

% Although the above only deals with the marginal expectation over the latent variable $X$, the theorem still holds over the measure $\mathcal{P}_{\X|z}$, leading us to the following result.

\begin{corollary}\label{cor: conditional_psi}
Given Assumptions \ref{assp: additive_merror}-\ref{assp: schennach_technical_assump}, the characteristic function of $X$ given $Z = z$, i.e. $\psi_{\cP_{X|z}}(\alpha)$, is equal to % \defeq \E_{\cP_{X|z}}[e^{i\alpha X}]$, is identified with 
\begin{align}
    \overbrace{\E_{\cP_{X|z}}[e^{i\alpha X}]}^{\psi_{\cP_{X|z}}(\alpha) :=} &= \exp \left(\int_0^{\alpha} i \frac{\E[Me^{i\nu N}|z]}{\E[e^{i\nu N}|z]}d\nu\right).\label{eq: char_cme}
\end{align}
\end{corollary}
\begin{proof}
Follows directly from \cite[Theorem 1]{schennach04}, where the original phrased the equality for the marginal distribution $p(x, m, n)$.
\end{proof}
Since characteristic functions are exact representations of probability distributions, Corollary~\ref{cor: conditional_psi} says that we may model $\cP_{X|z}$ through modelling $\cP_{M,N|z}$ and $\cP_{N|z}$, and the mathematical relation is specified by Eq.~\eqref{eq: char_cme}.

% \ricardo{To save space, can't we just have Corollary one and say that this follows from Theorem such-and-such of Schennach, original phrased for the marginal distribution of $X$?}

% \section{Measurement Error in Nonlinear Models}




% \begin{remark}
% The assumption where we need two measurements with independent error can be relaxed. For example, one measurement can be relaxed to a monotonic polynomial function of the latent plus independent symmetric noise \cite{hu_sasaki_15}; their approach is a two-stage one: first identify the polynomial, then identify the marginal of the latent using the same method as in Theorem \ref{thm: schennach}. Therefore, in this work we focus on the simplified scenario and leave the extension to more general measurement error assumptions for future work.
% \end{remark} \caroline{this should be said at a different place.}
% \section{Connection of characteristic functions with Reproducing Kernel Hilbert Spaces (RKHSes)}
% Differing from traditional latent variable model approaches, our upshot is to learn the conditional mean embedding of
% \section{Confounded Causal Inference with Instrumental Variables}

% Figure-\ref{fig:merror_iv_graph} describes the independence structure of the variables in our setting. We assume that highlighted variables, $X$ and $\epsilon$, are unobserved and the rest are observed variables. Within this setting, we ask to estimate the structural function $f: \mathcal{X} \longrightarrow \mathcal{Y}$.

