\section{Method} \label{sec: method}
In this section we will show how to recover $\mathbb E[Y~|~do(X \!=\! x)]$. To do so, recall that all we need is the kernel mean embedding $\mu_{\cP_{X|Z}}$. We begin by demonstrating that estimating $\mu_{\cP_{X|Z}}$ boils down to estimating the characteristic function $\psi_{\cP_{X|Z}}$. We then introduce a trick for solving integral equations that we call the \emph{differentiation trick} which allows us to estimate $\psi_{\cP_{X|Z}}$ without explicitly estimating the integral. Finally, we give a full procedure for estimating $\mathbb E[Y~|~do(X \!=\! x)]$ and describe advantages of our approach.

%We start by observation that the characteristic function uniquely determines a distribution, and under certain conditions, so does a kernel mean embedding. If we can find the characteristic function of a latent variable using the corrupted observations, then there is also a way to find the kernel mean embedding. Once we have found the kernel mean embedding of the latent variable given the instrument, i.e. the quantity $\mu_{\cP_{X|Z}}$, we can use Eq.~\eqref{eq: kiv_1}-Eq.~\eqref{eq: kiv_3} to estimate $f$. 

% The upshot is that we do not need to find the distribution $p(x|z)$ first, which is a more general and difficult task. \caroline{What are the reasons? 1. modelling the distributions directly might result in high variance, and afterwards we still need to sample from the distribution - but i think that's only a problem with high dimensional data? 2. ... }

% In order to connect estimation of the kernel mean embedding $\mu_{\cP_{X|Z}}$ with  estimating the characteristic function $\psi_{\cP_{X|Z}}$
% We start with identifying the connection between the characteristic function and the kernel embedding of a distribution.

\subsection{From kernel mean embeddings to characteristic functions} \label{subsec: kme_cf}
For simplicity, we limit our description to $\R$. However, all of the following arguments can be extended trivially to $\R^d, d>1$. First recall the Fourier transform:
\begin{align*}
    \tilde{h}(\alpha) &= \frac{1}{2\pi} \int_{-\infty}^{\infty} h(x)e^{-i\alpha x}dx
\end{align*}
and the inverse Fourier transform:
\begin{align*}
    h(x) &= \int_{-\infty}^{\infty} \tilde{h}(\alpha)e^{i\alpha x} d\alpha .
\end{align*}
%For more details on these representations see \citet{fukumizu2008lecturenotes}.

Further, we assume the following.
%Given these, we require the following weak assumption to construct our link between characteristic functions and kernel mean embeddings.
\begin{assumption}[Symmetric, characteristic and translation-invariant kernels] \label{assump: kernel_additional}
$k(x, \cdot), k(m, \cdot), k(n, \cdot)$ are symmetric, characteristic and translation-invariant kernels.
\end{assumption}
Kernel symmetry is a standard assumption in ML as kernel functions are generally real. Characteristic kernels allow us to embed probability distributions uniquely in an RKHS. Translation-invariant kernels allow us to consider the probability measure associated with kernel functions.

Under Assumption~\ref{assump: kernel_additional}, we can write $k(x,y)=k(x-y)$ and $k(t)$ is positive definite. By Bochner's theorem, we know that $k$ can be written as the Fourier transform of a unique measure $\tilde{k}$:
\begin{align*}
    k(t) &= \frac{1}{2\pi} \int_{-\infty}^{\infty} e^{-i\alpha t}\tilde{k}(\alpha) d\alpha \\
    \text{i.e. }\; k(x, y) &= \int_{-\infty}^{\infty} e^{-i\alpha (x-y)} q(\alpha)d\alpha
    % k(x,y) &= \frac{1}{2\pi}\int_{-\infty}^{\infty} e^{-i\alpha x}e^{i\alpha y} \tilde{k}(\alpha)d\alpha
\end{align*}
where $q(\alpha) := \frac{1}{2\pi}\tilde{k}(\alpha)$.
% , we obtain:
% \begin{align*}
%     k(x, y) = \int_{-\infty}^{\infty} e^{-i\alpha (x-y)} q(\alpha)d\alpha
% \end{align*}
As illustrated in e.g. \cite{fukumizu2008lecturenotes}, we may construct an RKHS on the entire real line using Fourier transforms as feature maps:
\begin{align*}
    &\mathcal{H}_X = \left\{f\in \cL^2(\mathbb{R}, dx) \Bigg| \int_{-\infty}^{\infty} \frac{\left|\tilde{f}(\alpha)\right|^2}{q(\alpha)}d\alpha < \infty\right\}\\
    &\langle f, g \rangle_{\cH_X} = \int_{-\infty}^{\infty} \frac{\tilde{f}(\alpha) \overline{\tilde{g}(\alpha)}}{q(\alpha)}d\alpha
\end{align*}
Now consider the Fourier transform of $k(x, \cdot)$, where $x$ is fixed. Since we know that $k(x,y) = \int e^{-i\alpha x} e^{i\alpha y} q(\alpha) d\alpha$, by inspection we realise $\tilde{k}(x, \alpha) = e^{-i\alpha x} q(\alpha)$, recovering the identity that $k(x, y) = \int_{-\infty}^{\infty} \frac{e^{-i\alpha x} q(\alpha) e^{i\alpha y} q(\alpha)}{q(\alpha)} d\alpha = \langle k(x, \cdot), k(y, \cdot) \rangle_{\cH_X}$. 

Recall the definition of the conditional mean embedding of $\mathcal{P}_{X|z}$ for a particular $z$: $\mu_{\mathcal{P}_{X|z}}(y) \defeq \int k(x, y)p(x|z) dx$. When all variables are observed, the conditional mean embedding (CME) can be estimated by samples $\{x_j, z_j\}_{j=1}^s$:
\begin{align}
    \hat{\mu}^{(s)}_{X|z}(y) %&= \sum_{i=1}^n \gamma_i(z) k(y, x_i)\\
    &=\sum_{j=1}^s \hat{\gamma}_j^{(s)}(z) k(x_j, y) \label{eq: empirical_cme}
\end{align}
% where the last step is due to the symmetry of the kernel matrix, and
where
\begin{align}
    \hat{\gamma}_j^{(s)}(z) = (K_{ZZ}+ s\hat{\lambda}^{(s)} I)^{-1}K_{Zz} \label{eq: gamma}
\end{align}

% $K_{ZZ}$ denotes the kernel matrix where $(K_{ZZ})_{jl} = \langle k(z_j, \cdot), k(z_l, \cdot) \rangle_{\cH_X}$. $K_{Zz}$ denotes a column vector where $(K_{Zz})_j = \langle k(z_j, \cdot), k(z, \cdot) \rangle_{\cH}$.

Taking the Fourier transform of $\hat{\mu}^{(s)}_{X|z}(y)$:
\begin{align*}
    \tilde{\hat{\mu}}^{(s)}_{X|z}(\alpha) &= \sum_{j=1}^s \hat{\gamma}_j^{(s)}(z) e^{-i\alpha x_j} q(\alpha)\\
    &= q(\alpha)\underbrace{\sum_{j=1}^s \hat{\gamma}_j^{(s)}(z) e^{-j\alpha x_j}}_{\eqdef \hat{\psi}^{(s)}_{ \mathcal{P}_{X|z}}(-\alpha)}
\end{align*}
Define the $s-$sample estimate of the characteristic function $\hat{\psi}^{(s)}_{\mathcal{P}_{X|z}}(\alpha) \defeq \sum_{j=1}^s \hat{\gamma}_j^{(s)}(z)e^{i\alpha x_j}$ with $\{x_j\}_{j=1}^s \sim \mathcal{P}_{X|z}$. Next, we show that $\hat{\psi}^{(s)}_{\mathcal{P}_{X|z}} \longrightarrow \psi_{\mathcal{P}_{X|z}}$ in $\mathcal{L}^2(\mathbb{R}, q)$ if and only if $\hat{\mu}_{\mathcal{P}_{X|z}}^{(s)} \longrightarrow \mu_{\mathcal{P}_{X|z}}$ in $\mathcal{H}_X$. 
% \begin{assumption}\label{assump: k_symmetry}
% $k$ is a symmetric kernel, i.e. $k(x,y) = k(y,x)$.
% \end{assumption}
% \caroline{ask Arthur if there is any reference.}
\begin{thm}[Convergence in CME is identical to convergence in characteristic function].\label{prop: charfun_cme_equiv}
Let $k: \mathcal{X} \times \mathcal{X} \longrightarrow \mathbb{R}$ be a symmetric, positive definite, and translationally invariant characteristic kernel, then for a (conditional) probability measure on $\mathcal{X}$, denoted $\mathcal{P}_{X|z}$, we have that $\hat{\psi}^{(s)}_{\mathcal{P}_{X|z}} \longrightarrow \psi_{\mathcal{P}_{X|z}}$ in $\mathcal{L}^2(\mathbb{R}, q)$ if and only if $\hat{\mu}_{\mathcal{P}_{X|z}}^{(s)} \longrightarrow \mu_{\mathcal{P}_{X|z}}$ in $\mathcal{H_X}$. Moreover, whenever either converges, the other converges at the same rate.
\end{thm}
We provide the proof in Section~\ref*{app: proofs} of the Supplementary Materials.

This means learning the characteristic function $\psi_{\cP_{X|z}}$ in $\cL^2(\mathbb{R}, q)$ simultaneously gives us a precise estimate of the kernel mean embedding $\mu_{\cP_{X|z}}$ in $\cH_{X}$.



\subsection{Learning the Latent Characteristic Function}

We now show how to learn the latent characteristic function which will give us the latent kernel mean embedding.

\textbf{Notation.} To lighten notation, from now on we will use $\hat{f}$ to denote the empirical estimate of a quantity $f$, and only use $\hat{f}^{(s)}$ when we need to be specify the sample size $s$.

\textbf{What if we are able to observe $X$?} When $X$ is observed, $\hat{\mu}^{(s)}_{\mathcal{P}_{X|z}}$ can be obtained directly and it can be shown that $\hat{\mu}^{(s)}_{\mathcal{P}_{X|z}} \longrightarrow \mu_{\mathcal{P}_{X|z}}$ as $s \longrightarrow \infty$. By Theorem~\ref{prop: charfun_cme_equiv}, the same samples and $\lambda$ which closely estimate the CME $\mu_{\cP_{X|z}}$ would also closely estimate the characteristic function $\psi_{\mathcal{P}_{X|z}}$, and vice versa \footnote{This should be possible for $z$ from an unseen distribution $\cP_{\check{\mathcal{Z}}}$ provided the unseen distribution has the same support as the training distribution $\cP_Z$.}. Thus, when $s$ is suitably large, we can accurately approximate the right hand side of $\eqref{eq: char_cme}$ as %$\sum_{j=1}^s \hat{\gamma}_j(z) e^{i\alpha x_j}$:
% \caroline{need to rewrite all occurances of $\gamma$ by $\gamma^{(s)}.$}
\begin{align}
    \exp\left(\int_{0}^{\alpha} i\frac{\E[Me^{i\nu N}|z]}{\E[e^{i\nu N}|z]}d \nu\right) &\approx \sum_{j=1}^s \hat{\gamma}_j(z) e^{i\alpha x_j}   \label{eq: finite_sample_approx}
\end{align}
where $\hat{\gamma}_j(z)$ is specified by Eq.~\eqref{eq: gamma}. Recall that this term also depends on $\hat{\lambda}$. To make this explicit we write $\hat{\gamma}^{\hat{\lambda}}_j(z)$.
%we keep in mind that $\hat{\gamma}$ depends on $\hat{\lambda}$ and $\{z_j\}_j$. 

\textbf{Solving for $X$.} Given eq.~\ref{eq: finite_sample_approx} we make the following observation: given samples of $\{z_j\}_j$, the estimate $\hat{\psi}_{\mathcal{P}_{X|z}}$ only depends on $\{x_j\}_j$ and $\hat{\lambda}$.

% \matt{MATT TODO: write out full equality from equation 12, then write below}
% \begin{align}
%     \psi_{\cP_{X|z}}(\alpha) &= \exp \left(\int_0^{\alpha} i \frac{\E[Me^{i\nu N}|z]}{\E[e^{i\nu N}|z]}d\nu\right).\label{eq: char_cme}
% \end{align}

Therefore, we can solve for $\{x_j\}_j,\hat{\lambda}$ by minimising the discrepancy between both sides of Eq.~\eqref{eq: finite_sample_approx} over $\{x_j\}_j,\hat{\lambda}$:

\begin{align}
    \{\hat{x}_j\}_j, \hat{\lambda}_X &=\argmin_{\{x_j\}_j, \hat{\lambda}}\E_{q(\alpha), \cP_{\check{Z}}}\left[\left(
    \sum_{j=1}^s \hat{\gamma}^{\hat{\lambda}}_j(\check{Z}) e^{i\alpha \hat{x}_j}
    %\hat{\psi}_{\mathcal{P}_{X|\check{Z}}}(\alpha)
    - \eta\right)^2\right] \label{eq: original_loss}\\
    \text{with } \eta &= \exp\int_{0}^{\alpha} \left(i\frac{\E[Me^{i\nu N}|\check{Z}]}{\E[e^{i\nu N}|\check{Z}]} d\nu \right) \nonumber
\end{align}

The expectation is taken over $q(\alpha)$ and $\cP_{\check{Z}}$ because, had the $X-$samples been observed, the convergence of characteristic function is in $\cL^2(\mathbb{R}, q)$ and the $\check{Z}$ distribution does not have to equal to the one used to learn the CME, as long as the two have the same support. To estimate $\eta$ requires two components of approximation: a) finite-sample approximation of $\E[e^{i\nu N}|\check{Z}]$ and $\E[Me^{i\nu N}|\check{Z}]$, b) computation of the integral $\int_{0}^{\alpha} \left(i\frac{\E[Me^{i\nu N}|\check{Z}]}{\E[e^{i\nu N}|\check{Z}]} d\nu\right)$, given a). While it is possible to use numerical methods such as quadrature to approximate the integral, we propose to save the second component by differentiation. 

\textbf{The Differentiation Trick.} We now describe a trick for handling intractable integrals when solving a system of equations. First, let us reproduce 
%in target function estimation of the form in Eq.~\eqref{eq: char_cme}. 
% First we revisit 
eq.~\eqref{eq: char_cme} below
\begin{align*}
    \overbrace{\E_{\cP_{X|z}}[e^{i\alpha X}]}^{\psi_{\cP_{X|z}}(\alpha) :=} &= \exp \left(\int_0^{\alpha} i \frac{\E[Me^{i\nu N}|z]}{\E[e^{i\nu N}|z]}d\nu\right). %. \hspace{0.7cm} (12) \nonumber
\end{align*}

% \matt{MATT TODO: write above in generic form}


We can take the natural logarithm and differentiate both sides of eq.~\eqref{eq: char_cme}, and substitute the samples of $\check{\mathcal{Z}}$:
\begin{align}
   \frac{\E[Xe^{i\alpha X}|\check{z}]}{\E[e^{i\alpha X}|\check{z}]} &= \frac{\E[Me^{i\alpha N}|\check{z}]}{\E[e^{i\alpha N}|\check{z}]} \label{eq: diff_char_cme}
\end{align}

Since differentiation is a many-to-1 operation, we need to verify that the solution to Eq.~\eqref{eq: diff_char_cme} is also the solution to Eq.~\eqref{eq: char_cme}.
\begin{lemma} \label{lemma}
Considering differentiable functions $\mathbb{C}^n \rightarrow \mathbb{C}$. Denote $f'(x):= \frac{d}{dx}f(x)$. Then if $f'=g'$ and $f(a) = g(a) = b, a,b\in\mathbb{C}$, then $f = g$.
\end{lemma}
\begin{proof}
If $f' = g'$, then $f = g + C$ for some $C\in \mathbb{C}$. But $f(a) - g(a) = b-b=0$, so $C=0$. 
\end{proof}
\begin{thm}\label{thm: unique_solution}
The (conditional) distribution of $X$, denoted by $\mathcal{P}_{X|z}$, which satisfies Eq.~\eqref{eq: diff_char_cme} is unique, and therefore is the same as the solution to \eqref{eq: char_cme}.
\end{thm}
The proof relies on the fact that characteristic functions are always $1$ at $\alpha=0$ (Section~\ref*{app: proofs} of the Supplementary Materials).

\textbf{When should one use the differentiation trick?} When estimation for the target function/parameter requires evaluating an intractable integral, one can think of using the differentiation trick. Lemma~\ref{lemma} specifies one condition where this can be done. Note that there are more situations where the differentiation trick can be applied, such as when all functions in the target class have the same normalization constant. We summarize two situations where the differentiation trick can be applied:
\begin{itemize}
    \item When the target function class is itself normalized, or fixed at certain input values. Examples of this which may be of interest to machine learning practitioners are: a) probability densities, which always integrates to 1, b) cumulative distributions, which is always $1$ at $\infty$.
    \item When an invertible transformation of the function class is normalized or fixed at certain inputs. In those cases, one can in principle solve the problem in the normalized function class, and then apply the invertible transformation to go back to original class.
    % \item When the constant can be fixed: having a normalized class of functions essentially stops us from gaining a constant through differentiation. If we have \textit{a priori} knowledge on what the constant is, we can also go ahead and use the differentation trick. This point can be absorbed into the first bullet, but we state it here for clarity. 
\end{itemize}

\textbf{Towards a sample-based estimator.} As discussed, we may replace $\E[e^{i\alpha X}|\check{z}]$ and $\E[e^{i\alpha N}|\check{z}]$ with their finite-sample estimates $\hat{\psi}_{\mathcal{P}_{X|\check{z}}}$ and $\hat{\psi}_{\mathcal{P}_{N|\check{z}}}$. For $\E[Xe^{i\alpha X}|\check{z}]$ and $\E[Me^{i\alpha N}|\check{z}]$, we realise that $\E[Xe^{i\alpha X}|\check{z}] = \frac{\partial }{\partial \alpha} \E[e^{i\alpha X}|\check{z}]$, and $\E[Me^{i\alpha N}|\check{z}] = \frac{\partial }{\partial \upsilon}\bigg|_{\upsilon=0} \E[e^{i(\alpha N + \upsilon M)}|\check{z}]$. Thus, we replace them with $\frac{\partial }{\partial \alpha} \hat{\psi}_{\cP_{X|z}}(\alpha)$ and $\frac{\partial }{\partial \upsilon}\bigg|_{\upsilon=0} \hat{\psi}_{\cP_{M,N|z}}(\alpha, \upsilon)$ respectively.
The full expressions of $s-$sample estimates for $\hat{\psi}_{\cP_{X|z}}(\alpha)$, $\hat{\psi}_{\cP_{N|z}}(\alpha)$, $\hat{\psi}_{\cP_{M,N|z}}(\upsilon, \alpha)$ and the relevant derivatives are stated in Section~\ref*{app: sample_estimates} of the Supplementary Materials.

% We can now replace each of the four terms in Eq.~\eqref{eq: diff_char_cme} with the sample estimates. 

% Moreover, we show that the transformations preserve the quality of approximation: 
% \begin{thm}\label{theorem: diff_preserves_quality}
% If $X$ is an observed variable, then taking logarithm and differentiating both sides of Eq.~\eqref{eq: char_cme}, we obtain $\forall z$\caroline{for all z which was seen in training}:
% \begin{align}
% \frac{\sum_{j=1}^{n} x_{j} \gamma_{j}^{(n), X}(z) e^{i \alpha x_{j}}}{\sum_{j=1}^{n} \gamma_{j}^{(n), X}(z) e^{i \alpha x_{j}}} &-\underbrace{\frac{\sum_{j=1}^{n} m_{j} \gamma_{j}^{(n), MN}(z) e^{i \alpha n_{j}}}{\sum_{j=1}^{s} \gamma_{j}^{(n), N}(z) e^{i \alpha n_{j}}}}_{\mathcal{Y}} \longrightarrow = 0 
% \end{align}
% \caroline{in what norm?}
% \end{thm}
% \begin{proof}
% \caroline{insert proof.}
% \end{proof}

Therefore, we arrive at the new objective function:
\begin{align}
&\{\hat{x}_j\}_{j=1}^s, \hat{\lambda}_X = \nonumber \\
&\argmin_{\{x_j\}_{j=1}^s, \hat{\lambda}_X}\E_{q(\alpha), \cP_{\check{Z}}} \left[\left( w_X(\alpha, \check{Z})- w_{MN}(\alpha, \check{Z}) \right)^2\right]  \label{eq: obj}\\
% \end{align}
% with
% \begin{align}
    &\text{with} \hspace{0.5cm}w_X(\alpha, \check{Z}) = \frac{\sum_{j=1}^{s} x_{j} \hat{\gamma}_X(\check{Z})_j e^{i \alpha x_{j}}}{\sum_{j=1}^{s} \hat{\gamma}_X(\check{Z})_j e^{i \alpha x_{j}}} \label{eq: step_2_inputs}\\
    &w_{MN}(\alpha, \check{Z}) = \frac{\sum_{j=1}^{s} m_{j} \hat{\gamma}_{M,N}(\check{Z})_j e^{i \alpha n_{j}}}{\sum_{j=1}^{s} \hat{\gamma}_N(\check{Z})_j e^{i \alpha n_{j}}} \label{eq: step_2_labels}
\end{align}
$w_{MN}$ is the sample estimate for the integrand in $\eta$ from Eq.~\eqref{eq: original_loss}. We can interpret the output values of $w_{MN}$ as the labels for the supervised learning task defined by Eq.~\eqref{eq: obj}, the $(\alpha, \check{z})$ as inputs, and the $\{x_j\}$ and $\hat{\lambda}_X$ are the parameters. As soon as we have obtained the optimal $\{\hat{x}_j\}_{j=1}^s$ and $\hat{\lambda}_X$, we can substitute into Eq.~\eqref{eq: empirical_cme} and Eq.~\eqref{eq: gamma} to obtain the CME estimate $\hat{\mu}_{\cP_{X|z}}$.

\subsection{Algorithm}


We propose \textit{MEKIV}: \textbf{M}easurement-\textbf{E}rror-corrected \textbf{K}ernel \textbf{I}nstrumental \textbf{V}ariable regression. Two independent samples are needed: $\{z_j, m_j, n_j\}_{j=1}^{s_1}$ and $\{\check{z}_j, \check{y}_j\}_{j=1}^{s_2}$.

Thanks to Theorem~\ref{prop: charfun_cme_equiv}, In step 1 of the MEKIV, we use $\{z_j, m_j, n_j\}_{j=1}^{s_1}$ to compute the sample estimates of the conditional kernel mean embeddings of $\cP_{N|z}$ and $\cP_{M,N|z}$, which in large sample size is guaranteed to converge to the ground truth \cite{singh2019kernel}. By Theorem~\ref{prop: charfun_cme_equiv}, this also gives us a sample estimate of the characteristic functions which converges in $\cL^2$ of their measures induced by their respective kernels. 

Step 2 of the MEKIV learns the characteristic function of $\mathcal{P}_{X|Z}$ by optimising for the $X$ samples using the training objective in Eq.~\eqref{eq: obj}. Again by Theorem~\ref{prop: charfun_cme_equiv}, a good estimate of the characteristic function gives us a good estimate of the conditional kernel mean embedding. 

In Step 3, MEKIV uses the learnt kernel conditional mean embedding and the second samples $\{\check{z}_j, \check{y}_j\}_{j=1}^{s_2}$ to estimate the structural function $f$ - equivalent to the stage 2 of the KIV (\cite{singh2019kernel}). 

The pseudocode of our complete algorithm can be found in Algorithm~\ref*{alg: all}~and~\ref*{alg: step_2} in the Supplementary Materials.
% The pseudocode of our complete algorithm can be found in Algorithm~1~and~2.

\textbf{Step 1.} From the first sample $\{z_j, m_j, n_j\}_{j=1}^{s_1}$, learn the conditional mean embedding of $p(m|z)$ and $p(m, n|z)$ using the result stated in Eq.~\eqref{eq: kivs1}, Section~\ref{subsec: kiv_method}:
% i.e. $\mu^{(s_1)}_{\mathcal{P}_{N|z}} (\cdot) \defeq C^{(s_1)}_{\mathcal{P}_{N|Z}}(\phi(z))(\cdot) $ where $C^{(s_1)}_{\cP_{N|Z}}$ and $C^{(s_1)}_{\cP_{M, N|Z}}$ denote the conditional mean embedding operators; these are obtained as the solution to:
% \begin{align}
% C^{(s_1)}_{\cP_{N|Z}} &= \argmin_{C\in \cH_{\Gamma}} E^{(s_1)}(C), \hspace{0.3cm} \text{with}\\
% E^{(s_1)}(C) &= \frac{1}{s_1} \sum_{j=1}^{s_1} \|\phi(n_j) - C\phi(z_j)\|_{\cH_{\mathcal{Z}}}^2 + \lambda_{N} \|C\|^2_{\cH_{\Gamma}}
% \end{align}
% where $\cH_{\Gamma}$ is the vector-valued RKHS of operators mapping $\cH_{\cZ}$ to $\cH_{\cN}$. It can be shown that $C^{(s_1)}_{\cP_{N|Z}} = \Phi(N)(K_Z + s_1 \lambda_N I)^{-1}\Phi^T(Z)$, where $\Phi(N)$ is a vector of $s_1$ columns with $\phi(n_j)$ being its \textit{j}th column. \cite{song2009hilbert, grunewalder2012conditional, singh2019kernel}. Therefore,
\begin{align}
    \hat{\mu}^{(s_1)}_{\cP_{N|z}} (\cdot) &= \sum_{j=1}^{s_1}(\hat{\gamma}^{(s_1)}_N(z))_j k(n_j, \cdot), \\  \text{with} \hspace{0.3cm} \hat{\gamma}^{(s_1)}_N(z)&=(K_{ZZ} + s_1 \hat{\lambda}_{N} I)^{-1}K_{Zz}
\end{align}
Similarly, it can be shown that:
\begin{align}
    \hat{\mu}^{(s_1)}_{\cP_{M,N|z}} (\cdot) &= \sum_{j=1}^{s_1}(\hat{\gamma}_{M,N}(z))_j k((m_j, n_j), \cdot)\label{eq: mn_cme},  \\ \text{where} \hspace{0.3cm} \hat{\gamma}^{(s_1)}_{M,N}(z)&=(K_{ZZ} + s_1 \hat{\lambda}_{M,N} I)^{-1}K_{Zz}
\end{align}
\begin{remark}
\eqref{eq: mn_cme} allows the use of product kernels.
\end{remark}
% From $\hat{\mu}^{(s_1)}_{\cP_{M, N|z}}$, we may obtain the sample estimate for the characteristic function of $\cP_{M,N|z}$:
% \begin{align}
%     \hat{\psi}^{(s_1)}_{\cP_{M,N|z}}(\upsilon, \alpha) &= \sum_{j=1}^n (\gamma_{M,N}(z))_j e^{i(\upsilon m_j +\alpha n_j)}
% \end{align}
% And consequentially the $\upsilon-$derivative evaluated at $\upsilon=0$:
% \begin{align}
%     \frac{\partial}{\partial \upsilon} \psi^{(s_1)}_{\cP_{M,N|z}}(\upsilon, \alpha)\bigg|_{\upsilon=0} &= \sum_{j=1}^n m_j (\gamma_{M,N}(z))_j e^{i \alpha n_j}
% \end{align}

\textbf{Step 2.}
After obtaining from Step 1 the quantities: $\hat{\gamma}_N$ and $\hat{\gamma}_{MN}$, Step 2 creates samples $\{\alpha_j\}$, $\{\check{z}_j\}$ and $\{(w_{MN})_j\}$. To this end, Step 2 samples $\{\alpha_j\}_{j=1}^{s_2}$ from $q(\alpha)$, and uses $\{\check{z}_j\}_{j=1}^{s_2}$ unseen in Step 1. In general, $\{\check{z}_j\}_{j=1}^{s_2}$ can be drawn from any distribution $\cP_{\check{Z}}$ with the same support as $\cP_Z$. To maximize sample usage, we take all pairs in the cross product $\{\alpha_j\}_{j=1}^{s_2} \times \{z_j\}_{j=1}^{s_2}$, giving $(s_2)^2$ pairs: $\{\alpha_j, \check{z}_j\}_{j=1}^{(s_2)^2}$ - here we overload notation $\{\check{z}_j\}$ to be both before and after taking the cross product. We input each pair of $\{\alpha_j, \check{z}_j\}$ into Eq.~\eqref{eq: step_2_labels} to generate the labels $\{(w_{MN})_j\}_{j=1}^{(s_2)^2}$. The process of sampling $\{\alpha_j\}$ from $q(\alpha)$ has a close connection with the Random Fourier Features literature (\cite{Bach15randomfourierfeatures, sriperumbudur15_optimal_rates_rff, rahimi_recht_rff_07}).

We now seek $\{x_j\}_{j=1}^{s_1}$ and $\hat{\lambda}_X$ in order to minimize the following objective, which is the empirical analogue of Eq.~\eqref{eq: obj}:
\begin{align}
\{\hat{x}_j\}_{j=1}^{s_1}, \hat{\lambda}_X &=\argmin_{\{x_j\}_{j=1}^{s_1}, \hat{\lambda}_X}\sum_{j=1}^{(s_2)^2} \left[\left( w_X(\alpha_j, \check{z}_j)- (w_{MN})_j\right)^2\right] \label{eq: emp_obj}
\end{align}
For clarity, Step 2 is illustrated in Algorithm~\ref*{alg: step_2} (see Supplementary Materials).

% \textbf{Random Fourier Features.}
% Since what we have is the convergence of the empirical characteristeric function under $q(\alpha)$, we sample $\{\alpha_q\}$ for the training dataset from $q (\alpha)$. This is related to the Random Fourier Features literature (\caroline{cite random fourier features literature}). 

% The interpretation of $\{x_j\}_{j=1}^{s_1}$ as the parameters of the function from $\alpha$ and $z$ to $w$ allows us to make use of modern differentiable machine learning techniques. We present the algorithm in Algorithm~\ref{step_2}. The optimized $\{\hat{x}_j\}_{j=1}^{s_1}$ and $\lambda^{(s_1)}_X$ allow us to construct an estimator for $\mu_{\cP_{X|z}}(\cdot)$.

\textbf{Step 3.}
Given estimates of $\{x_j\}_{j=1}^{s_1}$ and $\hat{\lambda}_X$, we obtain the empirical estimate $\hat{\mu}_{\cP_{X|z}}$. Along with the samples $\{\check{z}_j, \check{y}_j\}_{j=1}^{s_2}$, we obtain the solution for $\hat{f}^{s_1}$. The procedure is identical to the Stage 2 estimation of KIV \cite{singh2019kernel}, for which we stated the derived estimator in Section~\ref{subsec: kiv_method}. Our solution for $f$ is:
% Provided that the kernels $k_X$ and $k_Z$ are measurable, continuous and bounded, and $k_X$ is characteristic, 
% \begin{align}
%     \E_{\cP_{X|z}}[f(X)] = \langle f, \mu_{\cP_{X|z}} \rangle_{\cH_X}
% \end{align}
% where $f \in \cH_{X}$, a scalar-valued RKHS. Given an estimate for $\hat{\mu}_{\cP_{X|z}}$, the procedure for obtaining the solution $f$ is identical to the second stage of KIV \cite{kiv}. We present the solution below.
\begin{align}
    \hat{f}^{(s_2)}(x) &= (\hat{\beta})'K_{\hat{X}x}\\
    \text{with } \hat{\beta} &= (VV' + s_2{\hat{\xi}} K_{\hat{X}\hat{X}})^{-1}V\check{y}\\
    V &= K_{\hat{X}\hat{X}}(K_{ZZ} + s_1 \hat{\lambda} I)^{-1}K_{Z\check{Z}}
\end{align}

\subsection{Advantages of MEKIV}
We highlight the benefits of MEKIV:
\begin{itemize}
    \item MEKIV is \textbf{free of distributional assumptions}: as long as the measurement error satisfies the mean-independence conditions in Eq.~\eqref{eq:M_XN}-\eqref{eq:E_N}, the distributions can have any shape.
    \item \textbf{Computational efficiency}: MEKIV models only the CME of $\cP_{X|Z}$, and in particular, no modelling of the full joint distribution $\cP_{X|Z,M,N}$ as is commonly done in standard latent variable modelling.
    \item \textbf{Ease of implementation}: Unlike standard latent variable modelling, which is typically hard to train due to the large number of hyperparameters, MEKIV is easy to implement and works stably without large efforts in tuning.
\end{itemize}







% Now, we know that given a probability distribution $p(x)$, the mean embedding of the distribution $p(x)$ satisfies the following property:
% \begin{align}
%     \mu_X(y) &= \langle \mu_X(\cdot), k(y, \cdot)\rangle_{\mathcal{H}}\\
%     &= \E_{X}[k(y, X)]\\
%     &= \int k(y-x) p(x) dx
% \end{align}
% which is to say that the mean embedding is the convolution of the kernel with the probability measure. By convolution theorem, the Fourier transform of a convolution is the product of Fourier transforms:
% \begin{align}
%     \hat{\mu}_{X}(\alpha) = \psi_{p}(\alpha) \hat{k}(\alpha)
% \end{align}

% Since we require $k(t)$ to be positive definite, $\hat{k}(\alpha)$ must also be positive definite i.e. $\hat{k}(\alpha) > 0$. \caroline{cite something for this.}

% Note that this tells us that we can replace the left hand side of \eqref{eq: char_cme} by $\frac{\hat{\mu}_{X|z}(\alpha)}{\hat{k}(\alpha)}$:
% \begin{align}
%     \frac{\hat{\mu}_{X|z}(\alpha)}{\hat{k}(\alpha)} &= \exp \left(\int_{0}^{\alpha} i \frac{\E[Me^{i\nu N}|z]}{\E[e^{i\nu N}|z]} d\nu\right)
% \end{align}
