\setcounter{assumption}{7}
\setcounter{equation}{28}
\setcounter{figure}{4}
\section{Pseudocode}
See Figure \ref{fig:algorithm} for the pseudocode of our method.
\begin{figure*}[t]
\centering
\begin{minipage}[t]{.41\textwidth}
\begin{algorithm}[H]
\caption{MEKIV training}
\label{alg: all}
\textbf{Input: $M_1$, $M_2$, $N_1$, $N_2$, $Z_1$, $Z_2$, $Y_1$, $Y_2$} , kernelType:=RBF kernel\\
\\
\textbf{Step 1} \textbf{Input}: $M_1$, $N_1$, $Z_1$, kernelType 
\begin{algorithmic}[1]
\State $\hat{\gamma}_N \leftarrow$ KRR($N_1, Z_1$, kernelType) \cite[Stage 1 estimate]{kiv_supp}
\State Stack $M_1$ and $N_1$ to get $M_1N_1$.
\State $\hat{\gamma}_{MN} \leftarrow$ KRR($M_1N_1, Z_1$, kernelType)
\State\Return $\hat{\gamma}_{MN}, \hat{\gamma}_N$
\StateX
\end{algorithmic}

\textbf{Step 2} \textbf{Input}: kernelType,  $\gamma_N^{(s_1)}$, $\gamma_{MN}^{(s_1)}$, $M_1$, $N_1$, $Z_2$, number of $\alpha$ samples $\defeq C$,
\begin{algorithmic}[1]
\State Take $q(\alpha)$ as the Inverse Fourier Transform of the kernel rescaled by $\frac{1}{2\pi}$.
\State Take $\{\check{z}_j\}_{j=1}^{s_2}$ to be the set of data points in $Z_2$.
\State $\{\hat{x}_j\}_{j=1}^{s_1}, \hat{\lambda}_X \leftarrow$ OptimiseX1($q(\alpha)$,  $\gamma_N^{(s_1)}$, $\gamma_{MN}^{(s_1)}$, $M_1$, $N_1$, $\{\check{z}_j\}_{j=1}^{s_2}$, $C$)
\State\Return $\{\hat{x}_j\}_{j=1}^{s_1}, \hat{\lambda}_X$
\StateX
\end{algorithmic}

\textbf{Step 3} \textbf{Input}: $\{\hat{x}_j\}_{j=1}^{s_1}, \hat{\lambda}_X$, $Y_2$, $Z_2$, $Z_1$
\begin{algorithmic}[1]
\State $\xi \leftarrow $ KIVStage2Validation \cite[A.5.2]{kiv_supp}
% \State $\hat{f}^{(s_2)}_{\xi} \leftarrow$ KIVStage2($\{\hat{x}_j\}_{j=1}^{s_1}, \hat{\lambda}_X$, $Y_2$, $Z_2$, $Z_1$, $\xi$)
\State $\hat{f} \leftarrow$ KIVStage2($\{\hat{x}_j\}_{j=1}^{s_1}, \hat{\lambda}_X$, $Y_2$, $Z_2$, $Z_1$, $\xi$)
\State \Return $\hat{f}$
\end{algorithmic}
\end{algorithm}
% \end{subfigure}
\end{minipage}
\hfill
\begin{minipage}[t]{0.57\textwidth}
\begin{algorithm}[H]
\caption{Step 2: Learning the CME for $\mathcal{P}_{X|Z}$}
\label{alg: step_2}
\textbf{Input}: $q(\alpha)$,  $\gamma_N^{(s_1)}$, $\gamma_{MN}^{(s_1)}$, $M_1$, $N_1$, number of $\alpha$ samples $\defeq s_2$, $\{\check{z}_j\}_{j=1}^{s_2}$, 
\begin{algorithmic}[1]
\Function{OptimiseX1}{}
\State $\{\alpha_j, \check{z}_j, (w_{MN})_j\}_{j=1}^{(s_2)^2}$ = CreateTrainData($q(\alpha)$, $\hat{\gamma}_N$, $\hat{\gamma}_{MN}$, $M_1$, $N_1$ )
\State initialize $\hat{X} = (M_1 + N_1)/2$
\State initialize $\hat{\lambda}_X = \hat{\lambda}_N$
% \StateX \textbf{while} not converged \textbf{do}
\While{not converged}
\State Use Eq.~\eqref{eq: step_2_inputs} to calculate $\{(w_X)_j\}_{j=1}^{(s_2)^2}$ from $\{\alpha_j, \check{z}_j\}_{j=1}^{(s_2)^2}$. 
\State Compute loss = MSE($\{(w_X)_j, (w_{MN})_j\}_{j=1}^{(s_2)^2}$) \Comment{Eq.\eqref{eq: emp_obj}}
\State Compute $\nabla_{\hat{X}}(loss)$, $\nabla_{\hat{\lambda}_X}(loss)$
\State $\hat{X} \leftarrow \hat{X} - \text{step}\times \nabla_{\hat{X}}(loss) $; $\hat{\lambda}_X \leftarrow \hat{\lambda}_X - \text{step} \times \nabla_{\hat{\lambda}_X}(loss) $
% \State $\hat{\lambda}_X \leftarrow \hat{\lambda}_X - \text{step\_size} \nabla_{\hat{\lambda}_X}(loss) $
\EndWhile \\
\Return $\hat{X}$, $\hat{\lambda}_X$
\EndFunction
% \StateX \textbf{end while}
\StateX
\Function{CreateTrainData}{}
\State Sample $\{\alpha_j\}_{j=1}^C$ from $q(\alpha)$
\State Take all pairs in $\{\alpha_j\}_{j=1}^C\times \{\check{z}_j\}_{j=1}^{s_2}$ to get $\{(\alpha_j, \check{z}_j)\}_{j=1}^{C\times s_2}$
\State Substitute $\{(\alpha_j, \check{z}_j)\}_{j=1}^{C\times s_2}$, along with $M_1$, $N_1$, $\hat{\gamma}_N$, $\hat{\gamma}_{MN}$ into Eq.~\eqref{eq: step_2_labels} to calculate the labels $\{w_j\}_{j=1}^{C\times s_2}$.\\
\Return $\{\alpha_j, \check{z}_j, w_j\}_{j=1}^{C\times s_2}$
\EndFunction
\end{algorithmic}
\end{algorithm}
\end{minipage}
% \caption{Step 2 for training MEKIV - finding the $X1$ and $\lambda_{X}^{(s_1)}$}
\caption{Our proposed algorithm. Algorithm 1 outlines the end-to-end algorithm from training data to the structural estimator $\hat{f}$. Algorithm 2 outlines our main contribution, step 2 of the algorithm where we learn the CME for the latent variable $X$.}
\label{fig:algorithm}
\end{figure*}
\section{Relaxing classical measurement error assumptions} \label{app: relax_merror_assumptions}
In \cite{HU2015392_supp}, the authors assume the noise on the second measurement $N$, is an unknown monotonic polynomial function of $X$ with additive noise. The estimation procedure amounts of first identifying the polynomial function of $X$, before applying a technique similar to \cite{schennach04_supp}. In this work we take the first step to extend the estimator of \cite{schennach04_supp} to the confounded setting, and leave for future work the relaxation on the assumptions of the second measurement.

\section{Further assumptions on kernel identification} \label{app: kernel_assumptions}

We also employ the following technical assumptions to enable causal effect estimation in the latent treatment setting.

\begin{assumption} \label{assump: polish_space}
$\mathcal{Z, X, M, N, Y}$ are measurable, separable Polish spaces.
\end{assumption}
Assumption~\ref{assump: polish_space} is a regularity condition that allows us to define the conditional mean embedding operator.
\begin{assumption}
$Y$ is bounded.
\end{assumption}
\begin{assumption} \label{assump: kernel_reg}
$k(x, \cdot), k(m, \cdot), k(n, \cdot)$ are continuous, bounded by $\kappa >0$, and their feature maps are measurable. (ii) $k(x, \cdot), k(m, \cdot), k(n, \cdot)$ are characteristic kernels.
\end{assumption}
Assumption~\ref{assump: kernel_reg} is a standard assumption employed in kernel causal learning (\cite{kiv_supp} \cite{proximal2021_supp}).

\section{$s-$sample estimates}\label{app: sample_estimates}
For clarity, we state the $s-$sample estimates for $\hat{\psi}_{\cP_{X|z}}(\alpha)$, $\hat{\psi}_{\cP_{N|z}}(\alpha)$, $\hat{\psi}_{\cP_{M,N|z}}(\upsilon, \alpha)$, which are obtained from Kernel Ridge Regression, and the relevant derivatives below:
\begin{align}
    \hat{\psi}_{\cP_{X|z}}(\alpha) &= \sum_{j=1}^s \hat{\gamma}_X(z)_{j} e^{i \alpha x_{j}}\\
        \hat{\psi}_{\cP_{N|z}}(\alpha) &= \sum_{j=1}^s \hat{\gamma}_X(z)_{j} e^{i \alpha n_{j}}\\
            \hat{\psi}_{\cP_{M,N|z}}(\alpha) &= \sum_{j=1}^s \hat{\gamma}_{M,N}(z)_{j} e^{i (\upsilon m_j + \alpha n_{j})}
\end{align}
With:
\begin{align}
            \hat{\gamma}_X(z) &= (K_{ZZ} + s\hat{\lambda}_XI)^{-1}K_{Zz}\\
            \hat{\gamma}_N(z) &= (K_{ZZ} + s\hat{\lambda}_NI)^{-1}K_{Zz}\\
            \hat{\gamma}_{M,N}(z) &= (K_{ZZ} + s\hat{\lambda}_{M,N}I)^{-1}K_{Zz}
\end{align}
And the derivatives:
\begin{align}
            \frac{\partial }{\partial \alpha} \hat{\psi}_{\cP_{X|z}}(\alpha) &= \sum_{j=1}^s ix_j\hat{\gamma}_X(z)_{j} e^{i \alpha x_{j}}\\
            \frac{\partial }{\partial \upsilon}\bigg|_{\upsilon=0} \hat{\psi}_{\cP_{M,N|z}}(\alpha, \upsilon) &= \sum_{j=1}^s im_j\hat{\gamma}_{M,N}(z)_{j} e^{i \alpha n_{j}}
\end{align}
\section{Demand design - further results}
\begin{figure*}[t]
\centering
\begin{minipage}[t]{.28\textwidth}
\centering
\includegraphics[scale=0.38]{figures/demand_rho=0.25_Gaussian.pdf}
\caption*{(a) $\rho=0.25$}
\end{minipage}
\begin{minipage}[t]{.28\textwidth}
\centering
\includegraphics[scale=0.38]{figures/demand_rho=0.5_Gaussian.pdf}
\caption*{(b) $\rho=0.5$}
\end{minipage}
\begin{minipage}[t]{.28\textwidth}
\centering
\includegraphics[scale=0.38]{figures/demand_rho=0.9_Gaussian.pdf}
\caption*{(c) $\rho=0.9$}
\end{minipage}
\begin{minipage}[t]{.14\textwidth}
\centering
\includegraphics[scale=0.7]{figures/legend_tall.pdf}
\end{minipage}
% \caption{Step 2 for training MerrorKIV - finding the $X1$ and $\lambda_{X}^{(s_1)}$}
\caption{Demand design - Gaussian Measurement Error.}
\label{fig:demand_gaussian}
\end{figure*}
See Figure~\ref{fig:demand_gaussian} for further results on Demand design with Gaussian measurement error.

\section{Proofs} \label{app: proofs}
\textbf{Proof of Theorem~\ref{prop: charfun_cme_equiv}.}
\begin{proof}
First we note that by Fubini's theorem the Fourier Transform of the (ground truth) conditional mean embedding $\mu_{\cP_{X|z}}$ can be computed as:
\begin{align}
    \tilde{\mu}_{\cP_{X|z}}(\alpha) &= q(\alpha) \psi_{\cP_{X|z}}(-\alpha)
\end{align}
% This is seen by noticing that 
% \begin{align}
    
% \end{align}
\begin{align}
    &\|\hat{\mu}_{\mathcal{P}_{X|z}}^{(s)} - \mu_{\mathcal{P}_{X|z}}\|_{\cH_{\cX}}\\ &= \int_{-\infty}^{\infty} \frac{\left| \tilde{\hat{\mu}}_{\mathcal{P}_{X|z}}^{(s)}(\alpha) - \tilde{\mu}_{\mathcal{P}_{X|z}}(\alpha)\right|^2}{q(\alpha)}d\alpha\\
    &= \int_{-\infty}^{\infty} q(\alpha)\left| \hat{\psi}_{\mathcal{P}_{X|z}}^{(s)}(-\alpha) - \psi_{\mathcal{P}_{X|z}}(-\alpha)\right|^2d\alpha
\end{align}
Since $k$ is a symmetric kernel i.e. even function, $q(\alpha) = \frac{1}{2\pi} \tilde{k}(\alpha)$ is a real and even measure, so
\begin{align}
    &\int_{-\infty}^{\infty} q(\alpha)\left| \hat{\psi}_{\mathcal{P}_{X|z}}^{(s)}(-\alpha) - \psi_{\mathcal{P}_{X|z}}(-\alpha)\right|^2d\alpha\\
    &= \int_{-\infty}^{\infty} q(\alpha)\left| \hat{\psi}_{\mathcal{P}_{X|z}}^{(s)}(\alpha) - \psi_{\mathcal{P}_{X|z}}(\alpha)\right|^2d\alpha\\
    &= \|\hat{\psi}_{\mathcal{P}_{X|z}}^{(s)}(\alpha) - \psi_{\mathcal{P}_{X|z}}(\alpha)\|_{\mathcal{L}^2(\mathbb{R}, q)}
\end{align}
Consequentially, whenever $\|\hat{\mu}_{\mathcal{P}_{X|z}}^{(s)} - \mu_{\mathcal{P}_{X|z}}\|_{\cH_{\cX}} < \epsilon$, $\|\hat{\psi}_{\mathcal{P}_{X|z}}^{(s)} - \psi_{\mathcal{P}_{X|z}}\|_{\mathcal{L}^2(\mathbb{R}, q)} < \epsilon.$ and vice versa. Therefore, $\hat{\psi}^{(s)}_{\mathcal{P}_{X|z}} \longrightarrow \psi_{\mathcal{P}_{X|z}}$ in $\mathcal{L}^2(\mathbb{R}, q)$ if and only if $\hat{\mu}_{\mathcal{P}_{X|z}}^{(s)} \longrightarrow \mu_{\mathcal{P}_{X|z}}$ in $\mathcal{H_X}$. Moreover, if they converge, the convergence happen at the same rate.
\end{proof}

\textbf{Proof of Theorem~\ref{thm: unique_solution}.}
\begin{proof}
Since there is a bijection between characteristic functions and probability distributions, we only have to show that the characteristic function satisfying Eq.~\eqref{eq: diff_char_cme} is unique. 

Now Eq.~\eqref{eq: diff_char_cme} can be rewritten as
\begin{align}
    \frac{d\psi_{\cP_{X|\check{z}}}(\alpha)/d\alpha}{\psi_{\cP_{X|\check{z}}}(\alpha)} &= i g(\alpha) \\
    \frac{d}{d\alpha} \log(\psi_{\cP_{X|\check{z}}}(\alpha)) &= ig(\alpha) \\
    \text{with } g(\alpha) &= \frac{\E[Me^{i\alpha N}|\check{z}]}{\E[e^{i\alpha N}|\check{z}]} \label{eq: thm_uniq_char_1}
\end{align}
Now suppose there is another characteristic function $\psi(\alpha)$ which also satisfies Eq.~\eqref{eq: diff_char_cme} for all $\alpha \in \R$, i.e.
\begin{align}
            \frac{d}{d\alpha} \log(\psi(\alpha)) &= ig(\alpha)  \label{eq: thm_uniq_char_2}
\end{align}
Let $f(\alpha) = \log(\psi_{\cP_{X|z}}(\alpha))$, $g(\alpha) = \log (\psi(\alpha))$. $f'=g'$.
But since characteristic functions are always $1$ at $\alpha=0$, $f(0) = g(0) = \log(1) = 0$. So by Lemma~\ref{lemma} $f = g$. Since $\log$ is an invertible function whose inverse is $\exp$, we must have $\psi_{\cP_{X|z}} = \exp(f) = \exp(g) = \psi$. i.e. the solution to Eq.~\eqref{eq: diff_char_cme} is unique.
% But $\log(\psi_{\cP_{X|z}}(\alpha)) = \int^{\alpha}_0 i g(\nu) d\nu$, so $\log(\psi(\alpha)) - \log(\psi_{\cP_{X|z}}(\alpha)) = C$ i.e.
% \begin{align}
%     \log \left( \frac{\psi(\alpha)}{\psi_{\cP_{X|z}}(\alpha)}\right) &= C \\
%     \frac{\psi(\alpha)}{\psi_{\cP_{X|z}}(\alpha)} &= e^{C}
% \end{align}
% But characteristic functions are always $1$ at $\alpha=0$, thus:
% \begin{align}
%         \frac{\psi(0)}{\psi_{\cP_{X|z}}(0)} &= \frac{1}{1} = e^{C} \label{eq: char_normalised} \\
%         C &= 0 \label{eq: C_is_0}
% \end{align}
% Therefore $\frac{\psi(\alpha)}{\psi_{\cP_{X|z}}(\alpha)} = e^{0} = 1$, which is to say that $\psi(\alpha) = \psi_{\cP_{X|z}}(\alpha)$ $\forall \alpha \in \R$.
\end{proof}

\section{Real-world experiment: Income on children's outcome}
\begin{table}[ht]
    \centering
    \begin{tabular}{|c|c|}
    \hline
        Method  &  MSE \\
       \hline \hline
       KIV-Oracle  & 0.0345 $\pm$ 0.0190\\
       \hline
       MEKIV  & 0.0295 $\pm$ 0.0144\\
       \hline
       KIV-M  & 0.0318 $\pm$ 0.0199\\
       \hline
       KIV-MN  & 0.0310 $\pm$ 0.0142\\
       \hline
    \end{tabular}
    \caption{MSE, income-on-children's-outcome data}
    \label{tab:dahl_lochner}
\end{table}

As described in Section 6 of the main paper, we apply our algorithm to the dataset described in \cite{dahl_lochner_2012_supp}. In order to obtain causal ground truth, we fit a simulation model to the observed data, obtaining the structural equation $f$. We then generate data from the fitted simulation model, for which we now have access to causal ground truth. We then run MEKIV along with the baselines on the generated dataset. Table \ref{tab:dahl_lochner} present the results. We observe that the performance across all methods do not differ much, and in particular the perturbations around the average MSE overlap. This prompts us to look into the performance of the learnt estimators and we plot the estimated $\E[Y|do(A)]$ in Figure \ref{fig:dahl_lochner}. In Figure \ref{fig:dahl_lochner}, we observe that in fact none of the methods work well, including KIV-oracle. This suggests that the instrumental variable is only weakly associated with the input. A simple analysis on the dataset suggests exactly this: the average increase in average yearly income from 1985 to 2000 is around \$2000, whereas the largest increase between the EITC credit rate of two consecutive years is about 10\%, which corresponds to only a 10\% portion of the increase in income. 



\begin{figure}[t]
\centering
\includegraphics[scale=0.6]{figures/dahl_lochner.pdf}
% \caption{Step 2 for training MerrorKIV - finding the $X1$ and $\lambda_{X}^{(s_1)}$}
\caption{Dahl-Lochner Income on Cognitive outcome}
\label{fig:dahl_lochner}
\end{figure}