\newpage
\section{Some minor thoughts on Covariance Operators.}
Suppose we have two RKHS $\calH, \calG$ with kernel $k:\calX \times \calX \to \R, \ell: \calY \times \calY \to \R$. Suppose that $X, Y$ are random variables on $\calX, \calY$ with distribution functions $\mathbb{P}_X, \mathbb{P}_Y$ and joint distribution $\mathbb{P}_{XY}$. Define the tensor product space $\calG \otimes \calH$.

Define a function $f_\x : \calY \to \R \in \calG$ and a conditional distribution $\pi_{Y|X=\x}$. \HC{I am not able define everything rigorously in measure theory... So I tried not to use the term measure.}
The problem is to estimate the conditional expectation from samples $\{\x_i, \y_i\}_{i=1}^n$, especially when $\x$ is not observed.
\begin{align}
    \Pi[f_\x] = \E_{Y|X=\x}[g(Y)|X=\x]
\end{align}

Conditional expectation can also be viewed as a linear functional $\calG \to \R$, so it can be written as the inner product of function $f_\x$ and conditional kernel mean embedding $\calU_{Y|X=x}$.
\begin{align}
    \Pi[f_\x] = \langle f_\x, \calU_{Y|X=\x} \rangle_{\calG}
\end{align}

Based on the property that $\calU_{Y|X=x} = \calU_{Y|X}k(\x, \cdot)$, we have the following
\begin{align}
\begin{split}
    \langle f_x, \calU_{Y|X=\x} \rangle_{\calG} 
    &= \langle f_\x, \calU_{Y|X} k(\x, \cdot) \rangle_{\calG} \\ 
    &= \langle f_\x \otimes k(\x, \cdot), \calU_{Y|X} \rangle_{\calG \otimes \calH} \\
\end{split}
\end{align}
where $\calU_{Y|X} = \calC_{Y X} \calC_{X X}^{-1}$. $\calC_{YX}$ and $\calC_{XX}$ are covariance operators defined as below:
\begin{align}
    \calC_{YX} &= \E_{Y X}[\ell(Y, \cdot) \otimes k(X, \cdot)] \\
    \calC_{XX} &= \E_{X X}[k(X, \cdot) \otimes k(X, \cdot)]
\end{align}

The estimate error of $\hat{\Pi}[f_\x]$ can be expressed as:
\begin{align}
\begin{split}
    |\Pi[f_\x] - \hat{\Pi}[f_\x]| &=     
    \langle f_x, \calU_{Y|X=\x} - \hat{\calU}_{Y|X=\x} \rangle_{\calG} \\
    &= \langle f_\x, (\calU_{Y|X} - \hat{\calU}_{Y|X}) k(\x, \cdot) \rangle_{\calG} \\ 
    &= \langle f_\x \otimes k(\x, \cdot), \calU_{Y|X} - \hat{\calU}_{Y|X} \rangle_{\calG \otimes \calH} \\
    & \leq || f_\x \otimes k(\x, \cdot)||_{\calG \otimes \calH} ||\calU_{Y|X} - \hat{\calU}_{Y|X}||_{\calG \otimes \calH} \\
    &= k(\x,\x) ||f_\x||_\calG ||\calU_{Y|X} - \hat{\calU}_{Y|X}||_{\calG \otimes \calH}
\end{split}
\end{align}

For fixed $\x$, the error is bounded by the norm difference between $||\calU_{Y|X} - \hat{\calU}_{Y|X}||_{\calG \otimes \calH}$. So, if we choose our kernel $k, \ell$ carefully, we should have the analytic form for $\calU_{Y|X} = \calC_{Y X} \calC_{X X}^{-1}$ and the main target becomes to choose an empirical estimate $\hat{\calC}_{Y X}, \hat{\calC}_{X X}$ that can minimize the norm difference. 

The standard Monte Carlo estimate for $\calC_{YX}$ is
\begin{align*}
    \hat{\calC}_{YX}^{MC} = \frac{1}{n} \sum_{i=1}^n \ell(\y_i, \cdot) \otimes k (\x_i, \cdot) 
\end{align*}
For Bayesian Monte Carlo, we give a weight term $w_i$ for every sample pair $(\x_i, \y_i)$. And the Bayesian Monte Carlo estimate becomes
\begin{align*}
    \hat{\calC}_{YX}^{BMC} = \frac{1}{n} \sum_{i=1}^n w_i \ell(\y_i, \cdot) \otimes k (\x_i, \cdot) 
\end{align*}
Note that also the tensor product seems terrifying, they all vanish after taking the inner product.
\begin{align*}
\begin{split}
\langle \hat{\calC}_{YX}^{BMC}, \hat{\calC}_{YX}^{BMC} \rangle_{\calG \otimes \calH} &= \langle \frac{1}{n} \sum_{i=1}^n w_i \ell(\y_i, \cdot) \otimes k (\x_i, \cdot), \frac{1}{n} \sum_{i=1}^n w_j \ell(\y_j, \cdot) \otimes k (\x_j, \cdot) \rangle_{\calG \otimes \calH} \\
&= \frac{1}{n^2}\sum_{i=1}^n \sum_{j=1}^n w_i w_j \ell(\y_i, \y_j) k(\x_i, \x_j) \\ 
&= \frac{1}{n^2} \bW^\top \bK \bL \bW 
\end{split}
\end{align*}
where $\bK, \bL$ are $n \times n$ matrices and $\bW$ is $n \times 1$ vector.

I have no idea how to proceed with $\calC_{XX}^{-1}$, so this note pauses here.

\section{Old CKME regression}
Based on Riesz representer theorem, the solution to \eqref{eq:loss_reg_1} is unique and has the form

\begin{align}\label{eq:linear-comb}
    F = \sum_{i=1}^n K_{\Gamma x_i} (f_i)
\end{align}

if we recall that $K_{\Gamma x_i}:\calH_\calY \to \calH_\Gamma$ is the feature map and $k_\Gamma(x, x') \in \calL(\calH_\calY)$ is a bounded linear operator. The coefficients in \eqref{eq:linear-comb} $f_i \in \calH_\calY$ satisfy the following linear equations.

\begin{align}
    \sum_{j=1}^n (k_\Gamma(x_i, x_j) + \lambda \delta_{ij}) (f_j) = k_{\calY}(y_i, \cdot), \quad \forall i \in \{1,2, \cdots, n\}
\end{align}

\subsection{$\calH_\Gamma = \calH_\calY \otimes \calH_\calX$.}

Here we discuss the case that $\calH_\calX$ is a RKHS with reproducing kernel $k_\calX: \calX \times \calX \to \R$ and the reproducing kernel $k_\Gamma(x,x') = k_\calX(x,x')Id_{\calH_\calY}$. Now the reproducing property \eqref{eq:reproducing_1} becomes

\begin{align}
\begin{split}
    \PSi{K_{\Gamma x}(g_y), K_{\Gamma x'} (g_{y'})}{\calH_\Gamma} &= \PSi{g_y, K_{\Gamma x'} (g_{y'})(x)}{\calH_\calY} \\
    &= \PSi{g_y, k_\Gamma(x, x')(g_{y'})}{\calH_\calY} \\
    &= \PSi{k_\calX(x, \cdot), k_\calX(x', \cdot)}{\calH_\calX} \PSi{g_y, g_{y'}}{\calH_\calY}
\end{split}
\end{align}

This form reminds us of the definition of tensor product Hilbert space $\calH_\calX \otimes \calH_\calY$, which can be further written as:

\begin{align}
    \PSi{K_{\Gamma x}(g_y), K_{\Gamma x'} (g_{y'})}{\calH_\Gamma} = \PSi{k_\calX(x, \cdot) \otimes g_y, k_\calX(x', \cdot) \otimes g_{y'}}{\calH_\calX \otimes \calH_\calY}
\end{align}

The tensor product form reveals a close relation with covariance operators and we will discuss that in the next subsection.

Now the solution to the regression problem as defined in \eqref{eq:loss_reg_1} has a much simpler form.

\begin{align}\label{eq:linear_comb_2}
\begin{split}
    F(\cdot) = \sum_{i=1}^n K_{\Gamma x_i} (f_i)(\cdot)
    = \sum_{i=1}^n k_\calX(x_i, \cdot) f_i
\end{split}
\end{align}
where the coefficients $f_i \in \calH_\calY$ satisfy the linear equations

\begin{align}
    \sum_{j=1}^n (k_\calX(x_i, x_j) + \lambda \delta_{ij}) f_j = k_{\calY}(y_i, \cdot), \quad \forall i \in \{1,2, \cdots, n\}
\end{align}

The coefficients $\bff = [f_1, f_2, \cdots, f_n]^\top$ has a closed-form expression $\bff = (k_\calX(\bX, \bX) + \lambda \calI_N)^{-1} k_\calY(\by)$ and the solution \eqref{eq:linear_comb_2} becomes

\begin{align}
    F(\cdot) = k_\calX(\bX, \cdot)^\top (k_\calX(\bX, \bX) + \lambda \calI_N)^{-1} k_{\calY_\by}
\end{align}
where $k_{\calY_\by}$ is a vector of length $n$ consisting of $n$ functions $k_\calY(y_i, \cdot), i=\{1,2, \cdots, n\}$.

A little sanity check here: $F$ takes a value $x^* \in \calX$ and then $k_\calX(\bX, x^*)^\top (k_\calX(\bX, \bX) + \lambda \calI_N)^{-1}$ is a $1 \times n$ dimension vector, so $F(x^*)$ is a linear combination of $k_\calY(y_i, \cdot)$ and so $F(x^*) \in \calH_\calY$, which satisfies the requirement that $F: \calX \to \calH_\calY$.

Actually, if we consider $F$ as an element in the tensor product space $\calH_\calX \otimes \calH_\calY$, then $F$ is a function that takes two values $x^* \in \calX, y^* \in \calY$ and maps them to $\R$, i.e $F: \calX \times \calY \to \R$.

\subsection{The Second Perspective: Minimizing the Estimate Error.}
We follow the exact same setting as the first perspective. Suppose that $F \in \calH_\Gamma$ and we hope to minimize the empirical estimate error

\begin{align*}
    \argmin_{F \in \calH_\Gamma} \widehat{\calE(F)}, \quad \widehat{\calE(F)} = \sum_{p=1}^{n} \norm{\calU_{Y|X=\x_p} - F(\x_p)}{\calH_\calY}^2
\end{align*}


The whole sample observation $\bX = \{\x_i\}_{i=1}^{n}, \bY = \{\y_i\}_{i=1}^{m}$ and for a given $\x_i$, we have $y_1^{i}, \cdots, y_{m_i}^{i} \sim \pi_{Y|X=\x_i}$ and $\bY^i = [\y_1^{i}, \cdots, \y_{m_i}^{i}]^\top$. Since $F \in \calH_\Gamma$, we express it as the weighted sum of feature maps.

\begin{align}
    \begin{split}
        F &= \sum_{i=1}^{n} \sum_{j=1}^{m_i} K_{\Gamma \x_i} (k_\calY(\y_j^i, \cdot)) \\
        &= \sum_{i=1}^{n} \sum_{j=1}^{m_i} w_{ij} k_\calX(\x_i, \cdot) k_\calY(\y_j^i, \cdot)
    \end{split}
\end{align}
Sanity check: If $F$ takes a value $\x'$, then $F(\x') = \sum_{i=1}^{n} \sum_{j=1}^{m_i} w_{ij} k_\calX(\x_i, \x') k_\calY(\y_j^i, \cdot)$, which is a linear combination of $k_\calY(\y_j^i, \cdot) \in \calH_\calY$. Good!

\paragraph{Remark:} Note here we are not using the more general form $F = \sum_{i=1}^{n} \sum_{j=1}^{m} w_{ij} k_\calX(\x_i, \cdot) k_\calY(\y_j, \cdot)$, because $\{\x_i\}, \{\y_j\}$ are not sampled i.i.d jointly, but only ${\y_j^i}$ is sampled i.i.d after conditioning on $\x_i$.


The empirical estimate error is now
\begin{align*}
    \widehat{\calE(F)} = \sum_{p=1}^{n} \norm{\calU_{Y|X=\x_p} - \sum_{i=1}^{n} \sum_{j=1}^{m_i} w_{ij} k_\calX(\x_i, \x_p) k_\calY(\y_j^i, \cdot)}{\calH_\calY}^2
\end{align*}

We write out the norm term by term.
The first term is
\begin{align*}
    \PSi{\calU_{Y|X=\x_p}, \calU_{Y|X=\x_p}}{\calH_\calY} = \int \int k(\y, \y') d\pi_{Y|X=\x_p}(\y) d\pi_{Y|X=\x_p}(\y')
\end{align*}
The second term is
\begin{align*}
\begin{split}
    \PSi{\calU_{Y|X=\x_p},  \sum_{i=1}^{n} \sum_{j=1}^{m_i} w_{ij} k_\calX(\x_i, \x_p) k_\calY(\y_j, \cdot)}{\calH_\calY} 
    &= \sum_{i=1}^{n} \sum_{j=1}^{m_i} w_{ij} k_\calX(\x_i, \x_p) \int k_\calY(\y, \y_j)d \pi_{Y|X=\x_p}(\y) \\
    &= \sum_{i=1}^{n} k_\calX(\x_p, \x_i) \w_i^\top \int k_\calY(\bY^i, \y) d \pi_{Y|X=\x_p}(\y) \\
    &= \sum_{i=1}^{n} k_\calX(\x_p, \x_i) \w_i^\top \Phi_p^i
\end{split}    
\end{align*}
where
\begin{align*}
    \quad \w_i &\in \R^{m_i \times 1}, \\
    \Phi_p &= \left[\left(\int k_\calY(\bY^1, \y) d \pi_{Y|X=\x_p}(\y)\right)^\top, \cdots, \left(\int k_\calY(\bY^{n}, \y) d \pi_{Y|X=\x_p}(\y)\right)^\top\right]^\top \\
    &= \left[\left(\Phi_p^1\right)^\top, \cdots, \left(\Phi_p^{n}\right)^\top\right]^\top \in \R^{m \times 1}, \quad \text{and}
    \\
    \Phi_p^i &= \int k_\calY(\bY^i, \y) d \pi_{Y|X=\x_p}(\y) \in \R^{m_i \times 1}
\end{align*}
The third term is
\begin{align*}
\begin{split}
    &\PSi{\sum_{i=1}^{n} \sum_{j=1}^{m_i} w_{ij} k_\calX(\x_i, \x_p) k_\calY(\y_j, \cdot), \sum_{s=1}^{n} \sum_{t=1}^{m_s} w_{st} k_\calX(\x_s, \x_p) k_\calY(\y_t, \cdot)}{\calH_\calY} \\
    &= \sum_{i=1}^{n} \sum_{j=1}^{m_i} \sum_{s=1}^{n} \sum_{t=1}^{m_s} w_{ij} w_{st} k_\calX(\x_s, \x_p) k_\calX(\x_i, \x_p) k_\calY(\y_j, \y_t) \\
    &= \sum_{i=1}^{n} \sum_{s=1}^{n} k_\calX(\x_p, \x_i) \w_i^\top k_\calY(\bY^i, \bY^s) \w_s k_\calX(\x_s, \x_p) \\
\end{split}
\end{align*}

So we summarize the terms in $\widehat{\calE(F)}$ that depends on $\bw_i$.
\begin{align*}
    \widehat{\calE(F)} = -2 \sum_{i=1}^{n} k_\calX(\x_p, \x_i) \w_i^\top \Phi_p^i + \sum_{i=1}^{n} \sum_{s=1}^{n} k_\calX(\x_p, \x_i) \w_i^\top k_\calY(\bY^i, \bY^s) \w_s k_\calX(\x_s, \x_p) + const
\end{align*}

We take the derivative with respect to $\bw_i$ and we have.
\begin{align*}
    \sum_{p=1}^{n} - k_\calX(\x_i, \x_p) \Phi_p^i + \sum_{p=1}^n \sum_{s=1}^n k_\calX(\x_i, \x_p) k_\calY(\bY^i, \bY^s) \bw_s k_\calX(\x_s, \x_p) = 0
\end{align*}
which can written as
\begin{align}
    \sum_{p=1}^n D_p \left[\begin{array}{c}
    \w_1 \\
    \vdots \\
    \w_n 
    \end{array}\right]_{m \times 1} = 
    \sum_{p=1}^n
    \left[\begin{array}{c}
    k_\calX(\x_p, \x_1) \Phi_p^1 \\
    \vdots \\
    k_\calX(\x_p, \x_n) \Phi_p^n
    \end{array}\right]_{m \times 1}
\end{align}
where
\begin{align*}
    D_p = 
    \left[
    \begin{array}{ccc}
    \cdots 
    &
    k_\calX(\x_{i-1}, \x_p) k_\calY(\bY^{i-1}, \bY^s) k_\calX(\x_s, \x_p)
    &
    \cdots
    \\
    k_\calX(\x_p, \x_i) k_\calY(\bY^i, \bY^{s-1}) k_\calX(\x_{s-1}, \x_p)     
    &
    k_\calX(\x_i, \x_p) k_\calY(\bY^i, \bY^s) \bw_s k_\calX(\x_s, \x_p)
    & \cdots \\
    \cdots 
    &
    k_\calX(\x_{i+1}, \x_p) k_\calY(\bY^{i+1}, \bY^s) \bw_s k_\calX(\x_s, \x_p)
    & \cdots
    \end{array}
    \right]_{m \times m}
\end{align*}
and we have
\begin{align*}
    D = \sum_{i=1}^n D_p = 
    \left[
    \begin{array}{ccc}
    \cdots 
    &
    k_\calX(\x_{i-1}, \bX) k_\calX(\bX, \x_s) k_\calY(\bY^{i-1}, \bY^s) \bw_s 
    & \cdots \\
    k_\calX(\x_i, \bX) k_\calX(\bX, \x_{s-1}) k_\calY(\bY^i, \bY^{s-1})
    &
    k_\calX(\x_{i}, \bX) k_\calX(\bX, \x_s) k_\calY(\bY^{i}, \bY^s)
    & \cdots \\
    \cdots &
    k_\calX(\x_{i+1}, \bX) k_\calX(\bX, \x_s) k_\calY(\bY^{i+1}, \bY^s)
    & \cdots
    \end{array}
    \right]_{m \times m}
\end{align*}
Finally, we arrive at the optimal weight matrix

\begin{align}
    \left[\begin{array}{c}
    \w_1^\dagger \\
    \vdots \\
    \w_n^\dagger
    \end{array}\right]_{m \times 1}
    =
    D^{-1}
    \left[\begin{array}{c}
    \sum_{p=1}^n k_\calX(\x_p, \x_1) \Phi_p^1 \\
    \vdots \\
    \sum_{p=1}^n k_\calX(\x_p, \x_n) \Phi_p^n
    \end{array}\right]_{m \times 1}
\end{align}
And the optimal estimate of the kernel mean embedding is
\begin{align}
\begin{split}
    F^\dagger &= \sum_{i=1}^{n} \sum_{j=1}^{m_i}
    w_{ij} k_\calX(\x_i, \cdot) k_\calY(\y_j, \cdot) 
    \\ 
    &= \sum_{i=1}^{n} k_\calX(\cdot, \x_i) \bw_i^\dagger k_\calY(\bY^i, \cdot) \\
\end{split}
\end{align}
and the optimal estimate of the conditional integral is
\begin{align}
\begin{split}
    \Pi[g](\x') 
    &= \PSi{g, \calU_{Y|X=\x'}}{\calH_\calY} \\
    &= \PSi{g, F^\dagger(\x')}{\calH_\calY} \\
    &= \sum_{i=1}^n k_\calX(\x', \x_i) \bw_i^\dagger g(\bY^i)
\end{split}
\end{align}
