\section{Kernel Bayes Rule}
In this section, we assume a joint distribution $P(X,Y)$, with marginals $P(X), P(Y)$, and conditional distribution $P(X|Y), P(Y|X)$. The prior distribution is $\Pi(Y)$. Note that there is an important difference from $Q(X) = \int P(X|Y) \Pi(Y)$ and $P(X)$. The superscipt $\pi$ is used to show the difference. Denote that the samples drawn from the joint distribution $P(X,Y)$ as $(\{x_i, y_i\})_{i=1}^n$, and samples drawn from $\Pi(Y)$ as $\{\tilde{y_i}\}_{i=1}^n$.

\subsection{Sum rule}
We denote the embedding for $Q(X)$ as $\mu_X^{\pi}$ and we have the kernel sum rule
\begin{align}
    \mu_X^{\pi} = \E_Y \E_{X|Y}[k_\calX(X, \cdot)|Y] = \E_Y [C_{X|Y} k_\calX(X, \cdot)] = C_{X|Y} \E_Y[k_\calY(Y, \cdot)] = C_{X|Y} \mu_Y^{\pi}
\end{align}
where $\mu_Y^{\pi}$ is the embedding of the prior $\Pi(Y)$.
Note that the conditional embedding operator $C_{X|Y}$ does not depend on $\Pi(Y)$.

Moreover, we can find the embedding of $Q(X)$ in the product feature space $k_\calX(x, \cdot) \otimes k_\calX(x, \cdot)$, which is $C_{XX}^\pi = \E_X[k_\calX(x, \cdot) \otimes k_\calX(x, \cdot)]$. So the kernel sum rule becomes
\begin{align}
    C_{XX}^\pi = C_{XX|Y} \mu_Y^\pi
\end{align}
where $C_{XX|Y} k_\calY(y, \cdot) = \E_{X}[k_\calX(x, \cdot) \otimes k_\calX(x, \cdot)|Y=y]$ is the embedding of conditional tensor product feature.

If we assume that the empirical estimate for $\widehat{\mu_Y^\pi}= \balpha^\top k_\calY(\tilde{\bY}, \cdot)$, and we know that the empirical estimate $\widehat{C_{X|Y}} = k_\calY(\cdot, \bY) (K_\calY(\bY, \bY) + \lambda I)^{-1} k_\calX(\bX, \cdot)$, then the empirical estimate of $\mu_X^\pi$ 
\begin{align}
    \widehat{\mu_X^\pi} = \widehat{C_{X|Y}} \widehat{\mu_Y^\pi} = k_\calX(\cdot, \bX) (K_\calY(\bY, \bY) + \lambda I)^{-1} k_\calY(\bY, \tilde{\bY}) \balpha
\end{align}
and similarly,
\begin{align}
    \widehat{C_{XX}^\pi} = k_\calX(\cdot, \bX) \operatorname{diag}\left( (K_\calY(\bY, \bY) + \lambda I)^{-1} k_\calY(\bY, \tilde{\bY}) \balpha \right) k_\calX(\bX, \cdot)
\end{align}
\HC{I am not sure about the math details here, ask Liyuan about this.}

\subsection{Product Rule}
The joint distribution is $Q(X,Y)=P(X|Y) \Pi(Y)$.
We are interested in the joint embedding of $Q(X,Y)$.
\begin{align}
\begin{split}
    C_{XY}^\pi = \E_Y\left[E_{X|Y}\left[ k_\calX(X, \cdot)| Y \right]\otimes k_\calY(Y, \cdot )\right] = C_{X|Y} \E_Y\left[k_\calY(Y, \cdot) \otimes k_\calY(Y, \cdot)\right] = C_{X|Y}C_{YY}^\pi
\end{split}
\end{align}
The empirical estimate is
\begin{align}
    \widehat{C_{XY}^\pi} = \widehat{C_{X|Y}} \widehat{C_{YY}^\pi} = k_\calX(\cdot, \bX) \left( K_\calY(\bY, \bY) + \lambda I)^{-1} k_\calY(\bY, \tilde{\bY}) \right) \operatorname{diag}(\balpha) k_\calY(\tilde{\bY}, \cdot)
\end{align}


\subsection{Kernel Bayes Rule}
The target of kernel bayes rule is to obtain the mean embedding of the posterior $\mu^\Pi_{Y|X}$ from the mean embedding of the prior $\Pi(Y)$ and the likelihood $P(X|Y)$. The posterior distribution is $Q(Y|x) = \frac{P(x|Y)\Pi(Y)}{\int P(x|Y)d\Pi(Y)}$.

We have the kernel Bayes rule
\begin{align}
    C_{Y|X}^\pi = C_{YX}^\pi (C_{XX}^\pi)^{-1}
\end{align}
$C_{Y|X}^\pi$ is the mean embedding of the posterior distribution $Q(Y|x) = C_{Y|X}^\pi k_\calX(x, \cdot)$.
The empirical estimate is
\begin{align}
    \widehat{C_{Y|X}^\pi} = \widehat{C_{YX}^\pi} (\widehat{C_{XX}^\pi})^{-1} = \left( \widehat{C_{X|Y}} \widehat{C_{YY}^\pi} \right)^\top \left(\widehat{C_{XX|Y}} \widehat{\mu_Y^\pi} \right)^{-1}
\end{align}
The empirical estimate is too complicated and can be found in \cite{song2013kernel}.

