\section{Conditional Kernel Mean Embedding}

This section summarizes the main results from.
\paragraph{Conditional Distribution.}
We have a probability space $(\Omega, \calF, \mathbb{P})$, then we define two random variables $X: \Omega \to \calX$ and $Y: \Omega \to \calY$ with distribution $P_X$ and $P_Y$. For each $x \in \calX$, the conditional distribution is $P_{Y|x}(B) = \E[I_B(Y)|X=x], \forall B \in \sigma(Y)$. We assume that the conditional distribution $P_{Y|x}$ is regular, which means that it is a valid probability measure for any $x \in \calX$. We denote observations as $\{(x_i, y_i)\}_{i=1}^n = (\bX, \bY)$.

\paragraph{Reproducing Kernel Hilbert Space.}
We define two RKHS $\calH_\calY, \calH_\calX$ with inner product $\langle \cdot, \cdot \rangle_{\calH_\calY}, \PSi{\cdot, \cdot}{\calH_\calX}$ and kernel $K_\calY: \calY \times \calY \to \R, K_\calX: \calX \times \calX \to \R$. $\calL(\calH_\calY)$ is the set of all bounded linear operators from $\calH_\calY$ to $\calH_\calY$. We use $x, y$ to denote elements from set $\calX$ and $\calY$, and we use $f: \calX \to \R$ and $g: \calY \to \R$ to denote functions of  $\R^\calX, \R^\calY$. The Gram matrix is denoted as $K_\calX(\bX, \bX), K_\calY(\bY, \bY)$.

We also define a vector-valued RKHS $\calH_\Gamma$ which contains linear bounded operators $F: \calH_\calY \to \calH_\calX$ under the reproducing kernel $k_\Gamma: \calH_\calX \times \calH_\calX \to \calL(\calH_\calY)$ and inner product $\PSi{\cdot, \cdot}{\calH_\Gamma} $. The kernel is $k_\Gamma(f_1, f_2) = \PSi{f_1, f_2}{\calH_\calX} {Id}_{\calH_\calY}$ and the feature map $k_{\Gamma_{g_x}}: \calH_\calY \to \calH_\Gamma$ is $k_{\Gamma_{g_x}}(g_y) = g_y \otimes g_x$. 

To summarize, we have the following
\begin{align}
\begin{split}
    \langle g_1, k_\Gamma(f_1, f_2) g_2\rangle_{\calH_\calY}
    &=\left\langle {k_\Gamma}_{f_1}(g_1), {k_\Gamma}_{f_2}(g_2)\right\rangle_{\calH_\Gamma} \\
    &= \langle g_1 \otimes f_1, g_2 \otimes f_2 \rangle_{\calH_\Gamma}  \\
    &= \PSi{f_1, f_2}{\calH_\calX} \PSi{g_1, g_2}{\calH_\calY} 
\end{split}
\end{align}

Equivalently, $\calH_\Gamma$ is the space of Hilbert-Schimidt operators, i.e $\calH_\Gamma = \calH_\calY \otimes \calH_\calX$. The property of tensor product tells us that
\begin{align*}
    \PSi{f_2, (f_1 \otimes g_1) g_2}{} &= \PSi{f_2, \PSi{g_1, g_2}{} f_1 {}}{} \\
    &= \PSi{f_1, f_2}{} \PSi{g_1, g_2}{} \\
    &= \PSi{g_2, (g_1 \otimes f_1) f_2}{}
\end{align*}

\paragraph{Covariance Operator.}
The (uncentered) covariance operator $C_{YX}$ is defined as 
\begin{align*}
    \mathcal{C}_{YX}=\mathbb{E}_{X Y}[k_\calY(Y, \cdot) \otimes k_\calX(X, \cdot)]
\end{align*}
$C_{YX}$ can also be viewed as an element of $\calH_\calY \otimes \calH_\calX$. For two functions $f,g$, we have 
\begin{align*}
    \operatorname{Cov}[f(X)g(Y)] = \PSi{f \otimes g, C_{YX}}{\calH_\calY \otimes \calH_\calX} = \PSi{g, C_{YX}f}{\calH_\calX}
\end{align*}
Similarly, we can have four covariance operators $C_{XX}, C_{XY}, C_{YX}, C_{YY}$, and $C_{XY} = C_{YX}^\ast$. 

It is interesting to see that the covariance operators are exactly the same as the mean embedding of the joint features. 
\begin{align*}
    C_{XY} = \mu_{XY} = \E_{XY}[k_\calX(X, \cdot) \otimes k_\calY(Y, \cdot) ]
\end{align*}

The empirical estimate for covariance operators $C_{YX}$ is
\begin{align*}
    \widehat{C_{YX}} 
    &= \frac{1}{n} \sum_{i=1}^n k_\calY(y_i, \cdot) \otimes k_\calX(x_i, \cdot) \\
    &= \frac{1}{n} k_\calY(\bY, \cdot) k_\calX(\cdot, \bX)
\end{align*}
which is the outer product of two feature map vectors.

\subsection{A Regression Perspective.}
The embedding for conditional distribution $P_{Y|x}$ is
\begin{align}
    \calU_{Y|X=x} = \int k_\calY(y, \cdot) dP_{Y|X=x}(y)
\end{align}

For the conditional case, we want to have an embedding $\calU_{Y|X=x}$ for any $x \in \calX$. The embedding of a conditional distribution is not a single element in the RKHS, but will instead sweep out a family of points in the RKHS, each indexed by $x\in \calX$. 

We want to approximate an operator $C_{Y|X}: \calH_\calX \to \calH_\calY$ with the operator $F \in \calH_\calX \otimes \calH_\calY$. The operator $C_{Y|X}$ has the property that $C_{Y|X} k_\calX(x, \cdot) = \calU_{Y|X=x}$, so we want the operator $F$ to satisfy $F k(x, \cdot) = \E[k_\calY(Y, \cdot)|X=x]$. And we can define the adjoint $F^\ast$ as
\begin{align*}
    \PSi{F k(x, \cdot), g}{\calH_\calY} = \E[g(Y)|X=x] = \PSi{k(x, \cdot), F^\ast g}{\calH_\calX}
\end{align*}
The above line contains a very strong assumption that $F^\ast g = \E[g(Y)|X=\cdot] \in \calH_\calX$, which is generally not correct. For example, given Gaussian RKHS $\calH_\calX, \calH_\calY$, when $X$ and $Y$ are independent, then $\E[g(Y)|X=\cdot]$ is a constant function of $x$, which is known not to be in a Gaussian RKHS.

Before we move on, we repeat the definition of the operators $F, F^\ast$ as they will be used many times later on.
\begin{align}
    F \in \calH_\calX \otimes \calH_\calY, &\quad F k(x, \cdot) = \E[k_\calY(Y, \cdot)|X=x] \\
    F^\ast \in \calH_\calY \otimes \calH_\calX, &\quad F^\ast g = \E[g(Y)|X=\cdot]
\end{align}
The loss function is defined as

\begin{align}\label{eq:ckme_loss}
\begin{split}
    \calE_s(F) &= \sup_{\norm{g}{} \leq 1} \E_X\left[\left(\E[g(Y)|X=x] - [F^\ast g](x)\right)^2\right] \\
    &\leq \sup_{\norm{g}{} \leq 1} \E_{X, Y} \left(\PSi{g, k_\calY(Y, \cdot)}{\calH_\calY} - \PSi{g, F k_\calX(x, \cdot)}{\calH_\calY}\right)^2\\
    &\leq \E_{X,Y}\left[\norm{k_\calY(Y, \cdot) - F k_\calX(x, \cdot)}{\calH_\calY}^2 \right] \\
    &\coloneqq \calE(F)
\end{split}
\end{align}

In practice, the expectation will be replaced by the empirical estimate and regularization is added 

\begin{align}\label{eq:loss_reg_1}
    \widehat{\calE(F)} = \sum_{i=1}^n \norm{k_\calY(y_i, \cdot) - F k_\calX(x, \cdot)}{\calH_\calY}^2 + \lambda \norm{F}{\calH_\Gamma}^2
\end{align}

The optimal solution is
\begin{align}\label{eq:fdagger-regression}
    \widehat{F^\dagger} f = f(\bX)^\top (K_\calX(\bX, \bX) + \lambda I)^{-1} k_\calY(\bY, \cdot)
\end{align}
For any $x$, the conditional mean embedding is 
\begin{align}\label{eq:ckme-regression}
    \calU_{Y|X=x} = \widehat{F^\dagger} k_\calX(x, \cdot) = k_\calX(x, \bX)^\top (K_\calX(\bX, \bX) + \lambda I)^{-1} k_\calY(\bY, \cdot)
\end{align}
and the adjoint is
\begin{align}\label{eq:fdagger-adjoint-regression}
    \widehat{F^{\dagger\ast}} g = g(\bY)^\top (K_\calX(\bX, \bX) + \lambda I)^{-1} k_\calX(\bX, \cdot)
\end{align}
The practical consequence of a regression perspective is that the hyper-paramater can be selected via cross validation.

\subsection{A Covariance Operator Perspective.}

We start from \eqref{eq:ckme_loss} again, and find that the solution has an analytic form by covariance operator.

First, we need to prove the following theorem.
\begin{align}\label{eq:thm-operator}
    C_{XX} \E_{Y|X} [g(\bY)|X=\cdot] = C_{XY} g
\end{align}
The theorem above holds because
\begin{align*}
    \PSi{f, C_{XX} \E_{Y|X} [g(\bY)|X=\cdot]}{\calH_\calX} &= \E_X\left[f(X) \E_{Y|X} \left[g(\bY)|X=\cdot\right]\right] - \E_X\left[f(X)\right] \E_Y[g(\bY)] \\
    &= \PSi{f, C_{XY} g}{\calH_\calX}
\end{align*}
So, using the definition in the last section, we have the analytic form of the adjoint operator $F^{\dagger \ast}$.
\begin{align}
    F^{\dagger \ast} = C_{XX}^{-1} C_{XY}
\end{align}
Given that $C_{XX}^{-1}$ is self-adjoint and $C_{YX} = C_{XY}^\ast$, we have the analytic form of the operator $F^\dagger$.
\begin{align}
    F^{\dagger} = C_{YX} C_{XX}^{-1}
\end{align}
We replace the above analytic form with empirical estimate, we have
\begin{align}
    \widehat{F^{\dagger \ast}} = \widehat{C_{XX}}^{-1} \widehat{C_{XY}} , 
    \quad \widehat{F^{\dagger}} = \widehat{C_{YX}} \widehat{C_{XX}}^{-1}
\end{align}

A little more computation will convince us that the two perspectives result in the same empirical form. Suppose that $f'= \widehat{C_{XX}}^{-1}k_\calX(x, \cdot)$, then we have
\begin{align*}
    \frac{1}{n} \sum_{i=1}^n f'(x_i) k_\calX(x_i, \cdot) = k_\calX(x, \cdot), \\
    \frac{1}{n} \sum_{i=1}^n f'(x_i) k_\calX(x_i, x_p) = k_\calX(x, x_p), \quad \forall p \in \{0, 1, \cdots, n\}, \quad \text{so}\\ 
    f'(\bX) = n K_\calX(\bX, \bX)^{-1} k_\calX(\bX, x)
\end{align*}
Then
\begin{align*}
    \widehat{C_{YX}} f' &= \frac{1}{n} \sum_{i=1}^n f'(x_i) k_\calY(y_i, \cdot) \\
    \widehat{C_{YX}} f' &= \frac{1}{n} f'(\bX)^\top k_\calY(\bY, \cdot) \\
    \widehat{C_{YX}} f' &= k_\calX(x, \bX) K_\calX(\bX, \bX)^{-1} k_\calY(\bY, \cdot)
\end{align*}
Finally we have
\begin{align*}
    F^\dagger k_\calX(x, \cdot) = \widehat{C_{YX}} f' =  k_\calX(x, \bX) (K_\calX(\bX, \bX) + \lambda I)^{-1} k_\calY(\bY, \cdot)
\end{align*}
which agrees with \eqref{eq:ckme-regression}.
In the operator form, we write
\begin{align}
    \widehat{C_{Y|X}} = \widehat{C_{YX}} \widehat{C_{XX}}^{-1} = k_\calX(\cdot, \bX) (K_\calX(\bX, \bX) + \lambda I)^{-1} k_\calY(\bY, \cdot)
\end{align}
Actually, the operator form allow us to extend to more complicated ones like $C_{(XX)Y} = \E[k_\calX(X, \cdot) \otimes k_\calX(X, \cdot) \otimes k_\calY(Y, \cdot)]$ which is an operator from $\calH_\calY$ to $\calH_\calX \otimes \calH_\calX$.

\begin{figure}
\centering
    \includegraphics[width=0.49\linewidth]{discussion_notes/figures/ckme_operator.png}
    \hfill
    \includegraphics[width=0.49\linewidth]{discussion_notes/figures/ckme_regression.png}
    \vspace*{-5pt}
\vspace*{-5pt}
\end{figure}
