
\section*{Table of Contents}
\begin{itemize}
    \item[A] \hyperref[sec:proofs]{Main Proofs}
    \begin{itemize}
        \item[A.1] \hyperref[subsec:proof_prop1]{Proof of Proposition 1}
        \item[A.2] \hyperref[subsec:proof-prop2]{Proof of Proposition 2}
        \item[A.3] \hyperref[subsec:extension-prop1]{Proof of Proposition 3}
        \item[A.4] \hyperref[subsec:extension-prop2]{Proof of Proposition 4}
        \item[A.5] \hyperref[subsec:empirical]{Empirical Validation of Relationship between Different Divergence Measures}
    \end{itemize}
    \item[B] \hyperref[sec:estimation-2]{Empirical Estimator of CS and Conditional CS}
    \begin{itemize}
        \item[B.1] \hyperref[subsec:est-cs]{Empirical Estimator of CS}
        \item[B.2] \hyperref[subsec:est-ccs]{Empirical Estimator of CCS}
    \end{itemize}
    \item[C] \hyperref[sec:experiments]{Additional Experimental Results and Details}
    \begin{itemize}
        \item[C.1] \hyperref[subsec:details-test]{Details on Conditional Divergence Test}
        \item[C.2] \hyperref[subsec:details-distance]{Details on Distance Metric Minimization}
        \item[C.3] \hyperref[subsec:compare-fdal-digits]{Comparison on Digits}
        \item[C.4] \hyperref[subsec:add-abl]{Additional Ablation Study}
        \item[C.5] \hyperref[subsec:hyperparameters]{The Effect of Hyperparameters}
    \end{itemize}
\end{itemize}


% \subsection{Additional Ablation Study}
% \label{subsec:add-abl}


\section{Main Proofs}
\label{sec:proofs}

\subsection{Proof of Proposition 1}
\label{subsec:proof_prop1}
\begin{customprop}{1}
\label{theorem}
For any $d$-variate Gaussian distributions $p\sim \mathcal{N}(\mu_1,\Sigma_1)$ and $q\sim \mathcal{N}(\mu_2,\Sigma_2)$, where $\Sigma_1$ and $\Sigma_2$ are positive definite, we have:
\begin{equation}\label{eq:gaussian}
D_{\mathrm{CS}}(p;q) \leq \min \big\{ D_{\mathrm{KL}}(p;q), D_{\mathrm{KL}}(q;p)\big\}.
\end{equation}
%and $\log$ is the natural logarithm.
\end{customprop}


\begin{proof}
% Given two $d$-dimensional Gaussian distributions $p\sim \mathcal{N}(\mu_1,\Sigma_1)$ and $q\sim \mathcal{N}(\mu_2,\Sigma_2)$, the 
The KL divergence for $p$ and $q$ is given by:
\begin{equation}
\begin{split}
& D_{\text{KL}}(p; q) = \frac{1}{2}\left( \tr(\Sigma_2^{-1}\Sigma_1) - d + (\mu_2 - \mu_1)^{\top} \Sigma_2^{-1} (\mu_2 - \mu_1) + \log\left( \frac{|\Sigma_2|}{|\Sigma_1|} \right) \right),
\end{split}
\end{equation}
where $|\cdot|$ signifies the determinant of a matrix, and $\tr$ denotes the trace of a matrix.
% 
% 
Moreover, the CS divergence for $p$ and $q$ can be written as~\citep{kampa2011closed}:
\begin{equation}
\begin{split}
& D_{\text{CS}}(p; q) = -\log(z_{12}) + \frac{1}{2}\log(z_{11}) + \frac{1}{2}\log(z_{22}),
\end{split}
\end{equation}
where 
\begin{equation}
\begin{split}
& z_{12} 
% = \mathcal{N}(\mu_1;\mu_2, \Sigma_1 + \Sigma_2) 
= \frac{\exp(-\frac{1}{2}(\mu_1 - \mu_2)^{\top})(\Sigma_1 + \Sigma_2)^{-1}(\mu_1 - \mu_2)}{\sqrt{(2\pi)^d|\Sigma_1 + \Sigma_2|}}, \\
& z_{11} = \frac{1}{\sqrt{(2\pi)^d |2\Sigma_1|}}, \\
& z_{22} = \frac{1}{\sqrt{(2\pi)^d |2\Sigma_2|}}.
\end{split}
\end{equation}
We can simplify the expression to:
% Combining these terms, one can write:
\begin{equation}
\begin{split}
D_{\text{CS}}(p; q) & =  \frac{1}{2}(\mu_2 - \mu_1)^{\top}(\Sigma_1 + \Sigma_2)^{-1}(\mu_2 - \mu_1) + \log \left(\sqrt{(2\pi)^d |\Sigma_1 + \Sigma_2|}\right) \\
&  \qquad - \frac{1}{2}\log \left((\sqrt{(2\pi)^d |2 \Sigma_1|}\right) - \frac{1}{2}\log \left(\sqrt{(2\pi)^d |2 \Sigma_2|}\right) \\
& = \frac{1}{2}(\mu_2 - \mu_1)^{\top}(\Sigma_1 + \Sigma_2)^{-1}(\mu_2 - \mu_1) + \frac{1}{2}\log \left( \frac{|\Sigma_1 + \Sigma_2|}{2^d\sqrt{|\Sigma_1||\Sigma_2|}} \right).
\end{split}
\end{equation}


\textit{Part 1.} We first consider the difference between $D_{\text{CS}}(p; q)$ and $D_{\text{KL}}(p; q)$ results from mean vector discrepancy, i.e., $\mu_1-\mu_2\neq 0_{d\times 1}$ and $\Sigma_1=\Sigma_2=\Sigma$.
Consider two positive semi-definite Hermitian matrices $A$ and $B$ of size $n\times n$. It is known that $A-B$ is positive semi-definite if and only if $B^{-1}-A^{-1}$ is also positive semi-definite~\citep{horn2012matrix}.
% Note that, for any two positive semi-definite Hermitian matrices $A$ and $B$ of size $n\times n$, then $A-B$ is positive semi-definite if and only if $B^{-1}-A^{-1}$ is also positive semi-definite~\cite{horn2012matrix}.
%\pagebreak
Using this result, we observe that $\Sigma_2^{-1} - (\Sigma_1+\Sigma_2)^{-1}$ is positive semi-definite, as $(\Sigma_1+\Sigma_2) - \Sigma_2$ is positive semi-definite. Therefore, conditional on $\Sigma_1=\Sigma_2$, we have
\begin{equation}\label{eq:mean_difference}
% \begin{split}
2(D_{\text{CS}}(p; q) - D_{\text{KL}}(p; q))
=   (\mu_2 - \mu_1)^{\top} (\Sigma_1+\Sigma_2)^{-1} (\mu_2 - \mu_1)  - \big( \mu_2 - \mu_1)^{\top} \Sigma_2^{-1} (\mu_2 - \mu_1)
    % \\
    % = (\mu_2 - \mu_1)^{\top} \left[\Sigma_2^{-1} - (\Sigma_1+\Sigma_2)^{-1} \right] (\mu_2 - \mu_1) 
    \leq 0.
% \end{split}
\end{equation}

\bigskip
\textit{Part 2.} Now we consider the difference between $D_{\text{CS}}(p; q)$ and $D_{\text{KL}}(p; q)$ results from covariance matrix discrepancy, i.e., $\Sigma_1-\Sigma_2\neq 0_{d\times d}$ and $\mu_1 = \mu_2$. We have
\begin{equation}\label{eq:cov_diff1}
\begin{split}
2(D_{\text{CS}}(p; q) - D_{\text{KL}}(p; q)) & = \log \left( \frac{|\Sigma_1 + \Sigma_2|}{2^d\sqrt{|\Sigma_1||\Sigma_2|}} \right) - \log \left( \frac{|\Sigma_2|}{|\Sigma_1|} \right) - \tr(\Sigma_2^{-1}\Sigma_1)+d.
\\
& = -d\log 2 + \log\left( |\Sigma_1 +\Sigma_2| \right) -\frac{1}{2} (\log |\Sigma_1| + \log|\Sigma_2|) \\
& - \log|\Sigma_2| + \log|\Sigma_1| - \tr(\Sigma_2^{-1}\Sigma_1) + d \\
& = -d\log 2 + \log\left( \frac{|\Sigma_1 +\Sigma_2|}{|\Sigma_2|} \right) + \frac{1}{2}\log\left( \frac{|\Sigma_1|}{|\Sigma_2|} \right) - \tr(\Sigma_2^{-1}\Sigma_1) + d \\ 
& = -d\log 2 + \log\left( |\Sigma_2^{-1}\Sigma_1 + I| \right) + \frac{1}{2}\log\left( |\Sigma_2^{-1}\Sigma_1| \right) - \tr(\Sigma_2^{-1}\Sigma_1) + d,
\end{split}
\end{equation}
where $I$ represents a $d$-dimensional identity matrix.
Consider the terms $|\Sigma_2^{-1}\Sigma_1|$ and $|\Sigma_2^{-1}\Sigma_1 + I|$. For convenience, let $\{\lambda_i\}_{i=1}^d$ denote the eigenvalues of $\Sigma_2^{-1}\Sigma_1$. Since $\Sigma_2^{-1}\Sigma_1$ is positive semi-definite, we have $\lambda_i\geq 0$, $i=1\ldots,d$.
We have:
\begin{equation}\label{eq:cov_diff2}
|\Sigma_2^{-1}\Sigma_1| = \left[ \left( \prod_{i=1}^d \lambda_i \right)^{1/d} \right]^d \leq \left[ \frac{1}{d}\sum_{i=1}^d \lambda_i \right]^d = \left( \frac{1}{d} \tr(\Sigma_2^{-1}\Sigma_1) \right)^d,
\end{equation}
where the inequality arises from the property that a geometric mean is no greater than its arithmetic counterpart.
Similarly, one can have
\begin{equation}\label{eq:cov_diff3}
\begin{split}
    |\Sigma_2^{-1}\Sigma_1 + I| = \prod_{i=1}^d (1+\lambda_i) \leq \left[ \frac{1}{d}\sum_{i=1}^d (1+\lambda_i) \right]^d = \left( 1+\frac{1}{d}\tr(\Sigma_2^{-1}\Sigma_1) \right)^d.
\end{split}
\end{equation}

Substituting Eqs.~(\ref{eq:cov_diff2}) and (\ref{eq:cov_diff3}) into Eq.~(\ref{eq:cov_diff1}), we arrive at
\begin{equation}
% \begin{split}
2(D_{\text{CS}}(p; q) - D_{\text{KL}}(p; q))
% & = -d\log 2 + \log\left( |\Sigma_2^{-1}\Sigma_1 + I| \right) + \frac{1}{2}\log\left( |\Sigma_2^{-1}\Sigma_1| \right) - \tr(\Sigma_2^{-1}\Sigma_1) + d \\
% &\hspace{-1cm} 
\leq -d\log 2 + d\log\Big (1+ \frac{1}{d}\tr(\Sigma_2^{-1}\Sigma_1)\Big) + \frac{d}{2} \log \Big(\frac{1}{d}\tr(\Sigma_2^{-1}\Sigma_1)\Big) - \tr(\Sigma_2^{-1}\Sigma_1) + d.
% \end{split}
\end{equation}

We now show that $2(D_{\text{CS}}(p; q) - D_{\text{KL}}(p; q))\leq 0$ conditional on $\mu_1=\mu_2$. Let $f$ be a map given by
\begin{equation}
f(x) = -d\log2 + d\log\Big(1+\frac{x}{d}\Big) + \frac{d}{2}\log\Big(\frac{x}{d}\Big) - x + d,
\qquad
x\geq 0.
\end{equation}
Since $f'(d)=0$ and $f''(d)<0$, we then conclude that
\begin{equation}\label{eq:cov_difference}
    2(D_{\text{CS}}(p; q) - D_{\text{KL}}(p; q))=f\Big(\tr(\Sigma_2^{-1}\Sigma_1)\Big) \leq f(d)=0,
\end{equation}
where $\tr(\Sigma_2^{-1}\Sigma_1) = \sum_{i=1}^d \lambda_i \geq 0$, conditional on $\mu_1=\mu_2$. 

\bigskip 
\textit{Part 3.} Note that $D_{\text{CS}}(p; q) - D_{\text{KL}}(p; q)$ captures the differences in both the mean vector and covariance matrix discrepancies when $\mu_1 \neq \mu_2$ and $\Sigma_1 \neq \Sigma_2$. Namely,
\begin{multline*}
2(D_{\text{CS}}(p; q) - D_{\text{KL}}(p; q))
=
\Big[(\mu_2 - \mu_1)^{\top}(\Sigma_1 + \Sigma_2)^{-1}(\mu_2 - \mu_1) 
-
(\mu_2 - \mu_1)^{\top} \Sigma_2^{-1} (\mu_2 - \mu_1)
\Big]\\
+\bigg[\log \left( \frac{|\Sigma_1 + \Sigma_2|}{2^d\sqrt{|\Sigma_1||\Sigma_2|}} \right)
-
\log \left( \frac{|\Sigma_2|}{|\Sigma_1|} \right) 
-
\tr(\Sigma_2^{-1}\Sigma_1) + d \bigg]\leq 0,
\end{multline*}
using Eqs. \eqref{eq:mean_difference}) and \eqref{eq:cov_difference}.

\bigskip
The above analysis also applies to $D_{\text{KL}}(q; p)$. That is, $2(D_{\text{CS}}(p; q)-D_{\text{KL}}(q; p))\leq 0$ regardless of the parameter values. The combination of these results implies \eqref{eq:gaussian}.
% Specifically, we have:
% 
% \begin{equation}
% 2(D_{\text{KL}}(q; p) - D_{\text{CS}}(p; q) )_{\Sigma_1=\Sigma_2} =
%     (\mu_2 - \mu_1)^{\top} \left[\Sigma_1^{-1} - (\Sigma_1+\Sigma_2)^{-1} \right] (\mu_2 - \mu_1) \geq 0,
% \end{equation}
% 
% \begin{equation}
% \begin{split}
% 2(D_{\text{CS}}(p; q) - D_{\text{KL}}(q; p))_{\mu_1=\mu_2} & = \log \left( \frac{|\Sigma_1 + \Sigma_2|}{2^d\sqrt{|\Sigma_1||\Sigma_2|}} \right) - \log \left( \frac{|\Sigma_1|}{|\Sigma_2|} \right) - \tr(\Sigma_1^{-1}\Sigma_2)+d.
% \\
% & = -d\log 2 + \log\left( |\Sigma_1 +\Sigma_2| \right) -\frac{1}{2} (\log |\Sigma_1| + \log|\Sigma_2|) \\
% & - \log|\Sigma_1| + \log|\Sigma_2| - \tr(\Sigma_1^{-1}\Sigma_2) + d \\
% & = -d\log 2 + \log\left( \frac{|\Sigma_1 +\Sigma_2|}{|\Sigma_1|} \right) + \frac{1}{2}\log\left( \frac{|\Sigma_2|}{|\Sigma_1|} \right) - \tr(\Sigma_1^{-1}\Sigma_2) + d \\ 
% & = -d\log 2 + \log\left( |\Sigma_1^{-1}\Sigma_2 + I| \right) + \frac{1}{2}\log\left( |\Sigma_1^{-1}\Sigma_2| \right) - \tr(\Sigma_1^{-1}\Sigma_2) + d \\
% & \leq -d\log 2 + d\log \left(1+ \frac{1}{d}\tr(\Sigma_1^{-1}\Sigma_2) \right) + \frac{d}{2} \log (\frac{1}{d}\tr(\Sigma_1^{-1}\Sigma_2)) - \tr(\Sigma_1^{-1}\Sigma_2) + d \\
% & \leq 0.
% \end{split}
% \end{equation}
% 
% Hence,
% \begin{equation} \label{eq:CSreverseKL}
%     D_{\text{CS}}(p; q) - D_{\text{KL}}(q; p) \leq 0.
% \end{equation}
% 
% Combining Eq.~(\ref{eq:CSforwardKL}) and Eq.~(\ref{eq:CSreverseKL}), we obtain:
% \begin{equation} 
%     D_{\text{CS}}(p;q) \leq \min \{ D_{\text{KL}}(p;q), D_{\text{KL}}(q;p)\}.
% \end{equation}
% 
% 
\end{proof}





\subsection{Proof of Proposition 2}
\label{subsec:proof-prop2}

We first present Lemma~\ref{lemma_TV} that proves to be useful in Proposition~\ref{proposition_Gaussian_TV}.

\begin{lemma}\label{lemma_TV}
Let $p\sim \mathcal{N}(\mu_1, \Sigma_1)$ and $q\sim \mathcal{N}(\mu_2, \Sigma_2)$ be any $d$-dimensional Gaussian distributions, the TV distance between $p$ and $q$ in case $\Sigma_1=\Sigma_2=\Sigma$ (positive semidefinite) can be expressed as:
\begin{equation}
    D_{\text{TV}} = 2\Phi\Big(\frac{1}{2}\|\Sigma^{-1/2}\delta\|_2\Big)-1,
\end{equation}
where $\delta=\mu_1-\mu_2$, and $\Phi$ is the cumulative distribution function of a standard normal distribution.
\end{lemma}

\begin{proof}
Recall:
\begin{equation}
% \begin{aligned}
 D_{\text{TV}} 
 % &
 = \frac{1}{2}\int |p(\mathbf{x})-q(\mathbf{x})| \,\der \mathbf{x}  = \frac{1}{2}\int \Big|1-\frac{q(\mathbf{x})}{p(\mathbf{x})}\Big|\, p(\mathbf{x}) \,\der \mathbf{x}.
 % \end{aligned}
\end{equation}

Before continuing, we first note that for any $\mathbf{a}, \mathbf{b} \in \mathbb{R}^d$, and $S\in \mathbb{R}^{d\times d}$,
\begin{equation}
    \mathbf{a}^{\top}S\mathbf{a}- \mathbf{b}^{\top}S \mathbf{b} = (\mathbf{a}-\mathbf{b})^{\top} S (\mathbf{a}-\mathbf{b}) + 2(\mathbf{a}-\mathbf{b})^{\top} S \mathbf{b}.
\end{equation}

Using this identity, we have 
\begin{equation}
\begin{aligned}
    \frac{q}{p}(\mathbf{x}) &= \exp \Big\{ -\frac{1}{2}[(\mathbf{x}-\mu_2)^{\top} \Sigma^{-1}(\mathbf{x}-\mu_2) - (\mathbf{x}-\mu_1)^{\top} \Sigma^{-1}(\mathbf{x}-\mu_1)] \Big\} \\
    & = \exp \Big\{ -\frac{1}{2}[(\mu_1-\mu_2)^{\top} \Sigma^{-1}(\mu_1-\mu_2) + 2(\mu_1-\mu_2)^{\top} \Sigma^{-1}(\mathbf{x}-\mu_1)] \Big\} \\
    & = \exp \Big\{ -\frac{1}{2}\|\tilde{\delta}\|^2_2 + \tilde{\delta}^{\top}\Sigma^{-1/2}(\mu_1 -\mathbf{x}) \Big\},
\end{aligned}
\end{equation}
where $\tilde{\delta} = \Sigma^{-1/2}\delta$.
Therefore, 
\begin{equation*}
     D_{\text{TV}} = \frac{1}{2} \int \Big|1-\exp \Big\{ -\frac{1}{2}\|\tilde{\delta}\|^2_2 + \tilde{\delta}^{\top}\Sigma^{-1/2}(\mu_1 -\mathbf{x}) \Big\} \Big|p(\mathbf{x})\,\der \mathbf{x}.
\end{equation*}

Define a transformation $Y = \tilde{\delta}^{\top} \Sigma^{-1/2} (\mu_1 -X)$, where $X\sim \mathcal{N}(\mu_1, \Sigma)$. 
% Clearly, $Y \sim \mathcal{N}(0, \|\tilde{\delta}\|^2_2)$. 
Then, $D_{\text{TV}}$ can be equivalently written as
\begin{equation}
    D_{\text{TV}} =\frac{1}{2}\, \mathbb{E}_{Y}\Big|1 - \exp\Big (Y - \frac{1}{2}\|\tilde{\delta}\|^2_2 \Big)\Big|,
\end{equation}
where $Y \sim \mathcal{N}(0, \|\tilde{\delta}\|^2_2) $. Note that for any $Z \sim \mathcal{N}(\mu, \sigma^2)$, one can derive
\begin{equation}
    \mathbb{E}|1 - \exp(Z)| = 1 - 2\Phi\Big(\frac{\mu}{\sigma}\Big) + \exp \Big(\mu +\frac{1}{2}\sigma^2\Big)\Big(2\Phi\Big(\frac{\mu + \sigma^2}{\sigma}\Big) - 1\Big).
\end{equation}

Taking $\mu = -\frac{1}{2}\|\tilde{\delta}\|^2_2$ and $\sigma = \|\tilde{\delta}\|_2$ above, we have
\begin{equation}
D_{\text{TV}}
=\frac{1}{2}\Big\{ 1-2\Phi\Big(-\frac{1}{2}\|\tilde{\delta}\|_2\Big) + 2\Phi\Big(\frac{1}{2}\|\tilde{\delta}\|_2\Big) -1 \Big\}
= 2\Phi\Big(\frac{1}{2}\|\tilde{\delta}\|_2\Big) -1
= 2\Phi \Big(\frac{1}{2}\|\Sigma^{-1/2}\delta\|_2\Big)-1,
\end{equation}
% \begin{equation}
% \begin{aligned}
%     D_{\text{TV}} &=\frac{1}{2}\Big\{ 1-2\Phi\Big(-\frac{1}{2}||\delta'||_2\Big) + 2\Phi\Big(\frac{1}{2}||\delta'||_2\Big) -1 \Big\} \\
%     &= 2\Phi\Big(\frac{1}{2}||\delta'||_2\Big) -1 \\
%     & = 2\Phi \Big(\frac{1}{2}\|\Sigma_1^{-1/2}\delta\|_2\Big)-1,
% \end{aligned}
% \end{equation}
where the second equality follows from the symmetry property of $\Phi$.
\end{proof}

\bigskip
Now, we proceed to the proof of Proposition~\ref{proposition_Gaussian_TV}.
\begin{customprop}{2}%\label{proposition_Gaussian_TV}
Let $\Phi$ be the cumulative distribution function of a standard normal distribution. Let $p\sim \mathcal{N}(\mu_1, \Sigma_1)$ and $q\sim \mathcal{N}(\mu_2, \Sigma_2)$ be any $d$-dimensional Gaussian distributions. We have:
\begin{equation}
 D_{\mathrm{TV}} \leq \sqrt{D_{\mathrm{CS}} },
\end{equation}
if one of the following conditions is satisfied:
\begin{enumerate}
\item $\Sigma_1=\Sigma_2=\Sigma$ and $1/2\sqrt{ \delta^{\top} \Sigma^{-1} \delta } \geq 2\Phi (\|\Sigma^{-1/2}\delta\|_2/2)-1$, where $\delta = \mu_1 - \mu_2$;
\item $\sum_{i=1}^d \log\left( \frac{2+\lambda_i + 1/\lambda_i}{4} \right) \geq 4 $, where $\lambda_i$ is the $i$-th eigenvalue of $\Sigma_2^{-1}\Sigma_1$.
\end{enumerate}
\end{customprop}

\begin{proof}
First, note that $D_{\text{TV}}\leq 1/2\big(\int p(\mathbf{x})\,\der \mathbf{x}+\int q(\mathbf{x})\,\der \mathbf{x}\big)\leq 1$, whereas $D_{\text{CS}}$ is unbounded and can easily exceed values of 1. 
% This is because,
% \begin{equation}
% \begin{split}
%     D_{\text{TV}}(p;q) & = \frac{1}{2} \int |p(x)-q(x)| \,\der x \\
%     & \leq \frac{1}{2} \int |p(x)|+|q(x)| \,\der x \\
%     & = \frac{1}{2} (\int p(x)\,\der x + \int q(x)\,\der x) \\
%     & = 1.
% \end{split}    
% \end{equation}
Recall $\delta = \mu_1-\mu_2$. The closed-form expression of CS divergence is:
\begin{equation}
D_{\text{CS}}(p; q) = \frac{1}{2}\delta^{\top}(\Sigma_1 + \Sigma_2)^{-1}\delta + \frac{1}{2}\log \left( \frac{|\Sigma_1 + \Sigma_2|}{2^d\sqrt{|\Sigma_1||\Sigma_2|}} \right),
\end{equation}
where the first term and the second term quantify the discrepancy resulting from the difference of mean vectors and covariance matrices, respectively.

\bigskip
\textit{Part 1.} Consider $\Sigma_1 = \Sigma_2 = \Sigma$. By Lemma~\ref{lemma_TV}, we have $D_{\text{TV}} = 2\Phi (\|\Sigma^{-1/2}\delta\|_2/2)-1$.
% \begin{equation}
%     D_{\text{TV}} = 2\Phi (\|\Sigma^{-1/2}\delta\|_2/2)-1,
% \end{equation}
% in which $\Phi$ is the cumulative distribution function of a standard normal distribution.
Hence,
\begin{equation}
     D_{\text{TV}} \leq \sqrt{D_{\text{CS}} }\, \iff\, \frac{1}{2}\sqrt{ \delta^{\top} \Sigma_1^{-1} \delta } \geq 2\Phi (\|\Sigma_1^{-1/2}\delta\|_2/2)-1.
\end{equation}

\bigskip
\textit{Part 2.} More generally,
given that the TV distance for two Gaussian distributions lacks a closed-form expression, it suffices to examine the conditions under which $D_{\text{CS}} \geq 1$. We have
\begin{equation}
\begin{split}
    D_{\text{CS}} & \geq \frac{1}{2}\ln \left( \frac{|\Sigma_1 + \Sigma_2|}{2^d\sqrt{|\Sigma_1||\Sigma_2|}} \right) \\
    & = \frac{1}{2} \left( \frac{1}{2} \ln\left( \frac{|\Sigma_1+\Sigma_2|}{|\Sigma_1|} \right) + \frac{1}{2} \ln\left( \frac{|\Sigma_1+\Sigma_2|}{|\Sigma_2|} \right)  -d\ln 2 \right) \\
    & = \frac{1}{2} \left( \frac{1}{2} \ln\left( |\Sigma_1^{-1}\Sigma_2+I| \right) + \frac{1}{2} \ln\left( |\Sigma_2^{-1}\Sigma_1+I| \right)  -d\ln 2 \right).
\end{split}
\end{equation}

Let $\lambda_i$ denotes the $i$-th eigenvalue of $\Sigma_2^{-1}\Sigma_1$, then $1/\lambda_i$ is the $i$-th eigenvalue of $\Sigma_1^{-1}\Sigma_2$. We have
\begin{equation}
% \begin{split}
    |\Sigma_1^{-1}\Sigma_2+I|  = \prod_{i=1}^d (1/\lambda_i+1), 
    \qquad
    |\Sigma_2^{-1}\Sigma_1+I|  = \prod_{i=1}^d (\lambda_i+1).
% \end{split}
\end{equation}
It leads to
\begin{equation} \label{eq:condition2}
D_{\text{CS}}
 \geq \frac{1}{2} \sum_{i=1}^d \left( \frac{1}{2}\ln(\lambda_i+1) + \frac{1}{2}\ln(1/\lambda_i+1) - \ln 2 \right)
 = \frac{1}{4} \sum_{i=1}^d \log\left( \frac{2+\lambda_i + 1/\lambda_i}{4} \right).
\end{equation}
% \begin{equation} \label{eq:condition2}
% \begin{split}
%     D_{\text{CS}} & = \frac{1}{2} \left( \frac{1}{2} \ln\left( |\Sigma_1^{-1}\Sigma_2+I| \right) + \frac{1}{2} \ln\left( |\Sigma_2^{-1}\Sigma_1+I| \right)  -d\ln 2 \right) \\
%     & = \frac{1}{2} \sum_{i=1}^d \left( \frac{1}{2}\ln(\lambda_i+1) + \frac{1}{2}\ln(1/\lambda_i+1) - \ln 2 \right) \\
%     & = \frac{1}{4} \sum_{i=1}^d \log\left( \frac{2+\lambda_i + 1/\lambda_i}{4} \right).
% \end{split}
% \end{equation}

Given the condition $ \sum_{i=1}^d \log\left( \frac{2+\lambda_i + 1/\lambda_i}{4} \right)\geq 4$, we have $D_{\text{TV}} \leq 1 \leq \sqrt{D_{\text{CS}} }$.
The proof is now completed.
\end{proof}

\bigskip
\begin{remark}
In fact, the conditions outlined in Proposition~\ref{proposition_Gaussian_TV} are easily satisfied, particularly when $p$ and $q$ exhibit significant dissimilarity, and the variable dimension $d$ is large. For simplicity, let's consider a diagonal covariance matrix $\Sigma=\diag \Big(\sigma_1^2,\sigma_2^2,\ldots,\sigma_d^2\Big)$.
% given by:
% \begin{equation}
% \Sigma = \begin{bmatrix}
% \sigma_1^2 &  &  &  \\
%  & \sigma_2^2 &  &  \\
%  &  & \ddots &  \\
%  &  &  & \sigma_d^2
% \end{bmatrix}.
% \end{equation}
% 
In this case, the condition $1/2\sqrt{ \delta^{\top} \Sigma_1^{-1} \delta } \geq 2\Phi (\|\Sigma_1^{-1/2}\delta\|_2/2)-1$ reduces to:
\begin{equation}\label{eq:condition1_more}
    \frac{1}{2}\sqrt{\sum_{i=1}^d \left( \frac{\delta_i}{\sigma_i} \right)^2 } \geq 2\Phi \left( \frac{ \sqrt{\sum_{i=1}^d ( {\delta_i}/{\sigma_i} )^2 } }{2} \right) - 1.
\end{equation}
The R.H.S. of Eq.~(\ref{eq:condition1_more}) is upper bounded by $1$, whereas the L.H.S. of Eq.~(\ref{eq:condition1_more}) is unbounded and is prone to increase with the addition of new dimension (if other dimensions remain unchanged).
On the other hand, since $\log\left( \frac{2+\lambda_i + 1/\lambda_i}{4} \right) \geq 0$,
the L.H.S. of Eq.~(\ref{eq:condition2}) is unbounded and is prone to increase with the addition of new dimension (if $\lambda_i$, $i=1,2,..,d-1$, remain unchanged).

\begin{figure} [htbp]
%\hfill
\centering 
     \begin{subfigure}[b]{0.45\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/mu_TV_CS.png}
         \caption{$|\delta=\mu|$, $\sigma_1=\sigma_2=1$.}
         %\label{fig:percent_tv}
     \end{subfigure}
     \begin{subfigure}[b]{0.45\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/sigma_TV_CS.png}
         \caption{$\delta=0$, $\sigma_1\neq\sigma_2$.}
         %\label{fig:percent_KL}
     \end{subfigure}
     \caption{Values of $D_{\text{TV}}$ and $\sqrt{D_{\text{CS}}}$ for 1-dimensional Gaussian data in case (a) $\mu$ is different, $\sigma>0$ is the same; and (b) $\sigma$ is different, $\mu$ is the same.}
     \label{fig:one_dimension_TV_CS}
\end{figure}
Moreover, from Fig.~\ref{fig:one_dimension_TV_CS}, it is easy to observe that, when $d=1$, the TV distance is too conservative and quickly reaches its upper bound $1$, whereas the CS divergence is unbounded and larger than the TV distance if $p$ and $q$ are not sufficiently similar.
\end{remark}



%\subsection{An Extension of Proposition 1 to General Distributions}
\subsection{Proof of Proposition 3}
\label{subsec:extension-prop1}
%\subsection{Proof to Proposition 2}

We first present a lemma (without proof), referred to as the Jensen weighted integral inequality, which proves to be useful in the subsequent proof.
\begin{lemma}\citep{dragomir2003interpolations}\label{corollary_weighted_Jensen}
Assume a convex function $f:I\mapsto \mathbb{R}$. Moreover, $g,h: [x_1,x_2]\mapsto \mathbb{R}$ are measurable functions such that $g(x)\in I$ and $h(x)\geq 0$, $\forall x\in [x_1,x_2]$. Also suppose that $h$, $gh$, and $(f\circ g)\cdot h$ are all integrable functions on $[x_1,x_2]$ and $\int_{x_1}^{x_2} h(x)\,\der x > 0$, then
\begin{equation}
    f\left( \frac{\int_{x_1}^{x_2} g(x)h(x) \,\der x}{\int_{x_1}^{x_2} h(x) \,\der x} \right) \leq \frac{ \int_{x_1}^{x_2} (f\circ g)(x)h(x) \,\der x}{ \int_{x_1}^{x_2} h(x) \,\der x}.
\end{equation}
\end{lemma}

\bigskip 
Let $f(x)=x\log(x)$, which is a convex function. For some positive functions $a,b$, set $h=b$ and $g={a}/{b}$ in Lemma~\ref{corollary_weighted_Jensen}. We have
\begin{equation}\label{eq:continuous_log_sum}
    \left(\int_{x_1}^{x_2} a(x) \,\der x \right) \log\left(\frac{\int_{x_1}^{x_2} a(x) \,\der x}{\int_{x_1}^{x_2} b(x) \,\der x} \right) \leq \int_{x_1}^{x_2} a(x) \log\frac{a(x)}{b(x)} \,\der x.
\end{equation}
The inequality above holds for any integration range, provided the Riemann integrals exist. Moreover, this inequality can be easily extended to general ranges, including possibly disconnected sets, using Lebesgue integrals. In fact, Eq.~(\ref{eq:continuous_log_sum}) can be understood as a continuous extension of the well-known log sum inequality. For simplicity, we denote $\int_{x_1}^{x_2} a(x) \,\der x = \int_K a(x) \,\der x$, where $|K|=x_2-x_1\gg 0$ refers to the length of the integral's interval.



\bigskip 
\begin{customprop}{3}
\label{proposition_general_supp}
For any density functions $p:\,\mathbb{R}^d\to \mathbb{R}_{\geq 0}$ and $q:\,\mathbb{R}^d\to \mathbb{R}_{\geq 0}$, let $K$ be an integration domain over which $p$ and $q$ are Riemann integrable.
Suppose $|K|<\infty$, where $|K|$ denotes the volume. Then
% For any density functions $p$ and $q$, let $|K|$ denote the length of the integral's integration range $K$ with $|K| \gg 0$, we have:
\begin{equation}
C_1 \left[D_{\mathrm{CS}}(p;q) - \log{|K|} + 2\log C_2 \right] \leq D_{\mathrm{KL}}(p;q),
\end{equation}
where $C_1=\int_K p(\mathbf{x})\,\der \mathbf{x}$, $C_2 = { C_1 }{ \left(\int_K p^2(\mathbf{x})\,\der \mathbf{x} \int_K q^2(\mathbf{x})\,\der \mathbf{x} \right)^{-1/4} }$.  Clearly, for $K$ such that $|K\cap S| \gg 0$, where $S=\big\{\mathbf{x}:\, p(\mathbf{x})>0\big\}$, one can have $C_1 \approx 1$.
% $\approx \dfrac{ 1 }{ \left(\int_K p^2(\mathbf{x})\,\der \mathbf{x} \int_K q^2(\mathbf{x})\,\der \mathbf{x} \right)^{1/4} }$
\end{customprop}


\begin{proof}
The following results hold for multivariate density functions. Without loss of generality, we focus on the univariate case.
Construct the following two functions:
\begin{equation}
 a(x) = p(x)/C_2, 
 % = \frac{p(x)}{\int_K p(x)\,\der x} \left( \int_K p^2(x)\,\der x \int_K q^2(x)\,\der x \right)^{1/4},
 \qquad
 b(x)  = \sqrt{p(x)q(x)}.
\end{equation}
% \begin{equation} \label{eq:ax_bx}
% \begin{split}
%     a(x) & = \frac{p(x)}{\int_K p(x)\,\der x} \left( \int_K p^2(x)\,\der x \int_K q^2(x)\,\der x \right)^{1/4} ; \\
%     b(x) & = \sqrt{p(x)q(x)}.
% \end{split}
% \end{equation}
% 
% Let us denote $C_2 = \frac{ \int_K p(x)\,\der x }{ \left( \int_K p^2(x)\,\der x \int_K q^2(x)\,\der x \right)^{1/4} } $, by Eq.~(\ref{eq:ax_bx}), we have:
% \begin{equation}
%     p(x) = a(x)C_2.
% \end{equation}
% 
% Similarly, 
% \begin{equation}
%     \frac{a(x)}{b(x)} = \sqrt{\frac{p(x)}{q(x)}} \frac{1}{C_2}.
% \end{equation}
% 
% That is,
% \begin{equation}
%     \sqrt{\frac{p(x)}{q(x)}} = \frac{a(x)}{b(x)} C_2.
% \end{equation}
Clearly, $\sqrt{{p(x)}/{q(x)}} = \big({a(x)}/{b(x)}\big) C_2$.
We have
\begin{equation}\label{eq:KL_intermediate}
\begin{split}
    D_{\text{KL}}(p;q) & = \int_K p(x)\log \frac{p(x)}{q(x)} \,\der x \\
    & = 2 \int_K p(x)\log \sqrt{\frac{p(x)}{q(x)}} \,\der x \\
    & = 2 \int_K a(x) C_2 \log \left(\frac{a(x)}{b(x)} C_2\right) \,\der x \\
    & = 2 C_2 \left[ \int_K a(x) \log \left(\frac{a(x)}{b(x)}\right)\,\der x + \log C_2 \int_K a(x)\,\der x \right] \\
    & \geq 2 C_2 \left[ \left(\int_K a(x) \,\der x \right) \log\left(\frac{\int_K a(x) \,\der x}{\int_K b(x) \,\der x} \right) + \log C_2 \int_K a(x)\,\der x \right] \\
    & = 2 C_2 \int_K a(x)\,\der x \left[ \log\left(\frac{\int_K a(x) \,\der x}{\int_K b(x) \,\der x} \right) + \log C_2 \right],
\end{split}
\end{equation}
where the inequality is due to Eq.~(\ref{eq:continuous_log_sum}).
Note that
\begin{equation}\label{eq:ax_int}
\int_K a(x)\,\der x 
= \int_K \frac{p(x)}{C_2} \,\der x
= \frac{1}{C_2} \int_K p(x)  \,\der x
= \left( \int_K p^2(x)\,\der x \int_K q^2(x)\,\der x \right)^{1/4},
\end{equation}
% \begin{equation}\label{eq:ax_int}
% \begin{split}
%     \int_K a(x)\,\der x & = \int_K \frac{p(x)}{C_2} \,\der x \\
%     & = \frac{1}{C_2} \int_K p(x)  \,\der x \\
%     & = \left( \int_K p^2(x)\,\der x \int_K q^2(x)\,\der x \right)^{1/4},
% \end{split}
% \end{equation}
and, using the Cauchy-Schwarz inequality,
\begin{equation}\label{eq:bx_int}
\left(\int_K b(x)\,\der x\right)^2  
% = \left(\int_K \sqrt{p(x)q(x)} \,\der x \right)^2 
= \left(\int_K \sqrt{p(x)q(x)}\cdot 1 \,\der x\right)^2
\leq \left( \int_K p(x)q(x)\,\der x  \right) \left( \int_K 1\,\der x \right)
= \left( \int_K p(x)q(x) \,\der x \right) |K|.
\end{equation}
% \begin{equation}\label{eq:bx_int}
% \begin{split}
%     \left(\int_K b(x)\,\der x\right)^2  & = \left(\int_K \sqrt{p(x)q(x)} \,\der x \right)^2 \\
%     & = \left(\int_K \sqrt{p(x)q(x)}\cdot 1 \,\der x\right)^2 \\
%     & \leq \left( \int_K p(x)q(x) \right) \left( \int_K 1\,\der x \right) \\
%     & = \left( \int_K p(x)q(x) \right) |K|,
% \end{split}    
% \end{equation}


Substituting \eqref{eq:ax_int} and \eqref{eq:bx_int} into \eqref{eq:KL_intermediate}, we have
\begin{equation}
\begin{split}
    D_{\text{KL}}(p;q) & \geq 2 C_2 \int_K a(x)\,\der x \left[ \log\left(\frac{\int_K a(x) \,\der x}{\int_K b(x) \,\der x} \right) + \log C_2 \right] \\
    & = C_1 \left[ \log\left(\frac{\int_K a(x) \,\der x}{\int_K b(x) \,\der x} \right)^2 + 2 \log C_2 \right] \\
    & = C_1 \left[ \log\left(\frac{\left( \int_K p^2(x)\,\der x \int_K q^2(x)\,\der x \right)^{1/2}}{\left(\int_K b(x)\,\der x\right)^2} \right) + 2 \log C_2 \right] \\
    & \geq C_1 \left[ \log\left(\frac{\left( \int_K p^2(x)\,\der x \int_K q^2(x)\,\der x \right)^{1/2}}{ \left( \int_K p(x)q(x) \right) |K| } \right) + 2 \log C_2 \right] \\
    % & = C_1 \big[ D_{\text{CS}}(p;q) - \log|K| +2\log C_2 \big] \\
    & = C_1 \left[ D_{\text{CS}}(p;q) - \log|K| +2\log C_2 \right].
\end{split}
\end{equation}
The proof is completed.  
\end{proof}



\subsection{Proof of Proposition 4}
\label{subsec:extension-prop2}
\begin{figure} [htbp]
%\hfill
\centering 
 \includegraphics[width=0.7\textwidth]{Figures/TV_threshold_illustration.pdf}
 \caption{A graphical illustration of the sets $\mathcal{A}_{\epsilon}$ and $\mathcal{A}_{\epsilon}^{\complement}$ defined in Proposition \ref{proposition_general_TV}.}
\label{fig:TV_threshold}
\end{figure}

\begin{customprop}{4}
\label{proposition_general_TV}
For any density functions $p$ and $q$, and any $\epsilon>0$, let $\calA_{\epsilon}=\left\{\mathbf{x}:\, p(\mathbf{x})\leq \epsilon\right\}\cup \left\{\mathbf{x}:\, q(\mathbf{x})\leq \epsilon\right\}$ and $\calA_{\epsilon}^{\complement}$ be its complement. Moreover, define $T_{\calA_\epsilon^{\complement}}=\sup\left\{p(\mathbf{x})q(\mathbf{x}),\, \mathbf{x}\in\calA_\epsilon^{\complement}\right\}$
and $\left|\calA_\epsilon^{\complement}\right|$ to denote the ``length'' of the set $\calA_\epsilon^{\complement}$ (strictly speaking, the Lebesgue measure of the set $\calA_\epsilon^{\complement}$). Suppose there exists an $\epsilon>0$ such that $T_{\calA_\epsilon^{\complement}}\left|\calA_\epsilon^{\complement}\right|<\infty$ and $C_3 = \int p^2(\mathbf{x})\, \der \mathbf{x} \int q^2(\mathbf{x})\, \der \mathbf{x} \geq \exp(2) \left(2\epsilon+T_{\calA_\epsilon^{\complement}}\left|\calA_\epsilon^{\complement}\right|\right)^2$, then 
\begin{equation}
D_{\mathrm{TV}}(p;q)\leq \sqrt{D_{\mathrm{CS}}(p;q)}.
\end{equation}
\end{customprop}
%in which $C_3 = \int_K p^2(x) \,\der x \int_K q^2(x) \,\der x$.

\begin{proof}
Note that $D_{\text{CS}}(p;q)= -\log(\int p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}) + 1/2\log(C_3)$. For the term $\int p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}$, we can write
% \begin{equation}
% \begin{split}
%     D_{\text{CS}}(p;q) & = -\log(\int p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}) +\frac{1}{2} \log(\int p^2(\mathbf{x})\,\der \mathbf{x}) + \frac{1}{2} \log(\int q^2(\mathbf{x})\,\der \mathbf{x}) \\
%     & = -\log(\int p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}) + \frac{1}{2}\log(C_3).
% \end{split}
% \end{equation}
% 
%Given a threshold $\epsilon\in(0,1)$, let $\mathcal{A}=\{x: p(x)\leq \epsilon \} \cup \{x: q(x)\leq \epsilon \}$ refers to the union of the sets $\{x: p(x)\leq \epsilon \}$ and $\{x: q(x)\leq \epsilon \}$, and $\mathcal{A}^C$ refers to the complement of set $\mathcal{A}$. See Fig.~\ref{fig:TV_threshold} for an illustration.
\begin{equation}
\begin{split}
    \int p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} & = \int_{\calA_\epsilon} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} + \int_{\calA_\epsilon^{\complement}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} \\
    & \leq \int_{\calA_\epsilon} \epsilon \max\{p(\mathbf{x}), q(\mathbf{x})\}\,\der \mathbf{x} + \int_{\calA_\epsilon^{\complement}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} \\
    & \leq \epsilon \int ( p(\mathbf{x}) + q(\mathbf{x}) )\,\der \mathbf{x}
 + \int_{\calA_\epsilon^{\complement}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} \\
   & = 2\epsilon + \int_{\calA_\epsilon^{\complement}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}\\
   & \leq 2\epsilon+T_{\calA_\epsilon^{\complement}}\left|\calA_\epsilon^{\complement}\right|.
\end{split}
\end{equation}
% That is,
% \begin{equation}
% \begin{split}
%     \int p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} & \leq 2\epsilon + \int_{\mathcal{A}^C} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} \\
%     & \leq 2\epsilon + T|\mathcal{A}^C| \\
%     & \leq 4\epsilon.
% \end{split}
% \end{equation}
Hence, $D_{\text{CS}}(p;q)\geq -\log\Big(2\epsilon+T_{\calA_\epsilon^{\complement}}\left|\calA_\epsilon^{\complement}\right|\Big)+1/2\log(C_3)\geq 1 \geq D_{\text{TV}}(p;q)$.
% \begin{equation}
% \begin{split}
%     D_{\text{CS}}(p;q) & = -\log(\int p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}) + \log(C_3) \\
%     & \geq -\log(4\epsilon) + \log(C_3).
% \end{split}
% \end{equation}
% 
% If $\frac{C_3}{4\epsilon}\geq e$,
% \begin{equation}
%     \sqrt{D_{\text{CS}}(p;q)} \geq 1 \geq D_{\text{TV}}(p;q).
% \end{equation}
% 
\end{proof}

\bigskip 
\begin{remark}
\label{remark:exp-prop6}
We provide some explanation of the conditions in Proposition \ref{proposition_general_TV}. 
These conditions imply that as two densities $p,q$ exhibit less and less overlap (i.e., in the case of a small $\epsilon>0$) the integral of $pq$ tends toward 0. Consequently, $-\log\left(\int p(x)q(x)\der x\right)\gg 0$ in $D_{\mathrm{CS}}(p;q)$ dominates $\log\left(\int p^2(x)\der x\right)+\log\left(\int q^2(x)\der x\right)$ because $\int p^2(x)\der x$ and $\int q^2(x)\der x$ are constants unaffected by the extent of overlap between $p$ and $q$. Therefore, $D_{\mathrm{CS}}(p;q)$ can rapidly surpass 1 when the shapes of $p$ and $q$ are markedly distinct.

For illustration, 
let $p$ be the pdf of $\calN(\mu_1,\sigma_1^2)$ and $q$ be the pdf of $\calN(\mu_2,\sigma_2^2)$. 
For $\epsilon>0$,
we consider two examples: (i) $\mu_2=\mu_1 +\delta_\epsilon > \mu_1$ and $\sigma_1=\sigma_2=\sigma>0$; (ii) $\mu_1= \mu_2 = \mu$ and $\sigma_2=\sigma_1 + \delta_\epsilon > \sigma_1$, where $\delta_{\epsilon}>0$ relies on $\epsilon$.
%Note that if $\epsilon$ is excessively large, then $\calA = \SR$, resulting in  $\calA^\complement = \SR\backslash \calA = \emptyset$. If $\calA^\complement =\emptyset$, then $T_{\calA^\complement}\left|\calA^\complement\right| = 0 $. We observe that the conditions in Proposition 4 are not hard to fulfill. As such, we consider 

(i) For $\epsilon>0$, we have 
\begin{align*}
\calA &=\left(-\infty, \mu_2-\sigma\sqrt{-\log\left(2\pi\sigma^2\epsilon^2\right)}\right]\cup \left[\mu_1+\sigma\sqrt{-\log\left(2\pi\sigma^2\epsilon^2\right)},+\infty\right),\\
\left|\calA^{\complement}\right|
&=
2\sigma\sqrt{-\log\left(2\pi\sigma^2\epsilon^2\right)}-\delta_\epsilon,\\
T_{\calA^\complement}
&\leq \left(2\pi \sigma^2\right)^{-1}\exp\left(-\frac{(2\mu_1+\delta_\epsilon)^2}{4\sigma^2}\right),\\
C_3
&=
\left(4\pi \sigma^2\right)^{-1}.
\end{align*}
It is not hard to see that for any $\epsilon > 0$, when $\delta_{\epsilon}$ is sufficiently large, indicating that $p$ and $q$ substantially differ from each other, one can achieve $T_{\calA^\complement}\left|\calA^{\complement}\right| \leq 2\epsilon$ because $T_{\calA^\complement}$ decays to 0 exponentially fast when $\delta_{\epsilon}$ increases. Additionally, satisfying $C_3 \geq \exp(2) \left(2\epsilon+T_{\calA_\epsilon^{\complement}}\left|\calA_\epsilon^{\complement}\right|\right)^2$ is not challenging if $\epsilon$ is chosen small.

(ii) Similarly, for $\epsilon>0$, we have 
\begin{align*}
\calA &=\left(-\infty, \mu_1-\sigma_1\sqrt{-\log\left(2\pi\sigma_1^2\epsilon^2\right)}\right]\cup \left[\mu_1+\sigma_1\sqrt{-\log\left(2\pi\sigma_1^2\epsilon^2\right)},+\infty\right),\\
\left|\calA^{\complement}\right|
&=
2\sigma_1\sqrt{-\log\left(2\pi\sigma_1^2\epsilon^2\right)},\\
T_{\calA^\complement}
&\leq \left[2\pi \sigma_1(\sigma_1+\delta_{\epsilon})\right]^{-1},\\
C_3
&=
\left[4\pi \sigma_1(\sigma_1+\delta_{\epsilon})\right]^{-1}.
\end{align*}
As before, for any $\epsilon > 0$, as long as $\delta_{\epsilon}$ is sufficiently large, we can have $T_{\calA^\complement}\left|\calA^{\complement}\right| \leq 2\epsilon$ and $C_3  \geq \exp(2) \left(2\epsilon+T_{\calA_\epsilon^{\complement}}\left|\calA_\epsilon^{\complement}\right|\right)^2$.

\end{remark}

% \subsection{Proof to Lemma 1}

% \begin{lemma}\label{chain_rule}
%     Let $p^s(\mathbf{z},y)$ and $p^t(\mathbf{z},y)$ be two distributions for a pair of variables $\mathbf{z}$and $y$, then:
% \begin{equation}
% \begin{split}
%     & D_{\text{CS}} (p^t (\mathbf{z},y); p^s(\mathbf{z},y)) \\
%     & = D_{\text{CS}} (p^t (\mathbf{z}); p^s(\mathbf{z}) ) + D_{\text{CS}} (p^t (y|\mathbf{z}); p^s(y|\mathbf{z})).
% \end{split}
% \end{equation}
% \end{lemma}

% \begin{proof}

% We change the notation $p^s(\mathbf{z},y)$ and $p^t(\mathbf{z},y)$ as $p(\mathbf{x},y)$ and $q(\mathbf{x},y)$.

% \begin{equation} 
% \begin{aligned}
%  D_{\text{CS}}(p(y|\mathbf{x});q(y|\mathbf{x})) & =  -2\log(\iint_{\mathcal{X}, \mathcal{Y}} p(y|\mathbf{x})q(y|\mathbf{x})\,\der \mathbf{x}dy )  + \log (\iint_{\mathcal{X}, \mathcal{Y}} p^2(y|\mathbf{x})\,\der \mathbf{x}dy ) + \log (\iint_{\mathcal{X}, \mathcal{Y}}  q^2(y|\mathbf{x})\,\der \mathbf{x}dy )  \\
%  &  = -2\log(\iint_{\mathcal{X}, \mathcal{Y}} \frac{p(\mathbf{x},y)q(\mathbf{x},y)}{p(\mathbf{x})q(\mathbf{x})}\,\der \mathbf{x}dy )  + \log (\iint_{\mathcal{X}, \mathcal{Y}}  \frac{p^2(\mathbf{x},y)}{p^2(\mathbf{x})}\,\der \mathbf{x}dy )
%   + \log (\iint_{\mathcal{X}, \mathcal{Y}}  \frac{q^2(\mathbf{x},y)}{q^2(\mathbf{x})}\,\der \mathbf{x}dy ) \\
% & = -2\log(\iint_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x},y)q(\mathbf{x},y)\,\der \mathbf{x}dy ) + \log (\iint_{\mathcal{X}, \mathcal{Y}}  p^2(\mathbf{x},y)\,\der \mathbf{x}dy ) + \log (\iint_{\mathcal{X}, \mathcal{Y}}  q^2(\mathbf{x},y)\,\der \mathbf{x}dy ) \\
% & - \left[ - 2\log(\iint_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}dy ) + \log (\iint_{\mathcal{X}, \mathcal{Y}}  p^2(\mathbf{x})\,\der \mathbf{x}dy ) + \log (\iint_{\mathcal{X}, \mathcal{Y}}  q^2(\mathbf{x})\,\der \mathbf{x}dy ) \right].
% \end{aligned}
% \label{eq.ccs_divergence}
% \end{equation}

% Let us take $\iint_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}dy$ as an example,
% \begin{equation}
%     \iint_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}dy = \int_{\mathcal{Y}} \left( \int_{\mathcal{X}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} \right) dy = \int_{\mathcal{X}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} \int_{\mathcal{Y}} 1 dy.
% \end{equation}

% Hence,
% \begin{equation}
%     \log( \iint_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}dy ) = \log (\int_{\mathcal{X}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x}) + \log (\int_{\mathcal{Y}} 1 dy).
% \end{equation}

% \begin{equation} 
% \begin{aligned}
%  D_{\text{CS}}(p(y|\mathbf{x});q(y|\mathbf{x})) & = -2\log(\iint_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x},y)q(\mathbf{x},y)\,\der \mathbf{x}dy ) + \log (\iint_{\mathcal{X}, \mathcal{Y}}  p^2(\mathbf{x},y)\,\der \mathbf{x}dy ) + \log (\iint_{\mathcal{X}, \mathcal{Y}}  q^2(\mathbf{x},y)\,\der \mathbf{x}dy ) \\
% & - \left[ - 2\log(\int_{\mathcal{X}} p(\mathbf{x})q(\mathbf{x})\,\der \mathbf{x} ) + \log (\int_{\mathcal{X}}  p^2(\mathbf{x})\,\der \mathbf{x} ) + \log (\int_{\mathcal{X}}  q^2(\mathbf{x})\,\der \mathbf{x} ) \right] \\
% & = D_{\text{CS}} (p(\mathbf{x},y); q(\mathbf{x},y)) - D_{\text{CS}} (p (\mathbf{x}); q(\mathbf{x}) ).
% \end{aligned}
% \label{eq.ccs_divergence}
% \end{equation}


    
% \end{proof}


\subsection{Empirical Validation of Relationship between Different Divergence Measures}
\label{subsec:empirical}
We finally provide an empirical validation to show that the following relationship generally holds:
\begin{equation}\label{eq:divergence_relation}
    D_{\text{TV}} \lesssim \sqrt{D_{\text{CS}} } \quad \text{and} \quad D_{\text{CS}} \lesssim D_{\text{KL}},
\end{equation}
where $p$ and $q$ need not be Gaussian, and the symbol $\lesssim$ denotes ``less than or similar to''.


We start our analysis for discrete $p$ and $q$ for simplicity. This is because, unlike the CS divergence, TV distance and KL divergence do not have closed-form expressions for neither Gaussian distributions nor a mixture-of-Gaussian (MoG)~\citep{devroye2018total}. Hence, it becomes challenging to perform Monte Carlo simulations for continuous cases.



Consider the probability mass functions $p$ and $q$ with the support $\mathcal{X}=\{x_1,x_2,\ldots,x_K\}$ (i.e., there are $K$ different discrete states). Namely, $\sum^K_{i=1}p(x_i)=\sum^K_{i=1}q(x_i) = 1$. We have
\begin{align}
D_{\text{TV}}(p; q) &= \frac{1}{2}\sum |p(x_i) - q(x_i)|, \label{eq:discrete_TV}\\
D_{\text{CS}}(p; q) &= -\log \left( \frac{\sum p(x_i)q(x_i)}{\sqrt{\sum p(x_i)^2}\sqrt{\sum q(x_i)^2}} \right), \label{eq:discrete_CS}\\
D_{\text{KL}}(p; q) &= \sum^K_{i=1}p(x_i)\log \left( \frac{p(x_i)}{q(x_i)} \right). \label{eq:discrete_KL}
\end{align}

For some $K$, we randomly generate probability pairs $\big\{(p_i,q_i),\, i=1,\ldots K\big\}$. 
Fig.~\ref{fig:MC_simulation} demonstrates the values of $D_{\text{TV}}$ with respect to $\sqrt{D_{\text{CS}}}$ (first row) and $D_{\text{KL}}$ with respect to $D_{\text{CS}}$ (second row) when $K=2$ (first column) $K=3$ (second column) and $K=10$ (third column), respectively. We only show results of $1,000$ replicates.

\begin{figure} [htbp]
%\hfill
\centering 
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/K2_CS_TV.png}
         \caption{$K=2$}
     \end{subfigure}
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/K3_CS_TV.png}
         \caption{$K=3$}
     \end{subfigure}
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/K10_CS_TV.png}
         \caption{$K=10$}
     \end{subfigure}
     \\
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/K2_CS_KL.png}
         \caption{$K=2$}
     \end{subfigure}
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/K3_CS_KL.png}
         \caption{$K=3$}
     \end{subfigure}
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/K10_CS_KL.png}
         \caption{$K=10$}
     \end{subfigure}
     \caption{Values of $D_{\text{TV}}$ with respect to $\sqrt{D_{\text{CS}}}$ (first row) and $D_{\text{KL}}$ with respect to $D_{\text{CS}}$ (second row) for $1,000$ replicates of randomly generated probability pairs $\big\{(p_i,q_i),\, i=1,\ldots K\big\}$, when $K=2$ (first column), $K=3$ (second column), and $K=10$ (third column). The diagonal indicates $D_{\text{TV}}=\sqrt{D_{\text{CS}}}$ or $D_{\text{KL}}=D_{\text{CS}}$.}
     \label{fig:MC_simulation}
\end{figure}





\section{Empirical Estimator of CS and Conditional CS}
\label{sec:estimation-2}
%In this section, we first review Proposition 1 and Proposition 2 in the main manuscript, which give the empirical estimator of CS and CCS divergences. Then, we provide detailed proof of each. 

We now use subscripts to denote the domain index for notational convenience.

%This way, we can represent the square of distributions as $p_s(z)^2$, rather than $(p^s(z))^2$, which is too complex.

\subsection{Empirical Estimator of CS}
\label{subsec:est-cs}
\begin{customprop}{5}[Empirical Estimator of $D_{\text{CS}}(p_s(\mathbf{z});p_t(\mathbf{z}))$~\citep{jenssen2006cauchy}]
Given observations $\{\mathbf{z}_i^s\}_{i=1}^M$ and 
$\{\mathbf{z}_i^t\}_{i=1}^N$, the empirical estimator of $D_{\text{CS}}(p_s(\mathbf{z});p_t(\mathbf{z}))$ is given by:
\begin{equation}
\label{eq.cs_est_supp}
\begin{aligned}
& \widehat{D}_{\text{CS}} (p_s(\mathbf{z});p_t(\mathbf{z})) = \log \frac{1}{M^2}\sum_{i,j=1}^M \kappa({\bf z}_i^s,{\bf z}_j^s)) +  \\ & \log(\frac{1}{N^2}\sum_{i,j=1}^N \kappa({\bf z}_i^t,{\bf z}_j^t)) 
-2 \log(\frac{1}{MN}\sum_{i=1}^M \sum_{j=1}^N \kappa({\bf z}_i^s,{\bf z}_j^t)).
\end{aligned}
\end{equation}
where $\kappa$ is a kernel function such as Gaussian $\kappa_{\sigma}(\mathbf{z},\mathbf{z}')=\exp(-\|\mathbf{z}-\mathbf{z}'\|_2^2/2\sigma^2)$.
\end{customprop}

\begin{proof}
%Given $M$ samples $\{\mathbf{z}_i^s\}_{i=1}^M$ drawn from distribution $p_s$ and $N$ samples $\{\mathbf{z}_i^t\}_{i=1}^N$ drawn from distribution $p_t$.

The CS divergence is defined by:
\begin{equation} 
D_{\text{CS}}(p_s;p_t)=-\log \left(\frac{(\int p_s(\mathbf{z})p_t(\mathbf{z})\,\der \mathbf{z})^2}{\int p_s(\mathbf{z})^2\,\der \mathbf{z} \int p_t(\mathbf{z})^2\,\der \mathbf{z}}\right).
\label{eq.cs_divergence}
\end{equation}
By the kernel density estimation (KDE), we have:
\begin{equation}\label{eq:appendix_KDE1}
\hat{p}_s(\mathbf{z}) = \frac{1}{M} \sum_{i=1}^M \kappa_\sigma (\mathbf{z}-\mathbf{z}_i^s),
\end{equation}
and
\begin{equation}\label{eq:appendix_KDE2}
\hat{p}_t(\mathbf{z}) = \frac{1}{N} \sum_{i=1}^N \kappa_\sigma (\mathbf{z}-\mathbf{z}_i^t).
\end{equation}


By substituting Eq.~(\ref{eq:appendix_KDE1}) into $\int \hat{p}_s^2(\mathbf{z})\,\der z$, we have:
\begin{equation}\label{eq:appendix_KDE3}
\begin{split}
\int \hat{p}_s^2(\mathbf{z})\,\der z & = \int \left( \frac{1}{M} \sum_{i=1}^M \kappa_\sigma (\mathbf{z}-\mathbf{z}_i^s) \right)^2 \,\der z \\
 & = \frac{1}{M^2} \int \left( \sum_{i=1}^M\sum_{j=1}^M \kappa_\sigma (\mathbf{z}-\mathbf{z}_j^s) \cdot \kappa_\sigma (\mathbf{z}-\mathbf{z}_i^s) \right) \,\der z \\
 & = \frac{1}{M^2} \sum_{i=1}^M\sum_{j=1}^M \int \kappa_\sigma (\mathbf{z}-\mathbf{z}_j^s) \cdot \kappa_\sigma (\mathbf{z}-\mathbf{z}_i^s) \,\der z \\
 & = \frac{1}{M^2} \sum_{i=1}^M\sum_{j=1}^M \kappa_{\sqrt{2}\sigma} (\mathbf{z}_j^s-\mathbf{z}_i^s).
\end{split}
\end{equation}

%The last equation is obtained by noticing that the integral of the product of two Gaussians is exactly evaluated as the value of the Gaussian computed at the difference of the arguments and whose variance is the sum of the variances of the two original Gaussian functions~\cite{bromiley2003products}.

The final equation is derived by using the property that the integral of the product of two Gaussians equals the value of the Gaussian computed at the difference of the arguments with the variance being the sum of the variances of the two original Gaussian functions~\citep{bromiley2003products}.

Similarly,
\begin{equation}
\int \hat{p}_t^2(\mathbf{z})\,\der z = \frac{1}{N^2} \sum_{i=1}^N\sum_{j=1}^N \kappa_{\sqrt{2}\sigma} (\mathbf{z}_j^t-\mathbf{z}_i^t),
\end{equation}
and
\begin{equation}\label{eq:appendix_KDE5}
\int \hat{p}_s(\mathbf{z})\hat{p}_t(\mathbf{z})\,\der z = \frac{1}{MN} \sum_{i=1}^M\sum_{j=1}^N \kappa_{\sqrt{2}\sigma} (\mathbf{z}_j^t-\mathbf{z}_i^s).
\end{equation}


By substituting Eqs.~(\ref{eq:appendix_KDE3})-(\ref{eq:appendix_KDE5}) into the definition of CS divergence in Eq.~(\ref{eq.cs_divergence}), we obtain:
\begin{equation}
\begin{aligned}
\widehat{D}_{\text{CS}} (p_a; p_t) &=& \log\left(\frac{1}{M^2}\sum_{i,j=1}^M \kappa_{\sqrt{2}\sigma}(\mathbf{z}_j^s-\mathbf{z}_i^s)\right) +
\log\left(\frac{1}{N^2}\sum_{i,j=1}^N \kappa_{\sqrt{2}\sigma}(\mathbf{z}_j^t-\mathbf{z}_i^t)\right) \\
&& -2 \log\left(\frac{1}{MN}\sum_{i=1}^M \sum_{j=1}^N \kappa_{\sqrt{2}\sigma}(\mathbf{z}_j^t-\mathbf{z}_i^s)\right).
\end{aligned}
\end{equation}

\paragraph{Connection to MMD} Interestingly, we found the CS divergence is closely related to the MMD. Here, we demonstrate the connection between the CS divergence and MMD. A natural choice for measuring the dissimilarity between $p_s$ and $p_t$ is the Euclidean distance:
\begin{equation}
\begin{split}
D_{\text{ED}} (p_s;p_t ) & = \int (\hat{p}_s(\mathbf{z}) - \hat{p}_t(\mathbf{z}))^2\,\der z \\
 & = \int \hat{p}_s^2(\mathbf{z})\,\der z + \int \hat{p}_t^2(\mathbf{z})\,\der z - \int \hat{p}_s(\mathbf{z})\hat{p}_t(\mathbf{z}) \,\der z
\end{split}
\end{equation}

Combining Eqs.~(\ref{eq:appendix_KDE3})-(\ref{eq:appendix_KDE5}), we have:
\begin{equation}
\label{eq:appendix_ED}
\begin{split}
D_{\text{ED}} (p_s;p_t ) & = {\frac{1}{M^2} \sum_{i,j=1}^M \kappa_{\sqrt{2}\sigma} (\mathbf{z}_j^s-\mathbf{z}_i^s)}%_{\encircle{A}}
+ {\frac{1}{N^2} \sum_{i,j=1}^N \kappa_{\sqrt{2}\sigma} (\mathbf{z}_j^t-\mathbf{z}_i^t)} \\ %_{\encircle{B}}  \\ 
& - {\frac{2}{MN} \sum_{i,j=1}^{M,N} \kappa_{\sqrt{2}\sigma} (\mathbf{z}_j^t-\mathbf{z}_i^s)}.%_{\encircle{C}}.
\end{split}
\end{equation}

Note that Eq.~(\ref{eq:appendix_ED}) is exactly the same (in terms of mathematical expression) as the square of MMD using \emph{V}-statistic estimator~\citep{gretton2012kernel}:
\begin{equation}
\widehat{\text{MMD}}_v [p_s (\mathbf{z}),p_t (\mathbf{z})]
 = \left[ \frac{1}{M^2}\sum_{i,j=1}^M \kappa(\mathbf{z}_i^s,\mathbf{z}_j^s) + \frac{1}{N^2}\sum_{i,j=1}^N \kappa(\mathbf{z}_i^t,\mathbf{z}_j^t)
 - \frac{2}{MN}\sum_{i,j=1}^{M,N}\kappa(\mathbf{z}_j^t,\mathbf{z}_i^s) \right]^{\frac{1}{2}},
\end{equation}
by using a Gaussian kernel $\kappa$ with variance $\sqrt{2}\sigma$. Also, we have:
\begin{equation}
\label{eq:mmd_est}
\widehat{\text{MMD}}^2(p^s;p^t)  = \langle \mu_s,\mu_t \rangle_\mathcal{H}^2 = \frac{1}{M^2}\sum_{i,j=1}^M \kappa({\bf z}_i^s,{\bf z}_j^s) + \frac{1}{N^2}\sum_{i,j=1}^N \kappa({\bf z}_i^t,{\bf z}_j^t) - \frac{2}{MN}\sum_{i=1}^M \sum_{j=1}^N \kappa({\bf z}_i^s,{\bf x}_j^t)
\end{equation}

%\paragraph{Comment} As we discussed in the \textbf{Remark 2} in the main manuscript, the CS divergence measures the cosine similarity, whereas MMD uses Euclidean distance. 

By comparing Eq.~(\ref{eq.cs_est_supp}) with Eq.~(\ref{eq:mmd_est}), it is interesting to find that the empirical estimator of CS divergence just adds a logarithm on each term of that of MMD.  


%Given a characteristic kernel $\kappa(\mathbf{z},\mathbf{z}')=\langle \phi(\mathbf{z}),\phi(\mathbf{z}') \rangle_\mathcal{H}$, let us denote the (empirical) mean embedding for $\{\mathbf{z}_i^s\}_{i=1}^M$ and $\{\mathbf{z}_i^t\}_{i=1}^N$ as $\mu_s = \frac{1}{M}\sum_{i=1}^M \phi(\mathbf{z}_i^s)$ and $\mu_t = \frac{1}{N}\sum_{i=1}^n \phi(\mathbf{z}_i^t)$, the empirical estimators of CS divergence as:
%\begin{equation}
%\widehat{D}_{\text{CS}} (p^s;p^t) = -2\log \left( \frac{\langle \mu_s,\mu_t \rangle_\mathcal{H}}{\|\mu_s\|_\mathcal{H} \|\mu_t\|_\mathcal{H} } \right) = -2\log \cos(\mu_s,\mu_t),
%\end{equation}


% To summarize, the mathematical expression of MMD can be derived by either taking the distance of kernel mean embedding in a reproducing kernel Hilbert space (RKHS) or taking the Euclidean distance of two distributions which are estimated by KDE. More interestingly, one can estimate MMD by $\left( \encircle{A}+\encircle{B}-2\encircle{C} \right)^\frac{1}{2}$, and CS divergence by $\log\left(\encircle{A}\right)+ \log\left(\encircle{B}\right)-2\log\left(\encircle{C}\right)$. Note, however, that this observation does not hold for conditional MMD and conditional CS divergence.

\end{proof}

%\subsection{Proof to Proposition 2}
\subsection{Empirical Estimator of CCS}
\label{subsec:est-ccs}
% \begin{proposition}[Empirical Estimator of $D_{\text{CCS}}(p_s(\hat{y}|\mathbf{x});p_t(\hat{y}|\mathbf{x}))$~\cite{yu2023conditional}]
% Given observations $\{\mathbf{x}_i^s,\hat{y}_i^s \}_{i=1}^M$ and $\{\mathbf{x}_i^t,\hat{y}_i^t \}_{i=1}^N$. Let $K^s$ and $L^s$ denote, respectively, the Gram matrices for the variable $\mathbf{x}$ and the predicted output $\hat{y}$ in the source distribution. Similarly, let $K^t$ and $L^t$ denote, respectively, the Gram matrices for the variable $\mathbf{x}$ and the predicted out $\hat{y}$ in the target distribution. Meanwhile, let $K^{st}\in \mathbb{R}^{M\times N}$ (i.e., $\left(K^{st}\right)_{ij}=\kappa(\mathbf{x}^s_i - \mathbf{x}^t_j)$) denote the Gram matrix from source distribution to target distribution for input variable $\mathbf{x}$, and $L^{st}\in \mathbb{R}^{M\times N}$ the Gram matrix from source distribution to target distribution for predicted output $\hat{y}$.
% Similarly, let $K^{ts}\in \mathbb{R}^{N\times M}$ (i.e., $\left(K^{ts}\right)_{ij}=\kappa(\mathbf{x}^t_i - \mathbf{x}^s_j)$) denote the Gram matrix from target distribution to source distribution for input variable $\mathbf{x}$, and $L^{ts}\in \mathbb{R}^{N\times M}$ the Gram matrix from target distribution to source distribution for predicted output $\hat{y}$.
% The empirical estimation of $D_{\text{CCS}}(p_s(\hat{y}|\mathbf{x});p_t(\hat{y}|\mathbf{x}))$ is given by:
% \begin{equation}\label{eq:conditional_CS_est}
% \begin{split}
% & \widehat{D}_{\text{CS}}(p^s(\hat{y}|\mathbf{x});p^t(\hat{y}|\mathbf{x})) \approx \log( \sum_{j=1}^M ( \frac{ \sum_{i=1}^M K_{ji}^s L_{ji}^s }{ (\sum_{i=1}^M K_{ji}^s)^2 } ) ) + \log (\sum_{j=1}^N ( \frac{ \sum_{i=1}^N K_{ji}^t L_{ji}^t }{ (\sum_{i=1}^N K_{ji}^t)^2 } ) ) \\
% & - \log ( \sum_{j=1}^M ( \frac{ \sum_{i=1}^N K_{ji}^{st} L_{ji}^{st} }{ (\sum_{i=1}^M K_{ji}^s) (\sum_{i=1}^N K_{ji}^{st}) } ) ) - \log ( \sum_{j=1}^N ( \frac{ \sum_{i=1}^M K_{ji}^{ts} L_{ji}^{ts} }{ (\sum_{i=1}^M K_{ji}^{ts}) (\sum_{i=1}^N K_{ji}^t) } ) ).
% \end{split}
% \end{equation}
% \end{proposition}

 The conditional CS divergence for $p_s(\mathbf{y}|\mathbf{z})$ and $p_t(\mathbf{y}|\mathbf{z})$ is expressed as:
\begin{equation} 
\begin{aligned}
& D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});p_t(\mathbf{y}|\mathbf{z})) = \\
 &    -2\log\left(\int_{\mathcal{Z}} \int_{\mathcal{Y}} \frac{p_s(\mathbf{z},\mathbf{y})p_t(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z})p_t(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \right)  + \log \left(\int_{\mathcal{Z}} \int_{\mathcal{Y}} \frac{{p_s}^2(\mathbf{z},\mathbf{y})}{{p_s}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \right) \\
 & + \log \left(\int_{\mathcal{Z}} \int_{\mathcal{Y}} \frac{{p_t}^2(\mathbf{z},\mathbf{y})}{{p_t}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \right).
\end{aligned}
\label{eq.ccs_divergence}
\end{equation}
which contains two conditional quadratic terms (i.e., $\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_s}^2(\mathbf{z},\mathbf{y})}{{p_t}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$ and $\int_\mathcal{X}\int_\mathcal{Y} \frac{{p_t}^2(\mathbf{z},\mathbf{y})}{{p_t}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$) and a cross term (i.e., $\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y})p_t(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z})p_t(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$). Note, we use $y$ instead of $\hat{y}$ in Proposition 2 in the main manuscript to represent label for the convenience and clear demonstration.


\begin{customprop}{6}[Empirical Estimator of $D_{\text{CCS}}(p_s({y}|\mathbf{z});p_t({y}|\mathbf{z}))$]
Given observations $\{\mathbf{z}_i^s, y_i^s \}_{i=1}^M$ and $\{\mathbf{z}_i^t,y_i^t \}_{i=1}^N$. Let $K^s$ and $L^s$ denote, respectively, the Gram matrices for the variable $\mathbf{z}$ and the predicted output $\hat{y}$ in the source distribution. Similarly, let $K^t$ and $L^t$ denote, respectively, the Gram matrices for the variable $\mathbf{z}$ and the label $y$ in the target distribution. Meanwhile, let $K^{st}\in \mathbb{R}^{M\times N}$ (i.e., $\left(K^{st}\right)_{ij}=\kappa(\mathbf{z}^s_i - \mathbf{z}^t_j)$) denote the Gram matrix from source distribution to target distribution for input variable $\mathbf{z}$, and $L^{st}\in \mathbb{R}^{M\times N}$ the Gram matrix from source distribution to target distribution for predicted output $\hat{y}$.
Similarly, let $K^{ts}\in \mathbb{R}^{N\times M}$ (i.e., $\left(K^{ts}\right)_{ij}=\kappa(\mathbf{z}^t_i - \mathbf{z}^s_j)$) denote the Gram matrix from target distribution to source distribution for input variable $\mathbf{z}$, and $L^{ts}\in \mathbb{R}^{N\times M}$ the Gram matrix from target distribution to source distribution for predicted output $y$.
The empirical estimation of $D_{\text{CCS}}(p_s(y|\mathbf{z});p_t(y|\mathbf{z}))$ is given by:
\begin{equation}%\label{eq:conditional_CS_est}
\begin{split}
& \widehat{D}_{\text{CCS}}(p_s(\hat{y}|\mathbf{z});p_t(\hat{y}|\mathbf{z})) \approx \log( \sum_{j=1}^M ( \frac{ \sum_{i=1}^M K_{ji}^s L_{ji}^s }{ (\sum_{i=1}^M K_{ji}^s)^2 } ) ) + \log (\sum_{j=1}^N ( \frac{ \sum_{i=1}^N K_{ji}^t L_{ji}^t }{ (\sum_{i=1}^N K_{ji}^t)^2 } ) ) \\
& - \log ( \sum_{j=1}^M ( \frac{ \sum_{i=1}^N K_{ji}^{st} L_{ji}^{st} }{ (\sum_{i=1}^M K_{ji}^s) (\sum_{i=1}^N K_{ji}^{st}) } ) ) - \log ( \sum_{j=1}^N ( \frac{ \sum_{i=1}^M K_{ji}^{ts} L_{ji}^{ts} }{ (\sum_{i=1}^M K_{ji}^{ts}) (\sum_{i=1}^N K_{ji}^t) } ) ).
\end{split}
\end{equation}
\end{customprop}

In the following, we first demonstrate how to estimate the two conditional quadratic terms (i.e., $\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_s}^2(\mathbf{z},\mathbf{y})}{{p_s}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$ and $\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_t}^2(\mathbf{z},\mathbf{y})}{{p_t}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$) from samples. We then demonstrate how to estimate the cross term (i.e., $\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y})p_t(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z})p_t(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$). We finally explain the empirical estimation of $D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});p_t(\mathbf{y}|\mathbf{z}))$.

\begin{proof}

The following proof follows directly from~\citep{yu2023conditional}.

% \definecolor{lightmintbg}{rgb}{.88,.96,.99}
% \colorbox{lightmintbg}{[The conditional quadratic term]}
[The conditional quadratic term]

The empirical estimation of $\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s^2(\mathbf{z},\mathbf{y})}{p_s^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$ can be expressed as:
\begin{equation}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_s}^2(\mathbf{z},\mathbf{y})}{{p_s}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} = \mathbb{E}_{p_s(Z,Y)} \left[ \frac{p_s(Z,Y)}{{p_s}^2(Z)} \right] \approx \frac{1}{M} \sum_{j=1}^M \frac{p_s(\mathbf{z}_j,\mathbf{y}_j)}{{p_s}^2(\mathbf{z}_j)}.
\end{equation}

By kernel density estimator (KDE), we have:
\begin{equation}
\frac{p_s(\mathbf{z}_j,\mathbf{y}_j)}{{p_s}^2(\mathbf{z}_j)} \approx M \frac{\sum_{i=1}^M \kappa_\sigma(\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_s})\kappa_\sigma(\mathbf{y}_j^{p_s} - \mathbf{y}_i^{p_s}) }{ \left(\sum_{i=1}^M \kappa_\sigma (\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_s})\right)^2 }.
\end{equation}

Therefore,
\begin{equation}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_s}^2(\mathbf{z},\mathbf{y})}{{p_s}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \approx \sum_{j=1}^M \left( \frac{\sum_{i=1}^M \kappa_\sigma(\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_s})\kappa_\sigma(\mathbf{y}_j^{p_s} - \mathbf{y}_i^{p_s}) }{ \left(\sum_{i=1}^M \kappa_\sigma (\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_s})\right)^2 } \right).
\end{equation}

Similarly, the empirical estimation of $\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_t}^2(\mathbf{z},\mathbf{y})}{{p_t}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$ is given by:
\begin{equation}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_t}^2(\mathbf{z},\mathbf{y})}{{p_t}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \approx \sum_{j=1}^N \left( \frac{\sum_{i=1}^N \kappa_\sigma(\mathbf{z}_j^{p_t} - \mathbf{z}_i^{p_t})\kappa_\sigma(\mathbf{y}_j^{p_t} - \mathbf{y}_i^{p_t}) }{ \left(\sum_{i=1}^N \kappa_\sigma (\mathbf{z}_j^{p_t} - \mathbf{z}_i^{p_t})\right)^2 } \right).
\end{equation}



%\colorbox{lightmintbg}{[The cross term]}
[The cross term]

Again, the empirical estimation of $\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y}){p_t}(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z}){p_t}(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y}$ can be expressed as:
\begin{equation}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y}){p_t}(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z}){p_t}(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} = \mathbb{E}_{p_s(Z,Y)} \left[ \frac{{p_t}(Z,Y)}{p_s(X){p_t}(Z)} \right] \approx \frac{1}{M} \sum_{j=1}^M \frac{{p_t}(\mathbf{z}_j,\mathbf{y}_j)}{p_s(\mathbf{z}_j){p_t}(\mathbf{z}_j)}.
\end{equation}

By KDE, we further have:
\begin{equation}
\frac{{p_t}(\mathbf{z}_j,\mathbf{y}_j)}{p_s(\mathbf{z}_j){p_t}(\mathbf{z}_j)} \approx M \frac{\sum_{i=1}^N \kappa_\sigma(\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_t})\kappa_\sigma(\mathbf{y}_j^{p_s} - \mathbf{y}_i^{p_t}) }{\sum_{i=1}^M \kappa_\sigma (\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_s}) \sum_{i=1}^N \kappa_\sigma (\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_t})}.
\end{equation}

Therefore,
\begin{equation}\label{eq:cross}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y}){p_t}(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z}){p_t}(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \approx \sum_{j=1}^M \left( \frac{\sum_{i=1}^N \kappa_\sigma(\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_t})\kappa_\sigma(\mathbf{y}_j^{p_s} - \mathbf{y}_i^{p_t}) }{\sum_{i=1}^M \kappa_\sigma (\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_s}) \sum_{i=1}^N \kappa_\sigma (\mathbf{z}_j^{p_s} - \mathbf{z}_i^{p_t})} \right).
\end{equation}

Note that, one can also empirically estimate $\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y}){p_t}(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z}){p_t}(\mathbf{z})} \, \der x \der y$ over ${p_t}(\mathbf{z},\mathbf{y})$, which can be expressed as:
\begin{equation}\label{eq:cross_alternative}
\begin{split}
\int_\mathcal{X}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y}){p_t}(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z}){p_t}(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} & = \mathbb{E}_{{p_t}(X,Y)} \left[ \frac{p_s(X,Y)}{p_s(X){p_t}(X)} \right]
\approx \frac{1}{N} \sum_{j=1}^N \frac{p_s(\mathbf{z}_j,\mathbf{y}_j)}{p_s(\mathbf{z}_j){p_t}(\mathbf{z}_j)} \\
& \approx \sum_{j=1}^N \left( \frac{\sum_{i=1}^M \kappa_\sigma(\mathbf{z}_j^{p_t} - \mathbf{z}_i^{p_s})\kappa_\sigma(\mathbf{y}_j^{p_t} - \mathbf{y}_i^{p_s}) }{\sum_{i=1}^M \kappa_\sigma (\mathbf{z}_j^{p_t} - \mathbf{z}_i^{p_s}) \sum_{i=1}^N \kappa_\sigma (\mathbf{z}_j^{p_t} - \mathbf{z}_i^{p_t})} \right).
\end{split}
\end{equation}



%\colorbox{lightmintbg}{[Empirical Estimation]}
[Empirical Estimation]
%Denote $K_p$ and $L_p$ the Gram matrices for input variable $x$ and output variable $y$ from the distribution $p$, respectively. Further, let us further denote $A_{j*}$ the $j$-th row of a matrix $A$.

Let $K^{s}$ and $L^{s}$ denote, respectively, the Gram matrices for the input variable $\mathbf{z}$ and output variable $\mathbf{y}$ in the distribution $p_s$ from the source domain. Further, let $\left(K\right)_{ji}$ denotes the $(j,i)$-th element of a matrix $K$ (i.e., the $j$-th row and $i$-th column of $K$). We have:
\begin{equation}\label{eq:estimate_quadratic}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_s}^2(\mathbf{z},\mathbf{y})}{{p_s}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \approx \sum_{j=1}^M \left( \frac{ \sum_{i=1}^M K_{ji}^{s} L_{ji}^{s} }{ (\sum_{i=1}^M K_{ji}^s)^2 } \right).
\end{equation}


%\begin{equation}
%\int_\mathcal{X}\int_\mathcal{Y} \frac{p^2(x,y)}{p^2(x)} \, \der x \der y \approx \sum_{j=1}^N \left( \frac{\text{sum}(K_p \odot L_p)_{j*}}{(\text{sum}(K_p)_{j*})^2} \right)
%\end{equation}

Similarly, let $K^t$ and $L^t$ denote, respectively, the Gram matrices for input variable $\mathbf{z}$ and output variable $\mathbf{y}$ in the distribution ${p_t}$ from the target domain. We have:
\begin{equation}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{{p_t}^2(\mathbf{z},\mathbf{y})}{{p_t}^2(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \approx \sum_{j=1}^N \left( \frac{ \sum_{i=1}^N K_{ji}^t L_{ji}^t }{ (\sum_{i=1}^N K_{ji}^t)^2 } \right).
\end{equation}


%\begin{equation}
%\int_\mathcal{X}\int_\mathcal{Y} \frac{q^2(x,y)}{q^2(x)} \, \der x \der y \approx \sum_{j=1}^N \left( \frac{\text{sum}(K_q \odot L_q)_{j*}}{(\text{sum}(K_q)_{j*})^2} \right)
%\end{equation}

Further, let $K^{st}\in \mathbb{R}^{M\times N}$ denote the Gram matrix between distributions $p_s$ and ${p_t}$ for input variable $\mathbf{z}$, and $L^{st}$ the Gram matrix between distributions $p_s$ and ${p_t}$ for output variable $\mathbf{y}$. According to Eq.~(\ref{eq:cross}), we have:
\begin{equation}\label{eq:estimate_cross1}
\int_\mathcal{Z}\int_\mathcal{Y} \frac{p_s(\mathbf{z},\mathbf{y}){p_t}(\mathbf{z},\mathbf{y})}{p_s(\mathbf{z}){p_t}(\mathbf{z})} \,\der \mathbf{z}\der \mathbf{y} \approx \sum_{j=1}^M \left( \frac{ \sum_{i=1}^N K_{ji}^{st} L_{ji}^{st} }{ (\sum_{i=1}^M K_{ji}^s) (\sum_{i=1}^N K_{ji}^{st}) } \right).
\end{equation}


%\begin{equation}
%\int_\mathcal{X}\int_\mathcal{Y} \frac{p(x,y)q(x,y)}{p(x)q(x)} \, \der x \der y \approx \sum_{j=1}^N \left( \frac{\text{sum}(K_{pq} \odot L_{pq})_{j*}}{\text{sum}(K_{p})_{j*}\times \text{sum}(K_{q})_{j*}} \right)
%\end{equation}


Therefore, according to Eqs.~(\ref{eq:estimate_quadratic})-(\ref{eq:estimate_cross1}), an empirical estimate of $D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});{p_t}(\mathbf{y}|\mathbf{z}))$ is given by:
\begin{equation}\label{eq:conditional_CS_est1}
\begin{split}
D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});{p_t}(\mathbf{y}|\mathbf{z})) & \approx \log\left( \sum_{j=1}^M \left( \frac{ \sum_{i=1}^M K_{ji}^s L_{ji}^s }{ (\sum_{i=1}^M K_{ji}^s)^2 } \right) \right)
+ \log\left( \sum_{j=1}^N \left( \frac{ \sum_{i=1}^N K_{ji}^t L_{ji}^t }{ (\sum_{i=1}^N K_{ji}^t)^2 } \right) \right) \\
& - 2 \log \left( \sum_{j=1}^M \left( \frac{ \sum_{i=1}^N K_{ji}^{st} L_{ji}^{st} }{ (\sum_{i=1}^M K_{ji}^s) (\sum_{i=1}^N K_{ji}^{st}) } \right) \right).
\end{split}
\end{equation}


Note that, according to Eq.~(\ref{eq:cross_alternative}), $D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});{p_t}(\mathbf{y}|\mathbf{z}))$ can also be expressed as:
\begin{equation}
\begin{split}
D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});{p_t}(\mathbf{y}|\mathbf{z})) & \approx \log\left( \sum_{j=1}^M \left( \frac{ \sum_{i=1}^M K_{ji}^s L_{ji}^s }{ (\sum_{i=1}^M K_{ji}^s)^2 } \right) \right)
+ \log\left( \sum_{j=1}^N \left( \frac{ \sum_{i=1}^N K_{ji}^t L_{ji}^t }{ (\sum_{i=1}^N K_{ji}^t)^2 } \right) \right) \\
& - 2 \log \left( \sum_{j=1}^N \left( \frac{ \sum_{i=1}^M K_{ji}^{ts} L_{ji}^{ts} }{ (\sum_{i=1}^M K_{ji}^{ts}) (\sum_{i=1}^N K_{ji}^t) } \right) \right).
\end{split}
\end{equation}

Therefore, to obtain a consistent and symmetric expression, we estimate $D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});{p_t}(\mathbf{y}|\mathbf{z}))$ by:
\begin{equation}
\begin{split}
& D_{\text{CS}}(p_s(\mathbf{y}|\mathbf{z});{p_t}(\mathbf{y}|\mathbf{z}))  \approx \\ & \log\left( \sum_{j=1}^M \left( \frac{ \sum_{i=1}^M K_{ji}^s L_{ji}^s }{ (\sum_{i=1}^M K_{ji}^s)^2 } \right) \right)
+ \log\left( \sum_{j=1}^N \left( \frac{ \sum_{i=1}^N K_{ji}^t L_{ji}^t }{ (\sum_{i=1}^N K_{ji}^t)^2 } \right) \right) \\
& - \log \left( \sum_{j=1}^M \left( \frac{ \sum_{i=1}^N K_{ji}^{st} L_{ji}^{st} }{ (\sum_{i=1}^M K_{ji}^s) (\sum_{i=1}^N K_{ji}^{st}) } \right) \right)
- \log \left( \sum_{j=1}^N \left( \frac{ \sum_{i=1}^M K_{ji}^{ts} L_{ji}^{ts} }{ (\sum_{i=1}^M K_{ji}^{ts}) (\sum_{i=1}^N K_{ji}^t) } \right) \right).
\end{split}
\end{equation}

\end{proof}




%\section{ADDITIONAL EXPERIMENTAL RESULTS and DETAILS}
\section{Additional Experimental Results and Details}
\label{sec:experiments}

The demo code of the proposed CS-adv in the OfficeHome data is provided in \url{https://anonymous.4open.science/r/CS-adv-58E5}. 

\subsection{Details on Conditional Divergence Test}
\label{subsec:details-test}


The conditional KL divergence, by the chain rule, can be decomposed as:
\begin{equation}\label{eq:conditional_KL}
\begin{split}
    D_{\text{KL}}(p^s(y|\mathbf{x});p^t(y|\mathbf{x})) & = D_{\text{KL}}(p^s(\mathbf{x},y);p^t(\mathbf{x},y)) \\
    & - D_{\text{KL}}(p^s(\mathbf{x});p^t(\mathbf{x})).
\end{split}
\end{equation}
We estimate both terms in Eq.~(\ref{eq:conditional_KL}) with the $k$-NN estimator~\citep{wang2009divergence} ($k=3$), due to its popularity, simplicity and effectiveness. However, we would like to emphasis here that the $k$-NN estimator itself is not differentiable, which hinders its practical usage in deep UDA.


%\subsection{Conditional MMD vs Conditional CS}
%\label{sec:condtional-mmd-vs-cs}
The empirical estimation of $D_{\text{CCS}}(p^s(\hat{y}|\mathbf{x});p^t(\hat{y}|\mathbf{x}))$ is given by:
\begin{equation}%\label{eq:conditional_CS_est}
\begin{split}
& \widehat{D}_{\text{CS}}(p^s(\hat{y}|\mathbf{x});p^t(\hat{y}|\mathbf{x})) \\
& \approx \log( \sum_{j=1}^M ( \frac{ \sum_{i=1}^M K_{ji}^s L_{ji}^s }{ (\sum_{i=1}^M K_{ji}^s)^2 } ) ) 
 + \log (\sum_{j=1}^N ( \frac{ \sum_{i=1}^N K_{ji}^t L_{ji}^t }{ (\sum_{i=1}^N K_{ji}^t)^2 } ) ) \\
& - \log ( \sum_{j=1}^M ( \frac{ \sum_{i=1}^N K_{ji}^{st} L_{ji}^{st} }{ (\sum_{i=1}^M K_{ji}^s) (\sum_{i=1}^N K_{ji}^{st}) } ) )  - \log ( \sum_{j=1}^N ( \frac{ \sum_{i=1}^M K_{ji}^{ts} L_{ji}^{ts} }{ (\sum_{i=1}^M K_{ji}^{ts}) (\sum_{i=1}^N K_{ji}^t) } ) ).
\end{split}
\end{equation}

As an alternative, the conditional MMD can be estimated as~\citep{ren2016conditional}:
\begin{equation}
\begin{split}
    & \widehat{D}_{\text{MMD}}(p^s(\hat{y}|\mathbf{x});p^t(\hat{y}|\mathbf{x})) \\
    & = \tr \left( L^s (\tilde{K}^s)^{-1} K^s (\tilde{K}^s)^{-1} \right) +  \tr \left( L^t (\tilde{K}^t)^{-1} K^t (\tilde{K}^t)^{-1} \cdot \right) - 2\cdot \tr \left( L^{st} (\tilde{K}^t)^{-1} K^{ts} (\tilde{K}^s)^{-1} \cdot  \right),
\end{split}
\end{equation}
in which $\tilde{K} = K +\lambda I$.

Fig.~\ref{fig:conditional_distance} demonstrates the three synthetic datasets in which the set (a) and set (b) have much obvious difference in the conditional density $p(y|\mathbf{x})$; whereas the difference in set (a) and set (c) is relatively weak. Algorithm~\ref{PermutationAlg} summarizes the way to test the equivalence between two conditional densities.

\begin{algorithm}
\caption{Test for the equivalence between two conditional densities}
\label{PermutationAlg}
 \begin{algorithmic}[1]
 \renewcommand{\algorithmicrequire}{\textbf{Input:}}
 \renewcommand{\algorithmicensure}{\textbf{Output:}}
 \REQUIRE Two groups of observations $\psi_s = \{(\mathbf{x}_i^s,\mathbf{y}_i^s)\}_{i=1}^{M}$ and $\psi_t = \{(\mathbf{x}_i^t,\mathbf{y}_i^t)\}_{i=1}^{N}$;
Permutation number $P$;
Significance level $\alpha$.
 \ENSURE  Test \emph{decision} (Is $\mathcal{H}_0: p_s(\mathbf{y}|\mathbf{x})=p_t(\mathbf{y}|\mathbf{x})$ $True$ or $False$?).
  \STATE Compute conditional divergence value $d_0$ on $\psi_s$ and $\psi_t$ with one of the conditional divergence measures (e.g., conditional KL, or class conditional MMD, or conditional MMD, or conditional CS divergence).
 \\ % \textit{LOOP Process}
  \FOR {$m = 1$ to $P$}
  \STATE $(\psi^m_s, \psi^m_t)\leftarrow$ random split of $\psi_s\bigcup \psi_t$.
  \STATE Compute conditional divergence value $d_{m}$ on $\psi^m_s$ and $\psi^m_t$ with the selected conditional divergence measure.
  \ENDFOR
  \IF {$\frac{1+\sum\nolimits_{m=1}^P\mathbf{1}[d_{0}\leq d_t]}{1+P}\leq\alpha$}
  \STATE \emph{decision}$\leftarrow$$False$
  \ELSE
  \STATE \emph{decision}$\leftarrow$$True$
  \ENDIF  
 \RETURN \emph{decision} 
 \end{algorithmic} 
\end{algorithm}


\begin{figure} [htbp]
%\hfill
\centering 
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/sync_cond1.pdf}
         \caption{Synthetic distribution (a)}
     \end{subfigure}
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/sync_cond2.pdf}
         \caption{Synthetic distribution (b)}
     \end{subfigure}
     %\hfill
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/sync_cond3.pdf}
        \caption{Synthetic distribution (c)}
     \end{subfigure}
        \caption{Visualization of the synthetic datastes to test the power to discriminate two conditional distributions. In each plot, $x$-axis is the first dimension of $\mathbf{x}$, denote as $x_1$; $y$-axis is the second dimension of $\mathbf{x}$, denote as $x_2$. Different labels are marked with \textcolor{red}{red} and \textcolor{green}{green}, respectively.}
        \label{fig:conditional_distance}
\end{figure}

\begin{figure} [htbp]
%\hfill
\centering 
     \begin{subfigure}[b]{0.23\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/TSNE_25_no_adaptation.png}
         \caption{No adaptation}
         \label{fig:tsne-no-ad}
     \end{subfigure}
     \begin{subfigure}[b]{0.23\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/TSNE_25_cs.png}
         \caption{CS divergence}
         \label{fig:tsne-cs}
     \end{subfigure}
     %\hfill
     \begin{subfigure}[b]{0.23\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/TSNE_25_ccs.png}
        \caption{CCS divergence}
         \label{fig:tsne-ccs}
     \end{subfigure}
     \begin{subfigure}[b]{0.23\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/TSNE_25_cs_ccs.png}
        \caption{CCS+CS}
         \label{fig:tsne-cs-ccs}
     \end{subfigure}
        \caption{t-SNE visualization of feature trained without adaptation (\ref{fig:tsne-no-ad}), with CS divergence (\ref{fig:tsne-cs}, with CCS divergence (\ref{fig:tsne-ccs}), and with both CCS and CS divergences (\ref{fig:tsne-cs-ccs}).}
        \label{fig:tsne-overall}
\end{figure}


\subsection{Details on Distance Metric Minimization}
\label{subsec:details-distance}

\begin{figure}[htbp]
\centering
%\includegraphics[scale=.5]{Figures/da_dis.pdf}
\includegraphics[scale=.5]{Figures/distance-figure.pdf}
\caption{The diagram of distance metric minimization framework. $D_{\text{CS}}$ is used to align the latent representation $p(\mathbf{z})$, while $D_{\text{CCS}}$ matches the conditional distribution $p(\hat{y}|\mathbf{z})$}. 
\label{Fig.distance-framework}
\end{figure}

We illustrate the training scheme of our CS divergence-based distance metric minimization method in Fig.~\ref{Fig.distance-framework}. For matching the latent representation $\mathbf{z}$ extracted by the feature extractor $f$, we use $D_{\text{CS}}$. For the conditional distribution $p(\hat{y}|\mathbf{z})$ alignment (classifier adaptation), we adopt $D_{\text{CCS}}$. Additionally, we use cross entropy loss $L_{\text{CE}}$ on the source domain. In the end, we train three losses jointly:
\begin{equation}
L = L_{\text{CE}} + \lambda D_{\text{CS}} + \beta D_{\text{CCS}},     
\end{equation}
where $\lambda$ and $\beta$ are the weighting hyeprparameters. 

\subsection{Comparison on Digits}
\label{subsec:compare-fdal-digits}
\input{Tables/abl-fdl-ccs}
In Table~\ref{tab:abl-fdal-ccs}, we present the comparison between the proposed CS-adv method and other methods on the Digits dataset. It shows that the proposed method outperforms other methods, including f-DAL. 

\subsection{Additional Ablation Study}
\label{subsec:add-abl}

\paragraph{t-SNE visualization} In order to better understand the adaptation ability of CS and CCS divergence, we use t-SNE~\citep{van2008visualizing} to visualize the feature trained without adaptation (Fig.~\ref{fig:tsne-no-ad}), with CS divergence (Fig.~\ref{fig:tsne-cs}, with CCS divergence (Fig.~\ref{fig:tsne-ccs}), and with both CCS and CS divergences (Fig.~\ref{fig:tsne-cs-ccs}). The model is trained as introduced in Section~\ref{sec:adv-results} in the main text. Fig~\ref{fig:tsne-overall} shows the aligned quality on \textbf{M}$\rightarrow$\textbf{U} task. As shown in Fig~\ref{fig:tsne-overall}, CS divergence has a worse performance on inter-class separability, while CCS divergence can alleviate this issue. This can also be observed in Fig.~\ref{fig:tsne-cs-ccs}, where CCS divergence is added on top of CS divergence and leads to better separability compared with Fig.~\ref{fig:tsne-cs}. Hence, modeling the conditional distribution alignment is necessary and the proposed CCS divergence has an advantage. 


\paragraph{CCS with kSHOT} We investigate integrating the CCS divergence into kSHOT~\citep{sun2022prior}, an representative SOTA UDA approach. As kSHOT is based on SHOT~\citep{liang2020we} which freezes the classifier for the target domain, we only fine-tune the classifier part using CCS divergence to further enhance it by transferring the conditional distribution. The results in Table~\ref{tab:office-home-kshot} show improvements on the Office-Home dataset.

\begin{table}[htbp] % "r" for right alignment and width of the table
\centering
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{lccccccccccccc}
\toprule
Method & Ar$\rightarrow$Cl & Ar$\rightarrow$Pr & Ar$\rightarrow$Rw & Cl$\rightarrow$Ar & Cl$\rightarrow$Pr & Cl$\rightarrow$Rw & Pr$\rightarrow$Ar & Pr$\rightarrow$Cl & Pr$\rightarrow$Rw & Rw$\rightarrow$Ar & Rw$\rightarrow$Cl & Rw$\rightarrow$Pr & Avg \\
\midrule
kSHOT~\citep{sun2022prior} &58.2&	80.0	&82.9&	71.1	&80.3	&80.7&	71.3&	56.8	&83.2	&75.5	&60.3	&86.6	&73.9 \\
CCS+KSHOT &\textbf{58.9}&	\textbf{81.6}&	\textbf{83.4}&	\textbf{71.3}&	\textbf{81.2}	& \textbf{80.8} &	\textbf{71.6} & 56.5	& 82.9	&\textbf{75.8}& \textbf{61.2} 	&	\textbf{86.7}& \textbf{74.3}\\
\bottomrule
\end{tabular}%
}
\caption{Compare with KSHOT~\citep{sun2022prior} on \textbf{Office-Home}.}
\label{tab:office-home-kshot}
\end{table}


\begin{figure}[htbp]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=1\linewidth]{Figures2/source.png}
        \caption{Without adaptation.}
        \label{fig:sub1}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=1\linewidth]{Figures2/CS.png}
        \caption{Adapt only $p(z)$.}
        \label{fig:sub2}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=1\linewidth]{Figures2/CS+CCS.png}
        \caption{Adapt both $p(z)$ and $p(\hat{y}|z)$.}
        \label{fig:sub3}
    \end{subfigure}
    %\caption{No adversarial training. Kernel Density Estimation visualization of $p(z,y)$ and $p(z,\hat{y})$ in the target domain (after dimension reduction).}
    \caption{No adversarial training. Kernel Density Estimation (KDE) visualization of $p(\mathbf{z},y)$ and $p(\mathbf{z},\hat{y})$ in the target domain(after dimension reduction). $y$ and $\hat{y}$ are ground truth and predicted pseudo labels, respectively. $p(\mathbf{z},\hat{y})$ is close to $p(\mathbf{z},y)$ only when $p(\hat{y}|\mathbf{z})$ effectively approximates $p(y|\mathbf{z})$. Aligning $p(\mathbf{z})$ only~(\ref{fig:sub2}) cannot ensure the approximation of $p(\mathbf{z},y)$, while adding conditional alignment with pseudo labels closely approximates $p(\mathbf{z},y)$.}
    \label{fig:joint_dist}
\end{figure}


\paragraph{Kernel Density Estimation visualization} In order to show that it is reasonable to use the predicted pseudo $\hat{y}$ (similar to previous papers) and the necessity of aligning both marginal and conditional distribution, we draw the Kernel Density Estimation (KDE) visualization of $p(\mathbf{z}, y)$ and $p(\mathbf{z},\hat{y})$ in Fig.~\ref{fig:joint_dist}. We train our model on the Digits M$\rightarrow$U task and visualize the KDE of $p(\mathbf{z},y)$ and $p(\mathbf{z},\hat{y})$ in the target domain (dimension reduction is performed). As the same $\mathbf{z}$ is used for both $p(\mathbf{z},y)$ and $p(\mathbf{z},\hat{y})$, $p(\mathbf{z},\hat{y})$ is close to $p(\mathbf{z},y)$ only when $p(\hat{y}|\mathbf{z})$ effectively approximates $p(y|\mathbf{z})$. In each subfigure, the left shows the joint distribution $p(\mathbf{z},\hat{y})$ from the model prediction, while the right illustrates the joint distribution with the ground truth label $p(\mathbf{z},y)$. Fig.~\ref{fig:joint_dist} shows aligning $p(\mathbf{z})$ only~(Fig.~\ref{fig:sub2}) cannot ensure the approximation of $p(\mathbf{z},y)$, while adding conditional alignment with pseudo labels (Fig.~\ref{fig:sub3}) shows a close approximation of $p(\mathbf{z},y)$. 


\paragraph{Comparison on Office-Caltech-10} In this section, we present an additional ablation study analogous to Section \ref{sec:compare-mmd} in the main manuscript. We employ another toy dataset, Office-Caltech-10~\citep{fernando2014subspace}, to conduct a comprehensive comparison with both MMD and Joint Distribution MMD (JPMMD). The Office-Caltech-10 dataset comprises $10$ classes with an image size of $3\times 28\times 28$. It includes four domains: Amazon, Webcam, Caltech, and DSLR. We selected the Webcam-to-DSLR task for demonstration. The network architecture used mirrors that in Section 4.1.2, where LeNet and two fully connected layers serve as the feature extractor and nonlinear classifier, respectively. Results are depicted in Fig.~\ref{fig:abl-office-caltech}. Both CS and CCS methods surpass MMD and Joint Distribution MMD, with CS+CCS delivering the best performance.


\begin{figure} [htbp]
%\hfill
\centering 
         \includegraphics[width=0.45\textwidth]{Figures/compare_office-caltech.png}
         \caption{The ablation study of the \textbf{CS} and \textbf{CCS} components in Webcam to DSLR task, comparing with MMD and joint distribution MMD (JPMMD).}
        \label{fig:abl-office-caltech}
\end{figure}


% \paragraph{More comparisons including MMD+CCS, MMD+CMMD in the rebuttal. We also report more evaluations on challenging tasks (Webcam to DSLR and Amazon to DSLR) from the Office-Caltech-10 dataset.}
% \begin{table}[ht]
% \centering
% \begin{tabular}{lccc}
% \hline
% \textbf{Method} & \textbf{M$\rightarrow$U} & \textbf{Webcam to DSLR} & \textbf{Amazon to DSLR} \\
% \hline
% No Adaptation & $57.9 \pm 1.46$ & $70.4 \pm 2.94$ & $31.9 \pm 1.85$ \\
% MMD           & $85.3 \pm 1.06$ & $77.5 \pm 0.71$ & $36.2 \pm 3.91$ \\
% JPMMD          & $87.2 \pm 0.36$ & $76.8 \pm 1.26$ & $36.8 \pm 1.12$ \\
% CS            & $87.4 \pm 0.96$ & $80.0 \pm 2.42$ & $36.6 \pm 1.41$ \\
% CCS           & $89.1 \pm 0.89$ & $77.4 \pm 1.98$ & $42.1 \pm 2.87$ \\
% MMD+CCS       & $89.8 \pm 0.62$ & $77.6 \pm 1.34 $ & $40.8 \pm 4.41$ \\
% MMD+CMMD      & $13.2$          & $14.1$          & $9.6$ \\
% CS+CCS        & $90.6 \pm 0.83$ & $80.6 \pm 1.43$ & $43.2 \pm 3.46$ \\
% \hline
% \end{tabular}
% \caption{Your caption here}
% \label{table:your_label}
% \end{table}



\subsection{The Effect of Hyperparameters}
\label{subsec:hyperparameters}
We conduct ablation studies on batch size and kernel size in Fig~\ref{fig:abl-study} on MNIST to USPS task. First, we fix the kernel size as $1$ and increase the batch size. With larger batch size, the method has a better performance. Subsequently, with the batch size set at $128$, we explore various kernel sizes within a specific range. It shows that our method has a stable performance with respect to kernel size in a certain range. 

\begin{figure*}[htbp]
\centering 
     \begin{subfigure}[b]{0.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/compare_mnist_batch_size2.png}
         \caption{The ablation study of Batch Size.}
         \label{fig:abl_batch}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.4\textwidth}
         \centering
         \includegraphics[width=\textwidth]{Figures/compare_mnist_sigma.png}
         \caption{The ablation study of Gaussian kernel size.}
         \label{fig:abl_sigma}
     \end{subfigure}
        \caption{The ablation study of batch size and kernel size on MNIST to USPS task.}
        \label{fig:abl-study}
\end{figure*}


Additionally, in Table~\ref{tab:senstivity_lambda_beta}, we provide additional sensitivity analysis for $\lambda$ and $\beta$ for CS and CCS in the MNIST to USPS task. It shows that CS and CCS have stable performance for different regularization strengths. To have the same regularization strength with MMD, we keep $\lambda$ and $\beta$ as 1. 
 
\begin{table}[ht]
\centering
\begin{tabular}{lcccccc}
\hline
$\lambda$ or $\beta$ & 0.1 & 0.5 & 1 & 2 & 5 & 10 \\
\hline
CS & 84.8 & 86 & 87.4 & 87.8 & 88.7 & 88.7 \\
CCS & 90.1 & 90.3 & 90.1 & 90.2 & 90.1 & 89.8 \\
\hline
\end{tabular}
\caption{Sensitivity analysis for $\lambda$ and $\beta$.}
\label{tab:senstivity_lambda_beta}
\end{table}

%\vfill

%\end{document}
