\section{Proofs}\label{app:proof}
\subsection{Additional Lemmas}

\begin{lemma}\label{lemma:1}
Given two domains $D_{t}$ and $D_{t'}$, then for any classifier $h \in \mathcal{H}$, the expected  error of $h$ in domain $D_t$ can be upper bounded:
\begin{align*}
\epsilon_{{D}_t} \left( h \right) \leq  \epsilon_{D_{t'}}\left( h \right) + \sqrt{2}C \times  \mathcal{D}_{JS}\left(P^{X,Y}_{D_t} \parallel P^{X,Y}_{D_{t'}} \right)^{1/2}
\end{align*}
where $\mathcal{D}_{JS}\left( \cdot \parallel \cdot \right)$ is JS-divergence between two distributions.
\end{lemma}

\paragraph{Proof of Lemma~\ref{lemma:1}}
Let $D_{KL} \left( \cdot \parallel \cdot \right)$ be KL-divergence and $U = (X,Y)$ and $L(U) = L\left(h(X), Y\right)$. We first prove $\int_{\mathcal{E}} \left | P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right | du = \frac{1}{2} \int \left | P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right | du$ where $\mathcal{E}$ is the event that $P^{U=u}_{D_{t}} \geq P^{U=u}_{D_{t'}}$ ($\ast$) as follows:

\begin{align*}
    \int_{\mathcal{E}} \left | P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right | du &= \int_{\mathcal{E}} \left ( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right ) du \\
    &= \int_{\mathcal{E} \cup \overline{\mathcal{E}}} \left ( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right ) du - \int_{ \overline{\mathcal{E}}} \left ( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right ) du \\
    &\overset{(1)}{=}  \int_{ \overline{\mathcal{E}}} \left ( P^{U=u}_{D_{t'}} - P^{U=u}_{D_{t}}  \right ) du \\
    &=  \int_{ \overline{\mathcal{E}}} \left | P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right | du \\
    &= \frac{1}{2} \int \left | P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right | du
\end{align*}

where $\overline{\mathcal{E}}$ is the complement of $\mathcal{E}$. We have $\overset{(1)}{=}$ because $\int_{\mathcal{E} \cup \overline{\mathcal{E}}} \left ( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right ) du = \int_{\mathcal{U}} \left ( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}}  \right ) du = 0$. Then, we have:

\begin{align*}
\epsilon_{D_{t}}\left( h \right) &= \mathbb{E}_{D_{t}}\left[ L(U) \right] \\
&= \int_{\mathcal{U}} L(u) P^{U=u}_{D_{t}} du \\
&= \int_{\mathcal{U}} L(u) P^{U=u}_{D_{t'}} du + \int_{\mathcal{U}} L(u) \left( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right) du \\
&= \mathbb{E}_{D_{t'}}\left[ L(U) \right] + \int_{\mathcal{U}} L(u) \left( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right) du \\
&= \epsilon_{D_{t'}}\left( h \right) + \int_{\mathcal{E}} L(u) \left( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right) du + \int_{\overline{\mathcal{E}}} L(u) \left( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right) du \\
&\overset{(2)}{\leq} \epsilon_{D_{t'}}\left( h \right) + \int_{\mathcal{E}} L(u) \left( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right) du \\
&\overset{(3)}{\leq} \epsilon_{D_{t'}}\left( h \right) + C \int_{\mathcal{E}} \left( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right) du \\
&= \epsilon_{D_{t'}}\left( h \right) + C \int_{\mathcal{E}} \left| P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right| du \\
&\overset{(4)}{=} \epsilon_{D_{t'}}\left( h \right) + \frac{C}{2} \int \left| P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right| du \\
&\overset{(5)}{\leq} \epsilon_{D_{t'}}\left( h \right) + \frac{C}{2} \sqrt{2  \min \left(\mathcal{D}_{KL}\left(P^U_{D_{t'}} \parallel P^U_{D_{t}} \right) , \mathcal{D}_{KL}\left(P^U_{D_{t}} \parallel P^U_{D_{t'}} \right) \right)} \\
&\leq \epsilon_{D_{t'}}\left( h \right) + \frac{C}{\sqrt{2}} \sqrt{ \mathcal{D}_{KL}\left(P^U_{D_{t'}} \parallel P^U_{D_{t}} \right) } \;\;\;\; (\ast \ast)
\end{align*}
We have $\overset{(2)}{\leq}$ because $\int_{\overline{\mathcal{E}}} L(u) \left( P^{U=u}_{D_{t}} - P^{U=u}_{D_{t'}} \right) du \leq 0$; $\overset{(3)}{\leq}$ because $L(u)$ is non-negative function and is bounded by $C$; $\overset{(4)}{=}$ by using ($\ast$); $\overset{(5)}{\leq}$ by using Pinsker’s inequality between total variation norm and KL-divergence. 

Let $P^U_{{D}_{t,t'}} = \frac{1}{2} \left ( P^U_{D_{t}} + P^U_{D_{t'}} \right )$. Apply $(\ast \ast)$ for two domains $D_{t}$ and ${D}_{t,t'}$, we have:
\begin{align}\label{eq:s1}
    \epsilon_{D_{t}}\left( h \right) \leq \epsilon_{{D}_{t,t'}}\left( h \right) + \frac{C}{\sqrt{2}}\sqrt{\mathcal{D}_{KL}\left( P^U_{D_{t}} \parallel P^U_{{D}_{t,t'}} \right)}
\end{align}
Apply $(\ast \ast)$ again for two domains ${D}_{t,t'}$ and $D_{t'}$, we have:
\begin{align}\label{eq:s2}
    \epsilon_{{D}_{t,t'}}\left( h \right) \leq \epsilon_{D_{t'}}\left( h \right) + \frac{C}{\sqrt{2}}\sqrt{\mathcal{D}_{KL}\left( P^U_{D_{t'}} \parallel P^U_{{D}_{t,t'}} \right)}
\end{align}
Adding Eq. (\ref{eq:s1}) to Eq. (\ref{eq:s2}) and subtracting $\epsilon_{{D}_{t,t'}}$, we have:
\begin{align*}
    \epsilon_{D_{t}}\left( h \right) &\leq \epsilon_{D_{t'}}\left( h \right) + \frac{C}{\sqrt{2}} \left ( \sqrt{\mathcal{D}_{KL}\left( P^U_{D_{t}} \parallel P^U_{{D}_{t,t'}} \right)} + \sqrt{\mathcal{D}_{KL}\left( P^U_{D_{t'}} \parallel P^U_{{D}_{t,t'}} \right)} \right ) \\
    &\overset{(6)}{\leq} \epsilon_{D_{t'}}\left( h \right) + \frac{C}{\sqrt{2}}  \sqrt{2 \left(\mathcal{D}_{KL}\left( P^U_{D_{t}} \parallel P^U_{{D}_{t,t'}} \right) + \mathcal{D}_{KL}\left( P^U_{D_{t'}} \parallel P^U_{{D}_{t,t'}} \right) \right)} \\
    &= \epsilon_{D_{t'}}\left( h \right) + \frac{C}{\sqrt{2}}  \sqrt{4 \mathcal{D}_{JS}\left( P^U_{D_{t'}} \parallel P^U_{D_{t}} \right) } \\
    &= \epsilon_{D_{t'}}\left( h \right) + \sqrt{2}C \sqrt{\mathcal{D}_{JS}\left( P^U_{D_{t'}} \parallel P^U_{D_{t}} \right) } \\
\end{align*}
We have $\overset{(6)}{\leq}$ by using Cauchy–Schwarz inequality.

\begin{lemma}\label{lemma:2}
Given domain $D$, then for any $\delta > 0$, with probability at least $1 - \delta$ over samples $S$ of size $n$ drawn i.i.d from domain $D$,  for all $h \in \mathcal{H}$, the expected error of $h$ in domain $D$ can be upper bounded:
\begin{align*}
    \epsilon_{D}\left( h \right) \leq  \epsilon_{S}\left( h \right) + \frac{2B}{\sqrt{n}} + C\sqrt{\frac{\log(1 / \delta)}{2n}}
\end{align*}
\end{lemma}

\paragraph{Proof of Lemma~\ref{lemma:2}}
We start from the Rademacher bound~\cite{koltchinskii2000rademacher} which is stated as follows.
\begin{lemma}\label{lemma:3}
Rademacher Bounds.  Let $\mathcal{F}$ be a family of functions mapping from $Z$ to $[0,1]$. Then, for any $0 < \delta < 1$, with probability at least $1 - \delta$ over sample $S = \{z_1,\cdots,z_n\}$, the following holds for all $f \in \mathcal{F}$:
\begin{align*}
    \mathbb{E}\left[ f^{Z} \right] \leq \frac{1}{n} \sum_{i=1}^{n} f(z_i) + 2 \mathcal{R}_n(\mathcal{F}) + \sqrt{\frac{\log (1 / \delta)}{2n}}
\end{align*}
where $\mathcal{R}_n \left(\mathcal{F}\right)$ is a Rademacher complexity of function class $\mathcal{F}$.
\end{lemma}
We then apply Lemma~\ref{lemma:3} to our setting with $Z = (X,Y)$, the loss function $L$ bounded by $C$, and the function class $\mathcal{L}_{\mathcal{H}} = \left\{ (x,y) \rightarrow L\left(h\left(x\right),y\right): h \in \mathcal{H} \right\}$. In particular, we scale the loss function $L$ to $[0,1]$ by dividing by C and denote the new class of scaled loss functions as $\mathcal{L}_{\mathcal{H}} / C$. Then, for any $\delta > 0$, with probability at least $1 - \delta$, we have:
\begin{align}
   \frac{\epsilon_{D}\left( h \right)}{C} &\leq \frac{\epsilon_{S}\left( h \right)}{C} + 2 \mathcal{R}_n\left(\mathcal{L}_{\mathcal{H}} / C\right) + \sqrt{\frac{\log (1 / \delta)}{2n}} \nonumber \\
   &\overset{(1)}{=} \frac{\epsilon_{S}\left( h \right)}{C} + \frac{2}{C}\mathcal{R}_n\left(\mathcal{L}_{\mathcal{H}}\right) + \sqrt{\frac{\log (1 / \delta)}{2n}} \nonumber \\
   &\overset{(2)}{\leq} \frac{\epsilon_{S}\left( h \right)}{C} + \frac{2B}{C\sqrt{n}} + \sqrt{\frac{\log (1 / \delta)}{2n}} \label{eq:s3}
\end{align}
We have $\overset{(1)}{=}$ by using the property of Redamacher complexity that $\mathcal{R}_n(\alpha\mathcal{F}) = \alpha \mathcal{R}_n(\mathcal{F})$, $\overset{(2)}{\leq}$ because of bounded Rademacher complexity assumption. We derive Lemma~\ref{lemma:2} by multiplying Eq. (\ref{eq:s3}) by C.

\begin{lemma}\label{lemma:4}
Given domain sequence $\left\{D_t\right\}$, dataset sequence  $\left\{S_t\right\}$ sampled from $\left\{D_t\right\}$, $M$-optimal model sequence $H^{M} = \left\{ h_{1}^{M}, \cdots, h_{T+K}^{M} \right\}$, $M$-empirical optimal model sequence $\widehat{H}_{M} = \left\{ \widehat{h}_{1}^{M}, \cdots, \widehat{h}_{T+K}^{M} \right\}$, then for any $t$ and any $\delta > 0$, with probability at least $1 - \delta$ over samples $S_t$ of size $n$ drawn i.i.d from domain $D_t$,  we have:
\begin{align*}
    \epsilon_{D_t}\left( \widehat{h}^M_t \right) \leq  \epsilon_{D_t}\left( h^M_t \right) + 2\sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / \delta)}{n}}
\end{align*}
\end{lemma}

\paragraph{Proof of Lemma~\ref{lemma:4}}
We have:
\begin{align*}
    \epsilon_{D_t}\left( \widehat{h}^M_t \right) &\overset{(1)}{\leq} \epsilon_{D^M_t}\left( \widehat{h}^M_t \right) + \sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} \\
    &\overset{(2)}{\leq} \epsilon_{S^M_t}\left( \widehat{h}^M_t \right) + \sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} + \frac{2B}{\sqrt{n}} + \sqrt{\frac{ \log (1 / {\delta}')}{2n}} \;\;\;\; (\text{w.p} \geq 1 - {\delta}') \\
    &\overset{(3)}{\leq} \epsilon_{S^M_t}\left( h^M_t \right) + \sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} + \frac{2B}{\sqrt{n}} + \sqrt{\frac{ \log (1 / {\delta}')}{2n}} \\
    &\overset{(4)}{\leq} \epsilon_{D^M_t}\left( h^M_t \right) + \sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \;\;\;\; (\text{w.p} \geq 1 - {\delta}') \\
    &\overset{(5)}{\leq} \epsilon_{D_t}\left( h^M_t \right) + 2\sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}}
\end{align*}
We have $\overset{(1)}{\leq}$ by using Lemma~\ref{lemma:1} for $\epsilon_{D_t}\left( \widehat{h}^M_t \right)$, $\overset{(2)}{\leq}$ by using Lemma~\ref{lemma:2} for $\epsilon_{D^M_t}\left( \widehat{h}^M_t \right)$, $\overset{(3)}{\leq}$ because $\widehat{h}^M_t = \arg\min_{h \in \mathcal{H}} \epsilon_{S^M_t}\left( h \right)$, $\overset{(4)}{\leq}$ by using Lemma~\ref{lemma:2} for $\epsilon_{S^M_t}\left( h^M_t \right)$, $\overset{(5)}{\leq}$ by using Lemma~\ref{lemma:1} for $\epsilon_{D^M_t} \left( \widehat{h}^M_t \right)$. Finally, using union bound for $\overset{(2)}{\leq}$ and $\overset{(4)}{\leq}$, and denote $\delta = 2 {\delta}'$, we have:
\begin{align*}
    \epsilon_{D_t}\left( \widehat{h}^M_t \right) \leq  \epsilon_{D_t}\left( h^M_t \right )+ 2\sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / \delta)}{n}}
\end{align*}

\begin{lemma}\label{lemma:5}
Given domain sequence $\left\{D_t\right\}$, dataset sequence  $\left\{S_t\right\}$ sampled from $\left\{D_t\right\}$, $M$-optimal model sequence $H^{M} = \left\{ h_{1}^{M}, \cdots, h_{T+K}^{M} \right\}$, $M$-empirical optimal model sequence $\widehat{H}_{M} = \left\{ \widehat{h}_{1}^{M}, \cdots, \widehat{h}_{T+K}^{M} \right\}$, then for any $t$,  we have:
\begin{align*}
    \epsilon_{D_t}\left( h^M_t \right) \leq  \epsilon_{D_t}\left( \widehat{h}^M_t \right) + 2\sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2}
\end{align*}
\end{lemma}
\paragraph{Proof of Lemma~\ref{lemma:5}}
We have:
\begin{align*}
    \epsilon_{D_t}\left( h^M_t \right) &\overset{(1)}{\leq} \epsilon_{D^M_t}\left(h^M_t \right) + \sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} \\
    &\overset{(2)}{\leq} \epsilon_{D^M_t}\left(\widehat{h}^M_t \right) + \sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} \\
    &\overset{(3)}{\leq} \epsilon_{D_t}\left(\widehat{h}^M_t \right) + 2\sqrt{2}C\mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2}
\end{align*}
We have $\overset{(1)}{\leq}$ by using Lemma~\ref{lemma:1} for $\epsilon_{D_t}\left( h^M_t \right)$, $\overset{(2)}{\leq}$ because $h^M_t = \arg\min_{h \in \mathcal{H}} \epsilon_{D^M_t}\left( h \right)$, $\overset{(3)}{\leq}$ by using Lemma~\ref{lemma:1} for $\epsilon_{D^M_t}\left( \widehat{h}^M_t \right)$.

\subsection{Proof of main theorems.}
\subsubsection{Proof of Theorem~\ref{thm-1}}
We have:
\begin{align*}
    E_{tgt}\left( \widehat{H}^{M} \right) &= \frac{1}{K} \sum_{t=T+1}^{T+K} \epsilon_{D_t} \left( \widehat{h}^M_t \right) \\
    &\overset{(1)}{\leq} \frac{1}{K} \sum_{t=T+1}^{T+K} \left( \epsilon_{D_t} \left( h^M_t \right) + 2\sqrt{2}C \times \mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \right) \;\;\;\; (\text{w.p} \geq 1 - {\delta}') \\
    &=  E_{tgt}\left( H^{M} \right) + 2\sqrt{2}C \times D_{tgt}\left( M \right) + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \\
    &= E_{src}\left( H^{M} \right) + \left( E_{tgt}\left( H^{M} \right) - E_{src}\left( H^{M} \right) \right) + 2\sqrt{2}C \times \left( D_{src}\left( M \right) + \left( D_{tgt}\left( M \right) - D_{src}\left( M \right) \right) \right) \\
    &+ \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \\
    &\overset{(2)}{\leq} \frac{1}{T} \sum_{t=1}^{T} \epsilon_{D_t} \left( h^M_t \right) + \Phi(\mathcal{M}) + 2\sqrt{2}C \times D_{src}\left( M \right) + 2\sqrt{2}C \times \Phi(\mathcal{M}, \mathcal{H}) + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \\
    &\overset{(3)}{\leq} \frac{1}{T} \sum_{t=1}^{T} \left( \epsilon_{D_t} \left( \widehat{h}^M_t \right) + 2\sqrt{2}C  \times \mathcal{D}_{JS}\left( D_t \parallel D^M_t \right)^{1/2} \right) + \Phi(\mathcal{M}) + 2\sqrt{2}C \times D_{src}\left( M \right) + 2\sqrt{2}C \times \Phi(\mathcal{M}, \mathcal{H}) \\
    &+ \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \\
    &\overset{(4)}{\leq} \frac{1}{T} \sum_{t=1}^{T}  \epsilon_{D^M_t} \left( \widehat{h}^M_t \right) + 5\sqrt{2}C  \times D_{src}\left( M \right) + \Phi(\mathcal{M}) + 2\sqrt{2}C \times \Phi(\mathcal{M}, \mathcal{H}) + \frac{4B}{\sqrt{n}} + \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \\
    &\overset{(5)}{\leq} \frac{1}{T} \sum_{t=1}^{T} \left( \epsilon_{S^M_t} \left( \widehat{h}^M_t \right) + \frac{2B}{\sqrt{n}} + \sqrt{\frac{\log(1 / {\delta}')}{2n}} \right) + 5\sqrt{2}C  \times D_{src}\left( M \right) + \Phi(\mathcal{M}) + 2\sqrt{2}C \times \Phi(\mathcal{M}, \mathcal{H}) \\
    &+ \frac{4B}{\sqrt{n}} + \sum_{t=T+1}^{T+K} \sqrt{\frac{2 \log (1 / {\delta}')}{n}} \;\;\;\; (\text{w.p} \geq 1 - {\delta}') \\
    &= \widehat{E}^M_{src}\left( \widehat{H}^{M} \right) + 5\sqrt{2}C  \times D_{src}\left( M \right) + \Phi(\mathcal{M}) + 2\sqrt{2}C \times \Phi(\mathcal{M}, \mathcal{H}) + \frac{6B}{\sqrt{n}} + 3 \sqrt{\frac{\log (1 / {\delta}')}{2n}}
\end{align*}
We have $\overset{(1)}{\leq}$ by using Lemma~\ref{lemma:4} for $\epsilon_{D_t}\left( \widehat{h}^M_t \right)$, $\overset{(2)}{\leq}$ because $ \Phi\left(\mathcal{M}, \mathcal{H}\right) = \underset{M' \in \mathcal{M}}{\sup} \left( E_{tgt} \left( H^{M'} \right) - E_{src} \left( H^{M'} \right)  \right)$ amd $\Phi\left(\mathcal{M}\right) = \underset{M' \in \mathcal{M}}{\sup} \left( D_{tgt} \left( M' \right) - D_{src} \left( M' \right)  \right)$, $\overset{(3)}{\leq}$ by using Lemma~\ref{lemma:5} for $\epsilon_{D_t}\left( h^M_t \right)$, $\overset{(4)}{\leq}$ by using Lemma~\ref{lemma:2} for $\epsilon_{D_t}\left( \widehat{h}^M_t \right)$, $\overset{(5)}{\leq}$ by using Lemma~\ref{lemma:1} for $\epsilon_{D^M_t}\left( \widehat{h}^M_t \right)$.  Finally, using union bound for $\overset{(2)}{\leq}$ and $\overset{(4)}{\leq}$, and denote $\delta = (T+K) {\delta}'$, we have:
\begin{align}
E_{tgt}\left( \widehat{H}^{M} \right) \leq \widehat{E}^M_{src}\left( \widehat{H}^{M} \right) + 5\sqrt{2}C  \times D_{src}\left( M \right) + \Phi(\mathcal{M}) + 2\sqrt{2}C \times \Phi(\mathcal{M}, \mathcal{H}) + \frac{6B}{\sqrt{n}} + 3 \sqrt{\frac{\log ((T+K) / \delta)}{2n}} \label{eq:s4}
\end{align}
Note that the high probability bounds in Lemma~\ref{lemma:2} and Lemma~\ref{lemma:4} relates to hypothesis class $\mathcal{H}$ only. Therefore, Eq. (\ref{eq:s4}) still holds for $M$ depended on dataset sequence $\left\{S_t\right\}_{t=1}^{T+K}$.

\subsubsection{Proof of Proposition~\ref{thm-3}}
$\forall y \in \mathcal{Y}$, we have the following $(\ast)$:
\begin{align*}
    P^{Y=y}_{D^{W}_{t-1}} &= \int_{\mathcal{X}} P^{X=x, Y=y}_{D^{W}_{t-1}} dx \\
    &= \int_{\mathcal{X}} w_y \times P^{X=x, Y=y}_{D_{t-1}} dx \\
    &= \int_{\mathcal{X}} \frac{P^{Y=y}_{D_t}}{P^{Y=y}_{D_{t-1}}} \times P^{X=x, Y=y}_{D_{t-1}} dx \\
    &= P^{Y=y}_{D_t} \int_{\mathcal{X}} P^{X=x|Y=y}_{D_{t-1}} dx \\
    &= P^{Y=y}_{D_t} 
\end{align*}

We have:
\begin{align}\label{eq:s5}
\mathcal{D}_{KL}\left(P_{D_{t}}^{X,Y} , P_{D^{W,M}_{t}}^{X,Y} \right) &= \mathbb{E}_{P^{X,Y}_{D_t}}\left[\log P_{D_{t}}^{X,Y} - \log P_{D^{W,M}_{t}}^{X,Y} \right] \nonumber \\
&= \mathbb{E}_{P^{X,Y}_{D_t}}\left [\log{P_{D_{t}}^{Y}} + \log{P_{D_{t}}^{X|Y}} \right] - \mathbb{E}_{P^{X,Y}_{D_t}}\left[ \log{P_{D^{W,M}_{t}}^{Y}} + \log{P_{D^{W,M}_{t}}^{X|Y}} \right] \nonumber \\
&= \mathbb{E}_{P^{X,Y}_{D_t}}\left[\log{P_{D_{t}}^{Y}} - \log{P_{D^{W,M}_{t}}^{Y}}\right] + \mathbb{E}_{P^{X,Y}_{D_t}}\left[\log{P_{D_{t}}^{X|Y}} - \log{P_{D^{W,M}_{t}}^{X|Y}}\right] \nonumber \\
&\overset{(1)}{=} \mathbb{E}_{P_{D_{t}}^{Y}}\left[ \mathbb{E}_{ P_{D_{t}}^{X|Y}}\left[\log{P_{D_{t}}^{X|Y}} - \log{P_{D^{W,M}_{t}}^{X|Y}} \right] \right] \nonumber \\
&= \mathbb{E}_{P_{D_t}^{Y}}\left[\mathcal{D}_{KL}\left(P_{D_{t}}^{X|Y} \parallel P_{D^{W,M}_{t}}^{X|Y} \right) \right]
\end{align}
We have $\overset{(1)}{=}$ because $P^Y_{D^{W, M}_t} = m_{t-1} \sharp P^Y_{D^W_{t-1}} = P^Y_{D^W_{t-1}}$ for $m_{t-1}: \mathcal{X} \rightarrow \mathcal{X}$ and $P^Y_{D^W_{t-1}} = P^Y_{D_{t}}$ by $(\ast)$. For JS-divergence $\mathcal{D}_{JS}$, let $P^{X,Y}_{{D}'_t} = \frac{1}{2} \left ( P^{X,Y}_{D_t} + P^{X,Y}_{D^{W,M}_{t}} \right )$. Then, we have:
\begin{align*}
& \mathcal{D}_{JS}\left(P_{D_{t}}^{X,Y} \parallel P_{D^{W,M}_{t}}^{X,Y} \right)  \\
&= \frac{1}{2} \mathcal{D}_{KL}\left(P_{D_{t}}^{X,Y} \parallel P_{{D}'_{t}}^{X,Y} \right)  + \frac{1}{2}  \mathcal{D}_{KL}\left(P_{D^{W,M}_{t}}^{X,Y} \parallel P_{{D}'_{t}}^{X,Y} \right)  \\
&\overset{(2)}{=}   \frac{1}{2} \left( \mathbb{E}_{P^Y_{D_{t}}}\left[\mathcal{D}_{KL}\left(P_{D_{t}}^{X|Y} \parallel P_{{D}'_{t}}^{X|Y} \right) \right]  +  \mathbb{E}_{P^{Y}_{D^{W,M}_{t}}}\left[\mathcal{D}_{KL}\left(P_{D^{W,M}_{t}}^{X|Y} \parallel P_{{D}'_{t}}^{X|Y} \right) \right] \right)  \\
&= \mathbb{E}_{P^Y_{D_t}}\left[\frac{1}{2} \left( \mathcal{D}_{KL}\left(P_{D_{t}}^{X|Y} \parallel P_{{D}'_{t}}^{X|Y} \right) + \mathcal{D}_{KL}\left(P_{D^{W,M}_{t}}^{X|Y} \parallel P_{{D}'_{t}}^{X|Y} \right) \right) \right]  \\
&= \mathbb{E}_{P^Y_{D_t}}\left[ D_{JS}\left(P_{D_{t}}^{X|Y} \parallel P_{D^{W,M}_{t}}^{X|Y} \right) \right]
\end{align*}
We have $\overset{(2)}{=}$ by applying Eq. (\ref{eq:s5}) for $\mathcal{D}_{KL}\left(P_{D_{t}}^{X,Y} \parallel P_{{D}'_{t}}^{X,Y} \right)$ and $\mathcal{D}_{KL}\left(P_{D^{W,M}_{t}}^{X,Y} \parallel P_{{D}'_{t}}^{X,Y} \right)$.

\subsubsection{Proof of Proposition~\ref{thm:4}}

First, we show that for any $t \in [1,\cdots,T]$, we have:

\begin{align}
    \mathbb{E} \left[\mathcal{D}_{JS}\left(P_{\widehat{\alpha}_t}^{Z} \parallel P_{\widehat{\beta}_t}^{Z} \right) \right] \leq \mathcal{D}_{JS}\left(P_{\alpha^{\ast}_t}^{Z} \parallel P_{\beta^{\ast}_t}^{Z} \right) + \mathcal{O} \left( \left(\frac{1}{\sqrt{n}} \right) \times C(\mathcal{A}, \mathcal{B}, \Gamma) \right) \label{eq:s6}
\end{align}

Proposition~\ref{thm:4} is then obtained by applying Eq.(~\ref{eq:s6}) for all $t \in [1,\cdots,T]$ followed by averaging over $t$.

\paragraph{Proof of Eq.(~\ref{eq:s6}).}
To simplify the mathematical notation, we omit the index $t$ in the following. Our proof is based on the proof provided for GAN model by~\citet{biau2020some}. Let $L(\alpha, \beta, \gamma) = \int_{\mathcal{Z}} \left(\log \left ( D_{\gamma}(z) \right ) P_{\alpha}^{z} + \log \left ( 1 - D_{\gamma}(z) \right ) P_{\beta}^{z}\right) dz $ and $\widehat{L}(\alpha, \beta, \gamma)$ is the corresponding empirical error, we have:
\begin{align*}
    2 \mathcal{D}_{JS} \left( P_{\widehat{\alpha}}^{Z} \parallel P_{\widehat{\beta}}^{Z} \right) &= L(\widehat{\alpha}, \widehat{\beta}, \widehat{\gamma}) + \log(4) \\
    &\leq \sup_{\gamma} L(\widehat{\alpha}, \widehat{\beta}, \gamma) + \log(4) \\
    &\leq \sup_{\gamma} \left(  \widehat{L}(\widehat{\alpha}, \widehat{\beta}, \gamma) + \left| \widehat{L}(\widehat{\alpha}, \widehat{\beta}, \gamma) - L(\widehat{\alpha}, \widehat{\beta}, \gamma) \right| \right) + \log(4) \\
    &\leq \sup_{\gamma}   \widehat{L}(\widehat{\alpha}, \widehat{\beta}, \gamma) + \sup_{\gamma}  \left| \widehat{L}(\widehat{\alpha}, \widehat{\beta}, \gamma) - L(\widehat{\alpha}, \widehat{\beta}, \gamma) \right|  + \log(4) \\
    &\leq \inf_{\alpha,\beta}\sup_{\gamma}   \widehat{L}(\alpha, \beta, \gamma) + \sup_{\alpha,\beta,\gamma}  \left| \widehat{L}(\alpha, \beta, \gamma) - L(\alpha, \beta, \gamma) \right|  + \log(4) \\
    &\leq \inf_{\alpha,\beta}\sup_{\gamma}   L(\alpha, \beta, \gamma) + \left| \inf_{\alpha,\beta}\sup_{\gamma}   \widehat{L}(\alpha, \beta, \gamma) - \inf_{\alpha,\beta}\sup_{\gamma}   L(\alpha, \beta, \gamma) \right| \\
    &+ \sup_{\alpha,\beta,\gamma}  \left| \widehat{L}(\alpha, \beta, \gamma) - L(\alpha, \beta, \gamma) \right|  + \log(4) \\
    &\overset{(1)}{\leq} \inf_{\alpha,\beta}\sup_{\gamma}   L(\alpha, \beta, \gamma) + \sup_{\alpha,\beta}\left| \sup_{\gamma}   \widehat{L}(\alpha, \beta, \gamma) - \sup_{\gamma}   L(\alpha, \beta, \gamma) \right| \\
    &+ \sup_{\alpha,\beta,\gamma}  \left| \widehat{L}(\alpha, \beta, \gamma) - L(\alpha, \beta, \gamma) \right|  + \log(4) \\
    &\overset{(2)}{\leq} \inf_{\alpha,\beta}\sup_{\gamma}   L(\alpha, \beta, \gamma) + 2 \sup_{\alpha,\beta,\gamma}  \left| \widehat{L}(\alpha, \beta, \gamma) - L(\alpha, \beta, \gamma) \right|  + \log(4) \\
    &= 2 \mathcal{D}_{JS}\left(P_{\alpha^{\ast}}^{Z} \parallel P_{\beta^{\ast}}^{Z} \right) + 2 \sup_{\alpha,\beta,\gamma}  \left| \widehat{L}(\alpha, \beta, \gamma) - L(\alpha, \beta, \gamma) \right| 
\end{align*}
We have $\overset{(1)}{\leq}$ by using inequality $|\inf A - \inf B| \leq \sup |A-B|$, $\overset{(2)}{\leq}$ by using inequality $|\sup A - \sup B| \leq \sup |A-B|$.
Take the expectation and rearrange the both sides, we have:

\begin{align*}
    &\mathbb{E}\left[\mathcal{D}_{JS} \left( P_{\widehat{\alpha}}^{Z} \parallel P_{\widehat{\beta}}^{Z} \right) \right] - \mathcal{D}_{JS}\left(P_{\alpha^{\ast}}^{Z} \parallel P_{\beta^{\ast}}^{Z} \right) \\
    &\leq \mathbb{E}\left[\sup_{\alpha,\beta,\gamma}  \left| \widehat{L}(\alpha, \beta, \gamma) - L(\alpha, \beta, \gamma) \right| \right] \\
    &= \mathbb{E}\left[\sup_{\alpha,\beta,\gamma}  \left| \frac{1}{n} \sum_{i=1}^{n} \log \left ( D_{\gamma}((z_t^i)) \right ) + \frac{1}{n} \sum_{i=1}^{n} \log \left ( 1 - D_{\gamma}(z_{t+1}^i) \right ) \right. \right. \\
    &\left. \left. - \int_{\mathcal{Z}} \left(\log \left ( D_{\gamma}(z) \right ) P_{\alpha}^{z} + \log \left ( 1 - D_{\gamma}(z) \right ) P_{\beta}^{z}\right) dz \right| \right] \\
    &\leq \mathbb{E}\left[\sup_{\alpha,\beta,\gamma}  \left| \underbrace{\frac{1}{n} \sum_{i=1}^{n} \log \left ( D_{\gamma}((z_t^i)) \right ) - \int_{\mathcal{Z}} \left(\log \left ( D_{\gamma}(z) \right ) P_{\alpha}^{z} \right) dz}_{A_s(\alpha, \beta, \gamma)} \right| \right] \\
    &+ \mathbb{E}\left[\sup_{\alpha,\beta,\gamma}  \left| \underbrace{\frac{1}{n} \sum_{i=1}^{n} \log \left ( 1 - D_{\gamma}(z_{t+1}^i) \right ) - \int_{\mathcal{Z}} \left(\log \left ( 1 - D_{\gamma}(z) \right ) P_{\beta}^{z} \right) dz}_{A_t(\alpha, \beta, \gamma)} \right| \right] \\
\end{align*}
Note that $\left(A_s\left(\alpha, \beta, \gamma \right)\right)_{\alpha \in \mathcal{A}, \beta \in \mathcal{B}, \gamma \in \Gamma}$ and $\left(A_t\left(\alpha, \beta, \gamma \right)\right)_{\alpha \in \mathcal{A}, \beta \in \mathcal{B}, \gamma \in \Gamma}$ are the subgaussian processes in the metric spaces $\left(\mathcal{A} \times \mathcal{B} \times \Gamma, C_1 \left \| \cdot \right \| / \sqrt{n}\right)$ and $\left(\mathcal{A} \times \mathcal{B} \times \Gamma, C_1 \left \| \cdot \right \| / \sqrt{n}\right)$ where $C_1$  is a constant and $\left \| \cdot \right \|$ is the Euclidean norm on $\mathcal{A} \times \mathcal{B} \times \Gamma$. Then using Dudley's entropy integral, we have:
\begin{align*}
    &\mathbb{E}\left[\mathcal{D}_{JS} \left( P_{\widehat{\alpha}}^{Z} \parallel P_{\widehat{\beta}}^{Z} \right) \right] - \mathcal{D}_{JS}\left(P_{\alpha^{\ast}}^{Z} \parallel P_{\beta^{\ast}}^{Z} \right) \\
    &\leq  \mathbb{E}\left[\sup_{\alpha,\beta,\gamma} A_s\left(\alpha, \beta, \gamma \right) \left| \right| \right] + \mathbb{E}\left[\sup_{\alpha,\beta,\gamma} A_t\left(\alpha, \beta, \gamma \right) \left| \right| \right] \\
    &\leq 12 \int_{0}^{\infty} \left(\sqrt{\log N(\mathcal{A} \times \mathcal{B} \times \Gamma,  C \left \| \cdot \right \| / \sqrt{n}, \epsilon)} + \sqrt{\log N(\mathcal{A} \times \mathcal{B} \times \Gamma,  C \left \| \cdot \right \| / \sqrt{n}, \epsilon)} \right) d\epsilon \\
    &=  \frac{24C_1}{\sqrt{n}} \int_{0}^{\infty}\sqrt{\log N(\mathcal{A} \times \mathcal{B} \times \Gamma, \left \| \cdot \right \|, \epsilon)} d\epsilon \\
    &\overset{(3)}{=}  \frac{24C_1}{\sqrt{n}} \int_{0}^{\diam(\mathcal{A} \times \mathcal{B} \times \Gamma)}\sqrt{\log N(\mathcal{A} \times \mathcal{B} \times \Gamma, \left \| \cdot \right \|, \epsilon)} d\epsilon \\
    &\overset{(4)}{\leq}  \frac{24C_1}{\sqrt{n}} \int_{0}^{\diam(\mathcal{A} \times \mathcal{B} \times \Gamma)}\sqrt{\log \left( \left( \frac{2C_2 \sqrt{\dim(\mathcal{A} \times \mathcal{B} \times \Gamma)}}{\epsilon} \right) ^{\dim(\mathcal{A} \times \mathcal{B} \times \Gamma)} \right)} d\epsilon \\
    &= \mathcal{O}\left( \left(\ \frac{1}{\sqrt{n}} \right) \times C(\mathcal{A}, \mathcal{B}, \Gamma)\right) \\
\end{align*}
where $\diam(\cdot)$ and $\dim(\cdot)$ are the diameter and the dimension of the metric space, and $C(\mathcal{A}, \mathcal{B}, \Gamma)$ is the function of $\diam(\mathcal{A} \times \mathcal{B} \times \Gamma)$ and $\dim(\mathcal{A} \times \mathcal{B} \times \Gamma)$. We have $\overset{(3)}{=}$ because $N(\mathcal{A} \times \mathcal{B} \times \Gamma, \left \| \cdot \right \|, \epsilon) = 1$ for $\epsilon > \diam(\mathcal{A} \times \mathcal{B} \times \Gamma)$, $\overset{(4)}{\leq}$ by using inequality $N(\mathcal{T}, \| \cdot \|, \epsilon) \leq \left(\frac{2C_2\sqrt{d}}{\epsilon}\right)^{d}$ where $\mathcal{T}$ lied in Euclidean space $\mathbb{R}^d$ is the set of vectors whose length is at most $C_2$. 