\section{Theoretical Results}\label{sec:theory}
In this section, we aim to understand how a model sequence $H$ learned from source domain data would perform when deployed in target domains under non-stationary distribution shifts. Specifically, we will develop theoretical upper bounds of the model sequence's errors at target domains. These theoretical findings will provide guidance for the algorithm design in Section~\ref{sec:method}. All proofs are in Appendix~\ref{app:proof}. 

To start, we adopt two assumptions commonly used in DA/DG literature~\citep{nguyen2021kl,kumar2020understanding}.
\begin{assumption}[Bounded loss] We assume loss function $L$ is upper bounded by a constant $C$, i.e., $\forall x \in \mathcal{X}, y \in \mathcal{Y}$, $h \in \mathcal{H}$, we have $L(h(x), y) \leq C$.\label{ass:1}
\end{assumption}
\begin{assumption}[Bounded model complexity] We assume Rademacher complexity~\citep{bartlett2002rademacher} of function class $\mathcal{L}_{\mathcal{H}}$ computed from all samples with size $n$ is bounded for any distribution $P$ considered in this paper. That is, for some constant $B > 0$, we have:
    \begin{equation*}
        \mathcal{R}_n \left( \mathcal{L}_{\mathcal{H}} \right) = \mathbb{E}\left[ \underset{f \in \mathcal{L}_{\mathcal{H}}}{\sup} \frac{1}{n} \sum_{i=1}^n \sigma_i f\left(x_i\right) \right] \leq \frac{B}{\sqrt{n}}
    \end{equation*}
    where the expectation is with respect to $x_i \sim P$ and $\sigma_i \sim P_{\mathcal{R}}$, and $P_{\mathcal{R}}$ is Rademacher distribution.\label{ass:2}
\end{assumption}
We note that these two assumptions are actually reasonable and not strong. For instance, although Assumption~\ref{ass:1} does not hold for cross-entropy loss used in classification,  we can modify this loss to make it satisfied Assumption~\ref{ass:1}. In particular, it can be bounded by $C$ by modifying softmax output from $\left(p_1,\cdots,p_{|\mathcal{Y}|}\right)$ to $\left(\hat{p}_1,\cdots,\hat{p}_{|\mathcal{Y}|}\right)$ where $\hat{p}_i=p_i \left(1-\exp\left(-C \right)\left|\mathcal{Y}\right|\right)+\exp\left(-C\right)$. In addition, according to~\cite{liang2016statistical} (Theorem 11 page 82), Assumption~\ref{ass:2} holds when input space is compact and bounded in unit $L_2$ ball and function $f$ in $\mathcal{L}_{\mathcal{H}}$ is linear and Lipschitz continuous in $l_2$ norm.

To learn a model sequence $H$ that performs well on unseen target domains, we need to account for the non-stationary patterns across domains. However, these patterns are governed by mechanism $\mathbbm{M}$ which is unknown and must be estimated from source domains. Therefore, we need to learn a mechanism $M \in \mathcal{M}$ that can well estimate ground-truth $\mathbbm{M}$ and learn $H$ by leveraging $M$. Because the target data is inaccessible, we expect that the model performance on the target highly depends on the accuracy of $M\in \mathcal{M}$. To formally characterize the complexity of learning non-stationary pattern leveraging hypothesis classes $\mathcal{M}$, $\mathcal{H}$ and source domains, we introduce two complexity terms as follows.
\begin{definition}
[\textbf{Non-stationary complexity}]\label{def-1} Given a sequence of domains $\{D_t\}_{t=1}^{T+K}$, hypothesis classes $\mathcal{M}$ and $\mathcal{H}$, the $\mathcal{M},\mathcal{H}$-complexity term $\Phi\left(\mathcal{M}, \mathcal{H}\right)$ and $\mathcal{M}$-complexity term $\Phi\left(\mathcal{M}\right)$ are defined as
\begin{align*}
    \Phi\left(\mathcal{M}, \mathcal{H}\right) = &\underset{M' \in \mathcal{M}}{\sup} \left( E_{tgt} \left( H^{M'} \right) - E_{src} \left( H^{M'} \right)  \right) \\
    \Phi\left(\mathcal{M}\right) = &\underset{M' \in \mathcal{M}}{\sup} \left( D_{tgt} \left( M' \right) - D_{src} \left( M' \right)  \right)
\end{align*}
where $D_{tgt} \left( M' \right) = \frac{1}{K}\sum_{t=T=1}^{T+K} \left( \mathcal{D}_{JS}\left( P^{X,Y}_{D_t} \parallel  P^{X,Y}_{D_t^{M'}}\right) \right)^{1/2}$, $D_{src} \left( M' \right) = \frac{1}{T}\sum_{t=1}^T \left( \mathcal{D}_{JS}\left( P^{X,Y}_{D_t} \parallel  P^{X,Y}_{D_t^{M'}}\right) \right)^{1/2}$, and $\mathcal{D}_{JS}\left( \cdot \parallel \cdot \right)$ is JS-divergence between two distributions.
\end{definition}
In essence, $\Phi\left(\mathcal{M}, \mathcal{H}\right)$ quantifies the gap between the source and target domains in terms of prediction errors of model sequence $H^{M}$. Meanwhile, $\Phi\left(\mathcal{M}\right)$ evaluates the disparity in performance of $M$ regarding its ability to estimate non-stationary patterns in source and in target domain sequences. Performance is measured by the statistical distance between ground-truth and the distributions induced by $M$. Inspired by discrepancy measures used to quantity the differences between distributions~\citep{mansour2009domain,mohri2012new} $\Phi\left( \mathcal{M}, \mathcal{H} \right)$ and $\Phi\left( \mathcal{M} \right)$ explicitly take into account the hypothesis classes $\mathcal{M}$ and $\mathcal{H}$, and loss function $L$. This ensures that the bound constructed from these terms is directly related to the learning problem at hand. Next, we present a guarantee on target domains for $M$-empirical optimal model sequence $\widehat{H}^M$ as follows.

\begin{theorem}\label{thm-1}
Given domain sequence $\{D_t\}_{t=1}^{T+K}$, dataset sequence $\{S_t\}_{t=1}^{T+K}$ sampled from $\{D_t\}_{t=1}^{T+K}$, for any $M \in \mathcal{M}$ ($M$ can depend on $\{S_t\}_{t=1}^{T+K}$) and any $0 < \delta < 1$, with probability at least $1 - \delta$ over the choice of dataset sequence $\{S_t\}_{t=1}^{T+K}$, we have:
\begin{align*}
E_{tgt}\left( \widehat{H}^{M} \right) \leq &~~\widehat{E}^M_{src}\left( \widehat{H}^{M} \right) + 5\sqrt{2}C  \times D_{src}\left( M \right) \\
&+ \Phi(\mathcal{M}) + 2\sqrt{2}C \times \Phi(\mathcal{M}, \mathcal{H}) \\
&+ \frac{6B}{\sqrt{n}} + 3 \sqrt{\frac{\log ((T+K) / \delta)}{2n}}
\end{align*}
\end{theorem}
It states that the expected error of $\widehat{H}^{M}$ on target domains $E_{tgt}\left( \widehat{H}^{M} \right)$ is upper bounded by four parts: (i) empirical error of $\widehat{H}^{M}$ on $M$-generated source datasets $\widehat{E}^M_{src}\left( \widehat{H}^{M} \right)$, (ii) the average distance between source datasets and $M$-generating source datasets $D_{src}\left( M \right)$, (iii) non-stationary complexity terms $\Phi(\mathcal{M})$ and $\Phi(\mathcal{M}, \mathcal{H})$, (iv) sample complexity term $\frac{6B}{\sqrt{n}} + 3 \sqrt{\frac{\log ((T+K) / \delta)}{2n}}$. We note that both the third and fourth parts %terms 
remain fixed given hypothesis classes $\mathcal{M}$ and $\mathcal{H}$, and the sample size $n$ for each dataset in the sequence. It is also noteworthy that this bound still holds when $M$ depends on dataset sequence $\{S_t\}_{t=1}^{T+K}$, thereby allowing us to apply this bound for $M$ learned from $\{S_t\}_{t=1}^{T+K}$. In addition, $\widehat{E}^M_{src}\left( \widehat{H}^{M} \right) = %\arg
\min_{H} \widehat{E}^M_{src}\left( H \right)$ by definition. Therefore, to minimize the expected error of $\widehat{H}^{M}$ on target domains,  %$\widehat{H}^{M}$, 
Theorem~\ref{thm-1} suggests us to find a mechanism $M^{\ast} = \arg\min_{M \in \mathcal{M}} D_{src}(M)$ from source datasets $\{S_t\}_{t=1}^{T+K}$, and then learn model sequence $\widehat{H}^{M^{\ast}}$ that minimizes
empirical error on $M^{\ast}$-generated dataset sequence. 

% Note that we can construct the upper bound for the expected error of $\widehat{H}^{M}$ on target domains $E_{tgt}\left( \widehat{H}^{M} \right)$ based on the empirical error on source domains $\widehat{E}_{src}\left( \widehat{H}^{M} \right)$ instead of the one on $M$-generated source domains $\widehat{E}^M_{src}\left( \widehat{H}^{M} \right)$. However, that bound is not useful because $\widehat{H}^{M}$ is not the minimizer for source domains.

Learning $M^{\ast}$ requires the model to find the optimal mapping $m^{\ast}_{t-1}: \mathcal{X} \times \mathcal{Y} \rightarrow \mathcal{X} \times \mathcal{Y}$ that minimizes the distance of the joint distributions $\mathcal{D}_{JS} \left( P^{X,Y}_{D_t} \parallel P^{X,Y}_{D^{M^{\ast}}_t} \right)$ for all $t \in \{1,\cdots, T\}$. To this end, we first minimize the distance of output distribution between the two domains $D_t, D_{t-1}$, then find an optimal mapping function in input space $\mathcal{X}$. That is, minimizing the distance of joint distributions in output and input space separately. This approach is formally stated in Proposition~\ref{thm-3} below.  
\begin{proposition}\label{thm-3}
Let $P^{X,Y}_{D^{W}_{t-1}}$ be the distribution induced from $P^{X,Y}_{D_{t-1}}$ by importance weighting with factors $\{w_y\}_{y \in \mathcal{Y}}$ where $w_y = P^{Y=y}_{D_t} / P^{Y=y}_{D_{t-1}}$ (i.e., $P^{X=x,Y=y}_{D^{W}_{t-1}} = w_y \times P^{X=x,Y=y}_{D_{t-1}}$). Then for any mechanism $M$ that generates  $\left \{ m_t: \mathcal{X} \to \mathcal{X} \right \}_{t \in \mathbb{N}}$, we have the following:
\begin{align*}
    \mathcal{D}_{JS} \left( P^{X,Y}_{D_t} \parallel P^{X,Y}_{D_{t}^{W,M}} \right) = \mathbb{E}_{y \sim P^Y_{D_t}} \left [ \mathcal{D}_{JS} \left( P^{X|Y}_{D_t} \parallel P^{X|Y}_{D^{W,M}_{t}} \right) \right ]
\end{align*}
where $P^{X,Y}_{D^{W,M}_{t}} = m_{t-1} \sharp P^{X,Y}_{D^W_{t-1}}$ is a push-forward distribution induced from $P^{X,Y}_{D^W_{t-1}}$ using $m_{t-1}$.
\end{proposition}

\begin{figure}
  \begin{center}
    \includegraphics[width=0.4\textwidth]{figs/input_vs_representation.png}
  \end{center}
  \caption{Visualization of learning non-stationary mapping between two domains $D^W_t$ (i.e., generated from $D_{t+1}$ by importance weighting) and $D_{t+1}$. (a) Learning in input space $\mathcal{X}$. (b) Learning in representation space $\mathcal{Z}$.}
  \label{fig:1}
\end{figure}

Proposition~\ref{thm-3} suggests 2-step approach to learn $m_t: \mathcal{X} \times \mathcal{Y} \rightarrow \mathcal{X} \times \mathcal{Y}$: (i) reweight $P^{X,Y}_{D_{t-1}}$ with factors $\{w_y\}_{y \in \mathcal{Y}}$ (i.e., to minimize the distance of output distribution between $D_t, D_{t-1}$); (ii) learn $m_t: \mathcal{X} \rightarrow \mathcal{X}$ that minimizes the distance of conditional distribution $\mathcal{D}_{JS} \left ( P^{X|Y}_{D_t} \parallel P^{X|Y}_{D^{W,M}_{t}} \right )$.

We note that while the non-stationary complexity terms $\Phi(\mathcal{M})$ and $\Phi(\mathcal{M}, \mathcal{H})$ are fixed given hypothesis classes $\mathcal{M}$ and $\mathcal{H}$, a good design of $\mathcal{M}$ and $\mathcal{H}$ will make these terms small. Since the input space $\mathcal{X}$ may be of high dimension, constructing these hypothesis classes in high-dimensional space can be challenging in practice. To tackle this issue, we leverage the \textit{representation learning} approach to first map inputs to a representation space $\mathcal{Z}$, which often has a lower dimension than $\mathcal{X}$. In particular, instead of using $m^{\ast}_t: \mathcal{X} \rightarrow \mathcal{X}$ to map $P^X_{D^W_t}$ to $P^X_{D^{W, M^{\ast}}_{t+1}}$ in input space $\mathcal{X}$, we use $f^{\ast}_t:  \mathcal{X} \rightarrow \mathcal{Z}$ and $g^{\ast}_t: \mathcal{X} \rightarrow \mathcal{Z}$ to map $P^X_{D^W_t}$ and $P^X_{D_{t+1}}$ to $f^{\ast}_t \sharp P^Z_{D^W_t}$ and $g^{\ast}_t \sharp P^Z_{D_{t+1}}$ in representation space $\mathcal{Z}$ such that $\mathbb{E}\left [ \mathcal{D}_{JS}\left( g^{\ast}_{t} \sharp P^{Z|Y}_{D_{t+1}} \parallel f^{\ast}_{t} \sharp P^{Z|Y}_{D^W_{t}}\right) \right ]$ is minimal. Then, we learn a sequence of classifiers $H^{\ast}$ from representation to output spaces that minimizes empirical errors on source domains. This representation learning-based method is visualized in Figure~\ref{fig:1} and is summarized below.
\begin{remark}[Representation learning]\label{thm-5} 
Given the sequence of $T$ source domains, we estimate:
\newline
(i) Non-stationary mechanism $F^{\ast}$ and $G^{\ast}$ that generate two sequence of representation mappings $\left\{f^{\ast}_{t}: \mathcal{X} \rightarrow \mathcal{Z}\right\}$ and $\left\{g^{\ast}_{t}: \mathcal{X} \rightarrow \mathcal{Z}\right\}$ with $F^{\ast},G^{\ast}$ defined as:
\begin{equation*}
  \displaystyle \underset{F \in \mathcal{F}, G \in \mathcal{G}} {\argmin}~~\frac{1}{T}\sum_{t=1}^T \underset{y \sim P^Y_{D_t}}{\mathbb{E}}\left[\mathcal{D}_{JS}\left( g_{t-1} \sharp P^{Z|Y}_{D_t} \parallel f_{t-1} \sharp P^{Z|Y}_{D^W_{t-1}}\right) \right]  
\end{equation*} where $F$ and $G$ generate sequence of representation mappings $\left\{f_{t}: \mathcal{X} \rightarrow \mathcal{Z}\right\}$ and $\left\{g_{t}: \mathcal{X} \rightarrow \mathcal{Z}\right\}$, $\mathcal{F}$ and $\mathcal{G}$ are the hypothesis classes of $F$ and $G$.
\newline
(ii) Sequence of classifiers $H^{\ast} = \left\{h^{\ast}_{t}: \mathcal{Z} \rightarrow \mathcal{Y}^{\Delta}\right\}$ where each $h^{\ast}_{t}$ minimizes the empirical errors with respect to distributions $f^{\ast}_t \sharp P^{Z,Y}_{D^W_{t}}$ and $g^{\ast}_t \sharp P^{Z,Y}_{D_{t+1}}$.
\end{remark}
\begin{remark}[Comparison with conventional DG]
    A key property of non-stationary DG is that the model needs to evolve over the domain sequence to capture non-stationary patterns (i.e., learn invariant representations between two consecutive domains but adaptive across domain sequence). This differs from the conventional DG~\citep{ganin2016domain,phung2021learning} which (implicitly) assumes that target domains lie on or are near the mixture of source domains, then enforcing fixed invariant representations across all source domains can help generalize the model to target. We argue that this assumption does not hold in non-stationary DG where the target domains may be far from the mixture of source domains. Thus, the existing methods developed for conventional DG often fail in non-stationary DG.  We further validate this empirically in Appendix~\ref{app:exp2}.
\end{remark}

According to Remark~\ref{thm-5}, JS-divergence between two distribution $P_{D^W_t}$ and $P_{D_{t+1}}$ can be minimized through invariant representation learning. However in practice, models only have access to finite datasets $S^W_t$ and $S_{t+1}$. Moreover,~\cite{goodfellow2014generative} has shown that minimizing JS-divergence is aligned with the objective adversarial learning in the setting of infinite data. Therefore, evaluating the performance of minimizing JS-divergence via adversarial learning in the case of finite data is important. First, definition of adversarial learning is given below.
\begin{definition}
    \textbf{Adversarial learning for invariant representation.} Given two datasets $S^w_t=\left\{x_t^i\right\}_{i=1}^{n}$ and $S_{t+1}=\left\{x_{t+1}^i\right\}_{i=1}^{n}$, the goal of adversarial learning approach for invariant representation with respect to these two datasets is to achieve $\widehat{L}^t_{adv}=\inf_{\alpha_t,\beta_t} \sup_{\gamma_t} \left( \frac{1}{n} \sum_{i=1}^{n} \log \left ( D_{\gamma_t}(F_{\alpha_t}(x_t^i)) \right ) \right.$ $\left.+ \frac{1}{n} \sum_{i=1}^{n} \log \left ( 1 - D_{\gamma_t}(G_{\beta_t}(x_{t+1}^i)) \right ) \right) $ where $F_{\alpha_t}: \mathcal{X} \rightarrow \mathcal{Z}$ and $G_{\beta_t}: \mathcal{X} \rightarrow \mathcal{Z}$ are the representation networks parameterized by $\alpha_t \in \mathcal{A}$ and $\beta_t \in \mathcal{B}$, and $D_{\gamma_t}: \mathcal{Z} \rightarrow [0,1]$ are the discriminator parameterized by $\gamma_t \in \Gamma$ that tries to predict which domain the representation comes from.
\end{definition}
Then, Proposition~\ref{thm:4} shows that the error of minimizing JS-divergences using adversarial learning on the sequence of source datasets size $n$ is up to $\mathcal{O}  \left(\frac{1}{\sqrt{n}} \right) $.
\begin{proposition}\label{thm:4}
    Let $\alpha^{\ast}_t, \beta^{\ast}_t, \gamma^{\ast}_t$ are parameters learned by infinite data and $\widehat{\alpha}_t, \widehat{\beta}_t, \widehat{\gamma}_t$ are parameters learned by optimizing $\widehat{L}_{adv}^t$, then we have:
    % \begin{align*}
    %     \mathbb{E} \left[\mathcal{D}_{JS}\left(P_{\widehat{\alpha}}^{Z} \parallel P_{\widehat{\beta}}^{Z} \right) \right] &\leq \mathcal{D}_{JS}\left(P_{\alpha^{\ast}}^{Z} \parallel P_{\beta^{\ast}}^{Z} \right) \\ 
    %     &+ \mathcal{O} \left( \left(\frac{1}{\sqrt{n}} \right) \times C(\mathcal{A}, \mathcal{B}, \Gamma) \right)
    % \end{align*}
    \begin{align*}
        \mathbb{E} \left[ D_{src} \left( \widehat{\alpha}, \widehat{\beta} \right) \right] &\leq D_{src} \left( \alpha^{\ast}, \beta^{\ast} \right) \\
        &+ \mathcal{O} \left( \left(\frac{1}{\sqrt{n}} \right) \times C(\mathcal{A}, \mathcal{B}, \Gamma) \right)
    \end{align*}
    where $D_{src} \left( \widehat{\alpha}, \widehat{\beta} \right) = \frac{1}{T}\sum_{t=1}^T \mathcal{D}_{JS}\left(P_{\widehat{\alpha}_t}^{Z} \parallel P_{\widehat{\beta}_t}^{Z} \right)$ and $D_{src} \left( \alpha^{\ast}, \beta^{\ast} \right) = \frac{1}{T}\sum_{t=1}^T \mathcal{D}_{JS}\left(P_{\alpha^{\ast}_t}^{Z} \parallel P_{\beta^{\ast}_t}^{Z} \right)$, $P^Z_{\widehat{\alpha}_t}$, $P^Z_{\widehat{\beta}_t}$, $P^Z_{\alpha^{\ast}_t}$, $P^Z_{\beta^{\ast}_t}$ are distributions induced by representation networks parameterized by $\widehat{\alpha}_t, \widehat{\beta}_t, \alpha^{\ast}_t, \beta^{\ast}_t$, respectively, and $C(\mathcal{A}, \mathcal{B}, \Gamma)$ is a constant specified by the parameter spaces $\mathcal{A}, \mathcal{B}, \Gamma$.
\end{proposition}