\section{Problem Formulation}\label{sec:problem}

We first introduce the notations used throughout the paper and then formulate the problem. These notations and their descriptions are also summarized in Table~\ref{tab:notation}.

\textbf{Notations.}
Let $\mathcal{X}$ and $\mathcal{Y}$ denote the input and output space, respectively. We use capitalized letters $X, Y$ to denote random variables that take values in $\mathcal{X}, \mathcal{Y}$ and small letters $x,y$ their realizations. A \textit{domain} $D$ is specified by distribution $P^{X,Y}_{D}:\mathcal{X}\times \mathcal{Y}\to[0,1]$ and labeling function $\mathbbm{h}_{D}: \mathcal{X} \to \mathcal{Y}^{\Delta}$, where $\Delta$ is a probability simplex over $\mathcal{Y}$.  For simplicity, we also use $P_D^V$ (or $P_D^{V|U}$) to denote the induced marginal (or conditional) distributions of random variable $V$ (given $U$) in the domain $D$. 

\textbf{Non-stationary domain generalization setup.} We consider a problem where a learning algorithm has access to sequence of source datasets $\left\{ S_t \right\}_{t=1}^{T}$ where $S_t$ consists of $n$ instances i.i.d sampled from source domain $D_{t}$. In non-stationary DG, we assume there exists a mechanism $\mathbbm{M}$ that captures non-stationary patterns in the data. Specifically, $\mathbbm{M}$ can generate a sequence of mapping functions $\{\mathbbm{m}_{t}\}_{t \in \mathbb{N}}$ in which $\mathbbm{m}_{t}: \mathcal{X} \times \mathcal{Y} \rightarrow \mathcal{X} \times \mathcal{Y}$ captures the transition from domain $D_{t-1}$ to domain $D_{t}$. In other words, we can regard $P^{X,Y}_{D_{t}}$ as the push-forward distribution induced from $P^{X,Y}_{D_{t-1}}$ using the mapping function $\mathbbm{m}_{t-1}$ (i.e., $P^{X,Y}_{D_{t}} := \mathbbm{m}_{t-1} \sharp P^{X,Y}_{D_{t-1}}$). Note that this setup is different from conventional DG where domains are sampled independently from a meta-distribution. In non-stationary DG, domains are related to each other via mechanism $\mathbbm{M}$ (i.e., $\mathbbm{m}_{t}$ depends on previous mappings $\mathbbm{m}_1, \mathbbm{m}_2, \cdots, \mathbbm{m}_{t-1}$).

Given a sequence of $T$ source domains, our goal is to learn a sequence of models $H = \left\{h_t\right\}_{t=T+1}^{T+K}$, where $h_t: \mathcal{X} \to \mathcal{Y}^{\Delta}$ in a hypothesis class $\mathcal{H}$ is a model corresponds to domain $D_t$, such that these models can perform well on $K$ (unseen) target domains $\{D_{t}\}_{t=T+1}^{T+K}$. We aim to investigate under what conditions and by what algorithms we can ensure models learned from source domains can attain high accuracy at unknown target domains $\{D_{t}\}_{t=T+1}^{T+K}$ in non-stationary environment. Formally, we measure the accuracy using an error metric defined below.

\textbf{Error metric.} Consider a model $h: \mathcal{X} \to \mathcal{Y}^{\Delta}$ in a hypothesis class $\mathcal{H}$, we denote  $h(x)_y$ as the element on $y$-th dimension which predicts $\Pr(Y=y|X=x)$. Then the \textit{expected error} of $h$ under domain $D$ for some loss function $L:  \mathcal{Y}^{\Delta} \times \mathcal{Y} \rightarrow \mathbb{R}_+$ (e.g., 0-1, cross-entropy loss)  can be defined as
$\epsilon_{D}\left(h\right) = \mathbb{E}_{x,y \sim D}\left[L\left(h(X), Y\right)\right].$  Similarly, the \textit{empirical error} of $h$ over $n$ samples $S$ drawn i.i.d. from $P^{X,Y}_D$ is defined as $\epsilon_{S}\left(h\right) = \frac{1}{n}\sum_{x,y \in S} L\left(h(x),y\right).$ We also denote a family of functions $\mathcal{L}_{\mathcal{H}} $ associated with loss function $L$ and hypothesis class $\mathcal{H}$ as $\mathcal{L}_{\mathcal{H}} = \left\{ (x,y) \rightarrow L\left(h\left(x\right),y\right): h \in \mathcal{H} \right\}.$

Non-stationary mechanism $\mathbbm{M}$ is a key component in non-stationary DG. Since $\mathbbm{M}$ is unknown, we need to learn it from source domains. Let $M$ be a learned mechanism in a hypothesis class $\mathcal{M}$. To support theoretical analysis about $M$, we define the following:
\begin{itemize}[noitemsep,topsep=0pt,parsep=1pt,partopsep=1pt,leftmargin=*]
    \item \textbf{$M$-generated domain sequence} $\left\{D^{M}_{1},\cdots,D^{M}_{T+K}\right\}$ where domain %dataset 
    $D^{M}_{t}$ is associated with the distribution $P^{X,Y}_{D^{M}_{t}} = m_{t-1}\sharp P^{X,Y}_{D_{t-1}}$ and $D^{M}_{1} = D_1$.
    \item \textbf{$M$-generated dataset sequence} $\left\{S^{M}_{1},\cdots,S^{M}_{T+K}\right\}$ where dataset $S^{M}_{t}$ is generated from dataset $S_{t-1}$ by using mapping $m_{t-1}$.
    \item \textbf{$M$-optimal model sequence} $H^{M} = \left\{ h_{1}^{M}, \cdots, h_{T+K}^{M} \right\}$ where $h_{t}^{M} = \arg\min_{h \in \mathcal{H}} \epsilon_{D_{t}^{M}}\left(h\right)$.
    \item \textbf{$M$-empirical optimal model sequence} $\widehat{H}^{M} = \left\{ \widehat{h}_{1}^{M}, \cdots, \widehat{h}_{T+K}^{M} \right\}$ where $\widehat{h}_{t}^{M} = \arg\min_{h \in \mathcal{H}} \epsilon_{S_{t}^{M}}\left(h\right)$.
\end{itemize}
We also denote errors of model sequence $H$ on source and target domains as $E_{src} \left( H \right)=$ $\frac{1}{T}\sum_{t=1}^T  \epsilon_{D_t}\left( h_{t} \right) $ and $E_{tgt} \left( H \right)=$ $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{D_t}\left( h_{t} \right)$, respectively, and on $M$-generated source and target domains as $E^M_{src} \left( H \right)=$ $\frac{1}{T}\sum_{t=1}^T  \epsilon_{D^M_t}\left( h_{t} \right)$ and $E^M_{tgt} \left( H \right)=$ $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{D^M_t}\left( h_{t} \right)$. Empirical errors of $H$ on source and target datasets ($\widehat{E}_{src}$ and $\widehat{E}_{tgt}$), and on $M$-generated source and target datasets ($\widehat{E}^M_{src}$ and $\widehat{E}^M_{tgt}$) are defined similarly (Table~\ref{tab:notation}).

\renewcommand*{\arraystretch}{1.2} 
\begin{table}
    \centering
    \caption{Notations used in this paper.}\label{tab:notation}
    \resizebox{\linewidth}{!}{
    \begin{tabular}{ll} \hline
    Notation & Description \\ \hline
    $\mathcal{X}, \mathcal{Y}, \mathcal{Z}$ & input, output, representation spaces \\ 
    $\mathcal{M}, \mathcal{H}$ & mechanism and model hypothesis classes \\
    $X, Y, Z$ (resp. $x,y,z$) & random variables (resp. realizations) in $\mathcal{X}, \mathcal{Y}, \mathcal{Z}$ \\
    $D_t$ & $t^{th}$ domain in domain sequence  \\
    $S_t$ & $t^{th}$ dataset sampled from domain $D_t$ \\
    $\left\{D_t \right\}_{t=1}^T$ & source domains \\
    $\left\{D_t \right\}_{t=T+1}^{T+K}$ & target domains \\
    $P_{D_t}^{X,Y}$ & distribution associated with domain $D_t$ \\
    $\mathbbm{h}_{D_t}: \mathcal{X} \rightarrow \mathcal{Y}^{\Delta}$ & labeling function of domain $D_t$ \\
    $\mathbbm{M}$ & ground-truth mechanism that generates $\{\mathbbm{m}_t\}_{t \in \mathbb{N}}$ \\
    $\mathbbm{m}_t$ & ground-truth mapping from $D_t$ to $D_{t+1}$: $P^{X,Y}_{D_{t+1}} = \mathbbm{m}_t \sharp P^{X,Y}_{D_{t}}$ \\
    $M \in \mathcal{M}$ & hypothesis mechanism that generates  $\{m_t\}_{t \in \mathbb{N}}$ \\
    $h_t \in \mathcal{H}$ & hypothesis classifier for domain $D_t$ \\
    $L: \mathcal{Y}^{\Delta} \rightarrow \mathcal{Y}$ & loss function \\
    $\mathcal{L}_{\mathcal{H}}$ & family of functions $\left\{ (x,y) \rightarrow L\left( h(x), y \right) : h \in \mathcal{H} \right\}$ \\
    $\epsilon_D\left( h \right)$ & expected error of classifier $h$ on domain $D$ \\
    $\epsilon_S\left( h \right)$ & empirical error of classifier $h$ on dataset $S$ \\
    $D_{t}^{M}$ & domain associated with distribution $P^{X,Y}_{D^{M}_{t}} = m_{t-1}\sharp P^{X,Y}_{D_{t-1}}$ \\
    $S_{t}^{M}$ & dataset associated with domain $D_{t}^{M}$ \\
    $h_{t}^{M}$ & $\arg\min_{h \in \mathcal{H}} \epsilon_{D_{t}^{M}}\left(h\right)$ \\
    $\widehat{h}_{t}^{M}$ & $\arg\min_{h \in \mathcal{H}} \epsilon_{S_{t}^{M}}\left(h\right)$ \\
    $\mathcal{D}_{JS}$ & JS-divergence \\
    $E_{src} \left( H \right)$ (resp. $E_{tgt} \left( H \right)$) & $\frac{1}{T}\sum_{t=1}^T  \epsilon_{D_t}\left( h_{t} \right)$ (resp. $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{D_t}\left( h_{t} \right)$) \\
    % $E_{tgt} \left( H \right)$ & $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{D_t}\left( h_{t} \right)$ \\
    $E^M_{src} \left( H \right)$ (resp. $E^M_{tgt} \left( H \right)$) & $\frac{1}{T}\sum_{t=1}^T  \epsilon_{D^M_t}\left( h_{t} \right)$ (resp. $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{D^M_t}\left( h_{t} \right)$) \\ 
    % $E^M_{tgt} \left( H \right)$ & $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{D^M_t}\left( h_{t} \right)$ \\
    $\widehat{E}_{src} \left( H \right)$ (resp. $\widehat{E}_{tgt} \left( H \right)$) & $\frac{1}{T}\sum_{t=1}^T  \epsilon_{S_t}\left( h_{t} \right)$ (resp. $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{S_t}\left( h_{t} \right)$) \\
    $\widehat{E}^M_{src} \left( H \right)$ (resp. $\widehat{E}^M_{tgt} \left( H \right)$) & $\frac{1}{T}\sum_{t=1}^T  \epsilon_{S^M_t}\left( h_{t} \right)$ (resp. $\frac{1}{K}\sum_{t=T+1}^{T+K}  \epsilon_{S^M_t}\left( h_{t} \right)$) \\ 
    $D_{src} \left( M \right)$ & $\frac{1}{T}\sum_{t=1}^T \left( \mathcal{D}_{JS}\left( P^{X,Y}_{D_t} \parallel  P^{X,Y}_{D_t^{M}}\right) \right)^{1/2}$ \\
    $D_{tgt} \left( M \right)$ & $\frac{1}{K}\sum_{t=T+1}^{T+K} \left( \mathcal{D}_{JS}\left( P^{X,Y}_{D_t} \parallel  P^{X,Y}_{D_t^{M}}\right) \right)^{1/2}$ \\
    $ \Phi\left(\mathcal{M}, \mathcal{H}\right)$ & $\underset{M' \in \mathcal{M}}{\sup} \left( E_{tgt} \left( H^{M'} \right) - E_{src} \left( H^{M'} \right)  \right)$ \\
    $ \Phi\left(\mathcal{M}\right)$ & $\underset{M' \in \mathcal{M}}{\sup} \left( D_{tgt} \left( M' \right) - D_{src} \left( M' \right)  \right)$ \\ \hline
    \end{tabular}}
\end{table}
\renewcommand*{\arraystretch}{1.} 