\section{Details of Experimental Setup and Additional Results}\label{app:exp}
\subsection{Experimental Setup}

\paragraph{Datasets.}
Our experiments are conducted on two synthetic and two real-world datasets. The data statistics of these datasets are presented in Table \ref{tab:s1}. For \textsf{Eval-S} scenario, the first half of domains in the domain sequences are used for training and the following domains are used for testing. For \textsf{Eval-D} scenario, we vary the size of the training set starting from the first half of domains by sequentially adding new domains to this set. In both scenarios, we split the training set into smaller subsets with a ratio $81:9:10$; these subsets are used as training, validation, and in-distribution testing sets. The data descriptions are given as follow: 
\begin{itemize}
    \item \textbf{Circle} \citep{pesaranghader2016fast}: A synthetic dataset  containing 30 domains. Features $X:=[X_1,X_2]^T$ in domain $t$ are two-dimensional and Gaussian distributed with mean $\Bar{X}^t = [r\cos(\pi t / 30), r\sin(\pi t / 30)]$ where $r$ is radius of semicircle; the distributions of different domains have the same covariance matrix but different means that uniformly evolve from right to left on a semicircle. Binary label $Y$ are generated based on labeling function $Y=\mathbbm{1}\left[ (X_1-x^o_1)^2 + (X_2-x^o_2)^2 \leq r \right]$, where $(x^o_1, x^o_2)$ are center of semicircle. Models trained on the right part are evaluated on the left part of the semicircle.
    \item \textbf{Circle-Hard}: A synthetic dataset adapted from \textbf{Circle} dataset, where mean $\Bar{X}^t$ does not uniformly evolve. Instead, $\Bar{X}^t = [r\cos(\theta_t), r\sin(\theta_t)]$ where $\theta_t = \theta_{t-1} + \pi (t - 1) / 180$ and $\theta_{1} = 0 \, \text{rad}$.
    \item \textbf{RMNIST}: A dataset constructed from MNIST \citep{lecun1998gradient} by $R$-degree counterclockwise rotation. We evenly select 30 rotation angles $R$ from $0^{\circ}$ to $180^{\circ}$ with step size $6^{\circ}$; each angle corresponds to a domain. The domains with $R \leq r$ are considered source domains, those with $R > r$ are the target domains used for evaluation. In this dataset, the goal is to train a multi-class classifier on source domains that predicts the digits of images in target.    %We consider a multicalss classification problem in which models predict digit from the corresponding images.
    \item \textbf{Yearbook} \citep{ginosar2015century}: A real dataset consisting of frontal-facing American high school yearbook photos from 1930-2013. Due to the evolution of fashion, social norms, and population demographics, the distribution of facial images changes over time. In this dataset, we aim to train a binary classifier using historical data to predict the genders of images in the future.
    %models trained on past years are evaluated in the future. We consider a binary classification problem in which models predict gender of people from the corresponding images.
    \item  \textbf{CLEAR} \citep{lin2021clear}: A real dataset built from existing large-scale image collections (YFCC100M) which captures the natural temporal evolution of visual concepts in the real world that spans a decade (2004-2014). In this dataset, we aim to train a multi-class classifier using historical data to predict 10 object types in future images.
\end{itemize}
\begin{table}[H]
\centering
\caption{Data statistics.}\label{tab:s1}
%\vspace{1em}
\begin{tabular}{lllll}
\hline
         & Data type  & Label type & \#instance & \#domain \\ \hline
Circle   & Synthetic  & Binary     & 30000        & 30       \\ 
Circle-Hard & Synthetic  & Binary     & 30000        & 30       \\ 
RMNIST   & Semi-synthetic & Multi      & 30000       & 30       \\ 
Yearbook & Real-world & Binary     & 33431       & 84       \\
CLEAR & Real-world & Multi     & 29747       & 10       \\ \hline
\end{tabular}
\end{table}

\paragraph{Non-stationary mechanisms in synthetic datasets.} We note that in synthetic datasets, we precisely known the non-stationary mappings that generate domain sequences.

\begin{itemize}
    \item \textbf{Circle}: A synthetic dataset  containing 30 domains. Features $X:=[X_1,X_2]^T$ in domain $t$ are two-dimensional and Gaussian distributed with mean $\Bar{X}^t = [r\cos(\pi t / 30), r\sin(\pi t / 30)]$ where $r$ is radius of semicircle; the distributions of different domains have the same covariance matrix but different means that uniformly evolve from right to left on a semicircle. Binary label $Y$ are generated based on labeling function $Y=\mathbbm{1}\left[ (X_1-x^o_1)^2 + (X_2-x^o_2)^2 \leq r \right]$, where $(x^o_1, x^o_2)$ are center of semicircle. \newline
    $\Rightarrow \mathbbm{m}_t = \begin{bmatrix}
    \cos(\pi / 30) & -\sin(\pi / 30)\\ 
    \sin(\pi / 30) & \cos(\pi / 30)
    \end{bmatrix} \forall t \in [1, \cdots, 29]$
    \item \textbf{Circle-Hard}: A synthetic dataset adapted from \textbf{Circle} dataset, where mean $\Bar{X}^t$ does not uniformly evolve. Instead, $\Bar{X}^t = [r\cos(\theta_t), r\sin(\theta_t)]$ where $\theta_t = \theta_{t-1} + \pi (t - 1) / 180$ and $\theta_{1} = 0 \, \text{rad}$. \newline
    $\Rightarrow \mathbbm{m}_t = \begin{bmatrix}
    \cos(\pi t / 180) & -\sin(\pi t / 180)\\ 
    \sin(\pi t / 180) & \cos(\pi t / 180)
    \end{bmatrix} \forall t \in [1, \cdots, 19]$
    \item \textbf{RMNIST}: A dataset constructed from MNIST by $R$-degree counterclockwise rotation. We evenly select 30 rotation angles $R$ from $0^{\circ}$ to $180^{\circ}$ with step size $6^{\circ}$; each angle corresponds to a domain. \newline
    $\Rightarrow \mathbbm{m}_t = \begin{bmatrix}
    \cos(6^\circ) & -\sin(6^\circ)\\ 
    \sin(6^\circ) & \cos(6^\circ)
    \end{bmatrix} \forall t \in [1, \cdots, 29]$ 
\end{itemize}

\paragraph{Baseline methods.}
We compare the proposed \texttt{AIRL} with existing methods from related areas, including the followings:
\begin{itemize}
    \item Empirical risk minimization (\texttt{ERM}): A simple method that considers all source domains as one domain.
    \item Last domain (\texttt{LD}): A method that only trains model using the most recent source domain.
    \item Fine tuning (\texttt{FT}): The baseline trained on all source domains in a sequential manner.
    \item Domain invariant representation learning:  Methods that learn the invariant representations across source domains and train a model based on the representations. We experiment with \texttt{G2DM} \citep{albuquerque2019generalizing}, \texttt{DANN} \citep{ganin2016domain}, \texttt{CDANN} \citep{li2018deep}, \texttt{CORAL} \citep{sun2016deep}, \texttt{IRM} \citep{arjovsky2019invariant}.
    \item Data augmentation: We experiment with \texttt{MIXUP} \citep{zhang2018mixup} that generates new data using convex combinations of source domains to enhance the generalization capability of models.
    \item Continual learning: We experiment with \texttt{EWC} \citep{kirkpatrick2017overcoming}, method that learns model from data streams that overcomes catastrophic forgetting issue.
    \item Continuous domain adaptation:  We experiment with \texttt{CIDA} \citep{wang2020continuously}, an adversarial learning method designed for DA with continuous domain labels.
    \item Distributionally robust optimization:  We experiment with \texttt{GROUPDRO} \citep{sagawa2019distributionally} that minimizes the worst-case training loss over pre-defined groups through regularization.
    \item Gradient-based DG: We experiment with \texttt{FISH} \citep{shi2021gradient} that targets domain generalization by maximizing the inner product between gradients from different domains.
    \item Contrastive learning-based DG:  We experiment with \texttt{SELFREG} \citep{kim2021selfreg} that utilizes the self-supervised contrastive losses to learn domain-invariant representation by mapping the latent representation of the same-class samples close together.
    \item Non-stationary environment DG:  We experiment with \texttt{DRAIN} \citep{bai2022temporal}, \texttt{TKNets} \citep{zeng2024generalizing}, \texttt{LSSAE} \citep{qin2022generalizing}. and \texttt{DDA} \citep{zeng2023foresee}. DRAIN, DPNET, and DDA focus on domain $D_{T+1}$ only so we use the same model when making predictions for all target domains $\{D_{t}\}_{t>T}$.
\end{itemize}

\paragraph{Evaluation method.} 
In the experiments, models are trained on a sequence of source domains $\mathcal{D}_{src}$, and their performance is evaluated on target domains $\mathcal{D}_{tgt}$ under two different scenarios: \textsf{Eval-S} and \textsf{Eval-D}. 

In the scenario \textsf{Eval-S}, models are trained one time on the first half of domain sequence $\mathcal{D}_{src} = [D_1, D_2, \cdots, D_T]$ and are then deployed to make predictions on the next $K$ domains in the second half of domain sequence $\mathcal{D}_{tgt} = [D_{T+1}, D_{T+2}, \cdots, D_{T+K}]$ $(T+1 \leq K \leq 2T)$. The average and worst-case performances can be evaluated using two matrices $\oodavg$ and $\oodwrt$ defined below.
\begin{equation*}
  \displaystyle  \oodavg = \frac{1}{K} \sum_{k=1}^K \acc_{T+k};~~~ \oodwrt = \underset{k \in [K]}{\min} \acc_{T+k} 
\end{equation*}
where $\acc_{T+k}$ denotes the accuracy of model on target domain $D_{T+k}$. 

In the scenario \textsf{Eval-D}, source and target domains are not static but are updated periodically as new data/domain becomes available. This allows us to update models based on new source domains.
%data from different domains comes sequentially, which allows the model to update multiple times. 
Specifically, at time step $t \in [T, 2T - K]$, models are updated on source domains $\mathcal{D}_{src} = [D_1, D_2, \cdots, D_t]$ and are used to predict target domains  $\mathcal{D}_{tgt} = [D_{t+1}, D_{t+2}, \cdots, D_{t+K}]$. The average and worst-case performances of models in this scenario can be defined as follows.
\begin{align*}
  & \textstyle \oodavg = \frac{1}{(T-K+1)K} \sum_{t=T}^{2T-K}\sum_{k=1}^K \acc_{t+k} \\
  & \textstyle  \oodwrt = \underset{t \in [T, 2T-K]}{\min}  \frac{1}{K} \sum_{k=1}^{K} \acc_{t+k}
\end{align*}
In our experiment, the time step $t$ starts from the index denoting half of the domain sequence. 

\paragraph{Implementation and training details.}
Data, model implementation, and training script are included in the supplementary material. We train each model on each setting with 5 different random seeds and report the average prediction performances. All experiments are conducted on a machine with 24-Core CPU, 4 RTX A4000 GPUs, and 128G RAM.

\subsection{Additional experiment results}\label{app:exp2}
\paragraph{Performance gap between in-distribution and out-of-distribution predictions.}
This study is motivated based on the assumption that the environment changes over time and that there exist distribution shifts between training and test data. To verify this assumption in our datasets, we compare the performances of \texttt{ERM} on in-distribution and out-of-distribution testing sets. Specifically, we show the gaps between the performances of \texttt{ERM} measured on the in-distribution (i.e., $\idavg$) and out-of-distribution (i.e., $\oodavg$) testing sets under \textsf{Eval-D} scenario (i.e., $K=5$) in Figure \ref{fig:s1}.

\paragraph{Performance of fixed invariant representation learning in conventional and non-stationary DG settings.}
A key distinction from non-stationary DG is that the model evolves over the domain sequence to capture non-stationary patterns  (i.e., learn invariant representations between two consecutive domains but adaptive across domain sequence). This stands in contrast to the conventional DG~\citep{ganin2016domain,phung2021learning} which relies on an assumption that target domains lie on or are near the mixture of source domains, then enforcing fixed invariant representations across all source domains can help to generalize the model to target domains. We argue that this assumption may not hold in non-stationary DG where the target domains may be far from the mixture of source domains resulting in the failure of the existing methods. 
 
To verify this argument, we conduct an experiment on rotated \textbf{RMNIST} dataset with \texttt{DANN} \citep{ganin2016domain} – a model that learns fixed invariant representations across all domains. Specifically, we create 5 domains by rotating images by $0$, $15$, $30$, $45$, and $60$ degrees, respectively, and follow leave-one-out evaluation (i.e., one domain is target while the remaining domains are source). Clearly, the setting where the target domain are images rotated by $0$ or $60$ degrees can be considered as non-stationary domain generalization while other settings can be considered as conventional domain generalization. The performances of DANN with different target domains are shown in Table~\ref{tab:s6}. As we can see, the accuracy drops significantly when the target domain are images rotated by $0$ or $60$ degrees. This result demonstrates that learning fixed invariant representations across all domains is not suitable for non-stationary DG.

\renewcommand*{\arraystretch}{1.5} 
 \begin{table}[t]
 \caption{Performances of \texttt{DANN} on \textbf{RMNIST} dataset.}\label{tab:s6}
     \centering
     \begin{tabular}{lccccc} \hline  
          Target Domain&  $0^{\circ}$-rotated&  $15^{\circ}$-rotated&  $30^{\circ}$-rotated&  $45^{\circ}$-rotated& $60^{\circ}$-rotated\\ \hline  
          Model Performance&  51.2	
&  59.1	&  70.0&  69.2& 53.9\\ \hline 
     \end{tabular}
\end{table}
\renewcommand*{\arraystretch}{1.} 

\paragraph{Computation complexity of non-stationary DG methods.}
Compared to existing works for non-stationary DG, our method also shows better computational efficiency. It’s because of our effective design to capture non-stationary patterns. Specifically, \texttt{LSSAE} and \texttt{DRAIN} have more complex architectures and objective functions resulting in much more training time than our method. While \texttt{TKNets} has slightly better training time than ours, this model requires storing previous data to make predictions and is not generalized to multiple target domains. To further support our claim, the average training times (i.e., seconds) of these methods for different datasets are shown in Table~\ref{tab:s7}.

\renewcommand*{\arraystretch}{1.5} 
\begin{table}[]
\caption{The average training times (i.e., seconds) of non-stationary DG methods for \textbf{Circle},\textbf{Circle-Hard}, \textbf{RMNIST}, \textbf{Yearbook}, and \textbf{CLEAR} datasets.}\label{tab:s7}
\centering
\begin{tabular}{lccccc}
\hline
\multicolumn{1}{c}{\textbf{}} & \multicolumn{1}{c}{\textbf{Circle}} & \multicolumn{1}{c}{\textbf{Circle-Hard}} & \multicolumn{1}{c}{\textbf{RMNIST}} & \multicolumn{1}{c}{\textbf{Yearbook}} & \multicolumn{1}{c}{\textbf{CLEAR}} \\ \hline
\texttt{AIRL}                            & 32                                   & 25                                        & 382                                  & 749                                    & 1504                                \\ 
\texttt{LSSAE}                           & 184                                  & 175                                       & 1727                                 & 1850                                   & 13287                               \\ 
\texttt{DRAIN}                           & 460                                  & 230                                       & 2227                                 & 5538                                   & 1920                                \\ 
\texttt{TKNets}                           & 18                                   & 13                                        & 208                                  & 448                                    & 1542                                \\ \hline
\end{tabular}
\end{table}
\renewcommand*{\arraystretch}{1.} 

\paragraph{Experimental results for \textsf{Eval-S} scenario.}
The prediction performances of \texttt{AIRL} and baselines on synthetic (i.e., Circle, Circle-Hard) and real-world (i.e., RMNIST, Yearbook) data under \textsf{Eval-S} scenario are presented in Figure \ref{fig:s2} below. In this scenario, the training set is fixed as the first half of domains while the testing set is varied from the five subsequent domains to the second half of domains in the domain sequences. We report averaged results with error bars (std) for training over 5 different random seeds.

We can see that \texttt{AIRL} consistently outperforms baselines in most datasets. We also observe that the prediction performances decreases when the predictions are made for the distant target domains (i.e., the number of testing domain increases) for all models in \textbf{Circle}, \textbf{Circle-Hard}, and \textbf{RMNIST} datasets. This pattern is reasonable because domains in these datasets are generated monotonically. For \textbf{Yearbook} dataset, the performance curves are U-shaped that they decrease first but increase later. This dataset is from a real-world environment so we expect the shapes of the curves are more complex compared to those in the other datasets.

\begin{figure}[t]
\centering
\begin{subfigure}[b]{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/in_out_mismatch_circle_large_stream_5.png}
\caption{Circle}
\label{fig:s1-1}
\end{subfigure}
%\hfill
\hspace{1cm}
\begin{subfigure}[b]{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/in_out_mismatch_circle_hard_stream_5.png}
\caption{Circle-Hard}
\label{fig:s1-2}
\end{subfigure}
\\
\begin{subfigure}[b]{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/in_out_mismatch_rmnist_stream_5.png}
\caption{RMNIST}
\label{fig:s1-3}
\end{subfigure}
%\hfill
\hspace{1cm}
\begin{subfigure}[b]{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/in_out_mismatch_yearbook_stream_5.png}
\caption{Yearbook}
\label{fig:s1-4}
\end{subfigure}
\caption{Gaps between the performances of \texttt{ERM} measured on the in-distribution and out-of-distribution testing sets (i.e., $\idavg - \oodavg$) under \textsf{Eval-D} scenario (i.e., $K=5$). This experiment is conducted on \textbf{Circle}, \textbf{Circle-Hard}, \textbf{RMNIST}, and \textbf{Yearbook} datasets.} 
\label{fig:s1}
\end{figure}

\begin{figure}[t]
\centering
\begin{subfigure}[b]{0.48\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/eval_s_avg_circle_large.png}
\caption{Circle}
\label{fig:s2-1}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.48\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/eval_s_avg_circle_hard.png}
\caption{Circle-Hard}
\label{fig:s2-2}
\end{subfigure}
\vspace{1em}
\begin{subfigure}[b]{0.48\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/eval_s_avg_rmnist.png}
\caption{RMNIST}
\label{fig:s2-3}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.48\textwidth}
\centering
\includegraphics[width=\textwidth]{figs/eval_s_avg_yearbook.png}
\caption{Yearbook}
\label{fig:s2-4}
\end{subfigure}
\caption{Prediction performances (i.e., $\oodavg$) of AIRL and baselines under \textsf{Eval-S} scenario. The training set is fixed as the first half of domains while the testing set is varied from the five subsequent domains to the second half of domains in the domain sequences. We report average results for training over 5 different random seeds. This experiment is conducted on \textbf{Circle}, \textbf{Circle-Hard}, \textbf{RMNIST}, and \textbf{Yearbook} datasets.} 
\label{fig:s2}
\end{figure}