\section{Experiments}\label{sec:experiment}

\renewcommand*{\arraystretch}{1.2} 
\begin{table*}[t]
\caption{Prediction performances (i.e., $\oodavg$ and $\oodwrt$) of \texttt{AIRL} and baselines under \textsf{Eval-D} scenario $(K=5)$. We report average results (w. standard deviation) over 5 random seeds. For \textbf{CLEAR} dataset, due to only one split between train and test sets, $\oodavg$ and $\oodwrt$ are similar.}\label{tab:1}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccccccc}
\hline
\multirow{2}{*}{\textbf{Algorithm}} & \multicolumn{2}{c}{\textbf{Circle}}                      & \multicolumn{2}{c}{\textbf{Circle-Hard}}                      & \multicolumn{2}{c}{\textbf{RMNIST}}                      & \multicolumn{2}{c}{\textbf{Yearbook}}                      & \multicolumn{1}{c}{\textbf{CLEAR}}                  \\  
                  & \multicolumn{1}{c}{$\oodavg$}     & $\oodwrt$     & \multicolumn{1}{c}{$\oodavg$}      & $\oodwrt$      & \multicolumn{1}{c}{$\oodavg$}     & $\oodwrt$      & \multicolumn{1}{c}{$\oodavg$}     & $\oodwrt$      & \multicolumn{1}{c}{$\oodavg / \oodwrt$}          \\ \hline
\texttt{ERM}               & \multicolumn{1}{c}{89.63 (0.89)} & 79.84 (1.84) & \multicolumn{1}{c}{66.94 (1.69) }  & 58.43 (0.05)  & \multicolumn{1}{c}{56.61 (1.83)} & 51.85 (4.15)  & \multicolumn{1}{c}{90.79 (0.16)} & 71.03 (1.74)  & \multicolumn{1}{c}{69.04 (0.18)}   \\ 
\texttt{LD}                & \multicolumn{1}{c}{76.60 (6.45)} & 56.88 (3.74)  & \multicolumn{1}{c}{58.13 (1.67)}  & 51.58 (1.87)  & \multicolumn{1}{c}{37.54 (2.77)} & 25.80 (4.12)  & \multicolumn{1}{c}{77.10 (0.30)} & 57.97 (0.88)  & \multicolumn{1}{c}{57.01 (2.15)}   \\ 
\texttt{FT}                & \multicolumn{1}{c}{85.57 (1.82)} & 71.99 (4.11) & \multicolumn{1}{c}{59.02 (5.20)}  & 50.80 (2.79)  & \multicolumn{1}{c}{60.73 (0.87)} & 47.30 (3.77)  & \multicolumn{1}{c}{87.04 (0.58)} & 66.83 (2.22)  & \multicolumn{1}{c}{66.71 (0.46)}   \\ \hdashline 
\texttt{DANN}              & \multicolumn{1}{c}{88.80 (1.17)} & 78.32 (3.23) & \multicolumn{1}{c}{65.10 (0.93)}  & 56.68 (0.59)  & \multicolumn{1}{c}{58.25 (1.15)} & 53.61 (1.61)  & \multicolumn{1}{c}{90.57 (0.22)} & 69.58 (1.38)  & \multicolumn{1}{c}{67.48 (1.19)}   \\ 
\texttt{CDANN}             & \multicolumn{1}{c}{89.75 (0.14)} & 80.75 (2.97) & \multicolumn{1}{c}{64.05 (1.33)}  & 58.68 (0.22)  & \multicolumn{1}{c}{58.19 (0.93)} & 54.45 (1.40)  & \multicolumn{1}{c}{90.46 (0.30)} & 70.37 (1.44)  & \multicolumn{1}{c}{66.12 (0.37)}   \\ 
\texttt{G2DM}              & \multicolumn{1}{c}{89.40 (2.27)} & 79.61 (2.94) & \multicolumn{1}{c}{67.75 (2.69)}  & 59.65 (1.61)  & \multicolumn{1}{c}{57.62 (0.39)} & 53.93 (0.31)  & \multicolumn{1}{c}{87.57 (0.37)} & 66.69 (1.15)  & \multicolumn{1}{c}{56.98 (2.77)}   \\ 
\texttt{CORAL}             & \multicolumn{1}{c}{90.13 (0.52)} & 83.14 (1.27) & \multicolumn{1}{c}{66.12 (1.48)}  & 59.62 (1.17)  & \multicolumn{1}{c}{51.41 (2.63)} & 44.95 (3.64)  & \multicolumn{1}{c}{90.41 (0.20)} & 69.53 (2.00)  & \multicolumn{1}{c}{70.96 (1.06)}   \\ 
\texttt{GROUPDRO}          & \multicolumn{1}{c}{90.50 (1.75)} & 81.07 (6.12) & \multicolumn{1}{c}{67.08 (1.67)}  & 58.51 (0.12)  & \multicolumn{1}{c}{54.37 (2.98)} & 46.21 (5.69)  & \multicolumn{1}{c}{90.65 (0.20)} & 71.21 (1.51)  & \multicolumn{1}{c}{70.63 (0.04)}   \\ 
\texttt{MIXUP}             & \multicolumn{1}{c}{88.49 (0.86)} & 76.78 (2.49) & \multicolumn{1}{c}{63.03 (1.53)}  & 56.21 (1.20)  & \multicolumn{1}{c}{52.13 (2.54)} & 34.60 (16.81)  & \multicolumn{1}{c}{89.75 (0.05)} & 68.73 (1.36)  & \multicolumn{1}{c}{69.58 (0.99)}   \\ 
\texttt{IRM}               & \multicolumn{1}{c}{85.78 (1.11)} & 74.80 (1.73) & \multicolumn{1}{c}{62.43 (2.70)}  & 54.96 (1.78)  & \multicolumn{1}{c}{26.96 (1.11)} & 16.25 (1.87)  & \multicolumn{1}{c}{84.65 (0.31)} & 64.30 (2.44)  & \multicolumn{1}{c}{49.54 (1.08)}   \\
\texttt{SELFREG}               & \multicolumn{1}{c}{90.33 (0.14)} & 82.20 (0.93) & \multicolumn{1}{c}{68.23 (2.47)}  & 60.28 (0.90)  & \multicolumn{1}{c}{50.58 (2.35)} & 42.15 (4.63)  & \multicolumn{1}{c}{91.47 (0.12)} & 73.88 (0.37)  & \multicolumn{1}{c}{69.18 (0.68)}   \\
\texttt{FISH}               & \multicolumn{1}{c}{90.65 (0.25)} & 79.09 (2.46) & \multicolumn{1}{c}{62.69 (0.63)}  & 56.97 (0.49)  & \multicolumn{1}{c}{56.53 (1.32)} & 52.23 (1.47)  & \multicolumn{1}{c}{89.92 (0.20)} & 70.58 (0.90)  & \multicolumn{1}{c}{69.46 (0.47)}   \\ \hdashline
\texttt{EWC}               & \multicolumn{1}{c}{89.18 (1.72)} & 79.59 (4.63) & \multicolumn{1}{c}{68.31 (3.31)}  & 61.34 (2.18)  & \multicolumn{1}{c}{66.53 (1.26)} & 50.63 (5.35)  & \multicolumn{1}{c}{89.47 (0.17)} & 59.09 (7.70)  & \multicolumn{1}{c}{45.58 (4.92)}   \\ \hdashline 
\texttt{CIDA}              & \multicolumn{1}{c}{87.25 (0.88)} & 77.91 (0.23) & \multicolumn{1}{c}{65.38 (2.77)}  & 58.15 (0.88)  & \multicolumn{1}{c}{53.42 (4.35)} & 35.21 (17.85)  & \multicolumn{1}{c}{91.29 (0.16)} & 70.19 (1.45)  & \multicolumn{1}{c}{65.10 (0.12)}   \\ 
\texttt{DRAIN}             & \multicolumn{1}{c}{86.78 (0.65)} & 74.57 (1.82) & \multicolumn{1}{c}{67.44 (4.65)}  & 57.76 (3.42)  & \multicolumn{1}{c}{67.09 (4.06)} & 59.49 (8.31)  & \multicolumn{1}{c}{89.62 (0.39)} & 70.36 (2.32)  & \multicolumn{1}{c}{64.67 (0.65)}   \\
\texttt{TKNets}             & \multicolumn{1}{c}{91.76 (0.16)} & \textbf{83.35 (1.32)}  & \multicolumn{1}{c}{64.19 (0.95)}  & 59.94 (0.18)  & \multicolumn{1}{c}{74.39 (0.23)} & 71.03 (0.37)  & \multicolumn{1}{c}{92.11 (0.26)} & 75.04 (1.16)  & \multicolumn{1}{c}{64.05 (0.64)}   \\
\texttt{LSSAE}\tablefootnote{We have observed that the training process of \texttt{LSSAE} with our image encoders on image datasets fails to converge.}             & \multicolumn{1}{c}{90.21 (1.95)} & 80.92 (3.53) & \multicolumn{1}{c}{66.43 (0.81)}  & 61.22 (0.71)  & \multicolumn{1}{c}{33.30 (2.14)} & 18.83 (3.85)  & \multicolumn{1}{c}{60.48 (4.99)} & 50.35 (4.67)  & \multicolumn{1}{c}{22.61 (0.25)}   \\
\texttt{DDA}             & \multicolumn{1}{c}{72.06 (4.51)} & 48.81 (0.97) & \multicolumn{1}{c}{65.26 (3.20)}  & 56.16 (2.45)  & \multicolumn{1}{c}{\textbf{78.18} (0.88)} & 73.70 (0.31)  & \multicolumn{1}{c}{86.72 (0.56)} & 67.60 (2.66)  & \multicolumn{1}{c}{70.12 (1.10)}   \\ \hdashline 
\texttt{AIRL}              & \multicolumn{1}{c}{\textbf{92.28 (0.27)}} & 82.81 (2.70) & \multicolumn{1}{c}{\textbf{73.50 (2.21)}}  & \textbf{63.29 (1.26)}  & \multicolumn{1}{c}{77.49 (0.86)} & \textbf{74.99 (0.57)}  & \multicolumn{1}{c}{\textbf{93.10 (0.21)}} & \textbf{78.22 (0.92)}   & \multicolumn{1}{c}{\textbf{73.04} (0.67)}  \\ \hline
\end{tabular}}
\end{table*}
\renewcommand*{\arraystretch}{1.} 


In this section, we present the experimental results of the proposed  \texttt{AIRL} and compare \texttt{AIRL} with a wide range of existing algorithms. We evaluate these algorithms on synthetic and real-world datasets. Next, we first introduce the experimental setup and then present the empirical results.

\textbf{Experimental setup.}
Datasets and baselines used in the experiments are briefly introduced below. Their details are shown in Appendix~\ref{app:exp}.

\underline{\textit{Datasets.}} We consider five datasets: \textbf{Circle} \citep{pesaranghader2016fast} (a synthetic dataset containing 30 domains where each instance is sampled from 30 two-dimensional Gaussian distributions), \textbf{Circle-Hard} (a synthetic dataset adapted from \textbf{Circle} dataset such that domains do not uniformly evolve), \textbf{RMNIST} (a semi-synthetic dataset constructed from MNIST \citep{lecun1998gradient} by $R$-degree counterclockwise rotation), \textbf{Yearbook} \citep{ginosar2015century} (a real dataset consisting of frontal-facing American high school yearbook photos from 1930-2013), and \textbf{CLEAR} \citep{lin2021clear} (a real dataset capturing the natural temporal evolution of visual concepts that spans a decade).

\underline{\textit{Baselines.}}
We compare the proposed \texttt{AIRL} with %Besides our proposed model, we also approach the evolving domain generalization problem by 
existing methods from related areas, including the followings: empirical risk minimization (\texttt{ERM}), last domain training (\texttt{LD}), fine tuning (\texttt{FT}), domain invariant representation learning (\texttt{G2DM} \citep{albuquerque2019generalizing}, \texttt{DANN} \citep{ganin2016domain}, \texttt{CDANN} \citep{li2018deep}, \texttt{CORAL} \citep{sun2016deep}, \texttt{IRM} \citep{arjovsky2019invariant}), data augmentation (\texttt{MIXUP} \citep{zhang2018mixup}), continual learning (\texttt{EWC} \citep{kirkpatrick2017overcoming}), continuous DA (\texttt{CIDA} \citep{wang2020continuously}), distributionally robust optimization (\texttt{GroupDRO} \citep{sagawa2019distributionally}), gradient-based DG (\texttt{Fish} \citep{shi2021gradient}) contrastive learning-based DG (i.e., \texttt{SelfReg} \citep{kim2021selfreg}), non-stationary DG (\texttt{DRAIN} \citep{bai2022temporal}, \texttt{TKNets} \citep{zeng2024generalizing}, \texttt{LSSAE} \citep{qin2022generalizing}, and \texttt{DDA} \citep{zeng2023foresee}).
To ensure a fair comparison, we adopt similar architectures for \texttt{AIRL} and baselines, including both representation mapping and classifier. The implementation details are in Appendix \ref{app:model}.

\begin{table}
\caption{ Ablation study for \texttt{AIRL} on \textbf{Circle-Hard} dataset under \textsf{Eval-D} scenario $(K=5)$.}\label{tab:2}
\centering
\resizebox{0.41\textwidth}{!}{
\begin{tabular}{ccccc}
\hline
$\lstm$ & $\trans$ & $\mathcal{L}_{inv}$ & $\oodavg$     & $\oodwrt$      \\ \hline
\xmark   & \cmark                  & \cmark                    & 69.06 & 61.05  \\ 
\cmark   & \xmark                  & \cmark                    & 65.51 & 58.69  \\ 
\cmark   & \xmark                  & \xmark                    & 68.33 & 60.16 \\ 
\cmark   & \cmark                  & \cmark                    & \textbf{73.50} & \textbf{63.29}  \\ \hline
\end{tabular}}
\end{table}

\underline{\textit{Evaluation method.}} In the experiments, models are trained on a sequence of source domains $\mathcal{D}_{src}$, and their performance is evaluated on target domains $\mathcal{D}_{tgt}$ under two different scenarios: \textsf{Eval-S} and \textsf{Eval-D}. In the scenario \textsf{Eval-S}, models are trained one time on the first half of domain sequence $\mathcal{D}_{src} = [D_1, D_2, \cdots, D_T]$ and are then deployed to make predictions on the second half of domain sequence $\mathcal{D}_{tgt} = [D_{T+1}, D_{T+2}, \cdots, D_{2T}] (K=T)$. In the scenario \textsf{Eval-D}, source and target domains are not static but are updated periodically as new data/domain becomes available. For each of these two scenarios, we use two accuracy measures, $\oodavg$ and $\oodwrt$, to evaluate the average- and worst-case performances. Their details are shown in Appendix~\ref{app:exp}. We train each model with 5 different random seeds and report the average prediction performances.

\textbf{Results.}
Next, we evaluate the model performance under \textsf{Eval-D} scenario (Results for \textsf{Eval-S} are in Appendix \ref{app:exp}).

\underline{\textit{Non-stationary DG results.}} 
Performance of \texttt{AIRL} and baselines on synthetic (i.e., \textbf{Circle}, \textbf{Circle-Hard}) and real-world (i.e., \textbf{RMNIST}, \textbf{Yearbook}) data are presented in Table \ref{tab:1}. We observe that \texttt{AIRL} consistently outperforms other methods over all datasets and metrics. These results indicate that \texttt{AIRL} can effectively capture non-stationary patterns across domains, and such patterns can be leveraged to learn the models that generalize better on target domains compared to the baselines. Among baselines, methods designed specifically for non-stationary DG (i.e., \texttt{DRAIN}, \texttt{DPNET}, \texttt{LSSAE}) and continual learning method (i.e., \texttt{EWC}) achieve better performance than other methods. However, such improvement is inconsistent across datasets. %(i.e., \texttt{EWC} and \texttt{LSSAE} for \textbf{Circle}; \texttt{EWC}, \texttt{DRAIN} and \texttt{LSSAE} for \textbf{Circle-Hard}; \texttt{EWC}, \texttt{DRAIN} and \texttt{DPNET} for \textbf{RMNIST}; \texttt{DPNET} for \textbf{Circle}). 

\underline{\textit{Comparison with non-stationary DG methods.}} \texttt{DPNET} assumes that the evolving pattern between two consecutive domains is constant and the distances between them are small. Thus, this method does not achieve good performance for \textbf{Circle-Hard} dataset where distance between two consecutive domains is proportional to domain index. \texttt{DRAIN} utilizes Bayesian framework and generates the whole models at every domain. This method, however, is only capable for small neural networks and does not scale well to real-world applications. Moreover, \texttt{DPNET}, \texttt{DRAIN}, and \texttt{DDA} can only generalize to a single subsequent target domain. \texttt{LSSAE} leverages sequential variational auto-encoder~\citep{li2018disentangled} to learn non-stationary pattern. However, this model assumes the availability of aligned data across domain sequence, which may pose challenges to its performance in non-stationary DG. In contrast, \texttt{AIRL} is not limited to the constantly evolving pattern. It is also scalable to large neural networks and can handle multiple target domains. In particular, compared to the base model (\texttt{ERM}), our method has only one extra Transformer and LSTM layers. Note that these layers are used during training only. In the inference stage, predictions are made by $\enc$ and classifier pre-generated by $\lstm$ which then results in a similar inference time with \texttt{ERM}.

\underline{\textit{Decision boundary visualization.}} We conduct a quantitative analysis for our method by visualizing its predictions on \textbf{Circle-Hard} dataset. We train models (i.e., \texttt{AIRL} and \texttt{ERM}) on the first 10 domains (right half) and evaluate on the remaining 10 domains (left half). As depicted in Figure~\ref{fig:6}, our method, designed to capture non-stationary patterns across domains, generates more accurate predictions for target domains compared to ERM.

\underline{\textit{Ablation studies.}}
We  conduct experiments to investigate the roles of each component in \texttt{AIRL}. In particular, we compare \texttt{AIRL} with its variants; each variant is constructed by removing $\lstm$ (i.e., use fixed classifier instead), $\trans$ (i.e., use fixed representation instead), $L_{inv}$ (i.e., without invariant constraint) from the model. As shown in Table \ref{tab:2}, model performance deteriorates when removing any of them. These results validate our theorems and demonstrate the effectiveness of each component. 

\begin{figure}
  \begin{center}
    \includegraphics[width=0.45\textwidth]{figs/uai2024.png}
  \end{center}
  \caption{Visualization of predictions on \textbf{Circle-Hard} dataset generated by \texttt{ERM} and \texttt{AIRL}.}
  \label{fig:6}
\end{figure}

\underline{\textit{Limitations.}} While \texttt{AIRL} consistently outperforms existing methods across all datasets and metrics, we acknowledge certain limitations in our work. Regarding theoretical analysis, we presently lack an effective method to estimate non-stationary complexity from finite data. Concerning algorithm design, our method is unable to address scenarios where data from all source domains are not simultaneously available during training (i.e., online learning). Moreover, it may not be generalized to every non-stationary environment in some specific cases. This is due to the reliance of our method on the selection of hypothesis classes $\mathcal{F}$, $\mathcal{G}$.