\section{Model Details}\label{app:model}

% \subsection{Pseudo codes for \texttt{AIRL}'s learning and inference processes}

% \begin{algorithm2e}[H]
% \SetAlgoLined
% \KwIn{Training datasets from $T$ source domains $\{D_t\}_{t=1}^T$, \textit{representation network} = $\{\enc$, $\trans\}$, \textit{classification network} = $\{ \lstm, h_1 \}$, $\alpha$, $n$}
% \KwOut{Trained $\enc, \trans, \lstm, h^{\ast}_1$}

% \setcounter{AlgoLine}{0}
% $L_{inv} = 0, L_{cls} = 0$ \\
% \tcc{Estimate $\{w^t_y\}_{y \in \mathcal{Y}, t < T}$  for important weighting}
% \For{$t=1:T-1$}{
% \For{$y \in \mathcal{Y}$}{
% $w^t_y = P^{Y=y}_{D_{t+1}} / P^{Y=y}_{D_{t}}$
% }
% }
% \tcc{Learn weights for $\enc, \trans, \lstm$}
% \While{learning is not end}{
% Sample batch $\mathcal{B} = \left\{ x_t, y_t \right\}_{t=1}^T \sim \{D_t\}_{t=1}^T$ where $\left\{ x_t, y_t \right\} = \left\{ x^j_t, y^j_t \right\}_{j=1}^n$\\
% $z_{1} = \enc\left( x_{1} \right)$ \\
% \For{$t=1:T-1$}{
%     $z_{t+1} = \enc\left( x_{t+1} \right)$ \\
%     $\widehat{z}_t = \trans\left( z_{\leq t} \right) $ \\
%     $\{\widehat{z}_t(w), y_t(w)\} =$ Reweight $\{\widehat{z}_t, y_t\}$ with $w^t = \{w^t_y\}_{y \in \mathcal{Y}}$ \\
%     % \For{$y=1:|\mathcal{Y}|$}{
%     % $\widehat{z}_t(y) = \{\widehat{z}^j_t: y^j_t = y  \}$ \\
%     % Reweight $\widehat{z}_t(y)$ with $w_y$ \\
%     % }
%     Calculate $L_{inv}^t$ from $\widehat{z}_t(w), z_{t+1}$ by Eq. (\ref{eq:inv}) \\
%     $L_{inv} = L_{inv} + L_{inv}^t$ \\
%     \If{$t>1$}{
%     $h_t = \lstm \left( h_{<t} \right)$ \\
%     }
%     Calculate $L_{cls}^t$ from $y_t(w), y_{t+1}, h_t\left( \widehat{z}_t(w) \right), h_t\left( z_{t+1} \right)$ by Eq. (\ref{eq:cls}) \\
%     $L_{cls} = L_{cls} + L_{cls}^t$ \\
% }
% Update $\enc, \trans, \lstm, \widehat{h}_1$ by optimizing $L_{inv} + \alpha L_{cls}$ \\
% }
% \caption{Learning process for \texttt{AIRL}}
% \label{alg:train}
% \end{algorithm2e}

% \begin{algorithm2e}[H]
% \label{alg:fatdm}
% \SetAlgoLined
% \KwIn{Testing dataset from target domain $D_t (t \in \{ T+1, \cdots, T+K \})$, trained $\enc, \lstm, h^{\ast}_1$}
% \KwOut{Predictions for testing dataset}

% \setcounter{AlgoLine}{0}

% \For{$t'=2:(t-1)$}{
% $h^{\ast}_{t'} = \lstm \left ( h^{\ast}_{< t'} \right )$ 
% }
% \While{inference is not end}{
% Sample batch $\mathcal{B} = x_{t} \sim D_{t}$ \\
% $z_{t} = \enc \left( x_{t} \right)$ \\
% Generate predictions $h^{\ast}_{t-1} \left ( z_{t} \right )$ \\
% }

% \caption{Inference process for \texttt{AIRL}}
% \label{alg:test}
% \end{algorithm2e}

% \subsection{Details of model architectures}

Our proposed model \texttt{AIRL} consists of three components: (i) encoder $\enc$ that maps inputs to representation (i.e., equivalent to $g_t$ in our theoretical results), (ii) transformer layer $\trans$ that helps to enforce the invariant representation (i.e., $\enc$ + $\trans$ equivalent to $f_t$ in our theoretical results), and (iii) classification network $\lstm$ that generates classifiers mapping representations to the output space. At each target domain, $\lstm$ layer is used to generate the new  classifier based on the sequences of previous classifiers. The detailed architectures of these networks used in our experiment are presented in Tables \ref{tab:s3} and \ref{tab:s4} below.

\renewcommand{\arraystretch}{1.3}
\begin{table}[H]
\centering
\caption{Detailed architecture of \texttt{AIRL} for \textbf{RMNIST} (\textbf{n\_channel} = 1, \textbf{n\_output} = 10), \textbf{Yearbook} (\textbf{n\_channel} = 3, \textbf{n\_output} = 1), and \textbf{CLEAR} (\textbf{n\_channel} = 3, \textbf{n\_output} = 10) datasets. }\label{tab:s3}
\resizebox{\linewidth}{!}{
\begin{tabular}{ll}
\hline
Networks                                  & Layers                              \\ \hline
\multirow{16}{*}{Representation Mapping $G$} & Conv2d(input channel = \textbf{n\_channel}, output channel = 32, kernel = 3, padding = 1) \\  
                                            & BatchNorm2d                                                             \\
                                            & ReLU                                                                    \\
                                            & MaxPool2d                                                               \\
                                            & Conv2d(input channel = 32, output channel = 32, kernel = 3, padding = 1)\\  
                                            & BatchNorm2d                                                             \\
                                            & ReLU                                                                    \\
                                            & MaxPool2d                                                               \\
                                            & Conv2d(input channel = 32, output channel = 32, kernel = 3, padding = 1)\\  
                                            & BatchNorm2d                                                             \\
                                            & ReLU                                                                    \\
                                            & MaxPool2d                                                               \\
                                            & Conv2d(input channel = 32, output channel = 32, kernel = 3, padding = 1)\\  
                                            & BatchNorm2d                                                             \\
                                            & ReLU                                                                    \\
                                            & MaxPool2d                                                               \\ \hline
\multirow{7}{*}{Transformer $\trans$} & $Q$: Linear(input dim = 32, output dim = 32) \\  
                                          & $K$: Linear(input dim = 32, output dim = 32)                                \\  
                                          & $V$: Linear(input dim = 32, output dim = 32) \\ 
                                          & $U$: Linear(input dim = 32, output dim = 32) \\ 
                                          & Linear(input dim = 32, output dim = 32) \\ 
                                          & Batchnorm1d                                \\ 
                                          & LeakyReLU \\ \hline
\multirow{3}{*}{Classification Network $\lstm$}          & Linear(input dim = (32 * 32 + 32) + (32 * \textbf{n\_output} + \textbf{n\_output}), output dim = 128) \\ 
                                          & LSTM(input dim = 128, output dim = 128)                                \\ 
                                          & Linear(input dim = 128, output dim = (32 * 32 + 32) + (32 * \textbf{n\_output} + \textbf{n\_output})) \\ \hline
\multirow{3}{*}{$\widehat{h}_t$ (Output of $\lstm$)}          & Linear(input dim = 32, output dim = 32) \\ 
                                          & ReLU                                \\ 
                                          & Linear(input dim = 32, output dim = \textbf{n\_output}) \\ \hline
\end{tabular}}
\end{table}
\renewcommand{\arraystretch}{1.}
\newpage
\renewcommand{\arraystretch}{1.3}
\begin{table}[H]
\centering
\caption{Detailed architecture of \texttt{AIRL} for \textbf{Circle} and \textbf{Circle-Hard} datasets.}\label{tab:s4}
\resizebox{\linewidth}{!}{
\begin{tabular}{ll}
\hline
Networks                                  & Layers                              \\ \hline
\multirow{7}{*}{Encoder $\enc$}                   & Linear(input dim = 2, output dim = 32) \\  
                                          & ReLU                                \\  
                                          & Linear(input dim = 32, output dim = 32) \\ 
                                          & ReLU                                \\ 
                                          & Linear(input dim = 32, output dim = 32) \\ 
                                          & ReLU                                \\ 
                                          & Linear(input dim = 32, output dim = 32) \\ \hline
\multirow{7}{*}{Transformer $\trans$} & $Q$: Linear(input dim = 32, output dim = 32) \\  
                                          & $K$: Linear(input dim = 32, output dim = 32)                                \\  
                                          & $V$: Linear(input dim = 32, output dim = 32) \\ 
                                          & $U$: Linear(input dim = 32, output dim = 32) \\ 
                                          & Linear(input dim = 32, output dim = 32) \\ 
                                          & Batchnorm1d                                \\ 
                                          & LeakyReLU \\ \hline
\multirow{3}{*}{Classification Network $\lstm$}          & Linear(input dim = (32 * 32 + 32) + (32 * 1 + 1), output dim = 128) \\ 
                                          & LSTM(input dim = 128, output dim = 128)                                \\ 
                                          & Linear(input dim = 128, output dim = (32 * 32 + 32) + (32 * 1 + 1)) \\ \hline
\multirow{3}{*}{$\widehat{h}_t$ (Output of $\lstm$)}          & Linear(input dim = 32, output dim = 32) \\ 
                                          & ReLU                                \\ 
                                          & Linear(input dim = 32, output dim = 1) \\ \hline
\end{tabular}}
\end{table}
\renewcommand{\arraystretch}{1.}

% \renewcommand{\arraystretch}{1.2}
% \begin{table}[h]
% \centering
% \caption{Detailed architecture of \texttt{AIRL} for \textbf{RMNIST} dataset.}\label{tab:s4}
% \begin{tabular}{ll}
% \hline
% Networks                                  & Layers                              \\ \hline
% Evolving Network $E$                        & LSTM(input dim = 800, output dim = 800) \\ \hline
% \multirow{7}{*}{Representation Mapping $G$} & Conv2d(input channel = 1, output channel = 32, kernel = 3, padding = 1) \\  
%                                           & BatchNorm2d                                \\
%                                           & ReLU                                \\
%                                           & MaxPool2d                                \\
%                                           & Conv2d(input channel = 32, output channel = 32, padding = 1) \\  
%                                           & BatchNorm2d                                \\
%                                           & ReLU                                \\
%                                           & MaxPool2d                                \\
%                                           & Linear(input dim = 32, output dim = 16) \\ 
%                                           & ReLU                                \\ 
%                                           & Linear(input dim = 16, output dim = 16) \\ \hline
% \multirow{3}{*}{Classifier $H$}             & Linear(input dim = 16, output dim = 16) \\ 
%                                           & ReLU                                \\ 
%                                           & Linear(input dim = 16, output dim = 10) \\ \hline
% \end{tabular}
% \end{table}
% \renewcommand{\arraystretch}{1.}

% \renewcommand{\arraystretch}{1.2}
% \begin{table}[h!]
% \centering
% \caption{Detailed architecture of \texttt{AIRL} for \textbf{Yearbook} dataset.}\label{tab:s5}
% \begin{tabular}{ll}
% \hline
% Networks                                    & Layers                                                                  \\ \hline
% Evolving Network $E$                        & LSTM(input dim = 2112, output dim = 2112)                               \\ \hline
% \multirow{7}{*}{Representation Mapping $G$} & Conv2d(input channel = 3, output channel = 32, kernel = 3, padding = 1) \\  
%                                             & BatchNorm2d                                                             \\
%                                             & ReLU                                                                    \\
%                                             & MaxPool2d                                                               \\
%                                             & Conv2d(input channel = 32, output channel = 32, kernel = 3, padding = 1)\\  
%                                             & BatchNorm2d                                                             \\
%                                             & ReLU                                                                    \\
%                                             & MaxPool2d                                                               \\
%                                             & Conv2d(input channel = 32, output channel = 32, kernel = 3, padding = 1)\\  
%                                             & BatchNorm2d                                                             \\
%                                             & ReLU                                                                    \\
%                                             & MaxPool2d                                                               \\
%                                             & Conv2d(input channel = 32, output channel = 32, kernel = 3, padding = 1)\\  
%                                             & BatchNorm2d                                                             \\
%                                             & ReLU                                                                    \\
%                                             & MaxPool2d                                                               \\
%                                             & Linear(input dim = 32, output dim = 32)                                 \\ 
%                                             & ReLU                                                                    \\ 
%                                             & Linear(input dim = 32, output dim = 32)                                 \\ \hline
% \multirow{3}{*}{Classifier $H$}             & Linear(input dim = 32, output dim = 32)                                 \\ 
%                                             & ReLU                                                                    \\ 
%                                             & Linear(input dim = 32, output dim = 1)                                  \\ \hline
% \end{tabular}
% \end{table}
% \renewcommand{\arraystretch}{1.}

\newpage