\textbf{Notation.}
We denote the integer set $\{1,2...,b\}$ by $[b]$. Moreover,  $\|\cdot\|$ denotes the $\ell_2$ vector norm, $\|\cdot\|_1$ denotes the $\ell_1$ vector norm, and $S_b=\{a\in\mathbb{R}^{b}:a_{i}\ge0,\ \|a\|_{1}=1\}$ is the probability simplex set.


\begin{figure}
\begin{centering}
\includegraphics[scale=0.16]{fig/tdg.pdf}
\par\end{centering}
\vspace{-0.1cm}
\caption{Illustration of the temporal domain generalization problem where the distribution of the training set of $f_{\th_t}$ mismatches the distribution of its test set $\D_t$ at time $t$.}\label{fig:tgd}
\vspace{-0.2cm}
\end{figure}

\section{Problem and Background}
\paragraph{Temporal Domain Generalization.} Consider an online classification problem with the feature space $\mathcal{X}$ and the label space $\mathcal{Y}$.    
Our goal is to learn an accurate prediction model $f_\th:\mathcal{X} \rightarrow \mathcal{Y}$ parameterized by $\th \in \Theta$ from a stream of datasets $\D_1,\dots,\D_T$ in $T$ consecutive rounds.  
% We consider the situation that the data distribution (i.e., domain) is gradually shifting along time. 
Specifically,  
at round $t$, we choose a model parameter $\th_t \in \Theta$ and deploy our prediction model $f_{\theta_t}$.  
Then we observe the dataset $\D_t$ with $n_t$ labeled examples $\D_t = \{(x_t^{(i)},y_t^{(i)})\}_{i=1}^{n_t}$ drawn from certain data distribution $\P_{t}$, where $x_t^{(i)}$ are the input features and $y_t^{(i)}$ is the associated label. 
% We assume that $\P_{t}$ changes gradually with time $t$. 
% And at each time $t$ we want to deploy a machine learning model $f_{\th_{t}}$ that is able to give accurate predictions on the incoming (testing) data $\D_t$ from domain $\P_{t}$. 
Thus, for a given loss function $\ell:\mathcal{X}\times \mathcal{Y} \rightarrow \mathbb{R}_+$, the empirical loss of our prediction model at time $t$ is given by $r_{t}(\th)=\E_{(x,y)\sim\D_{t}}\ell(f_{\th}(x),y)$. 
% Define the loss of model $f_\th$ over a data $\D_{t}$ at time $t$ as $r_{t}[\th]:=\E_{(x,y)\sim\D_{t}}\ell[f_{\th}(x),y]$, our objective is thus {\color{red} change data to dataset}
Moreover, we consider the situation where the data distribution $\P_t$ (i.e., domain) is gradually changing over time. 
A natural performance metric for our learning algorithm is the temporal average of the test loss suffered by the prediction model: 
% We are mainly interested in the following temporal average of test loss
\begin{align} 
   \label{eq:dynamic_regret}
   \frac{1}{T}\sum_{t=1}^{T}r_{t}(\th_{t}).
\end{align}  
% To measure the performance of the online learning algorithm in such a changing environment, a standard metric used in the literature is the average \emph{dynamic regret} \citep{zinkevich2003online}:
% \begin{equation} \label{eq:dynamic_regret}
%   \frac{1}{T}\sum_{t=1}^{T}[r_{t}(\th_{t})-r_t(\th_{t}^*)], 
% \end{equation}
% where $\th_{t}^* = \argmin_{\th} r_t(\th)$ is the optimal  model parameter at round $t$. Intuitively, it measures the performance gap between our deployed model and the optimal model averaged over $T$ rounds. 
We remark that $T$, which denotes the total number of rounds in the online process, is typically large in practice. 
% \ruichen{Here I introduced the dynamic regret}

The key challenge is the temporal domain generalization. Indeed, at time $t$ we train our prediction model $f_{\th_t}$ using the observed examples $\cup_{i\in\{0,1,...,t-1\}}\D_{i}$ and due to temporal shift of domain, the distribution of the test set does not match the distribution of its training set. Such mismatch of the training and testing domains results in domain generalization error. See Fig~\ref{fig:tgd} for an illustration.


Our formulation is motivated by the online recommendation
systems that aim to advertise items to users given user features. The domain is gradually changing because of the flux in the content that gets continuously added/removed from the system \citep{he2014practical,ye2020adaptive}. As the recommendation model needs to be deployed for serving, it is hard to update its parameter in real time \citep{cervantes2018evaluating,wang2020practical,peng2021learning}. The training process is thus discretized in which the model parameter is updated periodically with the hope that it generalizes well in its test domain.



\begin{algorithm}[t!]
\caption{Batch and Incremental Update}\label{alg:buiu}
\begin{algorithmic}
\State \textbf{Input:} The learning rate $\eta$ for updating the parameter $\th$.
\For{$t\in [T]$}
    \State{Deploy the prediction model $f_{\th_{t}}$ with parameter $\th_t$.}
    \State{Collect the new dataset $\D_t$.}
    \State{Initialize $\th_{t+1}$.}
    \While{$\|\frac{1}{b}\sum_{i=0}^{b-1}\nabla r_{t-i}(\th_{t+1})\|\ge\delta$}
        \State{$\th_{t+1} \leftarrow \th_{t+1}-\eta\frac{1}{b}\sum_{i=0}^{b-1}\nabla r_{t-i}(\th_{t+1}).$}
    \EndWhile
\EndFor
\end{algorithmic}
\end{algorithm}
\begin{figure}[t!]
\begin{centering}
\includegraphics[scale=0.2]{fig/arch.pdf}
\par\end{centering}
\caption{Comparing (1) the ideal update where the future information $\nabla r_t$ can be accessed at training time; (2) the batch update; and (3) our proposed approach.} \label{fig:compare}
\vspace{-0.5cm}
\end{figure}


\begin{algorithm*}[t]
\caption{Future Gradient Descent}\label{alg:main}
\begin{algorithmic}
% \Require $n \geq 0$
% \Ensure $y = x^n$
\State \textbf{Input:} The learning rate $\eta$, $\eta_\phi$ for updating the model parameter $\th$ and $\phi$. The initial trajectory buffer $B$.
\For{$t\in [T]$}
    \State{Deploy the prediction model $f_{\th_{t}}$ with parameter $\th_t$. Then collect the new dataset $\D_t$.}
    \State{Initialize the parameter of MFGG $\phi_{t+1}$.} \Comment{Initialization of $\phi_{t+1}$ is user-specific.}       
    \For{Inner loop iteration $k\in K$} \Comment{Update the meta network.}
        \State{$\phi_{t+1} \leftarrow \phi_{t+1}-\eta_{\phi}\sum_{\th\in B}\nabla_{\phi}\|m(\th;\phi_{t+1},t)-\nabla r_{t}(\th)\|^{2}$.} \Comment{May replace with the mini-batch version.}
    \EndFor
    \State{Initialize the trajectory buffer $B = \emptyset$ and model parameter $\th_{t+1}$.} \Comment{Initialization scheme of $\th_{t+1}$ is specified by user.}
    \While{$\|m(\th_{t+1};\phi_{t+1},t+1)\|\ge\delta$} \Comment{Alternatively, we may run gradient descent with a fixed number of iterations.}
        \State{$\th_{t+1} \leftarrow \th_{t+1}-\eta m(\th_{t+1};\phi_{t+1},t+1)$.} \Comment{May replace with the mini-batch version.}
        \State{$B \leftarrow B \cup \{ \th_{t+1} \}$} \Comment{Alternatively, we may update the trajectory buffer $B$ every a few iterations.}
    \EndWhile
\EndFor
\end{algorithmic}
\end{algorithm*}
\paragraph{Batch and Incremental Update.}
Batch Update (BU) \citep{hazan2017efficient,wang2020practical} is a widely used updating pipeline for training the recommendation model in temporally shifting domains. At each time $t$, the model parameters are updated using the gradient of the averaged losses $r_{t},..., r_{t-b+1}$, where $b$ is a time window size indicating how many observed data are used. BU with $b=1$ is also named as Incremental Updating (IU). We summarize the pipeline in Algorithm \ref{alg:buiu}, where $\nabla r_{s}$ with $s \le 0$ is defined as $0$. Also see an illustration of BU with $b=2$ in the second plot of Fig.~\ref{fig:compare}. It is noteworthy that the initialization scheme of $\th_{t+1}'$ for the updating at each time is problem-dependent and user-specified. For example, we can set $\th_{t+1}' = \th_{t}$ of $\th_{t+1}'=\th_{t-b+1}$ if we consider one-pass training setting \citep{zheng2020shadowsync,ye2020adaptive,du2021alternate}.









