\title{Appendix: Future Gradient Descent for Adapting the Temporal Shifting Data Distribution in Online Recommendation Systems}
\maketitle

\paragraph{Extra Notation}
We introduce several new notations for the appendix. We use $\left\langle \cdot,\cdot\right\rangle$ to denote the inner product between two vectors and use $\circ$ to denote the entrywise product.


\section{Proof of Theorem \ref{thm:fixed_meta_regret}} \label{apx:generalized_fgd}
\begin{proof}
We start with a simple decomposition using the triangle inequality: 
\[
\|u_{w,t}(\th_{t})\|\le\|u_{w,t}(\th_{t})-\bar{m}(\th_{t};t)\|+\|\bar{m}(\th_{t};t)\|.
\]
By the termination condition of Algorithm \ref{alg:main_generalized}, we have $\|\bar{m}(\th_{t};t)\|\le\delta$.
Furthermore, it follows from \eqref{eq:GG_diff} that 
\begin{align*}
 \|u_{w,t}(\th_{t})-\bar{m}(\th_{t};t)\|
= & \frac{1}{w}\|\nabla r_{t}(\th_{t})-m(\th_{t};t)\|.
\end{align*}
Hence, we obtain 
\begin{equation}\label{eq:one_step_local_regret}
    \|u_{w,t}(\th_{t})\|^2 \le \left(\delta + \frac{1}{w}\|\nabla r_{t}(\th_{t})-m(\th_{t};t)\|\right)^2 
    \leq 2\delta^2 + \frac{2}{w^2}\|\nabla r_{t}(\th_{t})-m(\th_{t};t)\|^2. 
\end{equation}
This further implies that 
\begin{equation}\label{eq:regret_key}
\R_{w}(T)=\frac{1}{T}\sum_{t=1}^{T}\|u_{w,t}(\th_{t})\|^{2}  \le\frac{2}{w^{2}T}\sum_{t=1}^{T}\|\nabla r_{t}(\th_{t})-m(\th_{t};t)\|^{2}+2\delta^{2}, % \\
% & \le\frac{2}{w^{2}T}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)-m(\th;\phi,t)\|^{2}+2\delta^{2}.
\end{equation}
and the main result follows from the fact that $\|\nabla r_{t}(\th_{t})-m(\th_{t};t)\|^{2} \leq \sup_{\th}\|\nabla r_{t}(\th)-m(\th;t)\|^{2}$ for all $t\in [T]$. Furthermore, under the boundedness assumption, we have for all $t\in [T]$ 
\begin{equation}
    \|\nabla r_{t}(\th_{t})-m(\th_{t};t)\|^{2} \leq \left(\|\nabla r_{t}(\th_{t})\|+\|m(\th_{t};t)\|\right)^2 \le 4M^2.  
\end{equation}
Hence, \eqref{eq:regret_key} also implies $\R_w(T) \leq {8M^2}/{w^2}+2\delta^2$, which leads to $\R_w(T)=O(1/w^2)$ when $\delta = 1/w$. 
\end{proof}

\section{Details of the Result in Section \ref{sec:opt_meta}} \label{apx:simple_fgd}
\paragraph{Algorithm.}
% We focus on the case when the MFGG is given by 
% \begin{equation*}
%     m(\theta; \phi,t) = \sum_{i=1}^b a_{i} \nabla r_{t-i}(\th).
% \end{equation*}
% Given $\th_{t}$, let $h_{t}(a_t)=\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi, t)\|^{2}$
% where we view $\th_{t}$ as a constant. We can see that the regret bound in Theorem~\ref{thm:fixed_meta_regret}
% \begin{equation}
%     \R_w(T) \leq 2\delta^2 +  \frac{2}{w^2T} \sum_{t=1}^T h_t(\phi).
% \end{equation}
% The challenge here is that $h_t$ can only be accessed after $\phi$ is chosen. 
% The high-level idea is to treat this as another online learning problem. 
% The algorithm is presented in Algorithm~\ref{alg:main_simple}.
Given $\th_{t}$, define $h_{t}(\phi)=\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi, t)\|^{2}$ as a function of $\phi$, where we view $\th_{t}$ as a constant. Thus, if follows from that \eqref{eq:regret_key} that 
\begin{equation}\label{eq:regret_in_ht}
    \R_w(T) \leq \frac{2}{w^2T} \sum_{t=1}^T h_t(\phi_t) + 2 \delta^2.
\end{equation}
Thus, our goal is to minimize $\sum_{t=1}^T h_t(\phi_t)$ in an \emph{online} manner, since we can only access $h_t(\phi_t)$ after $\phi_t$ is chosen. To achieve this, 
we use the classic exponentiated gradient method to update $\phi_t$. Specifically, for any $\phi = [a_1,\dots,a_b]\in S_{b}$, define the negative potential function $\psi(\phi)=\sum_{i=1}^{b}a_{i}\log a_{i}$
and its Bregman divergence
% the Kullback-Leibler (KL) divergence 
\begin{equation*}
    \B_{\psi}(\phi;\phi') = \psi(\phi)-\psi(\phi')-\left\langle \nabla\psi(\phi'),\phi-\phi'\right\rangle = \sum_{i=1}^b a_i \log \frac{a_i}{a_i'}.
\end{equation*}
% \[
% \B_{\psi}(\phi_{1};\phi_{2})=\psi(\phi_{1})-\psi(\phi_{2})-\left\langle \nabla\psi(\phi_{2}),\phi_{1}-\phi_{2}\right\rangle .
% \]
% $\phi_{t+1}$ is obtained via Exponentiated gradient descent from $\phi_t$ with step size $\eta_\phi$. 
Then $\phi_{t+1}$ is given by
\begin{align*}
\phi_{t+1} =\argmin_{\phi\in S_{b}}\left(\langle \nabla h_{t},\phi\rangle +\frac{1}{\eta_\phi}\B_{\psi}(\phi;\phi_{t})\right)
 =\frac{\phi_{t}\circ\exp(-\eta_\phi\nabla h_{t}(\phi_{t}))}{\|\phi_{t}\circ\exp(-\eta_\phi\nabla h_{t}(\phi_{t}))\|_{1}},
\end{align*}
where $\eta_{\phi}$ is the learning rate. 
See Section 6.6 in \citet{orabona2019modern} for the derivation of the last equality. Intuitively, $\frac{1}{\eta_\phi}\B_{\psi}(\phi;\phi_{t})$ stabilizes the algorithm by ensuring that $\phi_{t+1}$ remains close to $\phi_t$.
% such that the first-order Taylor approximation of $h_t$ is accurate.

This simplified version of FGD is summarized in Algorithm \ref{alg:main_simple}. Note that when updating $\phi$, we only use the last recommendation model $\th_t$.

\begin{algorithm*}[t]
\caption{Generalized Future Gradient Descent for Smoothed Regret (simplified version for the theoretical study)}\label{alg:main_simple}
\begin{algorithmic}
% \Require $n \geq 0$
% \Ensure $y = x^n$
\State \textbf{Input:} The learning rate $\eta$, $\eta_\phi$ for updating the model parameter $\th$ and $\phi$.
\State{Initialize $\phi_{1}=[1/b,...,1/b]$.}
\For{$t\in [T]$}
    \State{Deploy the prediction model $f_{\th_{t}}$ with the parameter $\th_t$ and collect the new dataset $\D_t$.}
    \State{Construct the function $h_t(\phi)=\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi, t)\|^{2}$}
    % \State{Initialize $\phi_{t+1}=\phi_{t}$.}
    % \State{$\phi_{t+1}=\arg\min_{\phi\in S_{b}}\left\langle \nabla_{\phi}\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi_{t},t)\|^{2},\phi\right\rangle +\frac{1}{\eta_{\phi}}\B_{\psi}(\phi;\phi_{t})$.} \Comment{Update $\phi$ by one step of mirror descent.}
    \State{$\phi_{t+1}=\frac{\phi_{t}\circ\exp(-\eta_\phi\nabla h_{t}(\phi_{t}))}{\|\phi_{t}\circ\exp(-\eta_\phi\nabla h_{t}(\phi_{t}))\|_{1}}$.} \Comment{One step of Exponentiated gradient descent from $\phi_t$}
    % \State{$a_{t+1,i} \leftarrow a_{t,i} \exp(-\eta'\nabla h_t(a_t)),\;\forall i \in[b]$.} \Comment{One step of Exponentiated gradient descent}
    % \State{$a_{t+1,i} \leftarrow {a_{t+1,i}}/{\sum_{i=1}^b} a_{t+1,i},\;\;\forall i \in[b]$.} 
    % \State{$\phi_{t+1}=\phi_{t+1}/||\phi_{t+1}||_{1}$.}
    % \Comment{Normalization}
    \State{Initialize the model parameter $\th_{t+1}$.}
    \While{$\|{\color{black}\bar{m}(\th_{t+1};\phi_{t+1},t+1)}\|\ge\delta$}
        \State{$\th_{t+1} = \th_{t+1}-\eta {\color{black}\bar{m}(\th_{t+1};\phi_{t+1}, t+1)}$.}
    \EndWhile
    % \State{Set $\th_{t+1}=\th'_{t+1}$}.
\EndFor
\end{algorithmic}
\end{algorithm*}

\begin{lemma}\label{lem:bounded_gradient}
    Suppose that we have $\|\nabla r_t(\th)\|\leq M$ for all $\theta \in \Theta$ and $t$. Then $\|\nabla h_t(\phi)\|_{\infty} \leq 8 M^2$ for all $\phi\in S_b$. 
\end{lemma}
\begin{proof}
    By definition, we have 
    \begin{equation*}
        h_t(\phi) = \|\nabla r_{t}(\th_{t})-\sum_{i=1}^b a_{i} \nabla r_{t-i}(\th_t)\|^2 = \|\sum_{i=1}^b a_{i} (\nabla r_{t}(\th_{t})- \nabla r_{t-i}(\th_t))\|^2, 
    \end{equation*}
    where we used the fact that $\sum_{i=1}^b a_{i}=1$. Direct computation shows that  
    \begin{align}
        \biggl|\frac{\partial h_t}{\partial a_{i}}(\phi)\biggr| &= 2\Bigl|\Bigl\langle \nabla r_{t}(\th_{t})- \nabla r_{t-i}(\th_t), \sum_{j=1}^b a_{j} (\nabla r_{t}(\th_{t})- \nabla r_{t-j}(\th_t))\Bigr\rangle\Bigr| \\
        & \leq 2\|\nabla r_{t}(\th_{t})- \nabla r_{t-i}(\th_t)\| \biggl\|\sum_{j=1}^b a_{j} (\nabla r_{t}(\th_{t})- \nabla r_{t-j}(\th_t))\biggr\| \label{eq:cauchy_schwarz}\\
        & \leq 2(\| \nabla r_{t}(\th_{t})\|+\| \nabla r_{t-i}(\th_t)\|) \biggl(\sum_{j=1}^b a_{j} (\|\nabla r_{t}(\th_{t})\|+\|\nabla r_{t-j}(\th_{t})\|)\biggr) \label{eq:triangle}\\
        & \leq 8M^2, \label{eq:bounded_gradient}
    \end{align}
    where we used Cauchy-Schwarz inequality in \eqref{eq:cauchy_schwarz}, the triangle inequality in \eqref{eq:triangle} and the boundedness of the gradients in \eqref{eq:bounded_gradient}. Hence, we conclude that $\|\nabla h_t(\phi)\|_{\infty} \leq 8M^2$.  
\end{proof}

\paragraph{Proof of Theorem \ref{thm:opt_meta_regret}.}

Now we proceed to the proof of Theorem \ref{thm:opt_meta_regret}. This is a standard result in the online learning literature (see, e.g., \citet{orabona2019modern}). For completeness, we present the proof below.

\begin{proof}
As $\psi$ is $\lambda$-strongly convex with $\lambda=1$, we have
\begin{equation}
    \B_{\psi}(\phi;\phi') \geq \frac{1}{2}\|\phi-\phi'\|_1^2.
\end{equation}
% {\color{gray} It is not hard to show that $\psi$ is $\lambda$-strongly convex
% with $\lambda=1$. By Taylor theorem, we have 
% \[
% \psi(\phi_{1})=\psi(\phi_{2})-\left\langle \nabla\psi(\phi_{2}),\phi_{1}-\phi_{2}\right\rangle +\frac{1}{2}\left(\phi_{1}-\phi_{2}\right)^{\top}\nabla^{2}\psi(\phi_{3})\left(\phi_{1}-\phi_{2}\right),
% \]
% where $\phi_{3}$ is a convex combination of $\phi_{1}$ and $\phi_{2}$.
% We thus conclude that 
% \begin{align} \label{eq:ber_lb}
% \B_{\psi}(\phi_{1};\phi_{2}) =\frac{1}{2}\left(\phi_{1}-\phi_{2}\right)^{\top}\nabla^{2}\psi(\phi_{3})\left(\phi_{1}-\phi_{2}\right) \ge\frac{\lambda}{2}\|\phi_{1}-\phi_{2}\|^{2},
% \end{align}
% where the last inequality is by the $\lambda$-strongly convexity
% of $\psi$. }

% For any $\phi\in S_{b}$, define $\psi(\phi)=\sum_{i=1}^{b}a_{i}\log a_{i}$
% and its Bregman Divergence 
% \[
% \B_{\psi}(\phi_{1};\phi_{2})=\psi(\phi_{1})-\psi(\phi_{2})-\left\langle \nabla\psi(\phi_{2}),\phi_{1}-\phi_{2}\right\rangle .
% \]
% Suppose that $\phi_{t+1}$ is obtained via one-step of mirror descent with step size $\eta$
% \begin{align*}
%     \phi_{t+1} & =\arg\min_{\phi\in S_{b}}\left\langle \nabla h_{t},\phi\right\rangle +\frac{1}{\eta}\B_{\psi}(\phi;\phi_{t})\\
%      & =\frac{\phi_{t}\circ\exp(-\eta\nabla h_{t}(\phi_{t}))}{\|\phi_{t}\circ\exp(-\eta\nabla h_{t}(\phi_{t}))\|_{1}}.
% \end{align*}

Throughout the proof, we slightly abuse the notation by writing $\eta_{\phi}=\eta$ and $\nabla h_{t}=\nabla h_{t}(\phi_{t})$ for simplicity. Notice that by our update rule $\phi_{t+1}$ is given by
\begin{align*}
\phi_{t+1} & =\argmin_{\phi\in S_{b}}\left(\eta\langle \nabla h_{t},\phi\rangle + \B_{\psi}(\phi;\phi_{t})\right).% \\
 % & =\arg\min_{\phi\in S_{b}}\left\langle \eta\nabla h_{t},\phi\right\rangle +\psi(\phi)-\psi(\phi_{t})-\left\langle \nabla\psi(\phi_{t}),\phi-\phi_{t}\right\rangle \\
 % & =\arg\min_{\phi\in S_{b}}\left\langle \eta\nabla h_{t}-\nabla\psi(\phi_{t}),\phi\right\rangle +\psi(\phi),
\end{align*}
From the first-order optimality condition, we get for any $\phi\in S_{b}$,
\begin{align*}
    &\langle \eta \nabla h_t +\nabla\psi( \phi_{t+1})-\nabla\psi(\phi_t), \phi_{t+1}-\phi \rangle \leq 0 \\
    \Leftrightarrow \qquad &\eta \langle \nabla h_t, \phi_t-\phi \rangle \leq \eta  \langle \nabla h_t, \phi_t-\phi_{t+1} \rangle + \langle \nabla\psi( \phi_{t+1})-\nabla\psi(\phi_t), \phi-\phi_{t+1}\rangle \\
    \Leftrightarrow \qquad &\eta \langle \nabla h_t, \phi_t-\phi \rangle \leq \eta  \langle \nabla h_t, \phi_t-\phi_{t+1} \rangle -\B_{\psi}(\phi;\phi_{t+1})+ \B_{\psi}(\phi;\phi_{t})-\B_{\psi}(\phi_{t+1};\phi_{t}),
\end{align*}
where we used the three-point equality \citep{chen1993convergence} in the last inequality. 
% \begin{align*}
% 0 & \le\left\langle \eta\nabla h_{t}-\nabla\psi(\phi_{t}),v\right\rangle +\psi(v)-\left\langle \eta\nabla h_{t}-\nabla\psi(\phi_{t}),\phi_{t+1}\right\rangle -\psi(\phi_{t+1})\\
%  & =\left\langle \eta\nabla h_{t}-\nabla\psi(\phi_{t}),v-\phi_{t+1}\right\rangle +(\psi(v)-\psi(\phi_{t+1}))\\
%  & \le\left\langle \eta\nabla h_{t}-\nabla\psi(\phi_{t}),v-\phi_{t+1}\right\rangle +\left\langle \nabla\psi(\phi_{t+1}),v-\phi_{t+1}\right\rangle \\
%  & =\left\langle \eta\nabla h_{t}-\nabla\psi(\phi_{t}),v-\phi_{t+1}\right\rangle +\left\langle \nabla\psi(\phi_{t+1}),v-\phi\right\rangle \\
%  & =\left\langle \eta\nabla h_{t}+\nabla\psi(\phi_{t+1})-\nabla\psi(\phi_{t}),v-\phi_{t+1}\right\rangle ,
% \end{align*}
% where the last inequality is by the convexity of $\psi$. We also
% have, for any $v\in S_{b}$,
% \[
% \left\langle \eta\nabla h_{t},\phi_{t}-v\right\rangle =\left\langle \eta\nabla h_{t},\phi_{t}-\phi_{t+1}\right\rangle +\left\langle \eta\nabla h_{t},v-\phi_{t}\right\rangle .
% \]
Furthermore, 
\begin{align*}
    \eta \langle \nabla h_t, \phi_t-\phi_{t+1} \rangle-\B_{\psi}(\phi;\phi_{t+1}) &\leq 
    \eta \|\nabla h_t\|_{\infty} \|\phi_t-\phi_{t+1}\|_1 -\frac{1}{2}\|\phi_t-\phi_{t+1}\|_1^2 \\
    &\leq \frac{\eta^2}{2}\|\nabla h_t\|_{\infty}^2 +\frac{1}{2}\|\phi_t-\phi_{t+1}\|_1^2-\frac{1}{2}\|\phi_t-\phi_{t+1}\|_1^2 \\
    &= \frac{\eta^2}{2}\|\nabla h_t\|_{\infty}^2.
\end{align*}
% Using $\left\langle x,y\right\rangle \le\frac{\|x\|^{2}}{2}+\frac{\|y\|^{2}}{2}$,
% we have
% \begin{align*}
% \left\langle \eta\nabla h_{t},\phi_{t}-\phi_{t+1}\right\rangle  & \le\frac{\eta^{2}}{2}\|\nabla h_{t}\|^{2}+\frac{1}{2}\|\phi_{t}-\phi_{t+1}\|^{2}\\
%  & \le\frac{\eta^{2}}{2}\|\nabla h_{t}\|^{2}+\B_{\psi}(\phi_{t+1};\phi_{t}).
% \end{align*}
% Here the last inequality is by (\ref{eq:ber_lb}). Also 
% \begin{align*}
% \left\langle \eta\nabla h_{t},\phi_{t+1}-v\right\rangle  & =\left\langle \eta\nabla h_{t}+\nabla\psi(\phi_{t+1})-\nabla\psi(\phi_{t}),\phi_{t+1}-v\right\rangle -\left\langle \nabla\psi(\phi_{t+1})-\nabla\psi(\phi_{t}),\phi_{t+1}-v\right\rangle \\
%  & \le\left\langle \nabla\psi(\phi_{t+1})-\nabla\psi(\phi_{t}),v-\phi_{t+1}\right\rangle \\
%  & =\B_{\psi}(v;\phi_{t})-\B_{\psi}(v;\phi_{t+1})-\B_{\psi}(\phi_{t+1};\phi_{t}),
% \end{align*}
% where the last equality can be obtained by some algebra (see in \citet{chen1993convergence} for derivation).
Combining these two bounds, we have 
\begin{align*}
\eta\left\langle \nabla h_{t},\phi_{t}-\phi\right\rangle  % & \le\B_{\psi}(v;\phi_{t})-\B_{\psi}(v;\phi_{t+1})-\B_{\psi}(\phi_{t+1};\phi_{t})+\frac{\eta^{2}}{2}\|\nabla h_{t}\|^{2}+\B_{\psi}(\phi_{t+1};\phi_{t})\\
  \leq \B_{\psi}(\phi;\phi_{t})-\B_{\psi}(\phi;\phi_{t+1})+\frac{\eta^{2}}{2}\|\nabla h_{t}\|_{\infty}^{2}.
\end{align*}
Since $h_{t}(\phi)$ is convex in $\phi$, we have $h_{t}(\phi_{t})-h_{t}(\phi)\le\left\langle \nabla h_{t},\phi_{t}- \phi\right\rangle $ for any $ \phi\in S_{b}$. 
By telescoping, we obtain  
\begin{align*}
\sum_{t=1}^{T}(h_{t}(\phi_{t})-h_{t}(\phi)) & \le\sum_{t=1}^{T}\left\langle \nabla h_{t},\phi_{t}-\phi\right\rangle \\
 & \le\frac{1}{\eta}\sum_{t=1}^{T}\left[\B_{\psi}(\phi;\phi_{t})-\B_{\psi}(\phi;\phi_{t+1})+\frac{\eta^2}{2}\|\nabla h_{t}\|_{\infty}^{2}\right]\\
 & =\frac{1}{\eta}(\B_{\psi}(\phi;\phi_{1})-\B_{\psi}(\phi;\phi_{T+1}))+\frac{\eta}{2}\sum_{t=1}^{T}\|\nabla h_{t}\|_{\infty}^{2}\\
 & \le\frac{1}{\eta}\log b+{32\eta}M^4 T.
\end{align*}
% Since $\phi_{1}=[1/b,...,1/b]$, 
where we used Lemma~\ref{eq:bounded_gradient}, $\B_{\psi}(\phi;\phi_{T+1})\geq 0$ and $\B_{\psi}(\phi;\phi_{1})=\psi(\phi)+\log b\le\log b$ in the last inequality. 
% Also 
% \begin{align*}
% \|\nabla h_{t}\| & =\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi,t)\|\\
%  & \le\|\nabla r_{t}(\th_{t})\|+\sum_{i=1}^{b}\alpha_{i}\|\nabla r_{t-i}(\th_{t})\|\\
%  & \le2M.
% \end{align*}
Choosing $\eta=c\sqrt{(\log b)/(TM^{4})}$ with some constant $c>0$
leads to  
\begin{align} \label{eq:meta_diff}
\sum_{t=1}^{T}[h_{t}(\phi_{t})-h_{t}(\phi)]\le O(M^2\sqrt{T\log b}).
\end{align}
Note that \eqref{eq:meta_diff} holds for any $\phi \in S_b$. In particular, we can set $\phi=\phi^*$ defined by $\phi^{*}=\argmin_{\phi\in S_{b}} \sum_{t=1}^{T} h_t(\phi)$. Therefore, 
\begin{align*}
    \sum_{t=1}^{T} h_t(\phi_t) & \le\sum_{t=1}^{T}h_t(\phi^*)+O(M^2\sqrt{T\log b})\\
     & = \min_{\phi\in S_b}\sum_{t=1}^{T}\|\nabla r_{t}(\th_t)-m(\th_t;\phi,t)\|^{2}+O(M^2\sqrt{T\log b}) \\
     & \leq \min_{\phi\in S_b}\sum_{t=1}^{T} \sup_{\th }\|\nabla r_{t}(\th)-m(\th;\phi,t)\|^{2}+O(M^2\sqrt{T\log b})
     = \min_{m\in\mathcal{M}}Q[T;m]+O(M^2\sqrt{T\log b}).
\end{align*}
% \[
% \phi^{*}=\argmin_{\phi\in S_{b}} \sum_{t=1}^{T} h_t(\phi_t)\leq \argmin_{\phi\in S_{b}}\sum_{t=1}^{T}\sup_{\th}||\nabla r_{t}(\th)-m(\th;\phi,t)||^{2}/T
% \]  
% Now, following the argument of the proof of Theorem \ref{thm:fixed_meta_regret}, we
% have 
% \[
% \R_{w}(T)\le\frac{2}{w^{2}T}\sum_{t=1}^{T}\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi_{t},t)\|^{2}+2\delta^{2}.
% \]

% As $S_{b}$ is compact, we define 
% \[
% \phi^{*}=\arg\min_{\phi\in S_{b}}\sum_{t=1}^{T}\sup_{\th}||\nabla r_{t}(\th)-m(\th;\phi,t)||^{2}/T
% \]
% and thus $m(\cdot;\phi^{*},\cdot)=\arg\min_{m\in\mathcal{M}}\sum_{t=1}^{T}\sup_{\th}||\nabla r_{t}(\th)-m(\th;\phi,t)||^{2}/T$.
% % As $\mathcal{M}$ is compact, we define
% % \[
% % m(\th;\phi^{*},t)=\arg\min_{m\in\mathcal{M}}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)-m(\th;\phi,t)\|^{2}/T.
% % \]
% Choosing $\phi=\phi^*$, \eqref{eq:meta_diff} gives that
% \begin{align*}
% \sum_{t=1}^{T}\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi_{t},t)\|^{2} & \le\sum_{t=1}^{T}\|\nabla r_{t}(\th_{t})-m(\th_{t};\phi^{*},t)\|^{2}+O(M^2\sqrt{T\log b})\\
%  & = \min_{m\in\mathcal{M}}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)-m(\th;\phi,t)\|^{2}+O(M^2\sqrt{T\log b}).
% \end{align*}
We thus conclude from \eqref{eq:regret_in_ht} that 
\[
\R_{w}(T)\le \frac{2}{w^{2}T}(\min_{m\in\mathcal{M}}Q[T;m]+O(M^2\sqrt{T\log b})) + 2\delta^{2}.
\]
\end{proof}

\section{A Practical Generalized FGD algorithm.} \label{apx:generalize_fgd}

\begin{algorithm*}[t]
\caption{Generalized Future Gradient Descent for Smoothed Loss}\label{alg:main_generalized}
\begin{algorithmic}
\State \textbf{Input:} The learning rate $\eta$, $\eta_\phi$ for updating the model parameter $\th$ and $\phi$. The initial trajectory buffer $B$.
\For{$t\in [T]$}
    \State{Deploy the prediction model $f_{\th_{t}}$ with parameter $\th_t$. Then collect the new dataset $\D_t$.}
    \State{Initialize the parameter of MFGG $\phi_{t+1}$.} \Comment{Initialization of $\phi_{t+1}$ is user-specific.}       
    \For{Inner loop iteration $k\in K$} \Comment{Update the meta network.}
        \State{$\phi_{t+1} \leftarrow \phi_{t+1}-\eta_{\phi}\sum_{\th\in B}\nabla_{\phi}\|m(\th;\phi_{t+1},t)-\nabla r_{t}(\th)\|^{2}$.} \Comment{May replace with the mini-batch version.}
    \EndFor
    \State{Initialize the trajectory buffer $B = \emptyset$ and model parameter $\th_{t+1}$.} \Comment{Initialization scheme of $\th_{t+1}$ is specified by user.}
    \While{$\|{\bar{m}(\th_{t+1};\phi_{t+1},t+1)}\|\ge\delta$} \Comment{Alternatively, we may run gradient descent with a fixed number of iterations.}
        \State{$\th_{t+1} \leftarrow \th_{t+1}-\eta {m(\th_{t+1};\phi_{t+1},t+1)}$.} \Comment{May replace with the mini-batch version.}
        \State{$B \leftarrow B \cup \{ \th_{t+1} \}$} \Comment{Alternatively, we may update the trajectory buffer $B$ every a few iterations.}
    \EndWhile
\EndFor
\end{algorithmic}
\end{algorithm*}

Compared with FGD in Algorithm \ref{alg:main}, we use a smoothed version of MFGG $\bar{m}$ for training, which is due to the consideration of minimizing a smoothed loss in (\ref{eq:dynamic_regret_smooth}). For completeness, we also summarize the practical algorithm of the generalized version of FGD in Algorithm \ref{alg:main_generalized}.

% \section{Comparison Gradient Variation With Other Measure of Domain Divergence} \label{apx:compare_gv}
% In Proposition \ref{prop:BU}, the domain discrepancy is characterized by the gradient variation term (i.e,m how the gradient differs). There are some other discrepancy measure between the domains. Examples include the $\mathcal{H}$-divergence \citep{kifer2004detecting} between $\D$ and $\D'$ defined
% as $d_{\mathcal{H}}(\D,\D')=\sup_{\th}\|\mathbb{E}_{\mathcal{D}}\ell(f_{\th}(x),y)-\mathbb{E}_{\mathcal{D}'}\ell(f_{\th}(x),y)\|$
% and the $\mathcal{H}\Delta\mathcal{H}$ divergence \citep{ben2010theory} defined as $d_{\mathcal{H}\Delta\mathcal{H}}(\D,\D')=\sup_{\th,\th}\|\mathbb{E}_{\D}\ell(f_{\th},f_{\th'})-\E_{\D'}\ell(f_{\th},f_{\th'})\|$.
% Overall, the commonly used divergence measure has the general form
% of $\sup_{\th}\|\mathbb{E}_{\mathcal{D}}g_{\th}-\mathbb{E}_{\mathcal{D}'}g_{\th}\|$, where $g_{\th}$ is a test function parameterized by $\th$. The $\mathcal{H}$-divergence chooses $g_{\th}=\ell(f_{\th}(x),y)$
% and the $\mathcal{H}\Delta\mathcal{H}$ divergence first extends the
% parameter space $\Theta$ to the product space $\Theta\otimes\Theta$
% and let $g_{(\th,\th')}=\ell(f_{\th}(x),f_{\th'}(x))$ for any $(\th,\th')\in\Theta\otimes\Theta$.
% The gradient variation chooses $g_{\th}=\nabla\ell(f_{\th},y)$. As
% we consider the local regret for non-convex problem where
% the closeness to the first-order stationary is the main concern,
% using gradient as the test function is a natural fit.