\section{Theory}
In this section, we study the advantage of the proposed FGD over BU and IU theoretically using recent advances in non-convex online learning. Specifically, we show that FGD is able to perform better than BU and IU in terms of the so-called \textit{local regret} \citep{hazan2017efficient,hallak2021regret}, which measures the algorithm's performance by comparing it with the best one can achieve in hindsight.

\subsection{Local Regret}
To upper bound the average loss in \eqref{eq:dynamic_regret} in a changing environment, one standard approach is to study the average dynamic regret \citep{zinkevich2003online}: 
\begin{align} \label{eq:convex_regret}
    \frac{1}{T}\sum_{t=1}^{T}[r_{t}(\th_{t})-\min_{\th\in\Theta}r_{t}(\th)],
\end{align}
which uses the global minimum of $r_{t}$ as a benchmark when evaluating the performance at time $t$.
% To minimize the average dynamic regret in \eqref{eq:dynamic_regret}, the majority of existing works focus on \emph{convex} loss functions and many efficient methods have been proposed \ruichen{Could cite a few papers here}.   
However, in modern recommendation systems the prediction model $f_\th$ is given by a deep neural network, and thus the resulting loss function $r_{t}(\theta)$ is highly non-convex. This means finding an approximate global minimum of $r_{t}$ is computationally intractable, making it hopeless to derive any meaningful bound on the average dynamic regret in \eqref{eq:convex_regret}.  
% Instead of using the suboptimal gap to measure the model parameter $\theta_t$ as in \eqref{eq:dynamic_regret}, approximate stationary point. 
% This leads to the local dynamic regret first proposed by \citep{hazan2017efficient}. 
To remedy this issue, we adopt the notion of \emph{local regret} proposed by \cite{hazan2017efficient}. 
Specifically, given $\{\th_{t}\}_{t=1}^{T}$ generated
by an online learning algorithm, the average \emph{local regret} is defined as % 
\begin{equation}\label{eq:local_regret}
\R(T):=\frac{1}{T}\sum_{t=1}^{T}\|\nabla r_{t}(\th_{t})\|^{2}.
\end{equation}
% \ruichen{Why is there a factor of $1/T$?}
Compared with \eqref{eq:convex_regret}, in \eqref{eq:local_regret} we evaluate the model parameters in terms of the first-order stationarity, and thus it can be viewed as the non-convex counterpart of the dynamic regret in \eqref{eq:convex_regret}. 
% Intuitively, $\R(T)$ measures how $\th_{t}$ is close to the first order stationary point of $r_{t}$,
% the idea updating using future information. 
In particular, a small value of $\R(T)$ implies a small gradient on average, 
%$\|\nabla r_t (\th_t)\|$, 
suggesting that the algorithm achieves near-optimal performance locally in the long run. % similar convergence property as the ideal update. 
% \ruichen{We can relate the local regret in \eqref{eq:local_regret} with the dynamic regret in \eqref{eq:dynamic_regret} via KL property?}

More generally, when the smoothed loss \eqref{eq:dynamic_regret_smooth} is considered, one can use the average \emph{$w$-local regret} accordingly as in \citep{hazan2017efficient}:
\[
\R_{w}(T):=\frac{1}{T}\sum_{t=1}^{T}\|\nabla u_{w,t}(\th_{t})\|^{2}, 
\]
where we evaluate $\th_t$ using the smoothed loss function $u_{w,t}(\th):=\frac{1}{w}\sum_{i=0}^{w-1}r_{t-i}(\th)$. 
% The overall interpretation of the $w$-local dynamic regret $\R_{w}(T)$ is the same as $\R(T)$, but it uses a sliding window with width $w$ over the previous datasets when calculating the gradient. 
In the following, we will focus our analysis on $\R_{w}(T)$, as choosing $w=1$ also covers the standard local regret in \eqref{eq:local_regret}. 
% We also note that the smoothed local regret corresponds to evaluating the model parameter $\th_t$ over the datasets $\cup_{i=0}^{w-1} D_{t-i}$, which is observed as a more robust evaluation metric in practice \citep{he2014practical}.
% It corresponds to the local dynamic regret of a smoothed objective $\frac{1}{T}\sum_{t=1}^{T}u_{w,t}(\th_{t})$ serving as a more robust evaluation metric in practice \citep{he2014practical}.


% It is noteworthy that $\R_{w}(T)$
% is a dynamic regret in the sense that a sequence of instantaneous
% minimizers are used as references:
% \[
% \R_{w}(T)=\frac{1}{T}\sum_{t=1}^{T}\Big[\|\nabla u_{w,t}(\th_{t})\|^{2}-\underset{\text{Reference}}{\underbrace{\min_{\th}\|\nabla u_{w,t}(\th)\|^{2}}}\Big].
% \]

\subsection{Regret of Batch Update}
In \citep{hazan2017efficient}, the authors analyzed the average $w$-local regret $\R_w(T)$ for BU. We recall their result below but offer a different interpretation from the domain generalization perspective. % {\color{red} show we interpret the bound from a different perspective.}
% \begin{assumption} \label{asm:regular}
% Assuming that there exists $L,M<\infty$ such that for any $t\in[M]$
% and $\th,\th'$, $|r_{t}(\th)|\le M$ and $\|\nabla r_{t}(\th)-\nabla r_{t}(\th')\|\le L\|\th-\th'\|$. 
% \end{assumption}
\begin{proposition} [\citep{hazan2017efficient,hallak2021regret}]\label{prop:BU}
% Consider the regret $\R_{w}(T)$,
With the choice of the window size $b=w$, 
the $w$-local regret incurred by BU in Algorithm~\ref{alg:buiu} satisfies
\begin{equation*}%\label{eq:BU_regret}
    \begin{aligned}
        \R_{w}(T) & \le\underset{\text{optimization error}}{\underbrace{2{\textstyle \sum_{t=1}^{T}}
        \| \nabla u_{w,t-1}(\theta_t)
        % \frac{1}{w}{\textstyle \sum_{i=0}^{w-1}}\nabla r_{t-1-i}(\th_{t})
        \|^{2}/T}
        }+\underset{\text{\text{domain generalization}}}{\underbrace{2V_{w}(T)/w^{2}}}\\
         & \le2\delta^{2}+\frac{2}{w^{2}}V_{w}(T), \\
        \text{where\  } & V_{w}(T)=\frac{1}{T}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)-\nabla r_{t-w}(\th)\|^{2}.
        \end{aligned}
\end{equation*} 
% where $V_{w}(T):=\frac{1}{T}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)-\nabla r_{t-w}(\th)\|^{2}$. 
Furthermore, if $\|\nabla r_t(\th)\| \leq M<\infty$ for all $\th\in \Theta$ and $t \geq 0$, choosing $\delta = O(1/w)$ gives $\R_w(T)=O(1/w^2)$,
which is minimax optimal.
\end{proposition}
% \paragraph{Remark: interpretation of the bound. }

The previous works \citep{hazan2017efficient,hallak2021regret} are  interested in the worst-case guarantee of the BU algorithm, and the result in Proposition \ref{prop:BU} only serves as an intermediate result. However, we observe that this regret bound also offers interesting insights from the perspective of domain generalization. To be specific, we can decompose it into two terms: 

\emph{The optimization error}: this is due to the fact that we only seek a $\delta$-approximate stationary point of the smoothed training loss function $u_{w,t-1}(\theta)$ at round $t$.  It is controllable in the sense that $\delta$ can be made arbitrarily small by running more iterations of gradient descent. Indeed, under standard smoothness assumption on $r_i$, we can achieve $\|\nabla u_{w,t-1}(\theta_t)\|\le\delta$ within $O(\delta^{-1})$ iterations. The optimization error term thus corresponds to how well we train the recommendation model in each round.

\emph{The domain generalization error}: this is due to the fact that the the test set $\cup_{i=0}^{w-1} D_{t-i}$ for evaluating $\th_t$ is different from the training set $\cup_{i=1}^{w} D_{t-i}$. It is typically the dominant term in the regret bound and will not vanish even when $\delta=0$. 
In some sense, it captures the level of variability in the data distributions, similar to the gradient variation term in \citep{chiang2012online,rakhlin2013online}. We also note that the domain generalization error decreases w.r.t. $w$. This is because when $w$ increases, the overlap between the
training set and the test set becomes larger (i.e., the training set and the test set deviate less)\footnote{Such overlapping mechanism is the key to defending adversaries in non-convex games and we refer to Section 2.3 in \citep{hazan2017efficient} for more details.}.
 

In summary, the optimization error term characterizes how well our model performs on the training set, while the domain generalization error term characterizes how much the test set deviates from the training set.
% how well the model performs on the test set, in which the mismatch between the training and test set matters.
% {\color{gray} The regret bound consists of two terms. The optimization error is
% upper bounded by $2\delta^{2}$. It corresponds to how well
% we optimize the recommendation such that it is close to the first
% order stationary. Since the optimization error is calculated based
% on the examples used for training, it can be viewed as how well we
% train the model. The gradient variation term \citep{chiang2012online,rakhlin2013online} uses gradient information
% to characterize how the future domain deviates from the past domains.
% As the training set is constructed from the past domains (i.e., for
% BU, the training set is constructed by mixing the latest two past
% domains) and the future domain gives the distribution of the test
% set, the gradient variation term is hence a domain generalization
% error term. For example, take $w=1$, $V_{w}(T)$ is then
% a temporal average of $\sup_{\th}\|\nabla r_{t}(\th)-\nabla r_{t-1}(\th)\|^{2}$.
% Since $r_{t-1}$ is used for training $f_{\th_{t}}$ and $r_{t}$
% is the loss when $f_{\th_{t}}$ is deployed, $V_{w}(T)$ is hence
% a measure of the difference between training/testing set. Intuitively,
% the more $\P_{t}$ is close to $\P_{t-1}$, the more $\nabla r_{t}$
% is close to $\nabla r_{t-1}$ and the smaller $V_{w}(T)$ is. In summary, the optimization error characterizes how well we train the model and the gradient variation term characterizes how well the model performs in the testing set, in which how the training and testing set differs matters. }

% \paragraph{Remark: the smoothing effect.}
% Note that the gradient variation term is decreasing w.r.t. $w$.
% This is because when $w$ is larger, the overlap between the
% training set ($\frac{1}{w}{\textstyle \sum_{i=0}^{w-1}}r_{t-1-i}$
% for BU) and the test set ($\frac{1}{w}{\textstyle \sum_{i=0}^{w-1}}r_{t-i}$
% for BU) is larger and hence the training/testing set deviates less.
% Such overlapping mechanism is also the key to defending
% adversaries in non-convex games and we refer to Section 2.3 in \citet{hazan2017efficient} for more details. 

\paragraph{Comparison with other measure of domain divergence.}
% In Proposition \ref{prop:BU}, the domain discrepancy is characterized by the gradient variation term (i.e,m how the gradient differs). There are some other discrepancy measure between the domains such as $\mathcal{H}$-divergence \citep{kifer2004detecting} and the $\mathcal{H}\Delta\mathcal{H}$ divergence \citep{ben2010theory}. We draw the connection between the gradient variation and the other measure in Appendix~\ref{apx:compare_gv}.
In Proposition \ref{prop:BU}, the domain discrepancy is characterized in terms of the gradient variation (i.e, how much the gradient of the loss functions differs). Some other domain discrepancy measures have also been proposed. Examples include the $\mathcal{H}$-divergence \citep{kifer2004detecting} between $\D$ and $\D'$ defined
as $d_{\mathcal{H}}(\D,\D')=\sup_{\th}\|\mathbb{E}_{\mathcal{D}}\ell(f_{\th}(x),y)-\mathbb{E}_{\mathcal{D}'}\ell(f_{\th}(x),y)\|$
and the $\mathcal{H}\Delta\mathcal{H}$ divergence \citep{ben2010theory} defined as $d_{\mathcal{H}\Delta\mathcal{H}}(\D,\D')=\sup_{\th,\th'}\|\mathbb{E}_{\D}\ell(f_{\th},f_{\th'})-\E_{\D'}\ell(f_{\th},f_{\th'})\|$.
Overall, the commonly used divergence measures share the general form
of $\sup_{\th}\|\mathbb{E}_{\mathcal{D}}g_{\th}-\mathbb{E}_{\mathcal{D}'}g_{\th}\|$, where $g_{\th}$ is a test function parameterized by $\th$. The $\mathcal{H}$-divergence uses $g_{\th}=\ell(f_{\th}(x),y)$
and the $\mathcal{H}\Delta\mathcal{H}$ divergence first extends the
parameter space $\Theta$ to the product space $\Theta\otimes\Theta$
and let $g_{(\th,\th')}=\ell(f_{\th}(x),f_{\th'}(x))$ for any $(\th,\th')\in\Theta\otimes\Theta$.
The gradient variation uses $g_{\th}=\nabla_{\th}\ell(f_{\th},y)$. As
we consider the local regret for non-convex problems where the goal is to find a first-order stationary point,
using the gradient as the test function is a natural fit.


\subsection{The Headroom of Batch Update} \label{sec:headroom}
In the last section, we see that BU achieves the minimax regret, so at first sight it seems there is no room for further improvement. However, we note that this only implies that BU is optimal in the \emph{worst-case sense}, i.e., when the future data distribution is completely uncorrelated with the previous ones. 
This is hardly the case in reality: the drift in the data distribution normally happens in a gradual manner, and the data distribution in the past should be informative of the future.  
Hence, the natural question is: can we do better than BU in a gradually changing environment?

The discussion after Proposition~\ref{prop:BU} suggests that the only hope for improvement lies in reducing the domain generalization error $V_{w}(T)$. To illustrate the headroom, we start with Meta Gradient Descent (MGD), a `helper algorithm' that extends BU and serves as an intermediate step towards the proposed FGD. Assume that we are given a sequence of gradient generators $\{m(\cdot;t)\}_{t=1}^T$. 
%To accommodate the smoothed loss, 
Then FGD uses a smoothed gradient generator given by 
\[
\bar{m}(\th;t)=\frac{1}{w}\left(m(\th;t)+\sum_{i=1}^{w-1}\nabla r_{t-i}(\th)\right),
\]
for updating, yielding Algorithm \ref{alg:fgd_wo_meta}. 
\begin{algorithm}[t!]
\caption{Meta Gradient Descent: a helper algorithm}\label{alg:fgd_wo_meta}
\begin{algorithmic}
\State \textbf{Input:} The learning rate $\eta$ for updating the parameter $\th$.
\For{$t\in [T]$}
    \State{Deploy the prediction model $f_{\th_{t}}$ with parameter $\th_t$.}
    \State{Collect the new dataset $\D_t$.}
    \State{Construct the smoothed gradient generator $\bar{m}(\cdot;t+1)$.}
    \State{Initialize $\th_{t+1}$.}
    \While{$\|\bar{m}(\th_{t+1};t+1)\|\ge\delta$}
        \State{$\th_{t+1} \leftarrow \th_{t+1}-\eta \bar{m}(\th_{t+1};t+1)$}
    \EndWhile
\EndFor
\end{algorithmic}
\end{algorithm}

By substituting $\nabla r_{t-w}(\cdot)$ for $m(\cdot,t)$,  MGD reduces to BU with $b=w$. Comparing $\bar{m}(\cdot;t)$ with $\nabla u_{w,t}$, the true gradient on the test set, we see that 
\begin{equation}\label{eq:GG_diff}
\bar{m}(\th;t)-\nabla u_{w,t}(\th)=\frac{1}{w}(m(\th;t)-\nabla r_{t}(\th)),
\end{equation}
suggesting that MGD introduces a general gradient generator $m(\th;t)$ as a proxy for $\nabla r_{t}(\th)$, similar to FGD. On the other hand, we note that the gradient generator in MGD is pre-specified, while FGD parametrizes the gradient generator $m$ with $\phi$ and optimizes it on the fly.

From this perspective, BU in Algorithm \ref{alg:buiu} in fact implicitly uses $m(\cdot,t) = \nabla r_{t-w}$ to approximate $\nabla r_t$, which explains why $V_w(T)$ depends on the difference between these two terms. While such design makes sense in the very limited case where the sequence of domains is known to have a period of $w$, it might not be a savvy choice in general. To be specific, one can construct $m$ from the observed datasets $\D_{t-1},...,\D_{t-b}$ based on some mapping parameterized by $\phi \in \Phi$. For instance, such mapping can be given by a deep neural network as described in Section \ref{sec:method}. In this way, MGD enables a mechanism that  utilizes the past domains more flexibly to predict the future gradient information $\nabla r_t$ when it can be forecasted with a more general form.

% On closer inspection, we can see that BU in fact implicitly uses the dataset $D_{t-w}$ as the proxy for the unseen dataset $D_t$, which explains why $V_{w}(T)$ depends on the difference between $\nabla r_t$ and $\nabla r_{t-w}$. 
% While such design makes sense when the sequence of data distributions is known to have a period of $w$, this may not always be a savvy choice:   \ruichen{TODO: argue why this is not a good idea in general}

% In general, $\delta$ can be set to be very small: under standard smoothness assumption on $r_i$, for any $\delta$, we can find $\th_t$ such that $\|\sum_{i=1}^{w-1}\nabla r_{t-i}(\th_t)/w\|\le\delta$ within $O(\delta^{-1})$ iterations. It implies that the gradient variation $V_{w}(T)$ is in general a dominating term in the regret bound. Reducing the gradient variation error is the key to improve the regret.

% {\color{gray} Notably, the gradient variation term depends on the algorithm and the problem,
% and it is highly related to whether we are able to utilize the past domains
% to predict the future domain. Here are two extreme cases:
% \begin{itemize}
% \item \emph{Periodical environment.} When the domains are $w$-periodical,
% i.e., $r_{t+w}=r_{t}$ for all $t$, and we are interested in
% a smoothed regret $\R_{w}(T)$ with $w$ window size, choosing
% BU with $b=w$ gives $V_{w}[T]=0$.
% \item \emph{Adversarial environment.} When the future domain is unpredictable using
% the past domains, for any $w$, we have $V_{w}[T]=O(1)$.
% This is also when IU/BU matches the lower bound.
% \end{itemize}
% In real world recommendation systems, we shall not expect the adversarial environment as the future domain should be predictable to a certain extent using the information of past domains. However, the BU is only able to reduce the regret in an very limited case with a periodical environment. It fails to provide a mechanism that utilizes the past domains when the gradient of loss of the future domain can be forecasted with a more general form.
% Thus, a natural question is: 
% \emph{Can we modify the algorithm to improve the gradient variation term?}} 

% A simplified version of FGD that does not optimize the MFGG gives a positive answer. \ruichen{``does not optimize the MFGG'' sounds weird...} Let 
% As the first step towards our FGD algorithm, we consider a meta FGD algorithm where we explicitly introduce a general gradient generator $m_{t}(\th)$ as the proxy for $\nabla r_t(\th)$.{\color{violet} We note that this is in the same spirit as the optimistic methods widely used in convex online learning \citep{chiang2012online,rakhlin2013online}.}% \ruichen{Note that we use $m_t$ as an approx. of $r_t$ rather than of its gradient, which seems more natural to me}
% To accommodate the $w$-local regret, we further consider a smoothed gradient generator given by 
% \begin{equation}\label{eq:proxy_loss}
% \bar{m}_{t+1}(\th)=\frac{1}{w}\left(m_{t+1}(\theta)+\sum_{i=0}^{w-2} \nabla r_{t-i}(\th)\right),
% \end{equation}
% which is a weighted average of $m_t(\theta)$ and the gradient of the loss function at $\th$ over the observed datasets $D_{t-w+2},\dots,D_{t}$. 
% At round $t$, we update the model parameter by running gradient descent with the simulated gradient $\bar{m}_{t+1}$
% % by running gradient descent on $\bar{m}_{t+1}$
% until reaching the desired accuracy $\delta$. 
% The algorithm is presented in Algorithm~\ref{alg:fgd_wo_meta}. In particular, we note that it exactly recovers BU in Algorithm~\ref{alg:buiu} when we set $m_{t}=\nabla r_{t-w}$.  
% be the updating direction of the recommendation model at time $t$, where $\sum_{i=1}^{0}r_{t-i}$ is defined as 0 by convention.
% We introduce this `intermediate' algorithm in Algorithm \ref{alg:fgd_wo_meta}.

% Slightly different from the MFGG in  Algorithm \ref{alg:main}, here we add a smooth term $\sum_{i=1}^{w-1}r_{t-i}(\th)$ because we consider a more general smoothed regret $\R_w$ and when $w=1$, it reduces to the FGD in Algorithm \ref{alg:main} with MFGG that is fixed and not optimized (i.e., $K=0$).
% and this motivates
% us to actively optimize our training algorithm of the recommendation model
% to reduce the generalization error by forecasting the future domain.
% However, the current IU/BU algorithm fail to provide such mechanism
% and is only able to reduce the regret bound  It is thus naturally to consider
% a more powerful algorithm to improve IU/BU when the future domain
% can be predicted by a specific combination of past domains, which
% serves as the main motivation of our proposed FGD algorithm.
% \subsubsection{Fixed Meta Network} \label{sec:fix_meta}

\begin{theorem} \label{thm:fixed_meta_regret}
The $w$-local regret incurred by Algorithm \ref{alg:fgd_wo_meta} satisfies
\begin{equation*}
 \R_{w}(T)\le 2\delta^{2}+\frac{2}{w^{2}}Q(T;m),% \\
% \text{where}\ \  & Q_{m}(T):=\frac{1}{T}\sum_{t=1}^{T}\|\nabla r_{t}(\th_t)-m_t(\th_t)\|^{2}.
\end{equation*}
where $Q(T;m):=\frac{1}{T}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)-m(\th;t)\|^{2}$. Furthermore, if  both $\|\nabla r_t\|$ and $\| m(\cdot;t)\|$ are upper bounded by $M<\infty$ for all $\th\in \Theta$ and $t\geq 0$, we recover the minimax regret $\R_w(T) = O(1/w^2)$ when $\delta=O(1/w)$.
\end{theorem}
% Theorem~\ref{thm:fixed_meta_regret} can be viewed as a generalization of Proposition~\ref{prop:BU}. 
Theorem~\ref{thm:fixed_meta_regret} shows that we can greatly improve the regret of BU by reducing the domain generalization error $Q(T;m)$ if $m$ is properly chosen. Specifically, suppose that $\mathcal{M}$---the hypothesis class of $m$---is rich enough to model the dynamic of the data distribution, in the sense that there exists $ m^*\in \mathcal{M}$ satisfying 
\begin{equation*}
    Q(T;m^*) := \frac{1}{T}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)- m^*(\th;t)\|^{2} = O\left(\frac{1}{T}\right).
\end{equation*}
Then the domain generalization error of MGD equipped with $m^*$ tends to zero at the rate of $1/T$, in contrast to being a non-vanishing dominant term in BU.  
On the other hand, we can still maintain essentially the same regret bound as BU in the worst case, and thus the improvement almost comes for free. 

% Compared with BU, once $\mathcal{M}$, the hypothesis class of $m$, is rich enough and $m$ is properly chosen, we can expect to greatly improve the regret of BU (i.e., $\min_{m\in\mathcal{M}}Q(T;m) \le V_{w}(T)$). On the other hand, we can still maintain essentially the same regret bound as BU in the worst case, and thus the improvement almost comes for free.   

% When $\mathcal{M}$ is rich enough to model the dynamic of the data distribution, in the sense that $\exists m^*\in \mathcal{M}$ satisfying 
% \begin{equation*}
%     Q(T;m^*) := \frac{1}{T}\sum_{t=1}^{T}\sup_{\th}\|\nabla r_{t}(\th)- m^*(\th;t)\|^{2} = O\left(\frac{1}{T}\right),
% \end{equation*}
% the domain generalization term of MGD equipped with $m^*$ tends to zero at the rate of $1/T$, in contrast to being a non-vanishing dominant term in BU.  

In the following section, we show that it is indeed possible for FGD to achieve a comparable local regret bound as the one given by MGD with the optimal gradient generator $m^*$ in $\mathcal{M}$.

% it fails to provide a mechanism that utilizes the past domains when the gradient information of the future domain can be forecasted with a more general form. Indeed, MGD with a proper design of $m$ improves over BU.

% {\color{gray} Since $\min_{m\in\mathcal{M}}Q_{m}(T)\le V_{w}[T]$ where $\mathcal{M}$ denotes the hypothesis class of the meta network $m$, when $m$ is properly chosen, we are able to give smaller gradient variation term than $V_{w}(T)$. Specially, we are able to achieve 0 gradient variation in the following case:
% \begin{itemize}
% \item \emph{Predictable environment}. When there exists a map $\gamma$ such that $\nabla r_{t}=\gamma(\D_{t-1},...,\D_{t-b})$ for any $t$, choosing $m=\gamma$ gives $Q_{m}(T)=0$.
% \end{itemize}
% The regret is reduced only when a good $m$ is chosen. In the following section, we show that FGD with optimized MFGG is able to achieve similar regret as that of Algorithm \ref{alg:fgd_wo_meta} with the static oracle $m^* = \arg\min_{m\in\mathcal{M}}Q_m(T)$ chosen.}

\subsection{Regret Bound of FGD} \label{sec:opt_meta}
% We again use an online learning algorithm to update the parameters in the MFGG component.  

% Now we proceed to analyze the regret of FGD when the MFGG is optimized during the training. 
To simplify the analysis, we consider the case
where the gradient generator at round $t$ is given by a linear model: 
\begin{equation}\label{eq:linear_model}
    m(\th;\phi,t)=\sum_{i=1}^{b}a_{i}\nabla r_{t-i}(\th),
\end{equation}
where $\phi=[a_{1},...,a_{b}]\in S_b$ is the parameter. The hypothesis class $\mathcal{M}$ is thus $\mathcal{M}=\{\{m(\cdot;\phi,t)\}_{t=1}^{T}:\sum_{i=1}^{b}a_{i}\nabla r_{t-i}(\cdot),\ \phi\in S_{b}\}$. This family of FGD algorithm covers the BU algorithm, which corresponds to setting $a_{b}=1$ and $a_{i}=0$ otherwise. For this toy example, we use the classic exponentiated gradient descent method \citep{kivinen97exponentiated} to update $\phi$, which ensures that $\phi \in S_b$. The detailed algorithm is summarized in Algorithm \ref{alg:main_simple} in Appendix \ref{apx:simple_fgd}.

% we also consider a slightly simplified version of FGD where the projected gradient descent type of algorithm (see Appendix \ref{apx:simple_fgd}) is used to updating $m$ to ensure that $\phi\in S_{b}$.
\begin{theorem} \label{thm:opt_meta_regret}
Assume that for any $t$, $\|\nabla r_{t}\|$ is bounded by $M<\infty$. Let $\mathcal{M}$ be the hypothesis class of $m$ given in \eqref{eq:linear_model}. For any given constant $c>0$, if we set the learning rate for updating $m$ as $\eta_\phi=c\sqrt{(\log b)/(TM^{4})}$, the $w$-local regret incurred by Algorithm \ref{alg:main_simple} in Appendix \ref{apx:simple_fgd} satisfies 
% the solution returned by Algorithm \ref{alg:main_simple} in Appendix \ref{apx:simple_fgd} gives
\begin{align*}
\R_{w}(T) & \le 2\delta^{2}+\frac{2}{w^{2}}(Q(T;m^*)+O(M^2\sqrt{\log b/T})),
\\
\text{where } & Q(T;m^*) = \min_{m\in \mathcal{M}}\sum_{t=1}^{T} \sup_{\th}\|\nabla r_{t}(\th)-m(\th;\phi,t)\|^{2}.
\end{align*}
\end{theorem}
Theorem \ref{thm:opt_meta_regret} suggests that FGD with optimized MFGG is able to achieve the regret of Algorithm \ref{alg:fgd_wo_meta} using $m^*$ with $O(1/\sqrt{T})$ excessive error. As $T$ is usually large, we can see that the excessive error is small.