\balg[t]
\caption{\texttt{HotDoG}} \label{alg:NVDoG}
\balgc

\Require $\beta_1 = 0.9$, $\beta_2 = 0.999$, $\epsilon = 10^{-8}$, $r = 10^{-3}$\\
$\quad\quad\quad\quad T$, $\theta_0$, $w_0$
\State $v_0 \gets \bm{0}$, $m_0 \gets \bm{0}$, $d_0 \gets \bm{0}$, $c \gets 0$, $h \gets \texttt{false}$
\For{$t=1, \dots, T$}
    \If{h} 
        \State $c\gets c+1$
        \State $\scS_{t} \gets \Unif\lt(S, [N]\rt)$ (without replacement)
        \State $\hat{g}_t = g(w_{t-1}, \theta_{t-1}, \scS_t)$ (\cref{eq:gradest})
        
        \State $v_t \gets \beta_2 v_{t-1} + (1-\beta_2) \hat{g}_t^2$
        \State $m_t \gets \beta_1 m_{t-1} + (1-\beta_1) \hat{g}_t$
        \State $d_t \!\gets\! \beta_1 d_{t-1} \!+\! (1\!-\!\beta_1) \max\left\{ \left| w_{t-1} \!-\! w_0 \right|, d_{t-1} \right\}$ 
        \State $\hat{v}_t \gets v_t / (1 - \beta_2^c)$
        \State $\hat{m}_t \gets m_t / (1 - \beta_1^c)$
        \State $\hat{d_t} \gets $ ( $r\mathbf{1}$ \algorithmicif\ {t==1} \algorithmicelse\ $d_t / (1 - \beta_1^{c-1})$ )
        \State $w_t \gets w_{t-1} \!-\! \hat{d}_t \left(\diag\left(\left(c \left(\hat{v}_t + \epsilon\right)\right)^{\frac{1}{2}}\right)\right)^{-1}\! \odot \hat{m}_t$ %\Comment{update step}
    \Else
        \State $w_t \!\gets\! w_{t-1}$, $v_t \!\gets\! v_{t-1}$, $m_t \!\gets\! m_{t-1}$, $d_t \!\gets\! d_{t-1}$
    \EndIf
    \For{$k=1, \dots, K$}
        \State $\theta_{tk} \dist \kappa_{w_{t}}(\cdot \mid \theta_{(t-1)k})$ \Comment{record $\ell_{tk}$}
    \EndFor
    \LineComment Hot-start test
    \State $h \!\gets\! $ (true \algorithmicif\ $h$ \algorithmicelse\ $\texttt{HotStartTest}\!\left(\!\left(\!\ell_{ik}\!\right)_{i=1,k=1}^{t,K}\!,t\!\right)$)
\EndFor

\State\Return $w_T$

\ealgc
\ealg



\section{Hot DoG}
\label{sec:nvdog}
In this section, we develop our novel Markovian optimization method, \emph{Hot-start DoG} (Hot DoG),
presented in \cref{alg:NVDoG}. Our method extends the original DoG optimizer in two ways: 
(1) we add a tuning-free hot-start test that automatically detects when the 
Markov chains have properly mixed and stochastic gradient estimates are stable, 
at which point we start coreset weight optimization;
and (2) we apply acceleration techniques to DoG.

\subsection{Hot-Start Test}\label{sec:hotstarttest}
Poorly initialized Markov chain states $\theta_0$ can be detrimental to 
the performance of learning-rate-free methods in Coreset MCMC. 
\cref{fig:burnintest}, especially \cref{fig:burnintest-linear,fig:burnintest-logistic,fig:burnintest-poiss}
show that this is likely due to the bias of initial gradient estimates. %In particular,
When $\theta_0$ is initialized far away from high density regions of $\pi_{w_0}$, the initial gradient estimates 
can have norms that are orders of magnitude larger than those 
computed using \iid draws. This leads to a quickly vanishing learning rate in \cref{eq:learningrates}. 
Therefore, it is crucial to 
hot-start the Markov chains to ensure they are properly mixed before training the coreset weights.
There are MCMC convergence diagnostics
that could be used for this purpose (e.g, $\shR$ \citep{vehtari2021rank}); many 
work only with real-valued variables, and are overly stringent for our application.
We require a test that works for general coreset posteriors of the form \cref{eq:coresetposterior}
and checks only that gradient estimates have stabilized reasonably.

To address this challenge, we propose keeping the weights fixed at their initialization
(i.e., $w_{t+1} \gets w_t$) until a hot-start test passes.
For the test, for each Markov chain $k\in [K]$, 
we split the iterates $i=1,\dots, t$ into 3 segments, each
of equal length $n = \ceil{t/3}$. 
We compute the average log-potentials for the two latter segments $m_{k1}$, $m_{k2}$,
and the standard deviations of residual errors $s_{k1}, s_{k2}$ from a linear fit 
\[
m_{ki} \!=\! \frac{\sum_{j=in\!+\!1}^{(i\!+\!1)n}\! \ell_{jk}}{n},
s_{ki}^2 \!=\! \frac{\min_{\substack{a,b}}\! \sum_{j=in\!+\!1}^{(i\!+\!1)n} (a\! +\! bj\! -\! \ell_{jk})^2}{n-2}.
\]
Here $\ell_{jk} = \sum_{m'=1}^M w_{0m'}\ell_{m'}(\theta_{jk})$ is the log-potential for chain $k$ at iteration $j$.
Our test monitors the difference between $m_{k1}$ and $m_{k2}$ relative to $s_{k1}$ and $s_{k2}$. 
A small difference in the averages indicates that the chains have stabilized. 
The residual standard errors allows us to remove trends from the noise computation.
We define, for each $k\in[K]$,
\[
    u_k = \frac{\lt| m_{k1} - m_{k2} \rt|}{\max\{s_{k1}, s_{k2}\}},
\]
and use the median of $\left( u_{k} \right)_{k=1}^K$ as our test statistic. This test statistic is checked against 
a threshold $c$; the test passes when the median test statistic is less than $c$. 
\cref{alg:burnintermination} shows the pseudocode for the hot-start test.
We find in practice setting $c=0.5$ works well in general.

\balg[t]
\caption{\texttt{HotStartTest}} \label{alg:burnintermination}
\balgc
\Require $\left(\ell_{ik}\right)_{i=1, k=1}^{t, K}$, $t$, $c=0.5$
\State $n = \texttt{ceil}(t/3)$
\For{$k=1, \dots, K$}
    \State $s^2_{k1} \gets \frac{1}{n-2}\min_{a,b\in\reals}\sum_{i=n+1}^{2n} \left( a+b i - \ell_{ik} \right)^2$
    \State $s^2_{k2} \gets \frac{1}{n-2}\min_{a,b\in\reals}\sum_{i=2n+1}^{t} \left( a+b i - \ell_{ik} \right)^2$
    \State $u_{k} \gets \frac{\left|\left(\frac{1}{n}\sum_{i=n+1}^{2n} \ell_{ik}\right) - \left(\frac{1}{n}\sum_{i=2n+1}^{t}\ell_{ik}\right)\right|}{\max\{s_{k1}, s_{k2}\}}$
\EndFor
\State \Return (true \algorithmicif\ \texttt{median}$\left( u_{1}, \dots, u_{K} \right) < c$ \algorithmicelse\  false)
\ealgc
\ealg

\subsection{Acceleration}
To accelerate DoG, we begin by noting that 
the denominator of the DoG learning rate in \cref{eq:learningrates} is similar to that of 
AdaGrad \citep{duchi2011adaptive} in that it is a cumulative sum of some function of the gradient. Therefore, 
we can leverage the idea used in RMSProp \citep{hinton2012neural} for accelerating AdaGrad to accelerate DoG. 
In particular, at iteration $t$, we can replace $\sum_{i\leq t} \|\hat{g}_i\|^2$ with 
$t\hat{v}_t$, the bias-corrected exponential moving average of the squared gradient. 
This allows us to exponentially decrease the weights of past gradient norms. As 
a result, the effect of the early $\|\hat{g}_t\|^2$ terms on the learning rate diminishes over time, 
resulting in less conservative learning rates. 
To account for situations where the gradient estimates differ in scale across dimensions, we apply 
the above acceleration technique in a coordinate-wise fashion and obtain the following update rule for $\hat{v}_t$:
\[
    v_t = \beta_2 v_{t-1} + (1-\beta_2) \hat{g}_t^2, \quad \hat{v}_t = \frac{v_t}{1-\beta_2^t},
\]
where $\beta_2 \in (0,1)$ is the exponential decay rate, $v_0 = 0$, and $\hat{g}_t^2$ denotes the vector with each entry of $\hat{g}_t$ squared.
We further apply the same idea to $r_t$, the maximum distance traveled from $w_0$, and $\hat{g}_t$, the gradient estimate itself. 
We use $\beta_1 \in (0,1)$ to denote the exponential decay rate for these two quantities.
Our final proposed optimization procedure is outlined in \cref{alg:NVDoG}.
Note that in \cref{alg:NVDoG}, all computations are coordinate-wise.

In Hot DoG, we set the exponential decay rates, $\beta_1$ and $\beta_2$, to be the same as those in 
\citet{kingma2014adam}, and we set the initial learning rate $r$ to a small constant (default $10^{-3}$) following the 
recommendation of \citet{ivgi2023dog}. 

\subsection{Convergence Analysis}
In this subsection, we present a theoretical analysis of the convergence of the coreset weights produced by Hot DoG. 
We begin by stating the set of assumptions, under which our analysis is conducted.
These assumptions are stated formally stated in \cref{sec:assumptions}.
As required by \cref{alg:NVDoG}, we have that $|\beta_1|<1$, $|\beta_2|<1$, and $\epsilon, r > 0$. 
We further impose a set of assumptions about the feasible region $\mathcal{W}$ of the coreset weights.
Namely, we assume (1) the coreset weights are non-negative and their sum is bounded 
above by a constant $B$ (\cref{assump:exact,assump:constraint}), and (2) the existence of an exact coreset 
$w^\star\in\mathcal{W}$ in the sense that $\kl{\pi_{w^\star}}{\pi}=0$.
Both of these assumptions greatly simplify the analysis without sacrificing the representative nature of our assumed model.
A typical choice for the coreset weight bound is to set $B=N$, where $N$ is the total number of observations. 
In terms of the optimal coreset, past work has shown that it provides a near-exact approximation with high probability 
for the wide class of strongly log-concave models [\citealp[Thm.~4.2]{naik2022fast}].
Under \cref{assump:exact}, which is similar to Assumption 3.1 in \citet{chen2024coreset},
we do not expect the convergence result to change in a meaningful way, aside from there being a persistent error 
corresponding to the optimal coreset error. 

Finally, we state our assumptions regarding the stochastic gradient (\cref{eq:gradest}), which estimates \cref{eq:grad}. 
We assume that the stochastic gradients are uniformly bounded above by a constant $U$ (\cref{assump:grad_bound}).
Now note that in \cref{eq:gradest}, Monte Carlo error from the MCMC samples $\theta_t$ contributes to the stochasticity. 
We additionally impose a mixing condition on the Markov chains (\cref{assump:mixing}), 
and assume that the Monte Carlo error is controlled (\cref{assump:noise}).

We now present our main theorem in \cref{thm:convergence}. 
The proof of \cref{thm:convergence} can be found in \cref{sec:proof}.
Our result shows that Hot DoG produces coreset weights that converge to the optimum in expectation at 
a sublinear rate.
This convergence rate is consistent with ADAM and other 
learning-rate-free stochastic gradient methods discussed in the paper 
(see for example [\citealp[Thm.~3.10]{ivgi2023dog} and [\citealp[Thm.~2]{mishchenko2023prodigy}]).

\begin{theorem}[Hot DoG convergence]\label{thm:convergence}
    Suppose \cref{assump:constraint,assump:exact,assump:grad_bound,assump:mixing,assump:noise} hold.
    As $t \to \infty$,
    \[
        \E\|w_t \!-\! w^\star\|^2 = O\left( \frac{1}{\sqrt{t}} \right).
    \]
\end{theorem}

It is worth noting that whether to employ the hot-start test does not alter the convergence rate of Hot DoG as shown 
in \cref{thm:convergence}. Instead, the hot-start test can lead to a more favourable constant in the 
convergence rate.
As we discussed in \cref{sec:hotstarttest}, the hot-start test helps avoid updating the coreset weights using 
initial gradient estimates that may have unusually large norms. 
In terms of our analysis, by holding off optimizing $w$ until the hot-start test passes, 
we can obtain a tighter bound on the gradient norm (i.e., a smaller $U$ in \cref{assump:grad_bound}). 
This results in a smaller constant in the convergence rate given in \cref{thm:convergence}, 
ultimately leading to improved finite-time performance.
