\begin{figure}[t]
    \includegraphics[width=\columnwidth]{plots/fixed/adam_shrink_svg-tex.pdf}
    \includegraphics[width=\columnwidth]{plots/legend_solid-cropped.pdf}
    \caption{Coreset MCMC posterior approximation error (as defined in \cref{eq:avg_sq_z})
    		using ADAM with different learning rates for a variety of datasets, models, and coreset sizes.
    		The lines indicate median values after 200,000 optimization iterations across 10 trials.}
    \label{fig:ADAM}
\end{figure}


\begin{figure*}[t!]
    \centering{
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_no_mix_1000_DoG.png}
        \caption{DoG}\label{fig:DoG_nomix}
    \end{subfigure}
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_no_mix_1000_DoWG.png}
        \caption{DoWG}\label{fig:DoWG_nomix}
    \end{subfigure}}\\
    \centering{
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_no_mix_1000_dadaptSGD.png}
        \caption{D-Adaptation SGD}\label{fig:dadaptSGD_nomix}
    \end{subfigure}
    \begin{subfigure}{0.45\textwidth}
        \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_no_mix_1000_ProdigyADAM_ca.png}
        \caption{prodigy ADAM}\label{fig:ProdigyADAM_nomix}
    \end{subfigure}}
    \caption{Traces of average squared coordinate-wise z-scores (defined in \cref{eq:avg_sq_z}) between the true and approximated posterior for 
                a Bayesian linear regression example with $M=1{,}000$ coreset points. 
                We evaluate four learning-rate-free SGD methods: 
                DoG and DoWG (with varying initial learning rate parameter), and 
                D-Adaptation SGD and prodigy ADAM (with default initial lower bound $10^{-6}$).
                The optimally-tuned ADAM baseline is shown in green. Results display the median 
                after 200,000 optimization iterations across 10 trials.}
    \label{fig:direct_application}
\end{figure*}


\section{Tuning-Free Coreset MCMC}

A key design choice when using Coreset MCMC is to specify how gradient estimates are used to
optimize the weights. One can use ADAM \citep{kingma2014adam}, which is
used as the default optimizer for Coreset MCMC \citep{chen2024coreset}:
at iteration $t$, with $\gamma_t > 0$ being the user-specified learning rate, we set 
\[
    w_{t+1} \gets \proj_{\geq 0}\lt(w_t - \gamma_t \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}\rt),
\]
where $\hat{m}_t$ and $\hat{v}_t$ are exponential averages of past gradients $(\hat{g}_i)_{i=0}^t$ and their 
element-wise squares, and $\epsilon$ is a small constant.
There are a wide range of other first-order stochastic methods available that could be used (e.g., vanilla stochastic
gradient descent, AdaGrad \citep{duchi2011adaptive}, etc.). However,
like ADAM, most of these algorithms require setting a learning rate $\gamma_t$. And as we show in \cref{fig:ADAM}, the quality 
of samples obtained from Coreset MCMC can be highly sensitive to the selected learning rate. In 
particular, \cref{fig:ADAM} shows that when using ADAM, no 
single learning rate applies well across all problems and coreset sizes; and for a given problem, the performance can vary
by orders of magnitude as one varies the learning rate.
Furthermore, the default ADAM learning rate of $10^{-3}$ \citep{kingma2014adam} 
provides poor results in most of the problems tested. As a result, 
careful tuning of the learning rate is required to obtain high quality posterior approximations.
This usually involves a search on a log-scaled grid, which is computationally wasteful as the results 
for all but one of the parameter values are thrown out. Moreover, in practice determining which learning rate
provides the best posterior approximation may not be straightforward, as we do not have access to estimates of the objective function.

A number of recent works in the literature propose learning-rate-free stochastic optimization methods to address this issue
\citep{carmon2022making,ivgi2023dog,khaled2023dowg,defazio2023learning,mishchenko2023prodigy}. Many of these methods 
are shown empirically to work competitively compared to optimally-tuned SGD on a wide range of large-scale, 
non-convex deep learning problems. Although different at first glance, all of these methods 
arise from the same insight. Suppose one would like to solve the stochastic 
optimization problem
\[
    \min_{w\in\reals^d} \E\left[ f(w,\theta) \right],
\]
where for all $\theta$, $f(\cdot, \theta)$ is convex and we only have access to unbiased stochastic gradient $g_t = \partial f(w_t, \theta_t)$. 
Define the initial distance to the optimal solution $d_0 = \|w_0 - w^\star\|$ 
and the sum of all gradient norms $G_T = \sum_{t\leq T}\|g_t\|^2$.
By setting the SGD learning rate
% \[
    $\gamma^\star = \frac{d_0}{\sqrt{G_T}}$,
% \]
the average iterate $\bar{w} = \frac{1}{T}\sum_{t\leq T}w_t$ satisfies the optimal error bound 
\[
    \E\left[ f(\bar{w},\theta) \right] - \E\left[ f(w^\star,\theta) \right] \leq \frac{d_0\sqrt{G_T}}{T}
\]
after $T$ iterations \citep{carmon2022making,orabona2020icml}.
Learning-rate-free methods therefore essentially try to estimate or bound the initial distance to the optimal solution $d_0$,
which is unknown in practice. To the best of our knowledge, there are four state-of-the-art methods
that do this in a manner that does not require multiple optimization runs, knowledge of unknown constants,
or the ability to query the objective function:
DoG \citep{ivgi2023dog}, DoWG \citep{khaled2023dowg}, D-Adaptation \citep{defazio2023learning} and prodigy 
\citep{mishchenko2023prodigy}. 
DoG and DoWG run vanilla stochastic gradient descent (SGD),
\[
    w_{t+1} &\gets \proj_{\geq 0}\lt(w_t - \gamma_t g_t\rt),
\]
with learning rate schedules
\[
    \hspace{-.3cm}\gamma_t = \frac{r_t}{\sqrt{G_t}} \text{(DoG)},\,
    \gamma_t = \frac{r^2_t}{\sqrt{\sum_{i\leq t} r_{i}^2 \|g_{i}\|^2}} \text{(DoWG)}, \label{eq:learningrates}
\]
where $r_0$ is set to some small constant
and, for $t \ge 1$, 
\[
r_t = \max_{i\leq t} \|w_t - w_0\|.
\]
For D-Adaptation and prodigy, 
$r_t$ in \cref{eq:learningrates} is replaced with a lower bound $d_t$ on $d_0$,
which is updated using estimated correlations between the $g_t$ and 
step direction $w_0-w_t$:
\[
    d_{t+1} = \max\left\{ \frac{\sum_{i=0}^{t} d_i \left\langle g_i, w_0-w_i \right\rangle}
                                { \left\| \sum_{i=0}^t d_ig_i \right\|}, d_{t} \right\}.
\]
D-Adaptation replaces $r_t$ in \cref{eq:learningrates} (DoG) with $d_t$, while prodigy replaces $r_t$ in \cref{eq:learningrates} (DoWG)
with $d_t$. Both D-Adaptation and prodigy have SGD and ADAM-based variants.
All four methods have been shown empirically to match the performance of 
optimally-tuned SGD.

\cref{fig:direct_application} shows the results from direct applications of DoG, DoWG, D-Adaptation (SGD), and prodigy 
(ADAM) to Coreset MCMC. We see that the quality of posterior approximation from all of four methods are orders of 
magnitude worse than optimally-tuned ADAM. With $\theta_0$ initialized far away from high density regions of 
$\pi_{w_0}$, the initial gradient estimates are large in magnitude, which leads to small learning rates. The 
accumulation of these large gradient norms in the learning rate denominator eventually causes the learning rate to 
vanish, halting the progress of coreset weight optimization. We address these problems in the 
next section.

% \cT{naitong fill. problem 1 -- bad initial (large) grad estimates cause super bad tiny steps. problem 2 -- denominator 
% accumulates too quickly. just point out the problems here and say that we will address in next sctn}

% I've left your fodder here. Don't use all of this. Keep it one paragraph long.
%\begin{itemize}
%    \item Intuitively, one may want to use one of these methods in Coreset MCMC. However, doing so does not directly 
%            lead to high quality posterior approximations.     
%    \item We use linear the linear regression example to illustrate this phenomenon. 
%            The same plots for other examples in appendix.
%    \item \cref{fig:direct_application} shows loss trace for DoG and DowG over different initial parameters $r$ 
%            compared to optimally tuned-ADAM.
%    \item d-dapt and prodigy SGD are not included as they would take the coreset weights to infinity.
%    \item We see that both DoG and DoWG are indeed robust to the initial parameter.
%    \item However, they don't match up with optimally-tuned ADAM's performance.
%    \item This difference comes from the later stage of weight optimization, where DoG and DoWG stop progressing towards 
%            the optimum.
%    \item this is due to reasons below.
%    \item (taken from earlier version of the draft) 
%            (1) the initial gradient estimates being much larger in magnitude than those obtained in later 
%            iterations and (2) the denominator of these learning rates accumulating all gradient estimate norms. Recall in 
%            Coreset MCMC, the gradient estimate $\hat{g}_t$ is an estimated covariance under $\pi_{w_t}$, obtained using $K$ 
%            arbitrarily initialized Markov chain states. These Markov chain states are simulated only one step forward every time 
%            the coreset weights are updated. Therefore, during the early iterations of Coreset MCMC, $\theta_t$ is likely far 
%            from high density regions of $\pi_{w_t}$, and so these states would be poor surrogates of \iid samples from 
%            $\pi_{w_t}$. This mismatch can cause the gradient estimates $\hat{g}_t$ to have unusually large norms. This undesirable 
%            behaviour is then compounded by the fact that the learning rate denominator from all four methods 
%            accumulate the sum of all gradient estimate norms up to the current iteration. As a result, the initial gradient 
%            estimates with unusually large norms results in overly conservative learning rates from all four of these methods. 
%    \item Therefore, to avoid having overly conservative gradient steps, it is necessary to introduce a burn-in phase to 
%            allow the Markov chains to mix before we start to optimize the coreset weights. 
%    \item Burn-in would take care of the problem of progress halting completely. 
%    \item However, even without the gradient mismatch problem unique to Coreset MCMC, both DoG and DoWG both claim to 
%            match the performance of optimally-tuned SGD, which is inferior to that of optimally-tuned ADAM.
%    \item Therefore, to further improve the performance of these learning-rate-free methods, we also need to apply 
%            acceleration techniques.
%\end{itemize}



Before concluding this section, we note that there are other approaches for making SGD free of learning rate 
tuning: some methods involve using stochastic versions of line search 
\citep{vaswani2019painless,paquette2020stochastic}, and others do the same for the Polyak step size 
\citep{loizou2021stochastic}. These methods are not applicable in our setting as they require evaluating the 
objective function. Recall that due to the unknown $Z(w)$ term in \cref{eq:coresetposterior}, we do not have access 
to estimates of the objective function.
%
%
%One method by \citet{carmon2022making} 
%
%proposes a bisection search algorithm that solves \cref{eq:implicitequation}. However, their method still requires 
%multiple optimization runs. Furthermore, setting $\alpha$ and $\beta$ in practice may not be straightforward as they 
%depend on the Lipschitz constant of the objective function.
%
%
%\citet{carmon2022making} show that, for some constants $\alpha, \beta$, 
%by setting
%\[
%    \gamma = \frac{r_T}{\sqrt{\alpha G_T + \beta}},
%    \label{eq:implicitequation}
%\]
%where $r_T = \max_{t\leq T} \|x_t - x_0\|$, the average SGD iterate converges to the optimal solution at the same 
%asymptotic rate as when the learning rate is set to $\gamma^\star$. 
%Instead of a bisection search, 

% One may want to use one of these methods in Coreset MCMC. 
% However, as we show in the next section, doing so does not directly lead to 
% high quality posterior approximations. We next investigate the poor quality of the resulting posterior approximations, 
% which motivates our proposed method.

% \cref{fig:direct_application} shows, through a Bayesian linear regression 
% problem, that doing so does not directly lead to high quality posterior approximations compared to those of 
% optimally-tuned ADAM. 
% Although it is worth noting that these methods are robust against different values of $r_0$. 
% The same trend can be found across other experiments and different coreset sizes, the plots for which are found in 
% \cref{sec:appendix_3}. We next investigate the poor quality of the resulting posterior approximation, which motivates 
% our proposed method.


