\section{Details of Experiments}
\label{sec:appendix_4}
\subsection{Model Specification}
In this subsection, we describe the six examples (two synthetic and four real data) that we used for our experiments. 
Processed versions of all datasets used for the experiments are available at 
\url{https://github.com/NaitongChen/automated-coreset-mcmc-experiments}.
For each of the regression models, we are given a set of points $(x_n,y_n)^N_{n=1}$, each consisting of features 
$x_n\in\reals^p$ and response $y_n$.

\textbf{Bayesian sparse linear regression:} This is based on Example 4.1 from \cite{george1993variable}. We use the model
\[
  \sigma^2 &\dist \InvGam\left(\nu/2, \nu\lambda/2\right),\\
  \forall i\in[p], \quad \gamma_i &\distiid \Bern(q), \\
  \beta_i \mid \gamma_i &\distind \Norm\left(0, \left(\mathds{1}(\gamma_i = 0)\tau + \mathds{1}(\gamma_i = 1)c\tau\right)^2 \right), \\ 
  \forall n\in[N], \quad y_n \mid x_n, \beta, \sigma^2 &\distind \Norm\left( x_n^\top\beta, \sigma^2 \right),
\]
where we set $\nu=0.1, \lambda=1, q=0.1, \tau=0.1$, and $c=10$. Here we model the variance $\sigma^2$, the vector of regression 
coefficients $\beta = \begin{bmatrix}\beta_1 & \dots & \beta_p \end{bmatrix}^\top\in\reals^p$ and  
the vector of binary variables $\gamma = \begin{bmatrix}\gamma_1 & \dots & \gamma_p \end{bmatrix}^\top\in\left\{0,1\right\}^p$ 
indicating the inclusion of the $p^\text{th}$ feature in the model.
We set $N=50{,}000$, $p=10$, $\beta^\star = \begin{bmatrix}0 & 0 & 0 & 0 & 0 & 5 & 5 & 5 & 5 & 5\end{bmatrix}^\top$, 
and generate a synthetic dataset by
\[
  \forall n \in [N], \quad x_n &\distiid \Norm\left( 0, I \right),\\
  \eps_n &\distiid \Norm\left( 0, 25^2 \right),\\
  y_n &= x_n^\top \beta^\star + \eps_n.
\]

\textbf{Bayesian linear regression:} We use the model
\[
  \begin{bmatrix} \beta & \log\sigma^2 \end{bmatrix}^\top &\dist \Norm(0,I),\\ 
  \forall n\in[N],
  y_n \mid x_n, \beta, \sigma^2 &\distind \Norm\left(\begin{bmatrix} 1 & x_n^\top \end{bmatrix}\beta, \sigma^2\right),
\]
where $\beta\in\reals^{p+1}$ is a vector of regression coefficients and $\sigma^2\in\reals_+$ is the noise variance. 
Note that the prior here is not conjugate for the likelihood.
The dataset consists of flight delay information from $N= 98{,}673$ observations and was constructed using flight 
delay data from \url{https://www.transtats.bts.gov/Homepage.asp} 
and historical weather information from \url{https://www.wunderground.com/}. We study 
the difference, in minutes, between the scheduled and actual departure times against $p=10$ features 
including flight-specific and meteorological information.

\textbf{Bayesian logistic regression:} We use the model
\[
    \forall i\in[p+1], \quad \beta_i &\distiid \Cauchy(0,1), \\
	\forall n\in[N], \quad y_n &\distind \Bern 
    \left(\left(1+\exp\left(-\begin{bmatrix} 1 & x_n^\top\end{bmatrix} \beta\right)\right)^{-1}\right),  
\]
where $\beta = \begin{bmatrix}\beta_1 & \dots & \beta_{p+1} \end{bmatrix}^\top\in\reals^{p+1}$ is a 
vector of regression coefficients. Here we use the same dataset as in linear 
regression, but instead model the relationship between whether a flight is cancelled using the same set of features. 
Note that of all flights included, only $0.058\%$ were cancelled.

\textbf{Bayesian Poisson regression:} We use the model
\[
  \beta &\dist \Norm(0, I),\\
  \forall n\in[N],
  y_n \mid x_n, \beta &\distind 
  \Poiss\left( \log\left( 1 + e^{ \begin{bmatrix} 1 & x_n^\top \end{bmatrix}\beta } \right) \right),
\]
where $\beta\in\reals^{p+1}$ is a vector of regression coefficients. 
% Here we use a processed version of the 
% bikeshare dataset\footnote{The dataset is available at 
% \texttt{[removed for peer review]}.} 
The dataset consists of $N= 15{,}641$ observations, and
%  and was constructed using the bike share 
% consisting of $N= 15{,}641$
% data points. 
we model the hourly count of rental bikes against $p=8$ features (e.g., temperature, humidity at the 
time, and whether or not the day is a workday). The original bike share dataset is available at 
\url{https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset}.

The remaining two non-regression models are specified as follows.

\textbf{Gaussian location:} We use the model
\[
\theta &\dist \Norm(0, I),\\
\forall n \in [N], X_n &\distiid \Norm(\theta, I),
\]
where $\theta, X_n\in\reals^d$. Here we model the mean $\theta$. We set $N=10{,}000, d=20$ and generate a synthetic dataset by
\[
    \forall n\in[N], x_n\distiid\Norm(0,I).
\] 


\textbf{Bradley-Terry model:} We use the model
\[
    \theta &\distiid \Norm(0, I),\\
    \forall n\in[N], y_{n} \mid h_n, v_n, \theta &\distind \Bern\left( \left( 1+\exp\left( (\theta_{v_n} - \theta_{h_n})/400 \right) \right)^{-1} \right),
\]
where $\theta \in \reals^d$. 
% is a vector of Elo ratings for each of the $d=30$ teams.
The dataset was constructed using games statistics from \url{https://www.nba.com/stats}
and consists of data of $N=26,651$ NBA games between the 2004 and 2022 seasons. 
$h_n$ and $v_n$ are the home team and visitor team IDs for the $n^\text{th}$ game in the dataset, and $y_n$ denotes 
the outcome of the game ($y_n=1$ if the home team won and $y_n=0$ if the visitor team won).
$\theta\in\reals^d$ represents the Elo ratings or relative skill levels [\citealp[Ch.~1]{elo1978rating}] for each of 
the $d=30$ teams. We model the Elo ratings using outcomes of pairwise comparisons between teams using game outcomes.

\subsection{Parameter Settings}
For full-data inference results of all examples except for the sparse linear regression model, we ran Stan 
\citep{carpenter2017stan} with 10 parallel chains,
each taking $100{,}000$ steps with the first $50{,}000$ discarded, for a combined $500{,}000$ draws. 
For full-data inference result of the sparse linear regression example, we use the Gibbs sampler developed by \cite{george1993variable}
to generate $200{,}000$ draws, with the first half discarded as burn-in.

To account for changes in $w$, for all real data experiments, we use the hit-and-run slice sampler 
with doubling \citep{belisle1993hit,neal2003slice}; 
for the Gaussian location model, we use a kernel that directly samples from $\pi_w$ [\citealp[Sec.~3.4]{chen2024coreset}].
for the sparse regression, we use the Gibbs sampler developed by \cite{george1993variable}.

We use Stan \citep{carpenter2017stan} to obtain full data inference results for real data experiments, 
and Gibbs sampling \citep{george1993variable} for the sparse regression model with discrete variables. 
The true posterior distribution for the Gaussian location model is available in closed form.

For ADAM, we test multiple learning rates over a log scale grid 
$\left( 10^k \right)$ for $k=-3,-2,\dots,1$.
For each experiment under each coreset size, the optimally-tuned ADAM is the one that obtained the lowest average 
squared z-score after $200,000$ iterations of weight optimization. 
For all learning-rate-free methods, we test different initial parameters (initial lower bound for prodigy ADAM 
and $r_0$ for Hot DoG, DoG, and DoWG) over a log scaled grid $\left( 10^k \right)$ for $k=-3, -2, \dots, 1$.

For the logistic regression example, to account for the class imbalance problem, we include all observations from the 
rare positive class if the coreset size is more than twice as big as the total number of observations with 
positive labels. Otherwise we sample our coreset to have $50\%$ positive labels and $50\%$ negative labels. 
Coreset points are uniformly subsampled for all other models.

\section{Additional Results}
\label{sec:appendix_5}
Figs. 4 to 6 in the main text
show the traces of average squared coordinate-wise z-scores, 
as well as the gradient estimate norms and hot-start test statistics for Hot DoG when $M=1000$. In this subsection, we 
show the same sets of plots for $M=100$ and $M=500$. 
Similarly to \cref{fig:burnincomparison}, 
\cref{fig:burnincomparison100,fig:burnincomparison500} compare Hot DoG with and without hot-start test. 
Similarly to \cref{fig:burnintest}, 
\cref{fig:burnintest100,fig:burnintest500} show the gradient estimate norms and hot-start test statistics during burn-in.
Similarly to \cref{fig:tracecombined}, 
\cref{fig:tracecombined100,fig:tracecombined500} compare Hot DoG (with hot-start test) and optimally-tuned ADAM.
We see that all plots show the same trends as the ones in 
\cref{sec:experiments}, 
where $M=1000$.
As a result, we arrive at similar observations as in
\cref{sec:experiments}.
% Section 5. 
In particular, Hot DoG with burn-in leaves the plateau sooner than without burn-in; the hot-start test passes and 
thus burn-in terminates shortly after gradient norms are stabilized; Hot DoG is robust to the fixed parameter $r$.

\begin{figure}[h]
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/gaussian_location_coresetMCMC_metrics_combined_100_ADAMDoGCoord.png}
    \caption{Gaussian location}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/sparse_regression_coresetMCMC_metrics_combined_100_ADAMDoGCoord.png}
    \caption{Sparse regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_combined_100_ADAMDoGCoord.png}
    \caption{Linear regression}\label{fig:burnincomparison-linear100}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/logistic_regression_coresetMCMC_metrics_combined_100_ADAMDoGCoord.png}
    \caption{Logistic regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/poisson_regression_coresetMCMC_metrics_combined_100_ADAMDoGCoord.png}
    \caption{Poisson regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/bradley_terry_coresetMCMC_metrics_combined_100_ADAMDoGCoord.png}
    \caption{Bradley-Terry}
    \end{subfigure}
    \caption{Traces of average squared coordinate-wise z-scores between the true and approximated posterior 
    across all experiments, obtained using Hot DoG with and without hot-start test. 
    All figures share the legend in \cref{fig:burnincomparison-linear100}. The coreset size $M$ is $100$ and each line
    represents a different initial learning rate parameter. The lines indicate the median from $10$ runs.
    Orange lines indicate runs with hot-start test and blue lines without.}
    \label{fig:burnincomparison100}
\end{figure}

\begin{figure}[h]
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/gaussian_location_coresetMCMC_metrics_combined_500_ADAMDoGCoord.png}
    \caption{Gaussian location}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/sparse_regression_coresetMCMC_metrics_combined_500_ADAMDoGCoord.png}
    \caption{Sparse regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_combined_500_ADAMDoGCoord.png}
    \caption{Linear regression}\label{fig:burnincomparison-linear500}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/logistic_regression_coresetMCMC_metrics_combined_500_ADAMDoGCoord.png}
    \caption{Logistic regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/poisson_regression_coresetMCMC_metrics_combined_500_ADAMDoGCoord.png}
    \caption{Poisson regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/bradley_terry_coresetMCMC_metrics_combined_500_ADAMDoGCoord.png}
    \caption{Bradley-Terry}
    \end{subfigure}
    \caption{Traces of average squared coordinate-wise z-scores between the true and approximated posterior 
    across all experiments, obtained using Hot DoG with and without hot-start test. 
    All figures share the legend in \cref{fig:burnincomparison-linear500}. The coreset size $M$ is $500$ and each line
    represents a different initial learning rate parameter. The lines indicate the median from $10$ runs.
    Orange lines indicate runs with hot-start test and blue lines without.}
    \label{fig:burnincomparison500}
\end{figure}

\begin{figure}[h]
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/gaussian_location_burnin100.png}
        \caption{Gaussian location}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/sparse_reg_burnin100.png}
        \caption{Sparse regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/lin_reg_burnin100.png}
        \caption{Linear regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/log_reg_burnin100.png}
        \caption{Logistic regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/poi_reg_burnin100.png}
        \caption{Poisson regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/bradley_terry_burnin100.png}
        \caption{Bradley-Terry}
    \end{subfigure}
    \caption{Trace of gradient estimate norms (blue) and hot-start test statistics (green) before weight optimization
            across all experiments with $M=100$.
            The orange horizontal line is the test statistic threshold $c=0.5$.}
    \label{fig:burnintest100}
\end{figure}

\begin{figure}[h]
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/gaussian_location_burnin500.png}
        \caption{Gaussian location}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/sparse_reg_burnin500.png}
        \caption{Sparse regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/lin_reg_burnin500.png}
        \caption{Linear regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/log_reg_burnin500.png}
        \caption{Logistic regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/poi_reg_burnin500.png}
        \caption{Poisson regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
        \includegraphics[width=\columnwidth]{plots/bradley_terry_burnin500.png}
        \caption{Bradley-Terry}
    \end{subfigure}
    \caption{Trace of gradient estimate norms (blue) and hot-start test statistics (green) before weight optimization
            across all experiments with $M=500$.
            The orange horizontal line is the test statistic threshold $c=0.5$.}
    \label{fig:burnintest500}
\end{figure}

\begin{figure}[h]
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/gaussian_location_coresetMCMC_metrics_mix_100_ADAMDoGCoord.png}
    \caption{Gaussian location}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/sparse_regression_coresetMCMC_metrics_mix_100_ADAMDoGCoord.png}
    \caption{Sparse regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_mix_100_ADAMDoGCoord.png}
    \caption{Linear regression}\label{fig:tracecombined-linear100}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/logistic_regression_coresetMCMC_metrics_mix_100_ADAMDoGCoord.png}
    \caption{Logistic regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/poisson_regression_coresetMCMC_metrics_mix_100_ADAMDoGCoord.png}
    \caption{Poisson regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/bradley_terry_coresetMCMC_metrics_mix_100_ADAMDoGCoord.png}
    \caption{Bradley-Terry}
    \end{subfigure}
    \caption{Traces of average squared coordinate-wise z-scores between the true and approximated posterior 
    across all experiments, obtained from Hot DoG and optimally-tuned ADAM. 
    All figures share the legend in \cref{fig:tracecombined-linear100}.
    The coreset size $M=100$ and each line represents a different initial learning rate parameter. 
    The lines indicate the median from $10$ runs.}
    \label{fig:tracecombined100}
\end{figure}

\begin{figure}[h]
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/gaussian_location_coresetMCMC_metrics_mix_500_ADAMDoGCoord.png}
    \caption{Gaussian location}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/sparse_regression_coresetMCMC_metrics_mix_500_ADAMDoGCoord.png}
    \caption{Sparse regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/linear_regression_coresetMCMC_metrics_mix_500_ADAMDoGCoord.png}
    \caption{Linear regression}\label{fig:tracecombined-linear500}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/logistic_regression_coresetMCMC_metrics_mix_500_ADAMDoGCoord.png}
    \caption{Logistic regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/poisson_regression_coresetMCMC_metrics_mix_500_ADAMDoGCoord.png}
    \caption{Poisson regression}
    \end{subfigure}
    \begin{subfigure}{0.33\textwidth}
    \includegraphics[width=\columnwidth]{plots/fixed/bradley_terry_coresetMCMC_metrics_mix_500_ADAMDoGCoord.png}
    \caption{Bradley-Terry}
    \end{subfigure}
    \caption{Traces of average squared coordinate-wise z-scores between the true and approximated posterior 
    across all experiments, obtained from Hot DoG and optimally-tuned ADAM. 
    All figures share the legend in \cref{fig:tracecombined-linear500}.
    The coreset size $M=500$ and each line represents a different initial learning rate parameter. 
    The lines indicate the median from $10$ runs.}
    \label{fig:tracecombined500}
\end{figure}