\section{Background}
\label{sec:background}

\subsection{Bayesian Coresets}
We are given a data set $(X_n)_{n=1}^N$ of $N$ observations, 
a log-likelihood $\ell_n \coloneqq \log p(x_n \mid \theta)$ 
for observation $n$ given $\theta \in \Theta$, and a prior density $\pi_0(\theta)$. 
We would like to sample from the  Bayesian posterior with density
\[
  \pi(\theta) \coloneqq \frac{1}{Z} \exp\left( \sum_{n=1}^N \ell_n(\theta) \right) \pi_0(\theta),
\]
where $Z$ is the unknown normalizing constant. A Bayesian coreset replaces the sum over $N$ log-likelihood terms with a 
weighted sum over a subset of size $M$, where $M\ll N$. Without loss of generality, we assume that these are the 
first $M$ points. The coreset posterior can then be written as
\[
  \pi_w(\theta) \coloneqq \frac{1}{Z(w)} \exp\left( \sum_{m=1}^M w_m \ell_m(\theta) \right) \pi_0(\theta), 
  \label{eq:coresetposterior}
\]
where $w \in \reals^M_{+}$ is a vector of coreset weights.
Recent coreset construction methods 
uniformly select $M$ points to include in the coreset \citep{naik2022fast,chen2022bayesian,chen2024coreset}, and 
then optimize the weights of those $M$ points as 
a variational inference problem \citep{campbell2019sparse},
\[
    w^\star = \argmin_{w\in\reals^M} \kl{\pi_w}{\pi} \quad \text{s.t.} \quad w \in \mathcal{W}\label{eq:coresetopt},
\]
with objective function gradient
\[
    \label{eq:grad}
    &\nabla_w \kl{\pi_w}{\pi} \\
    = &\Cov_{\pi_w}\lt( \bbmat \ell_1(\theta) \\ \vdots \\ \ell_M(\theta)\ebmat, 
        \sum_m w_m\ell_m(\theta) - \sum_n \ell_n(\theta) \rt).
\] 

\balg[t]
\caption{\texttt{CoresetMCMC}} \label{alg:coresetmcmc}
\balgc
\Require $\theta_0$, $\kappa_w$, $S$, $M$
\LineComment Initialize coreset weights
\State $w_{0m} = \frac{N}{M}, \quad m = 1,\cdots,M$
\For{$t=0, \dots, T$}
    \LineComment Subsample the data
    \State $\scS_{t} \gets \Unif\lt(S, [N]\rt)$ (without replacement)
    \LineComment Compute gradient estimate
    \State $\hat{g}_{t} \gets g(w_{t}, \theta_{t}, \scS_{t})$ (\cref{eq:gradest})
    \State $w_{t+1} \gets $ \texttt{stochastic\_gradient\_step($w_{t}, \hat{g}_{t}$)}
    \LineComment Step each Markov chain
    \For{$k=1, \dots, K$}
        \State $\theta_{(t+1)k} \dist \kappa_{w_{t+1}}(\cdot \mid \theta_{tk})$
    \EndFor
\EndFor
\ealgc
\ealg

\subsection{Coreset MCMC}
The key challenge in solving \cref{eq:coresetopt} is that $\pi_w$ does not admit tractable \iid draws,
and so unbiased estimates of the gradient in \cref{eq:grad} are not readily available.
Coreset MCMC \citep{chen2024coreset} is an adaptive algorithm that addresses this issue.
The method first initializes weights $w_0 \in\reals^M$ and 
$K\geq 2$ samples $\theta_0 = \lt(\theta_{01}, \dots, \theta_{0K}\rt) \in \Theta^K$.
At iteration $t\in\nats$, given coreset weights $w_t$ and samples 
$\theta_t \in \Theta^K$,
it then updates the weights 
$w_t \to w_{t+1}$ using the stochastic gradient estimate based on the draws $\theta_t$,
\[
    \label{eq:gradest}
    &g(w_t, \theta_t, \scS_t) = \\
    &\frac{1}{K\!-\!1}\!\sum_{k=1}^K \!\!\bbmat \bar\ell_1(\theta_{tk})\\ \vdots \\ \bar\ell_M(\theta_{tk})\ebmat
    \!\!\lt(\!\sum_m w_{tm}\bar\ell_m(\theta_{tk}) \!-\! \frac{N}{S}\!\sum_{s\in \scS_t}\!\bar\ell_{s}(\theta_{tk}) \!\rt),
\]
where $\scS_t \subseteq [N]$ is a uniform subsample of indices of size $S$, 
and $\bar\ell_n(\theta_{tk}) = \ell_n(\theta_{tk}) - \frac{1}{K}\sum_{j=1}^K \ell_n(\theta_{tj})$.
To complete the iteration, the method updates the samples by independently drawing  $\theta_{(t+1)k} \dist \kappa_{w_{t+1}}(\theta_{tk}, \cdot)$ for each $k\in [K]$,
where $\kappa_w$ is a family of $\pi_w$-invariant Markov kernels. %with invariant distribution $\pi_w$.
The pseudocode for Coreset MCMC is outlined in \cref{alg:coresetmcmc}. 


