\subsection{Proof of Theorem 1}\label{subsec:proof_of_theorem_1}
\paragraph{Flatness Bound of BMA}
We first derive flatness bound of BMA. We assume $M$ model $w_m, m=1, .., M$, whose Hessian matrices $H_{f_m}$ (defined in Eq.~\ref{eq:bma_hessian}) are Hermitian. $w_{\text{WA}} = 1/M \sum_{m=1}^M w_m$ is simple weight averaging and the Hessian of $w_{\text{WA}}$, $H_{f_{\text{WA}}}$, also be a Hermitian matrix.
Weyl’s inequality is known to bound the eigenvalues of Hermitian matrices.

\begin{proposition}
\label{proposition:wely_inequality}
\textbf{(Weyl's Inequality)} For Hermitian matrices $C_m \in \mathbb{C}^{p \times p}$, $k, l = 1, ..., p$, \begin{align}
\label{eq:wely_ineq}
\lambda_{k+l-1}(C_i + C_j) \le \lambda_k(C_i) + \lambda_l (C_j) \le  \lambda_{k+l-N}(C_i + C_j). \nonumber\\
\end{align}
\end{proposition}

Let $k=1$ and $l=1$, then Eq.~\ref{eq:wely_ineq} can be written as:
\begin{align}
\lambda_1(C_i + C_j) \le \lambda_1(C_i) + \lambda_1(C_j). \nonumber
\end{align}

As we have $M$ Hermitian matrices, it can be expanded as:
\begin{align}
\label{eq:upper_bound}
    \lambda_1(\frac{1}{M}\sum_{m=1}^M H_{f_m}) \le \frac{1}{M}\sum_{m=1}^M \lambda_1 (H_{f_m}).
\end{align}
% We get upper bound of Theorem~\ref{theorem:bma_eign_bound} like Eq (\ref{eq:upper_bound}).

One the other hand, we can let $(k,l) = \{(1, p), (p, 1)\}$ and rewrite the Eq.~\ref{eq:wely_ineq} as:
\begin{align}
    \max \{\lambda_1(C_i) + \lambda_p(C_j), \lambda_p(C_i) + \lambda_1(C_j)\} \le \lambda_1 (C_i + C_j). \nonumber
\end{align}

Again, set $M$ Hermitian matrices we have, it can be expanded as:
\begin{equation}
\label{eq:lower_bound}
    \max \left( \Bigg{\{} \frac{1}{M} \bigg{(} \lambda_{1}(H_{f_m}) + \sum_{\substack{n=1 \\ n \neq m}}^{M} \lambda_{p}(H_{f_n}) \bigg{)} \Bigg{\}}_{m=1}^M \right) \le \lambda_{1}(\frac{1}{M}\sum_{m=1}^M H_{f_m}).
\end{equation}

By combining Eq.~\ref{eq:upper_bound} with Eq.~\ref{eq:lower_bound} and substituting $\lambda_1$ to $\lambda_{\text{max}}$ and $\lambda_p$ to $\lambda_{\text{min}}$, the flatness of averaged weight parameter is bounded as:
\begin{equation}
\label{eq:wa_flatness_bound}
    \max \left( \Bigg{\{} \frac{1}{M} \bigg{(} \lambda_{\text{max}}(H_{f_m}) + \sum_{\substack{n=1 \\ n \neq m}}^{M} \lambda_{\text{min}}(H_{f_n}) \bigg{)} \Bigg{\}}_{m=1}^M \right) \le \lambda_{\text{max}}(\frac{1}{M}\sum_{m=1}^M H_{f_m}) \le \frac{\sum_{m=1}^M \lambda_{\text{max}}(H_{f_m})}{M}.
\end{equation}

However, Eq~\ref{eq:wa_flatness_bound}, as a bound for weight averaging (WA), cannot be directly applied to BMA, which marginalizes diverse predictions. To bridge this gap, we leverage Proposition~\ref{proposition:wa_and_ens}. which characterized the close relation between WA and BMA~\citep{izmailov2018averaging, wortsman2022model, rame2022diverse}. 

\begin{proposition}
\label{proposition:wa_and_ens}
(\citet{rame2022diverse}) Given predictions of model $f_m(\cdot)$ parameterized by $w_m$, those of weight averaged model $f_{\text{WA}}$ parameterized by $w_{\text{WA}}= \frac{1}{M} \sum_{m=1}^M w_m$, those of BMA $f_{\text{BMA}}$, and arbitrary twice differentiable loss function $\ell(\cdot)$, let $\Delta =\|f_{\text{BMA}}(x) - f_{\text{WA}}(x)\|_2$. Then, $\forall (x,y)$
\begin{align}
    \ell(f_{\text{WA}}(x), y) &= \ell(f_{\text{BMA}}(x), y) + O(\Delta). \nonumber
\end{align}
\end{proposition}

Lemma~\ref{proposition:wa_and_ens} shows that the predictions of BMA can be approximated with those of WA linearly. The error term is discarded in the process of obtaining the Hessian:
\begin{equation}
\label{eq:wa_bma_hessian}
H_{f_{\text{WA}}} \approx H_{f_{\text{BMA}}}
\end{equation}

By putting Eq.~\ref{eq:wa_bma_hessian} into Eq.~\ref{eq:wa_flatness_bound}, it leads to Lemma~\ref{lemma:bma_eign_bound}.

\setcounter{lemma}{0}
\begin{lemma}
Let twice differentiable loss $\ell(\cdot)$, predictions of model $f_m(\cdot)$ parameterized by $w_m$, and predictions of BMA $f_{\text{BMA}}(\cdot).$
With $M$ model sample $\{w_m\}_{m=1}^M$, the maximal eigenvalue of averaged Hessian of loss $\lambda_{\text{max}}(H_{f_{\text{BMA}}})$ is bounded as follow:
\begin{align}
\max \left( \Bigg{\{} \frac{1}{M} \bigg{(} \lambda_{\max}(H_{f_m}) + \sum_{\substack{n=1 \\ n \neq m}}^{M} \lambda_{\min}(H_{f_n}) \bigg{)} \Bigg{\}}_{m=1}^M \right) \le \lambda_{\max}(H_{f_{\text{BMA}}}) \le \frac{\sum_{m=1}^M \lambda_{\max}(H_{f_m})}{M}. \nonumber
\end{align}
\end{lemma}


\paragraph{Generalization Error Bound for BMA}
To further elucidate the theoretical connection between posterior flatness and generalization, we present the following proposition which is adapted from the generalization analysis of Eigen-SAM~\citep{luo2024explicit}:

\begin{proposition}[Adapted from Eigen-SAM]\label{proposition:sharpness_generalization}
Let $\ell : \mathbb{R}^p \to \mathbb{R}$ be a loss function upper bounded by $L$, and assume its third-order derivatives are uniformly bounded by a constant $C$. Suppose the following inequality holds for any parameter $\theta$:
\[
    \ell_{\mathcal{D}}(\theta) \leq \mathbb{E}_{\epsilon}[\ell_{\mathcal{D}}(\theta + \epsilon)] \qquad \text{where } \epsilon \sim \mathcal{N}(0, \sigma^2 I_p).
\]
Then, for any $\delta \in (0, 1)$ and $\sigma > 0$, with probability at least $1 - \delta$ over the training set $\mathcal{S} \sim \mathcal{D}^n$, we have:
\[
\begin{aligned}
    \ell_{\mathcal{D}}(\theta) \leq \ell_{\mathcal{S}}(\theta) + \frac{p \sigma^2}{2} \lambda_{\max}(\nabla^2 \ell_{\mathcal{S}}(\theta)) + \frac{C p^3 \sigma^3}{6}  + \frac{L}{2\sqrt{n}} \sqrt{p \log\left(1 + \frac{\|\theta\|^2}{p \sigma^2}\right) + O(1) + 2 \log\frac{1}{\delta} + 4 \log(n + p)}
\end{aligned}
\]
\end{proposition}

Building upon Lemma~\ref{proposition:wa_and_ens} and Proposition~\ref{proposition:sharpness_generalization}, we prove a new generalization bound that formally connects posterior flatness to the generalization performance of BMA:

\setcounter{theorem}{0}
\begin{theorem}[Generalization Bound for BMA]\label{theorem:bma_generalization_formal}
Let $w_m$ be samples from a variational posterior $q(w)$, and let $f_{\text{BMA}}$ denote the BMA predictor obtained by averaging predictions over these samples. Assume the conditions in Proposition~\ref{proposition:wa_and_ens} and Proposition~\ref{proposition:sharpness_generalization} hold for each sampled model $f_m$ with parameter $w_m$. Then, for any $\delta \in (0, 1)$ and $\sigma > 0$, with probability at least $1 - \delta$ over the training set $\mathcal{S} \sim \mathcal{D}^n$, the generalization error of the BMA predictor is upper bounded as:
\[
\ell_{\mathcal{D}}(f_{\text{BMA}}) \leq \ell_{\mathcal{S}}(f_{\text{BMA}}) + \frac{p \sigma^2}{2} \lambda_{\max}(H_{f_{\text{BMA}}}) + \frac{C p^3 \sigma^3}{6} + \frac{L}{2\sqrt{n}} \sqrt{p \log\left(1 + \frac{\|f_{\text{BMA}}\|^2}{p \sigma^2}\right) + O(1) + 2 \log \frac{1}{\delta} + 4 \log(n + p)}
\]
where $H_{f_{\text{BMA}}}$ denotes the Hessian of the loss evaluated at $f_{\text{BMA}}$.
\end{theorem}



\subsection{Derivation of Bayesian Flat-Seeking Optimizer}\label{subsec:derivation_odf_bayesian_flat_seeking_optimizer}
\subsubsection{Setting}\label{subsubsec:setting}
Let model parameter $w \subseteq \mathbb{R}^p$ and $w \sim \mathcal{N}(\mu, \Sigma)$.
% We can use lower triangular matrix $L$ instead of $p \times p$ covariance matrix $\Sigma$, where $\Sigma = L L^T$ by Cholesky decomposition.
While fully-factorized or mean-field covariance is de facto in Bayesian Deep Learning, it cannot capitalize on strong points of Bayesian approach.
Inspired from SWAG, we approximate covariance combining diagonal covariance $\sigma \subseteq \mathbb{R}^p$ and low-rank matrix $L \subseteq \mathbb{R}^{p \times K}$ with low-rank component $K$.
Then, we can simply sample $w = \mu + \frac{1}{\sqrt{2}}(\sigma z_1 + L z_2)$, where $z_1 \sim \mathcal{N}(0, I_p)$ and $z_2 \sim \mathcal{N}(0, I_K)$ where $p$, $K$ denotes the number of parameter, low-rank component, respectively. We treat flattened $\mu$, $\sigma$, and $L$, and concatenate as $\theta = \text{Concat}(\mu; \sigma; L)$.

\subsubsection{Objective function}\label{subsubsec:objective_function}
We compose our objective function with probabilistic weight, using KL Divergence as a metric to compare between two weights.

\begin{equation}
\label{eq:main_loss_app}
    \ell^\gamma_{\text{FP-BMA}}(\theta) = \max_{d|\theta+\Delta\theta, \theta| \leq \gamma^2} \ell(\theta + \Delta\theta) + \beta \textrm{D}_{\textrm{KL}}(p_\theta (w|\mathcal{D}) || p (w))
\end{equation}
\begin{equation}
\label{eq:divergence_app}
    \textrm{s.t.} \ \  d|\theta+\Delta\theta, \theta| = \textrm{D}_{\textrm{KL}} \big[ p_{\theta+\Delta\theta}(w|\mathcal{D})||p_\theta(w|\mathcal{D}) \big].
\end{equation}

\subsubsection{Optimization}\label{subsubsec:optimization}
% \subsubsection{Constraints}
% At first, we don't have any constraint on mean, $\mu$ and $\Delta \mu$:
% \begin{equation}
% \label{eq:mean_constraint}
%     \Tilde{\mu} = \mu + \Delta \mu \nonumber
% \end{equation}
% \\
% We have constraint on perturbation $\Delta L$ to maintain the covariance $\Sigma$ as positive semi-definite matrix:
% \begin{equation}
% \label{covariance_constraint}
%     \Tilde{L} = L + \Delta L , \ \  \Tilde{L}_{ii} \geq 0 \ \ \ \ \  \text{where} \ \ \  i = 1,2, ..., p \nonumber
%     % 2번 후보 :\Delta L , l_{ii} \geq 0
% \end{equation}


\paragraph{From KL Divergence to Fisher Information Matrix}
We can consider three options of perturbation on mean and covariance parameters of $w$: 1) Perturbation on mean, 2) perturbation on mean and diagonal variance, 3) Perturbation on mean and whole covariance. All of them can be approximated to Fisher Information Matrix.
Here, we show the relation between KLD and FIM considering the probation option 3.

Following FSAM, we deal with parameterized and conditioned as same notation:
\begin{equation}
    p_{\theta+\Delta\theta}(w|\mathcal{D}) = p(w|\mathcal{D}, \theta+\Delta\theta). \nonumber
\end{equation}


By definition of KL divergence, we rewrite Eq.~\ref{eq:divergence_app} as:
\begin{equation}
\textrm{D}_{\textrm{KL}}[p(w|\mathcal{D}, \theta + \Delta \theta) || p(w|\mathcal{D}, \theta)] = \int_{w} p(w | \mathcal{D}, \theta + \Delta \theta) \ \log \frac{p(w|\mathcal{D}, \theta + \Delta \theta)}{p(w|\mathcal{D}, \theta)} dw.
\label{eq:def_KLD_app}
\end{equation}
\\
In Eq.~\ref{eq:def_KLD_app}, we apply first-order Taylor Expansion:
\begin{equation}
\begin{split}
    &p(w|\mathcal{D}, \theta + \Delta \theta) \approx p(w|\mathcal{D}, \theta) + \nabla_{\theta} p(w | \mathcal{D}, \theta)^{T} \Delta \theta. \\
    \label{eq:taylor_expansion_app}
    &\log p(w|\mathcal{D}, \theta + \Delta \theta) \approx \log p(w|\mathcal{D}, \theta) + \nabla_{\theta} \log p(w | \mathcal{D}, \theta)^{T} \Delta \theta. \\
\end{split}
\end{equation}
\\
Substitute right terms of Eq.~\ref{eq:def_KLD_app} with Eq.~\ref{eq:taylor_expansion_app}:
% \begin{equation}
% \begin{split}
\begin{align}
    &\int_{w} p(w | \mathcal{D}, \theta + \Delta \theta) \ \log \frac{p(w|\mathcal{D}, \theta + \Delta \theta)}{p(w|\mathcal{D}, \theta)} dw \nonumber \\ 
    &=\int_{w} \big( p(w|\mathcal{D}, \theta) + \Delta \theta^{T} \nabla_{\theta} p(w | \mathcal{D}, \theta) \big) \nabla_{\theta} \log p(w|\mathcal{D}, \theta)^{T}\Delta\theta  \ dw \nonumber \\ 
    &=\int_{w} p(w|\mathcal{D}, \theta) \nabla_{\theta}\log p(w|\mathcal{D}, \theta)^{T}\Delta\theta dw \nonumber \\ 
    & \ \ + \int_{w}\Delta\theta^{T}p(w|\mathcal{D}, \theta)\nabla_{\theta}\log p(w|\mathcal{D}, \theta)\nabla_{\theta}\log p(w|\mathcal{D}, \theta)^{T}\Delta\theta \ dw.
    \label{eq:sub_taylor_app}
% \end{split}
% \end{equation}
\end{align}
\\
First term of Eq.~\ref{eq:sub_taylor_app} is equal to 0:
\begin{equation}
\begin{split}
    &\int_{w}p(w|\mathcal{D}, \theta)\nabla_{\theta}\log p(w|\mathcal{D}, \theta) \ dw\\
    &=\int_{w}p(w|\mathcal{D}, \theta)\frac{\nabla_{\theta}p(w|\mathcal{D}, \theta)}{p(w|\mathcal{D}, \theta)} \ dw\\
    &=\int_{w} \nabla_{\theta} p(w|\mathcal{D}, \theta) \ dw \ = \nabla_{\theta} \int_{w} p(w|\mathcal{D}, \theta) = 0. \\
    \label{eq:pf_zero_term_app}
\end{split}
\end{equation}
\\
Using Eq.~\ref{eq:sub_taylor_app} and Eq.~\ref{eq:pf_zero_term_app}, Eq.~\ref{eq:def_KLD_app} can be rewritten as Fisher information matrix by the definition of expectation:
\begin{equation}
\begin{split}
    &D_{KL}[p(w|\mathcal{D}, \theta+\Delta\theta) ||p(w|\mathcal{D}, \theta)]  \\
    &=\int_{w} \Delta\theta^{T}p(w|\mathcal{D}, \theta)\nabla_{\theta}\log p(w|\mathcal{D}, \theta) \nabla_{\theta} \log p(w|\mathcal{D}, \theta)^{T}\Delta\theta  \\
    &=\Delta\theta^{T}\mathbb{E}_{w} [\nabla_{\theta}\log p(w|\mathcal{D}, \theta) \nabla_{\theta} \log p(w|\mathcal{D}, \theta)^{T}]\Delta\theta \\
    &=\Delta \theta^T F_\theta(\theta)\Delta\theta, \\
\end{split}
\label{eq:KLD_to_FIM_app}
\end{equation}
\\
where $F_\theta(\theta)=\mathbb{E}_{w, \mathcal{D}}[\nabla_{\theta}\log p(w|\mathcal{D}, \theta) \nabla_{\theta} \log p(w|\mathcal{D}, \theta)^{T}]$.


It's too expensive to calculate Fisher information matrix $F(\theta)$ in practice.
% Thus, we compute the Fisher information matrix $F(\mu)$, $F(\sigma)$, and $F(L)$, instead of $F(\theta)$. For notation simplicity, we express $F(\mu)$, $F(\sigma)$, and $F(L)$ as $F(\theta)$ from now on.
We introduce a pseudo inverse for Fisher information matrix $F_\theta(\theta)^{-1}$ with Samelson inverse of a vector~\citep{gentle2007matrix, sidi2017vector, wynn1962acceleration} :
\begin{equation}
F_\theta(\theta)^{-1} = \frac{\nabla_{\theta} \log p(w|\mathcal{D}, \theta) \nabla_{\theta} \log p(w|\mathcal{D}, \theta)^{T}}{\| \nabla_{\theta} \log p(w|\mathcal{D}, \theta)  \|^{4}}.
%\in \mathbb{R}^{p \times 1}
\end{equation}

\paragraph{Lagrangian Dual Problem}

From the result of Eq.~\ref{eq:KLD_to_FIM_app}, we can rewrite the Eq.~\ref{eq:main_loss_app}:
\begin{equation}
\label{eq:main_loss_2_app}
    \ell^\gamma_{\text{FP-BMA}}(\theta) = \max_{\Delta\theta^T F_\theta(\theta) \Delta\theta \le \gamma^2} \ell(\theta + \Delta \theta).
\end{equation}

We can reach the optimal perturbation of FP-BMA $\Delta\theta^*$ by using Taylor Expansion on $\ell(\theta+\Delta\theta)$ of Eq.~\ref{eq:main_loss_app}:

\begin{equation}
\label{eq:approximation_l(w^star)_app}
    \ell(\theta + \Delta\theta) = \ell(\theta) + \nabla_\theta \ell(\theta)^T \Delta \theta.
\end{equation}
\\

Using Eq.~\ref{eq:approximation_l(w^star)_app}, we can rewrite Eq.~\ref{eq:main_loss_app} as Lagrangian dual problem:
\begin{equation}
\label{eq:lagrangian_loss_app}
     L(\Delta\theta, \lambda) =  \ell(\theta)  + \nabla \ell_\theta (\theta)^{T} \Delta\theta - \lambda(\Delta\theta^{T}F_\theta(\theta)\Delta\theta - \gamma^2).
\end{equation}
\\

Differentiating Eq.~\ref{eq:lagrangian_loss_app}, we get $\Delta\theta^*$:
\begin{align}
\label{eq:Delta_theta^*_app}
    &\frac{\alpha L(\Delta\theta, \lambda)}{\alpha \Delta\theta} = \nabla_\theta \ell(\theta)^T - 2\lambda \Delta\theta^T F_\theta(\theta) = 0 \notag \\
    & \therefore \ \Delta\theta^* = \frac{1}{2\lambda}F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta).
\end{align}
\\

Putting $\Delta\theta^*$ of Eq.~\ref{eq:Delta_theta^*_app} into $\Delta\theta$ of Eq.~\ref{eq:lagrangian_loss_app}, we can rewrite Eq.~\ref{eq:lagrangian_loss_app}:
\begin{equation}
\label{eq:lagrangian_loss_with_Delta_theta_*_app}
\begin{split}
    L(\Delta\theta^*, \lambda) &= \ell(\theta) + \frac{1}{2\lambda}\nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta) \\
     &- \frac{1}{4\lambda}\nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta) + \lambda\gamma^2.
\end{split}
\end{equation}
\\

By taking derivative of Eq.~\ref{eq:lagrangian_loss_with_Delta_theta_*_app} w.r.t. $\lambda$, we can also get $\lambda^*$:
\begin{align}
\label{eq:lambda^*_app}
    &\frac{\alpha L(\Delta\theta^*, \lambda)}{\alpha \lambda} = -\frac{1}{2\lambda^2}\nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta)
    + \frac{1}{4\lambda^2}\nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta) +\gamma^2 = 0 \notag \\
    & 4\lambda^2\gamma^2 = \nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1}  \nabla_\theta \ell(\theta) \notag \\
    & \therefore \ \lambda^* = \frac{\sqrt{\nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta)}}{2\gamma}.
\end{align}
\\

Finally, we get our $\Delta \theta_{\text{FP-BMA}}$ by substituting Eq.~\ref{eq:lambda^*_app} into
Eq.~\ref{eq:Delta_theta^*_app}:
\begin{equation}
\label{eq:Delta_theta_BSAM_app}
    \Delta\theta_{\text{FP-BMA}} = \gamma\frac{F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta)}{\sqrt{\nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1}\nabla_\theta \ell(\theta)}}.
\end{equation}





\subsection{Proof of Theorem 2}\label{subsec:proof_of_theorem2}
\subsubsection{FP-BMA to FSAM}
Theorem \ref{theorem:generalized_sabma} shows that FP-BMA is degenerated to FSAM under DNN and diagonal FIM setting. Deterministic parameters draw out the constant prior $p(w|x)=c$ and mean-only variational parameters $w = \theta$.

First, we can rewrite the log posterior $\log p_\theta (w | x, y)$ with Bayes rule:
\begin{equation}
\label{eq:posterior_bayes}
    \log p_\theta (w|x, y) = \log p_\theta (y|x,w) + \log p_\theta (w|x) - Z,
\end{equation}
where $Z$ is constant independent of $w$. Is is noted that the log posterior is divided into the log predictive distribution and log prior. Also, note that the prior is conditioned on the data to align with a generalized notation. The prior can depend on the input; however, this dependence is often ignored in practice~\citep{marek2024can}.

By taking derivative with respect to $\theta$ on Eq.~\ref{eq:posterior_bayes}, the constant $Z$ goes to $0$:
\begin{equation}
    \nabla_\theta \log p_\theta (w|x,y) = \nabla_\theta p_\theta (y|x,w) + \nabla_\theta \log p_\theta (w|x). \nonumber
\end{equation}

We have constant prior $p(w|x)=c$ in deterministic setting and it makes the gradient of log posterior and log predictive distribution:
\begin{equation}
\label{eq:same_output_weight_gradient}
    \nabla_\theta \log p_\theta (w|x,y) = \nabla_\theta p_\theta (y|x,w).
\end{equation}
Underlying Eq.~\ref{eq:same_output_weight_gradient}, it is possible to substitute the gradient of log posterior into the gradient of log predictive distribution and FIM over posterior goes to FIM over predictive distribution:
\begin{align}
\label{eq:same_output_weight_fisher}
    F_\theta (\theta) = &\mathbb{E}_{w, \mathcal{D}} [ \nabla_\theta \log p_\theta (w|x,y) \nabla_\theta \log p_\theta (w|x, y)^T] \nonumber\\
    &= \mathbb{E}_{w, \mathcal{D}} [ \nabla_\theta \log p_\theta (y|x,w) \nabla_\theta \log p_\theta (y|x, w)^T].
\end{align}
By taking diagonal computation over Eq.~\ref{eq:same_output_weight_fisher}, it goes to $F_y(\theta)$. After that, using the fact that mean-only variational parameters, FP-BMA degnerates to FSAM with $F_y(\theta)$ finally.
\begin{equation}
\label{eq:sabma_to_fsam}
    \Delta \theta_{\text{FP-BMA}} = \gamma\frac{F_y(\theta)^{-1}\nabla_\theta \ell(\theta)}{\sqrt{ F_y(\theta)^{-1} \nabla_\theta \ell(\theta) F_y(\theta)^{-1}}}.
\end{equation}



\subsubsection{FP-BMA to SAM}
It is simple to show that FP-BMA is extended version of SAM by defining FIM over output distribution $F_y(w)$ as identity matrix $I$ in Eq.~\ref{eq:sabma_to_fsam}, FP-BMA goes to SAM.
\begin{equation}
    \Delta \theta_{\text{FP-BMA}} = \gamma\frac{\nabla_w \ell(w)}{\| \nabla_w \ell(w)\|_2}.
\end{equation}



\subsubsection{FP-BMA to NG}
Theorem~\ref{theorem:generalized_sabma} also states the NG can be approximated with FP-BMA under specific conditions.
The update rule of natural gradient and FP-BMA can be written as Eq.~\ref{eq:ng_update_rule} and Eq.~\ref{eq:sabma_update_rule}, respectively.
\begin{align}
    &\theta \leftarrow \theta + \eta_{\text{NG}} F_y (\theta)^{-1} \nabla_\theta \ell(\theta). \label{eq:ng_update_rule} \\
    &\theta \leftarrow \theta + \eta_{\text{FP-BMA}} \nabla_\theta \ell(\theta+\Delta\theta). \label{eq:sabma_update_rule}
\end{align}
where $\eta_{\text{NG}}$ and $\eta_{\text{FP-BMA}}$ denote the learning rate of NG and FP-BMA. Note that we assume the log likelihood as loss fuction.

The $\nabla_\theta \ell(\theta + \Delta\theta)$ in Eq.~\ref{eq:sabma_update_rule} can be approximated with Taylor Expansion, the connection between Hessian and FIM, and Eq.~\ref{eq:same_output_weight_fisher} in DNN setup:
\begin{align}
\label{eq:approx_perturbation}
    \nabla_\theta \ell(\theta + \Delta\theta) &\approx \nabla_\theta \ell(\theta) + \nabla^2_\theta \Delta\theta \nonumber\\
    & = \nabla_\theta \ell(\theta) + \nabla^2_\theta \ell(\theta) \cdot \gamma\frac{F_\theta (\theta)^{-1} \nabla_\theta \ell(\theta)}{\sqrt{\nabla_\theta \ell(\theta)^T F_\theta (\theta)^{-1} \nabla_\theta \ell(\theta)}} \nonumber\\
    &= \nabla_\theta \ell(\theta) + \gamma^\prime \nabla_\theta^2 \ell(\theta) F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta) \ \bigg(\because \text{Let} \ \gamma^\prime = \frac{\gamma}{\sqrt{\nabla_\theta \ell(\theta)^T F_\theta(\theta)^{-1} \nabla_\theta \ell(\theta)}} \bigg) \nonumber\\
    &= [I + \gamma^\prime \nabla^2_\theta \ell(\theta) F_\theta(\theta)^{-1}]\nabla_\theta \ell(\theta) \nonumber\\
    &\approx (1+\gamma^\prime)\nabla_\theta \ell(\theta) \ (\because \nabla_\theta^2 \ell(\theta) \approx F_y (\theta), F_\theta(\theta) = F_y(\theta)).
\end{align}

By using the denoted learning rate $\eta_{\text{FP-BMA}}=\frac{\eta_{\text{NG}}}{I + \gamma^\prime}F_\theta (\theta)^{-1}$, Eq.~\ref{eq:same_output_weight_fisher}, and Eq.~\ref{eq:approx_perturbation}, update rule of FP-BMA approximates to NG.

