\section{Additional results for SAA for VI}
\label{sec:appendix-saa}
Table~\ref{table:ratio-time-adam} shows the median time taken by SAA for VI to reach the maximum ELBO achieved by Adam.
In this section, we present the total time taken by SAA for VI until its completion. 
Notably, for some models like \texttt{election88}, SAA reached an ELBO over 200 nats higher than Adam, clarifying the discrepancies between Table~\ref{table:times-saa} and Table~\ref{table:ratio-time-adam}.
Additionally, we show the ablation study results regarding parameter initialization using a warm start in Table~\ref{table:ratio-time-refresh-Q}.


\begin{figure*}[ht!]
  \renewcommand{\arraystretch}{1.2}
  \begin{minipage}{.5\textwidth}
    \centering\small
    \begin{tabular}{@{}lrrcrr@{}}
      \toprule
      {} &  \multicolumn{2}{c}{Diagonal} & \phantom{a} &  \multicolumn{2}{c}{Dense} \\
      \cmidrule{2-3} \cmidrule{5-6}
       & time & \begin{tabular}{@{}c@{}} max.\\size\end{tabular} && time & \begin{tabular}{@{}c@{}} max.\\size\end{tabular}\\
      \midrule
      \textbf{Bayesian log.\ regr.}\\ 
      \hspace{1em}a1a               &             0.46 &          $2^{8\phantom{0}}$ &&           52.99 &        $2^{18}$ \\
      \hspace{1em}australian        &             0.22 &          $2^{6\phantom{0}}$ &&            9.69 &        $2^{17}$ \\
      \hspace{1em}ionosphere        &             0.16 &          $2^{6\phantom{0}}$ &&            6.27 &        $2^{16}$ \\
      \hspace{1em}madelon           &             1.11 &         $2^{11}$ &&          100.19 &        $2^{18}$ \\
      \hspace{1em}mushrooms         &             0.42 &          $2^{8\phantom{0}}$ &&           90.65 &        $2^{17}$ \\
      \hspace{1em}sonar             &             0.29 &          $2^{8\phantom{0}}$ &&           19.24 &        $2^{18}$ \\
      \textbf{Stan models}\\
      \hspace{1em}congress          &             0.95 &          $2^{5\phantom{0}}$ &&            1.10 &         $2^{8\phantom{0}}$ \\
      \hspace{1em}election88        &            12.84 &          $2^{8\phantom{0}}$ &&          264.98 &        $2^{15}$ \\
      \hspace{1em}election88Exp     &            11.65 &         $2^{10}$ &&          351.63 &        $2^{12}$ \\
      \hspace{1em}electric          &             2.41 &         $2^{11}$ &&           70.07 &        $2^{18}$ \\
      \hspace{1em}electric-one-pred &             0.51 &          $2^{8\phantom{0}}$ &&            0.62 &         $2^{7\phantom{0}}$ \\
      \hspace{1em}hepatitis         &             3.49 &         $2^{12}$ &&          163.19 &        $2^{18}$ \\
      \hspace{1em}hiv-chr           &             2.68 &          $2^{9\phantom{0}}$ &&           64.87 &        $2^{18}$ \\
      \hspace{1em}irt               &            13.83 &         $2^{14}$ &&          473.77 &        $2^{18}$ \\
      \hspace{1em}mesquite          &             0.73 &          $2^{5\phantom{0}}$ &&            0.38 &         $2^{6\phantom{0}}$ \\
      \hspace{1em}radon             &             2.08 &         $2^{11}$ &&           53.62 &        $2^{18}$ \\
      \hspace{1em}wells             &             0.70 &          $2^{5\phantom{0}}$ &&            0.09 &         $2^{5\phantom{0}}$ \\
      \bottomrule
    \end{tabular}
    \captionof{table}{
      Median \textbf{running time} (in seconds) and corresponding median \textbf{sample size} at which convergence occurs for SAA for VI across runs.
      As described in Section~\ref{sec:experiments}, the sample size is limited to a maximum of $2^{18}$, which proved sufficient for all models.
      \vspace{13.9ex}
    }
    \label{table:times-saa}
  \end{minipage}\hfill %
  \begin{minipage}{.45\textwidth}
    \centering\small
    \begin{tabular}{@{}l 
      S[round-mode=places, round-precision=2]
      S[round-mode=places, round-precision=2]
      @{}}
      \toprule
        & \multicolumn{2}{c}{$\mathrm{Fresh}/ \mathrm{Warm}$} \\
       & \multicolumn{2}{c}{time ratio}\\
       \cmidrule{2-3}
       {} & \multicolumn{1}{c}{Diagonal} &  \multicolumn{1}{c}{Dense}\\
      \midrule
      \textbf{Bayesian log.\ regr.}\\
      \hspace{1em}a1a & 1.11 & 1.78 \\
      \hspace{1em}australian & 1.01 & 1.58 \\
      \hspace{1em}ionosphere & 0.94 & 1.26 \\
      \hspace{1em}madelon & 1.63 & 1.73 \\
      \hspace{1em}mushrooms & 1.31 & 2.04 \\
      \hspace{1em}sonar & 1.12 & 1.44 \\
      \textbf{Stan models}\\
      \hspace{1em}congress & 1.14 & 3.07 \\
      \hspace{1em}election88 & 3.11 & 20.63 \\
      \hspace{1em}election88Exp & 2.16 & 2.59 \\
      \hspace{1em}electric & 2.64 & 4.69 \\
      \hspace{1em}electric-one-pred & 1.05 & 0.75 \\
      \hspace{1em}hepatitis & 2.77 & 2.03 \\
      \hspace{1em}hiv-chr & 2.10 & 2.70 \\
      \hspace{1em}irt & 3.63 & 6.56 \\
      \hspace{1em}mesquite & 0.98 & 1.31 \\
      \hspace{1em}radon & 2.29 & 5.35 \\
      \hspace{1em}wells & 0.96 & 0.99 \\
      \bottomrule
    \end{tabular}
    \captionof{table}{\textbf{Time ratio} for the fresh start compared to the warm start. 
      Values greater than 1 indicate that using warm start is faster. 
      For the \textbf{ELBO}, significant differences ($> 0.1$) were observed only for the \texttt{election88} and \texttt{election88Exp} models: $-1.77$ and $-3.46$, respectively, with diagonal covariance, and $1.66$ and $3.43$ with dense covariance. Our results suggest that warm start approaches often reduce optimization time.}
    \label{table:ratio-time-refresh-Q}
  \end{minipage}
\end{figure*}



% \begin{table}[ht!]
%   \renewcommand{\arraystretch}{1.2}
%   \begin{center}
%     {
% \begin{tabular}{@{}lrrcrr@{}}
%   \toprule
%   {} &  \multicolumn{2}{c}{Diagonal Covariance} & \phantom{aa} &  \multicolumn{2}{c}{Dense Covariance} \\
%   \cmidrule{2-3} \cmidrule{5-6}
%    & total time & \begin{tabular}{@{}c@{}} maximum\\sample size\end{tabular} && total time & \begin{tabular}{@{}c@{}} maximum\\sample size\end{tabular}\\
%   \midrule
%   \textbf{Bayesian log.\ regr.}\\ 
%   \hspace{1em}a1a               &             0.46 &          $2^{8\phantom{0}}$ &&           52.99 &        $2^{18}$ \\
%   \hspace{1em}australian        &             0.22 &          $2^{6\phantom{0}}$ &&            9.69 &        $2^{17}$ \\
%   \hspace{1em}ionosphere        &             0.16 &          $2^{6\phantom{0}}$ &&            6.27 &        $2^{16}$ \\
%   \hspace{1em}madelon           &             1.11 &         $2^{11}$ &&          100.19 &        $2^{18}$ \\
%   \hspace{1em}mushrooms         &             0.42 &          $2^{8\phantom{0}}$ &&           90.65 &        $2^{17}$ \\
%   \hspace{1em}sonar             &             0.29 &          $2^{8\phantom{0}}$ &&           19.24 &        $2^{18}$ \\
%   \textbf{Stan models}\\
%   \hspace{1em}congress          &             0.95 &          $2^{5\phantom{0}}$ &&            1.10 &         $2^{8\phantom{0}}$ \\
%   \hspace{1em}election88        &            12.84 &          $2^{8\phantom{0}}$ &&          264.98 &        $2^{15}$ \\
%   \hspace{1em}election88Exp     &            11.65 &         $2^{10}$ &&          351.63 &        $2^{12}$ \\
%   \hspace{1em}electric          &             2.41 &         $2^{11}$ &&           70.07 &        $2^{18}$ \\
%   \hspace{1em}electric-one-pred &             0.51 &          $2^{8\phantom{0}}$ &&            0.62 &         $2^{7\phantom{0}}$ \\
%   \hspace{1em}hepatitis         &             3.49 &         $2^{12}$ &&          163.19 &        $2^{18}$ \\
%   \hspace{1em}hiv-chr           &             2.68 &          $2^{9\phantom{0}}$ &&           64.87 &        $2^{18}$ \\
%   \hspace{1em}irt               &            13.83 &         $2^{14}$ &&          473.77 &        $2^{18}$ \\
%   \hspace{1em}mesquite          &             0.73 &          $2^{5\phantom{0}}$ &&            0.38 &         $2^{6\phantom{0}}$ \\
%   \hspace{1em}radon             &             2.08 &         $2^{11}$ &&           53.62 &        $2^{18}$ \\
%   \hspace{1em}wells             &             0.70 &          $2^{5\phantom{0}}$ &&            0.09 &         $2^{5\phantom{0}}$ \\
%   \bottomrule
% \end{tabular}
% }
% \caption{
%   Median \textbf{running time} (in seconds) and corresponding median \textbf{sample size} at which convergence occurs for SAA for VI across runs.
%   As described in Section~\ref{sec:experiments}, the sample size is limited to a maximum of $2^{18}$, which proved sufficient for all models.
% }

% \label{table:times-saa}
% \end{center}
% \end{table}


% \begin{table*}[h!]
%   \renewcommand{\arraystretch}{1.2}
% \begin{center}
% \begin{tabular}{@{}l 
%   S[round-mode=places, round-precision=2]
%   S[round-mode=places, round-precision=2]
%   @{}}
%   \toprule
%     & \multicolumn{2}{c}{$(\mathrm{Fresh\ start})/ (\mathrm{Warm\ start})$} \\
%    & \multicolumn{2}{c}{Time ratio}\\
%    \cmidrule{2-3}
%    {} & \multicolumn{1}{c}{Diagonal} &  \multicolumn{1}{c}{Dense}\\
%   \midrule
%   \textbf{Bayesian log.\ regr.}\\
%   \hspace{1em}a1a & 1.11 & 1.78 \\
%   \hspace{1em}australian & 1.01 & 1.58 \\
%   \hspace{1em}ionosphere & 0.94 & 1.26 \\
%   \hspace{1em}madelon & 1.63 & 1.73 \\
%   \hspace{1em}mushrooms & 1.31 & 2.04 \\
%   \hspace{1em}sonar & 1.12 & 1.44 \\
%   \textbf{Stan models}\\
%   \hspace{1em}congress & 1.14 & 3.07 \\
%   \hspace{1em}electric & 2.64 & 4.69 \\
%   \hspace{1em}electric-one-pred & 1.05 & 0.75 \\
%   \hspace{1em}hepatitis & 2.77 & 2.03 \\
%   \hspace{1em}hiv-chr & 2.10 & 2.70 \\
%   \hspace{1em}irt & 3.63 & 6.56 \\
%   \hspace{1em}mesquite & 0.98 & 1.31 \\
%   \hspace{1em}radon & 2.29 & 5.35 \\
%   \hspace{1em}wells & 0.96 & 0.99 \\
%   \bottomrule
% \end{tabular}
% \caption{\textbf{Time ratio} for the fresh start compared to the warm start. 
% Values greater than 1 indicate that using warm start is faster. 
% For the \textbf{ELBO}, significant differences (greater than $0.1$) were observed only for the \texttt{election88} and \texttt{election88Exp} models: $-1.77$ and $-3.46$, respectively, with diagonal covariance, and $1.66$ and $3.43$ with dense covariance. Our results suggest that warm start approaches often reduce optimization time.}

% \label{table:ratio-time-refresh-Q}
% \end{center}
% \end{table*}
