
\documentclass[accepted]{uai2023}

\usepackage[american]{babel}

\usepackage[usenames,dvipsnames]{xcolor}

%% Some suggested packages, as needed:
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs}
\usepackage{tikz}
\usepackage{multirow}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% \usepackage{xr-hyper} 
% \externaldocument{uai2023-template}
% \usepackage[colorlinks,linktoc=all]{hyperref}       % hyperlinks
%\hypersetup{citecolor=MidnightBlue}
%\hypersetup{urlcolor=MidnightBlue}
%\hypersetup{linkcolor=black}

% Additional packages
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage[ruled, linesnumbered]{algorithm2e}
\usepackage{setspace}
\usepackage{colortbl}
\definecolor{Gray}{gray}{0.9}

\usepackage{floatrow}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{lemma}[theorem]{Lemma}
\newcommand{\red}[1]{\textcolor[rgb]{0.835,0.0780,0.1840}{#1}}

\title{The Shrinkage-Delinkage Trade-off: An Analysis of \\ Factorized Gaussian Approximations for Variational Inference\\(Supplementary Material)}

\author[1]{\href{mailto:<cmargossian@flatironinstitute.org>?Subject=Your UAI 2023 paper}{Charles C. Margossian}{}}
\author[1]{Lawrence K. Saul}

\affil[1]{%
    Center for Computational Mathematics\\
    Flatiron Institute\\
    New York, NY, USA
}


\begin{document}

\maketitle

In appendix~\ref{app:cov}, we provide further details on FG-VI when the covariance matrix has constant off-diagonal terms. Next appendix~\ref{app:alg} provides an algorithm to efficiently compute bounds on the shrinkage and delinkage terms, using techniques developped in section 4.
In appendix~\ref{app:traceS}, we show how to derive bounds on the trace of the shrinkage matrix.
In appendix~\ref{app:KL}, we obtain an upper bound on the KL divergence---equivalently the entropy gap---between $q$ and~$p$.

% We first provide additional results for the case of a Gaussian whose covariance matrix has constant off-diagonal terms and then derive, in details, Algorithm 1, which bounds the shrinkage and delinkage terms.


\appendix
\section{FG-VI for covariance with constant off-diagonal terms}
\label{app:cov}

Let $p$ be a multivariate Gaussian distribution over $\mathbb{R}^n$ with mean $\boldsymbol \mu$ and covariance matrix $\boldsymbol\Sigma$.
The elements of the correlation matrix are related to those of the covariance matrix by
\begin{equation}
    C_{ij}=\frac{\Sigma_{ij}}{\sqrt{\Sigma_{ii}\Sigma_{jj}}}.
\label{eq:corr}
\end{equation} 
In this section we assume that the correlation matrix has constant off-diagonal terms, and we use $\varepsilon\in[0,1)$ to denote the value of these terms. Note that we require $\varepsilon\geq 0$ since three or more random variables cannot all be mutually anti-correlated. Also we require $\varepsilon\!<\!1$ since otherwise $\mathbf{C}$ (and hence $\boldsymbol\Sigma$) would not be positive-definite.


\subsection{Solution for factorized Gaussian variational inference}

Let $q$ be the solution of FG-VI with diagonal covariance matrix $\boldsymbol\Psi$, where
\begin{equation}
    \Psi_{ii} = \frac{1}{\Sigma^{-1}_{ii}}
\label{eq:psi}
\end{equation} as in eq.~(4).

We now prove Theorem~3.5, broken up into a statement about the estimated variance (Proposition~\ref{prop:asymptotic-shrinkage}) and a statement about the entropy gap (Proposition~\ref{prop:asymptotic-entropy}).

\begin{proposition} \label{prop:asymptotic-shrinkage}
    If the correlation matrix in eq.~(\ref{eq:corr}) has constant off-diagonal terms, then the solution for FG-VI in eq.~(\ref{eq:psi}) obeys the following limits:
\begin{eqnarray}
   \underset{n \to \infty}{\lim} \Psi_{ii} & = & (1\! -\! \varepsilon)\Sigma_{ii} \\
    \underset{\varepsilon \to 1}{\lim} \Psi_{ii} & = & 0 \\
    \underset{n \to \infty}{\lim} \frac{1}{n} \mathrm{trace}({\bf S}) & = & \frac{1}{1 - \varepsilon}
 \end{eqnarray}
 where $\varepsilon\in[0,1)$ denotes the value of $C_{ij}$ for $i\neq j$.
\end{proposition}

\begin{proof}
Let $\mathbf{1}\in\mathbb{R}^n$ denote the vector of all ones. Then the correlation matrix can be written as
 \begin{equation}
      \mathbf{C} = (1 - \varepsilon) \mathbf{I} + \varepsilon {\bf 1} {\bf 1}^\top.
      \label{eq:corr-epsilon}
  \end{equation}
  One can verify by direct substitution that the inverse correlation matrix has elements
  \begin{equation}
      \mathbf{C}^{-1} = \tfrac{1}{1 - \varepsilon}\left[{\bf I} - \tfrac{\varepsilon}{1 + (n - 1) \varepsilon} {\bf 1}{\bf 1}^\top\right].
   \label{eq:Cinv}
  \end{equation}
Recall from eq.~(15) that $\Psi_{ii} = \Sigma_{ii}/C_{ii}^{-1}$. With some algebra, it follows from eq.~(\ref{eq:Cinv}) that
\begin{equation}
    \Psi_{ii} = \left[\frac{(1-\varepsilon)(1+(n\!-\!1)\varepsilon)}{1+(n\!-\!2)\varepsilon}\right]\Sigma_{ii}.
    \label{eq:psi-epsilon}
\end{equation}
It is straightforward to take the limits of eq.~(\ref{eq:psi-epsilon}) as $n\rightarrow\infty$ or $\varepsilon\rightarrow 1$, and these limits yield the results of the proposition.

Finally, recalling that $\text{trace}({\bf S}) = \sum_{i = 1}^n \Sigma_{ii} / \Psi_{ii}$, we immediately get
\begin{equation}
    \underset{n \to \infty}{\lim} \frac{1}{n} \text{trace}({\bf S}) = \frac{1}{1 - \varepsilon}.
\end{equation}
\end{proof}

%%%%%%%%%%%%%%

\iffalse
We derive the variance shrinkage and the entropy gap in the special case where the off-diagonal terms of the target's correlation matrix are a constant, $\varepsilon$.
We show that the shrinkage increases when either the dimensionality, $n$, or the correlation constant $\varepsilon$ grow, as shown in Figure 1.
More precisely, as $\varepsilon \to 1$, the shrinkage becomes arbitrarily large
but it converges to a finite value for $n \to \infty$.
Similarly, as $\varepsilon \to 1$, the entropy gap diverges;
on the other hand, as $n \to \infty$, the per component entropy gap goes to 0.

Let $p$ be a Gaussian distribution with covariance matrix $\boldsymbol \Sigma$ over $\mathbb R^n$.
Assume the diagonal elements of $\boldsymbol \Sigma$ go to 1---we can always achieve this with a suitable change of units---and that its off-diagonal elements are a constant, $\varepsilon \in [0, 1)$.
Note that $\varepsilon \ge 0$, since three or more random variables cannot all be mutually anti-correlated.
Furthermore, we require $\varepsilon < 1$ to ensure the correlation is bounded by 1.

\begin{proposition}
  The solution for FG-VI has diagonal covariance matrix, $\bf \Psi$,
  with
  \begin{equation}
      \Psi_{ii} = \frac{(1 - \varepsilon) (1 + (n - 1) \varepsilon)}{1 + (n - 2) \varepsilon}.
  \end{equation}
\end{proposition}

\begin{proof}
  Let ${\bf 1} \in \mathbb R^n$ be the vector of 1's.
  The covariance matrix can be rewritten
  \begin{equation}
      \boldsymbol \Sigma = (1 - \varepsilon) \boldsymbol I + \varepsilon {\bf 1} {\bf 1}^T.
  \end{equation}
  One can then verify that the inverse is
  \begin{equation}
      {\bf \Sigma}^{-1} = \frac{1}{1 - \varepsilon} {\bf I} - \frac{\varepsilon}{(1 - \varepsilon)(1 + (n - 1) \varepsilon)} {\bf 1}{\bf 1}^T.
  \end{equation}
  %
  From Proposition 2.1, $\Psi_{ii} = 1 / \Sigma^{-1}_{ii}$, which gives us the wanted result. 
\end{proof}
 %
 We then have the following limits,
 \begin{eqnarray}
   \underset{n \to \infty}{\lim} \Psi_{ii} & = & (1 - \varepsilon) \\
    \underset{\varepsilon \to 1}{\lim} \Psi_{ii} & = & 0,
 \end{eqnarray}
 %
 as stated in Section 2.
 Given that $\Sigma_{ii} = 1$, these results immediately apply to the shrinkage matrix ${\bf S}$.
\fi

%%%%%%%%%%%

 \begin{proposition}  \label{prop:asymptotic-entropy}
     If the correlation matrix in eq.~(\ref{eq:corr}) has constant off-diagonal terms, then the per-component entropy gap from FG-VI vanishes in the limit $n\rightarrow\infty$; that is,
      \begin{equation}
          \underset{n \to \infty}{\lim} \ \frac{1}{n} \left [ \mathcal H(p) - \mathcal H(q) \right ] = 0.
      \end{equation}
  \end{proposition}

\begin{proof}
Recall from Theorem 3.2 of the main paper that the entropy gap for FG-VI is given by
\begin{equation}
    \mathcal{H}(p)-\mathcal{H}(q) = \tfrac{1}{2}\log|\mathbf{S}| + \tfrac{1}{2}\log|\mathbf{C}|,
    \label{eq:gap}
\end{equation}
where $\mathbf{S}$ is the diagonal shrinkage matrix with elements $S_{ii} = \Sigma_{ii}/\Psi_{ii}$. We consider each term on the right side of this equation in turn. It follows at once from eq.~(\ref{eq:psi-epsilon}) that
\begin{equation}
\log|\mathbf{S}| = n\left[\log\frac{1+(n\!-\!2)\varepsilon}{(1\!-\!\varepsilon)(1+(n\!-\!1)\varepsilon)}\right],
\label{eq:logS}
\end{equation}
where $\varepsilon\!>\!0$ denotes the amount of off-diagonal correlation. Next we show how to evaluate $\log|\mathbf{C}|$. From eq.~(\ref{eq:corr-epsilon}), we rewrite the correlation matrix as
   \begin{equation}
        \mathbf{C} = (1\! -\! \varepsilon) {\bf I} + n \varepsilon \left (\tfrac{1}{\sqrt n} {\bf 1} \right) \left (\tfrac{1}{\sqrt n} {\bf 1} \right)^T.
        \label{eq:eigC}
    \end{equation}
    %
    Note that the second term on the right side of eq.~(\ref{eq:eigC}) is a rank-one matrix whose one nonzero eigenvalue is equal to $n\varepsilon$. By adding the first term---which is a multiple of the identity matrix---we obtain a new matrix whose eigenvalues are shifted by a uniform amount. It follows that this new matrix (namely, $\mathbf{C}$) has $n\!-\!1$ eigenvalues at $1\!-\!\varepsilon$ and one eigenvalue at $1+(n\!-\!1)\varepsilon$, so that
    \begin{equation}
        \log|\mathbf{C}| = (n\!-\!1)\log(1\!-\!\varepsilon) + \log (1 + (n\!-\!1) \varepsilon).
        \label{eq:logC}
    \end{equation}
The entropy gap is related to the sum of $\log|\mathbf{S}|$ and $\log|\mathbf{C}|$ by eq.~(\ref{eq:gap}). Adding the results in eq.~(\ref{eq:logS}) and eq.~(\ref{eq:logC}), we find that
\begin{align}
\log|\mathbf{S}| + \log|\mathbf{C}|
    &=  -\log(1\! -\! \varepsilon) + \log (1 + (n\! -\! 1) \varepsilon) \nonumber \\
    &\mbox{\hspace{2ex}}\hspace{3ex} +\ n \log \left [ \tfrac{1 + (n - 2) \varepsilon}{1 + (n - 1) \varepsilon} \right].
    \label{eq:sum}
\end{align}
Note that the first term on the right side is $\mathcal O(1)$, the second term is $\mathcal O(\log n)$, and the third term can be 
written as
    \begin{equation}
        n \log \left [ \tfrac{1 + (n - 2) \varepsilon}{1 + (n -1) \varepsilon} \right]
          = n \log \left [ 1 - \tfrac{\varepsilon}{1 + (n - 1) \varepsilon}  \right].
    \end{equation}
    %
    For large $n$, the log term in this equation is $\mathcal O(\frac{1}{n})$ so that the entire expression is $\mathcal O(1)$. From eq.(\ref{eq:sum}), it therefore follows that the entropy gap in eq.~(\ref{eq:gap}) is $\mathcal O(\log n)$. Dividing by $n$, we see that the per-component entropy gap vanishes in the limit $n\rightarrow\infty$, thereby completing the proof.
  \end{proof}

  \iffalse
  \begin{proposition}
      The per component entropy gap goes to 0 as $n \to \infty$;
      that is
      \begin{equation}
          \underset{n \to \infty}{\lim} \ \frac{1}{n} \left [ \mathcal H(p) - \mathcal H(q) \right ] = 0.
      \end{equation}
  \end{proposition}

  \begin{proof}
    Recall the entropy gap between two Gaussian distributions is
    \begin{equation}
        \mathcal H(p) - \mathcal H(q) = \frac{1}{2} \left [ \log |\boldsymbol \Sigma| - \log |\boldsymbol \Psi| \right].
    \end{equation}
    %
    Now we have
    \begin{equation}
        \boldsymbol \Sigma = (1 - \varepsilon) {\bf I} + n \varepsilon \left (\frac{1}{\sqrt n} {\bf 1} \right) \left (\frac{1}{\sqrt n} {\bf 1} \right)^T.
    \end{equation}
    %
    Ignoring the first term on the R.H.S, we have a rank-1 matrix, with $n$ times the eigenvalue, $n \varepsilon$.
    Adding the identity term then shifts the eigenvalues, leaving us with $n - 1$ eigenvalues at $(1 - \varepsilon)$ and one eigenvalue at $1 + (n - 1) \varepsilon$.
    Then
    \begin{equation}
        \log | \boldsymbol \Sigma | = (n - 1) \log (1 - \varepsilon) + \log (1 + (n - 1) \varepsilon).
    \end{equation}
    %
    Next 
    \begin{equation}
        \log | \boldsymbol \Psi | = n \log \Psi_{11}
          = n \log (1 - \varepsilon) + n \log \left [ \frac{1 + (n - 1) \varepsilon}{1 + (n - 2) \varepsilon} \right].
    \end{equation}
    %
    Thus
    %
    \begin{eqnarray}  \label{eq:gap-constant}
        \log |\boldsymbol \Sigma| - \log |\boldsymbol \Psi|
         = & - \log(1 - \varepsilon) + \log (1 + (n - 1) \varepsilon) \nonumber \\
           & - n \log \left [ \frac{1 + (n - 1) \varepsilon}{1 + (n - 2) \varepsilon} \right].
    \end{eqnarray}
    %
    The first term is $\mathcal O(1)$ and the second term $\mathcal O(\log n)$.
    The third term can be written
    \begin{equation}
        n \log \left [ \frac{1 + (n - 1) \varepsilon}{1 + (n - 2) \varepsilon} \right]
          = n \log \left [ 1 - \frac{1}{1 + (n - 1) \varepsilon}  \right].
    \end{equation}
    %
    For large $n$, the log term is $\mathcal O(1 / n)$ and the entire expression $\mathcal O(1)$.
    Hence $\mathcal H(p) - \mathcal H(q) = \mathcal O(\log n)$.
    Diving the entropy gap by $n$ completes the proof.
  \end{proof}
  %
  \fi
  
  From \eqref{eq:sum}, we also see that the entropy gap becomes infinite as $\varepsilon \to 1$ (for fixed $n$).
  Additional limits can be considered with respect to both $\varepsilon$ and $n$, but we do not pursue those here.
%   When the limit is taken with respect to both $\varepsilon$ and $n$, we need to examine the rate at which $\log(1 - \varepsilon) / n$ changes to obtain a limit of for the per component entropy gap.

  \subsection{Solution when minimizing the reverse KL-divergence}

  A factorized approximation cannot both match the marginal variances and the entropy of the target distribution.
  In the example at hand, minimizing $\text{KL}(p||q)$ leads to good estimates of the entropy but not of the marginal variances.
  We now show that when minimizing the reverse KL-divergence, $\text{KL}(p||q)$, the opposite behavior occurs.

  We first state the solution obtained when minimizing the reverse KL-divergence.
  The following is the counterpart to Proposition~2.1 %~\ref{prop:solution}
  in the main body of the paper.
  
  \begin{proposition} \label{prop:solution-reverse}
    Let q({\bf z}) be multivariate Gaussian with mean $\tilde{\boldsymbol \nu}$ and diagonal covariance $\tilde{\boldsymbol \Psi}$.
    Then the variational parameters minimizing the reverse KL-divergence, $\text{KL}(p||q)$ are given by $\tilde{\boldsymbol \nu} = \boldsymbol \mu$ and
    \begin{equation}
        \tilde{\Psi}_{ii} = \Sigma_{ii}.
    \end{equation}
  \end{proposition}

  \begin{proof}
      The variational parameters $\tilde{\boldsymbol \nu}$ and $\tilde{\boldsymbol \Psi}$ are estimated by minimizing the reverse KL-divergence
      \begin{equation}
          KL(p||q) = \mathbb E_p [\log p({\bf z})] - \mathbb E_p [\log q({\bf z})],
      \end{equation}
      where each expectation is taken with respect to the measure $p$.
      The first term on the R.H.S does not depend on the variational parameters.
      The second term is
      %
  \begin{eqnarray*}
    - \mathbb E_p [\log q({\bf z})] = \frac{1}{2} \log |\tilde{\boldsymbol \Psi}| + \frac{1}{2}\mathbb E_p ({\bf z} - \tilde{\boldsymbol \nu})^T \tilde{\boldsymbol \Psi}^{-1} ({\bf z} - \tilde{\boldsymbol \nu}) \\
    = \frac{1}{2} \sum_{i = 1}^n \log \tilde \Psi_{ii} +
      \frac{1}{\tilde \Psi_{ii}} \left (\Sigma_{ii} + (\mu_i - \tilde \nu_i)^2 \right).
  \end{eqnarray*}
  This expression is minimized by setting $\tilde \nu_i = \mu_i$ and moreover $\tilde{\boldsymbol \nu} = \boldsymbol \mu$.
  Differentiating with respect to $\tilde \Psi_{ii}$ and solving at a stationary point, we then have $\tilde \Psi_{ii} = \Sigma_{ii}$.
  \end{proof}
  % %
  % When minimizing the reverse KL-divergence, we obtain exact estimates of the marginal variances.
  % In contrast to the results on the entropy gap in the previous section, this is true even for finite $n$.
  % Note furthermore that this result does not depend on the details of the example at hand (i.e. the target is has a covariance matrix with constant off-diagonal terms).
 
  This next theorem, obtained when minimizing the reverse KL-divergence, is the counterpart to Theorem~3.6.%\ref{thm:constant-off-diag}.

  \begin{theorem}  \label{thm:constant-off-diag2}
    Suppose ${\bf C}$ has constant off-diagonal terms, $\varepsilon \in [0, 1)$.
    When minimizing the reverse KL-divergence, the entropy gap goes to a constant factor in the limit $n\rightarrow\infty$, whereas the variance is correctly estimated, that is
        \begin{align}
            \underset{n \to \infty} \lim \ \tfrac{1}{n} \left(\mathcal H(p) - \mathcal H(q)\right) &= \log(1 - \varepsilon), \\
            \tilde \Psi_{ii} &= \Sigma_{ii}.
        \end{align}
      \end{theorem}

  \begin{proof}
    The second equality is already stated in Proposition~\ref{prop:solution-reverse}.
    Note that this result does not depend on the specifics of the example at hand and applies to any covariance matrix.
  
    To obtain the first equality, we start with the shrinkage-delinkage decomposition,
    \begin{equation}
        \mathcal H(p) - \mathcal H(q) = \frac{1}{2} \log |{\bf S}| + \frac{1}{2} \log |{\bf C}|.
    \end{equation}
    %
    Since there is no shrinkage, $\log |{\bf S}| = 0$.
    Next, recall from \eqref{eq:logC} the expression for $\log |{\bf C}|$.
    Dividing $\log |{\bf C}|$ by $n$ and taking the limit in $n$, we obtain the desired result.
  \end{proof}
  %
  Naturally, the entropy gap can be arbitrarily large, with $q$ having a larger entropy than $p$, notably as $\varepsilon$ goes to 1.

  For problems where the marginal variances are of interest, rather than the entropy, we would ideally minimize the reverse KL-divergence.
  Unfortunately, there is usually no efficient way to optimize $\text{KL}(p||q)$, due to the difficulty in evaluating expectation values with respect to $p$.
  
  
  \section{Solutions for the bounds on the shrinkage and delinkage terms}
  \label{app:alg}

  By exploiting the symmetries that we proved in Section 4.2, we can efficiently compute bounds on the terms $\log |\mathbf{S}|$ and $\log|\mathbf{C}|$; these are the terms that arise, respectively, from the effects of shrinkage and delinkage.

  

  \subsection{Upper bound on $\log |{\bf S}|$} 
  
  First we show how to compute the upper bound on $\log |\mathbf{S}|$. Recall that to do so, we must solve the optimization problem
  %
  \begin{equation}
      \max_{\boldsymbol\lambda\in\Lambda_R} \sum_{i=1}^n \lambda_i^{-1}.
      \label{eq:optS}
  \end{equation}
  %
 From Lemma 4.2, we know that all the elements of the solution assume the edge values of $\lambda_1$ or $\lambda_n$
  save for at most one which we denote $\lambda_k$. At a high level, we solve the optimization by
 exhaustively computing the optimal solution for each candidate value of $k \in \{1,\ldots, n\}$,
  then choosing the particular value of $k$ whose solution maximizes the overall objective function.

  It remains only to show how to compute the solution for a particular candidate value of $k$.
  Recall the constraints that $\sum_{i = 1}^n \lambda_i = n$ and $\lambda_1 = R \lambda_n$.
 It follows that
  \begin{equation}
      \lambda_k = n - \left [ (k - 1) R + n - k \right] \lambda_n.
  \end{equation}
  %
  Using the constraints to eliminate $\lambda_1$ and $\lambda_k$, we can write the
 objective function entirely in terms of $\lambda_n$. In this way we find
  \begin{equation} \label{eq:objective-simple}
      \sum_{i = 1}^n \lambda_i^{-1} = \frac{1}{n - \left [ R (k - 1) + n - k \right] \lambda_n} + \frac{(k - 1)}{R \lambda_n} + \frac{n - k}{\lambda_n}.
  \end{equation}
  %
Crucially, we also need to enforce the boundary conditions $\lambda_n \le \lambda_k \le \lambda_1$, or equivalently
  %
  \begin{equation} \label{eq:boundary}
      \frac{n}{R k + n - k} \le \lambda_n \le \frac{n}{R (k - 1) + n - k + 1}.
  \end{equation}
  %
  Note that the simplified objective in \eqref{eq:objective-simple} for fixed $k$ is convex in $\lambda_n$; hence the maximizer must lie at one of the boundary values in \eqref{eq:boundary}. By computing the objective for each boundary value of $\lambda_k$, we find the optimal solution for this candidate value of~$k$. Finally, we obtain the overall solution to eq.~(\ref{eq:optS}) by considering all $n$ candidate values of $k$ and choosing the best one.

\subsection{Upper Bound on $\log |{\bf C}|$}

Next we show how to compute the upper bound on $\log|\mathbf{C}|$. Recall that to do so,
we must solve the optimization problem
  \begin{equation}
      \max_{\boldsymbol\lambda\in\Lambda_R}\left[\sum_{i=1}^n \log \lambda_i\right].
  \end{equation}
  %
  From Lemma~4.3, we know that all eigenvalues other than~$\lambda_1$ and $\lambda_n$ must have the same value; we denote this value  by~$\lambda_k$.
  From the constraint $\sum_{i = 1}^n \lambda_i = n$, it follows that
  \begin{equation}
      \lambda_k = \frac{n - (1 + R) \lambda_n}{n - 2}.
  \end{equation}
  Again, using the constraints to eliminate $\lambda_1$ and $\lambda_k$, we can write the objective function entirely in terms of $\lambda_n$. In this way we find
  \begin{equation}
      \sum_{i=1}^n \log \lambda_i = (n\! -\! 2) \log \frac{n - (1 + R) \lambda_n}{n - 2} + \log R \lambda_n + \log \lambda_n.
      \label{eq:concave}
  \end{equation}
  %
This objective is concave in $\lambda_n$, so we can locate the maximum by setting its derivative with respect to $\lambda_n$ equal to zero. Some straightforward algebra shows that this derivative vanishes when
  \begin{equation}
     % - \tfrac{(1 + R)(n - 2)}{n - (1 + R) \lambda_n} + \tfrac{2}{\lambda_n} = 0\quad\iff\quad
      \lambda_n = \frac{2}{1+R}.
  \end{equation}
  %
Finally we need to check that this solution does not violate the boundary conditions of the problem; in particular, we require that \mbox{$\lambda_n \ge \lambda_k \ge R \lambda_n$}, or equivalently that
  \begin{equation}
      \frac{n}{1 + R (n - 1)} \le \lambda_n \le \frac{n}{n - 1 + R}.
  \end{equation}
  %
These conditions are always satisfied for $n\geq 3$.
  Hence we obtain an analytical solution for the upper bound on $\log|\mathbf{C}|$.
  Finally, note that while the solution for $\lambda_n$ does not depend on $n$,
  the optimized objective function does depend on $n$ through eq.~(\ref{eq:concave}).

 Algorithm~\ref{alg:bounds} provides an implementation of the above-described method.

\begin{algorithm}[!b]
    \DontPrintSemicolon
    % \caption{Computing the upper bound on $\mathcal{H}_p-\mathcal{H}_q$.}
    \caption{Upper bounds on $\log|\mathbf{S}|$ and $\log|\mathbf{C}|$}
    \label{alg:bounds}
    \setstretch{1}
    {\bf Input:} $R, n$ \;\;

    \SetKwFunction{Fh}{ObjF}
    \SetKwProg{Fn}{Function}{:}{}
    \Fn{\Fh{$\lambda_n$, $k$}}{
        \KwRet $ \left (n - k + \frac{k - 1}{R} \right)\frac{1}{\lambda_n} + \frac{1}{n - [R(k - 1) + n - k] \lambda_n}$
    } \;

    \For {$k$ in $\{2, \cdots, n - 1\}$} {
      $\lambda_a \leftarrow \frac{n}{Rk + n - k}$ \;
      $\lambda_b \leftarrow \frac{n}{R(k-1) + n-k + 1}$ \;
      $F_k \leftarrow \text{max}(\text{\texttt{ObjF}} \;(\lambda_a), \text{\texttt{ObjF}}(\lambda_b))$ \;
      {\bf if} ($k = 1$) $F \leftarrow F_k$ \;
      {\bf else} $F \leftarrow \text{max}(F, F_k)$\;
    }
    $U_s \leftarrow n\log(F/n)$\;\;
    
    $\lambda_n \leftarrow \frac{2}{1+R}$ \;
    $U_c \leftarrow  \log \frac{1}{\lambda_n} + \log\frac{1}{R \lambda_n} + (n\!-\!2) \log \frac{n-2}{n-(1+R)\lambda_n}$\; \;

    {\bf Return:} $U_s$, $U_c$\;
    % $\frac{n}{2} \log \left(\frac{1}{n} F \right) - \frac{1}{2} G$ \;
  \end{algorithm}
  
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Bounds on the average variance shrinkage}
\label{app:traceS}

We can also derive bounds on the {\it average} shrinkage in componentwise variance in terms of the problem dimensionality, $n$, and the condition number, $R$, of the correlation matrix. The average in this case is performed over the different components of $\mathbf{z}$. Recall that the shrinkage in each componentwise variance is given by $S_{ii} = \Sigma_{ii}/\Psi_{ii}$. Hence we can also express this bound in terms of the trace of the shrinkage matrix, $\text{trace}({\bf S})$.
\begin{proposition}
Suppose that the correlation matrix $\mathbf{C}$ has condition number $R$. Then the solution for FG-VI in section 2 satisfies
\begin{equation}
 \min_{\lambda\in\Lambda_R}\sum_{i=1}^n \lambda_i^{-1} \leq {\rm trace}(\mathbf{S}) \leq \max_{\lambda\in\Lambda_R}\sum_{i=1}^n \lambda_i^{-1},
\label{eq:shrink-eig-bound}
\end{equation}
where $\Lambda_R$ is the set defined in section 4.
\end{proposition}
\begin{proof}
We showed in the proof of Theorem 3.1 that 
\begin{equation}
    S_{ii} = \frac{\Sigma_{ii}}{\Psi_{ii}}= C_{ii}^{-1}.
\end{equation}
It follows that ${\rm trace}(\mathbf{S}) = {\rm trace}(\mathbf{C}^{-1}) = \sum_i \lambda_i^{-1}$, where $\lambda_1,\ldots,\lambda_n$ are the eigenvalues of $\mathbf{C}$. The bound then follows from the relaxtion from the set $\mathcal{C}_R$ to the set $\Lambda_R$ in section~4.
\end{proof}

\subsection{Lower bound on $\text{trace}({\bf S})$}

The optimization implied by eq.~(\ref{eq:shrink-eig-bound}) is convex, since both the set $\Lambda_R$ and the objective function $\sum_i \lambda_i^{-1}$ are convex. In fact, this bound can be evaluated in closed form by using similar methods as in section 4.2.

  \begin{lemma} \label{lemma:trace-symmetry1}
  Let $\boldsymbol\lambda\in\Lambda_R$ be the solution that minimizes the left side of eq.~(\ref{eq:shrink-eig-bound}). Then $\lambda_i\!=\!\lambda_j$ whenever $1\!<\!i\!<\!j\!<\!n$.
\end{lemma}
\begin{proof}
This proof follows the same argument as the proof (by contradiction) for Lemma 4.3. 
Suppose there exists a solution with intermediate elements $\lambda_i$ and $\lambda_j$ that satisfy
%\begin{equation}
$\lambda_1\! \geq\! \lambda_i\! >\! \lambda_j\! \geq\! \lambda_n$.
% \end{equation}
Consider the effect on this solution of a perturbation that adds some small amount $\delta\!>\!0$ to~$\lambda_j$ and subtracts the same amount from~$\lambda_i$. For sufficiently small~$\delta$, this perturbation will not leave the set~$\Lambda_R$; however, it will {\it diminish} the separation of~$\lambda_i$ from~$\lambda_j$. As a result the objective $\sum_k (1/\lambda_k)$ experiences a change
\begin{equation}
  g(\delta) = \frac{1}{\lambda_i-\delta} -\frac{1}{\lambda_i} +\frac{1}{\lambda_j+\delta} - \frac{1}{\lambda_j}.
  \end{equation}
Evaluating the derivative, we find $g'(0) = \lambda_i^{-2}\! -\! \lambda_j^{-2} < 0$, so that the objective function is decreased for some $\delta>0$. As before this yields a contradiction, because any solution must be maximal, and hence stationary (i.e., $g'(0)\!=\!0$), with respect to small perturbations.
\end{proof}
With the above lemma, the $n$-dimensional optimization over $\Lambda_R$ can be reduced to a one-dimensional optimization that can be solved in closed form. The methods are identical to those in the previous appendix.

First we rewrite the constraint, $\lambda_n \le \lambda_k \le R \lambda_n$, as
\begin{equation}  \label{eq:trace-constraint}
    \frac{n}{R(n - 1) + 1} \le \lambda_n \le \frac{n}{R  + n - 1}.
\end{equation}
%
Since the minimization problem is convex, a minima can be found at a stationary point of the objective function
%
\begin{equation}  \label{eq:trace-objective}
    \sum_{i = 1}^n \frac{1}{\lambda_i} = \frac{(n - 2)^2}{n - (1 + R) \lambda_n} + \frac{1}{\lambda_n} + \frac{1}{R \lambda_n},
\end{equation}
%
which now only depends on $\lambda_n$.
Differentiating and setting to 0, we obtain the root-finding problem,
\begin{equation}
    \left [R(n - 2)^2 - (1 + R)^2 \right] \lambda^2_n + 2n (1 + R) \lambda_n - n^2 = 0,
\end{equation}
%
which can be solved exactly.
It remains to check whether the roots violate the constraints in~\eqref{eq:trace-constraint}, and pick the non-offending root which maximizes the objective in~\eqref{eq:trace-objective}.
If both roots violate the constraints then, by convexity of the problem, the solution must lie at one of the boundary terms in~\eqref{eq:trace-constraint}.

\subsection{Upper bound on \text{trace}(S)}

A similar approach gives us an upper bound on $\text{trace}({\bf S})$.
In fact, we have already solved the problem of maximizing the right side of~\eqref{eq:shrink-eig-bound} when upper-bounding $\log |{\bf S}|$.
It remains to apply the same strategy.


\section{Tighter upper bound on entropy gap}
\label{app:KL}
In Proposition 4.1 we derived separate upper bounds on the individual terms $\log|\mathbf{S}|$ and $\log|\mathbf{C}|$. One upper bound on the entropy gap (or equivalently, on ${\rm KL}(q,p)$) is obtained simply by adding these separate bounds. However, a tighter upper bound is obtained by replacing the separate optimizations in Proposition 4.1 by a single joint optimization:
\begin{equation}
{\rm KL}(q,p)\ \leq\ 
  \frac{1}{2}\max_{\boldsymbol\lambda\in\Lambda_R}\left[
    n \log \frac{1}{n}
  \sum_{i=1}^n \lambda_i^{-1} + \sum_{i=1}^n \log \lambda_i\right].
\label{eq:combined-opt}
\end{equation}
In this appendix we sketch how to solve this optimization and evaluate this bound in closed form. The first step is to make the change of variables,
\begin{equation}
    \omega_i = \frac{\lambda_i^{-1}}{\sum_{j=1}^n \lambda_j^{-1}},
\end{equation}
and to translate the domain of optimization accordingly. Under this change of variables, the original domain $\Lambda_R$ in section 4 is mapped onto the set
\begin{equation}
    \Omega_R = \left\{\boldsymbol\omega\in \mathbb R_+^n\, |\,\omega_n\geq\ldots\geq\omega_1 = \frac{1}{R} \omega_n, \sum_{i=1}^n\omega_i = 1\right\}.
\end{equation}
Likewise, a little algebra shows that the optimization in eq.~(\ref{eq:combined-opt}) is equivalent to the following:
\begin{equation}
{\rm KL}(q,p)\ \leq\ 
  \frac{1}{2}\max_{\boldsymbol\omega\in\Omega_R}\left[
    \sum_{i=1}^n \log\frac{1}{\omega_i} - n\log n\right].
\label{eq:omega-opt}
\end{equation}
Now we can make a similar argument as in the proof of Lemma 4.2 to simplify this optimization.
\begin{lemma} 
\label{lemma:symmetry-omega}
  Let $\boldsymbol\omega\in\Omega_R$ be the solution that maximizes the right side of eq.~(\ref{eq:omega-opt}). Then at most one $\omega_i$ is not equal to either $\omega_1$ or~$\omega_n$.
\end{lemma}
\begin{proof}
We prove the lemma by contradiction. Suppose there exists a solution with intermediate elements $\omega_i$ and $\omega_j$ that satisfy
%\begin{equation}
$\omega_n\! >\! \omega_i\! >\! \omega_j\! >\! \omega_1$.
%\end{equation}
Consider the effect on this solution of a perturbation that adds some small amount $\delta\!>\!0$ to $\omega_i$ and subtracts the same amount from $\omega_j$. Note that for sufficiently small~$\delta$, this perturbation will not leave the set~$\Omega_R$; however, it will {\it expand} the separation of $\omega_i$ from~$\omega_j$. As a result the objective in eq.~(\ref{eq:omega-opt}) changes by an amount
\begin{equation}
  f(\delta) = \frac{1}{2}\left[\log\frac{1}{\omega_i\!+\!\delta} - \log\frac{1}{\omega_i} + \log\frac{1}{\omega_j\!-\!\delta} - \log\frac{1}{\omega_j}\right].
\end{equation}
Next we evaluate the derivative $f'(\delta)$ at $\delta=0$; doing so we find $f'(0) = \omega_j^{-1}\! -\! \omega_i^{-1} > 0$, so that the objective is increased for some $\delta>0$. But this yields a contradiction, because any solution must be maximal, and hence stationary (i.e., $f'(0)\!=\!0$), with respect to small perturbations.
\end{proof}
With the above lemma, we can reduce the $n$-dimensional optimization over $\Omega_R$ to a one-dimensional optimization that can be solved in closed form; the methods are identical to those in the previous appendix.

In details, let $\omega_k$ be the one variable which (potentially) does not go to $\omega_1$ or $\omega_n$.
Given $\sum_{i = 1}^n \omega_i = 1$,
\begin{equation}
    \omega_k = 1 - (k - 1 + R(n -k)) \omega_1.
\end{equation}
%
The objective is then
\begin{eqnarray}
    \sum_{i = 1}^n \log \frac{1}{\omega_i} =&  - (k - 1) \log \omega_1 - (n - k) \log R\omega_1 \nonumber \\ & - \log (1 - [k - 1 + R(n -k) \omega_1]). \ 
\end{eqnarray}
%
Since we are trying to maximize a convex function, the solution does not lie at a stationary point, rather at a boundary set by the constraint, $\omega_1 \le \omega_k \le \omega_n$, or equivalently
\begin{equation}
    \frac{1}{k - 1 + R(n - k + 1} \le \omega_1 \le \frac{1}{k + R(n -k)}.
\end{equation}
%
It remains to test each candidate boundary for each choice of $k$ to obtain a maximizer.

\end{document}
