\section{\label{sec:mean}Mean-field equations}

\subsection{Replica partition function}

Our network, presented in \cref{sec:model}, is described by a Hamiltonian
\begin{align}
  H &= -\frac{1}{2N}\sum_{\mu\nu} \sum_{i \neq j} (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) (\eta^j_{\mu\nu} + \zeta^j_{\mu\nu}) S_i S_j + \theta \sum_i S_i \nonumber\\
  &= -\frac{1}{2N}\sum_{\mu\nu} \biggl[\sum_i (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S_i\biggr]^2 + \frac{1}{2N}\sum_{\mu\nu} \sum_i \Bigl[(\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S_i\Bigr]^2 + \theta \sum_i S_i.
\end{align}
We will forgo introducing external fields. Note that
\begin{equation}
  \frac{1}{2N}\sum_{\mu\nu} \sum_i \Bigl[(\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S_i\Bigr]^2
  \approx \frac{\alpha s}{2} \Bigl[ (1-2\gamma)^2 a + \gamma^2 \Bigr] \sum_i S_i
\end{equation}
considering $a \ll 1$ and $N \rightarrow \infty$. If we define
\begin{equation}
  \Gamma^2 \equiv (1-2\gamma)^2 a + \gamma^2,
\end{equation}
we obtain
\begin{equation}
  H = -\frac{1}{2N}\sum_{\mu\nu} \biggl[\sum_i (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S_i\biggr]^2 + \biggl(\theta + \frac{\alpha s\Gamma^2}{2}\biggr) \sum_i S_i.
\end{equation}

Now we consider a set of replica networks $\rho = 1, \ldots, n$ with the same parameter values and stored patterns, but their neural activities $S^\rho_i$ may vary across replicas. The Hamiltonian of each replica is
\begin{equation}
  H^\rho = -\frac{1}{2N}\sum_{\mu\nu} \biggl[\sum_i (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S^\rho_i\biggr]^2 + \biggl(\theta + \frac{\alpha s \Gamma^2}{2}\biggr) \sum_i S^\rho_i.
\end{equation}
and the partition function across replicas is
\begin{equation}
  \langle Z^n \rangle = \left\langle \mathrm{Tr}_S \prod_\rho \exp \bigl[-\beta H^\rho\bigr] \right\rangle,
\end{equation}
where the average occurs over patterns $\eta^i_{\mu\nu}$ and $\zeta^i_{\mu\nu}$. We invoke the standard gaussian integral identity
\begin{equation}
  \int \mathrm{d} m\,\mathrm{e}^{-a m^2 + b m}
  = \sqrt{\frac{\pi}{a}} \mathrm{e}^{b^2/4a}
\end{equation}
to obtain
\begin{align}
  \langle Z^n \rangle = \Biggl\langle \mathrm{Tr}_S \prod_\rho \Biggl\{
    &\exp\biggl[-\beta \biggl(\theta + \frac{\alpha s \Gamma^2}{2}\biggr) \sum_i S^\rho_i\biggr] \nonumber\\
    &\times \prod_{\mu\nu} \int \mathrm{d} m^\rho_{\mu\nu} \biggl(\frac{\beta N}{2\pi}\biggr)^{\!\!\frac{1}{2}} \exp\biggl[ -\frac{\beta N}{2} (m^\rho_{\mu\nu})^2 + \beta m^\rho_{\mu\nu} \sum_i (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S^\rho_i \biggr] \Biggr\} \Biggr\rangle.
\end{align}


\subsection{Uncondensed patterns}

We first average over the uncondensed patterns $\mu > 1$ for which $m^\rho_{\mu\nu} \ll 1$. First,
\begin{equation}
  \left\langle \prod_{{\mu\nu}\rho} \exp\biggl[\beta m^\rho_{\mu\nu} \sum_i (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S^\rho_i\biggr] \right\rangle
  = \prod_{i\mu} \left\langle \prod_\nu \exp\biggl[\beta (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) \sum_\rho m^\rho_{\mu\nu} S^\rho_i\biggr] \right\rangle.
\end{equation}
Using
\begin{equation}
  Y_\nu \equiv \beta \sum_\rho m^\rho_{\mu\nu} S^\rho_i,
\end{equation}
we note that
\begin{equation}
  \biggl\langle \prod_\nu \exp\bigl[(\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) Y_\nu\bigr] \biggr\rangle_{\!\!{\mu\nu}}
  = \biggl[ \prod_\nu \bigl\langle \exp[\eta^i_{\mu\nu} Y_\nu] \bigr\rangle_{\!\!{\mu\nu}} \biggr] \biggl\langle \prod_\nu \exp[\zeta^i_{\mu\nu} Y_\nu] \biggr\rangle_{\!\!{\mu\nu}}.
\end{equation}
First, using $Y \ll 1$,
\begin{equation}
  \bigl\langle \exp[\eta^i_{\mu\nu} Y_\nu] \bigr\rangle_{\!{\mu\nu}}
  \approx 1 + \frac{1}{2}(1-2\gamma)^2 a Y_\nu^2 
  \approx \exp \biggl[ \frac{1}{2}(1-2\gamma)^2 a Y_\nu^2 \biggr].
\end{equation}
The averaging is performed first over $\nu$, then over $\mu$. Continuing,
\begin{align}
  \biggl\langle \prod_\nu \exp[\zeta^i_{\mu\nu} Y_\nu] \biggr\rangle_{\!\!{\mu\nu}}
  &\approx \frac{1}{2}\prod_\nu \biggl\{1 + \gamma cY_\nu + \frac{\gamma^2}{2}Y_\nu^2\biggr\} + \frac{1}{2}\prod_\nu \biggl\{1 - \gamma cY_\nu + \frac{\gamma^2}{2}Y_\nu^2\biggr\} \nonumber\\
  &= 1 + \frac{1}{2} \sum_{\nu\omega} \Bigl[ \gamma^2(1-c^2)\delta_{\nu\omega} + \gamma^2c^2 \Bigr] Y_\nu Y_\omega \nonumber\\
  &\approx \exp\biggl\{ \frac{1}{2} \sum_{\nu\omega} \Bigl[ \gamma^2(1-c^2)\delta_{\nu\omega} + \gamma^2c^2 \Bigr] Y_\nu Y_\omega \biggr\}.
\end{align}
Thus,
\begin{align}
  \left\langle \prod_{{\mu\nu}\rho} \exp\biggl[\beta m^\rho_{\mu\nu} \sum_i (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S^\rho_i\biggr] \right\rangle 
  = \prod_\mu \exp\biggl\{ \frac{\beta N}{2} \beta\Gamma^2 \sum_{{\nu\omega}{\rho\sigma}} \bigl[(1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr] q^{\rho\sigma} m^\rho_{\mu\nu} m^\sigma_{\mu\omega} \biggr\}
\end{align}
if we define
\begin{equation}
  \kappa^2 \equiv \frac{\gamma^2}{\Gamma^2}c^2
\end{equation}
and enforce 
\begin{equation}
  q^{\rho\sigma} = \frac{1}{N}\sum_i S^\rho_i S^\sigma_i.
\end{equation}
We will do so by introducing the following integrals over delta-function representations:
\begin{equation}
  \prod_{\rho\leq\sigma} \int \mathrm{d} q^{\rho\sigma}\,\delta\bigg(q^{\rho\sigma} - \frac{1}{N}\sum_i S^\rho_i S^\sigma_i\bigg) \propto \int \biggl[\prod_{\rho<\sigma} \mathrm{d} q^{\rho\sigma}\,\mathrm{d} r^{\rho\sigma}\biggr] \exp\biggl[-\beta^2\alpha N \sum_{\rho\sigma} q^{\rho\sigma} r^{\rho\sigma} + \beta^2\alpha \sum_{i{\rho\sigma}} r^{\rho\sigma} S^\rho_i S^\sigma_i\biggr],
\end{equation}
where $r^{\rho\sigma}$ are additional auxillary variables and the factor of $\beta^2\alpha N$ is introduced for later convenience.

We can now integrate over the uncondensed overlaps $m^\rho_{\mu\nu}$:
\begin{alignat}{2}
  &\mathrlap{ \left\langle \prod_{{\mu\nu}\rho} \int \mathrm{d} m^\rho_{\mu\nu} \biggl(\frac{\beta N}{2\pi}\biggr)^{\!\frac{1}{2}} \exp\biggl[-\frac{\beta N}{2} (m^\rho_{\mu\nu})^2 + \beta m^\rho_{\mu\nu} \sum_i (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) S^\rho_i\biggr] \right\rangle } \nonumber\\
  &\qquad &&{}\propto \prod_\mu \int \biggl[\prod_{\nu\rho} \mathrm{d} m^\rho_{\mu\nu} \biggl(\frac{\beta N}{2\pi}\biggr)^{\!\frac{1}{2}}\biggr] \exp\biggl[-\frac{\beta N}{2} \sum_{{\nu\omega}{\rho\sigma}} \bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl( (1-\kappa^2)\delta_{\nu\omega} + \kappa^2 \bigr) q^{\rho\sigma} \bigr] m^\rho_{\mu\nu} m^\sigma_{\mu\omega}\biggr] \nonumber\\
  &&&{}= \biggl(\det\bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl( (1-\kappa^2)\delta_{\nu\omega} + \kappa^2 \bigr) q^{\rho\sigma} \bigr]^{-\frac{1}{2}}\biggr)^{\!p} \nonumber\\
  &&&{}= \exp \biggl\{-\frac{p}{2} \mathrm{Tr}\log\Bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) q^{\rho\sigma} \Bigr] \biggr\}.
\end{alignat}

Thus, so far, our partition function is
\begin{align}
  \langle Z^n \rangle \propto \int &\biggl[\prod_{\nu\rho} \mathrm{d} m^\rho_{1\nu} \biggl(\frac{\beta N}{2\pi}\biggr)^{\frac{1}{2}}\biggr] \biggl[\prod_{\rho\leq\sigma} \mathrm{d} q^{\rho\sigma}\,\mathrm{d} r^{\rho\sigma}\biggr] \nonumber\\
  &{}\times \exp\Biggl\{ -\beta N \biggl[\frac{1}{2} \sum_{\nu\rho} (m^\rho_{1\nu})^2 + \frac{\alpha}{2\beta} \mathrm{Tr}\log\bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) q^{\rho\sigma} \bigr] + \frac{\beta\alpha}{2} \sum_{\rho\sigma} q^{\rho\sigma} r^{\rho\sigma} \biggr]\Biggr\} \nonumber\\
  &{}\times \biggl\langle \mathrm{Tr}_S \exp\biggl[\beta \sum_{i\nu\rho} m^\rho_{1\nu} (\eta^i_{1\nu} + \zeta^i_{1\nu}) S^\rho_i - \beta \biggl(\theta + \frac{\alpha s\Gamma^2}{2}\biggr) \sum_{i\rho}S^\rho_i + \frac{\beta^2\alpha}{2} \sum_{i{\rho\sigma}} r^{\rho\sigma} S^\rho_i S^\sigma_i\biggr] \biggr\rangle.
\end{align}
We will consider the possibility that the network overlaps significantly with either one sparse example $\eta_{11}$ or dense examples $\zeta_{1\nu}$ of one concept. Thus, we replace
\begin{equation}
\sum_{i\nu\rho} m^\rho_{1\nu} (\eta^i_{1\nu} + \zeta^i_{1\nu}) S^\rho_i \cond{by} \sum_{i\nu\rho} m^\rho_{1\nu} \chi^i_{1\nu} S^\rho_i,
\end{equation}
where $\chi^i_{1\nu} = \eta^i_{11}\delta_{1\nu}$ or $\zeta^i_{1\nu}$ depending on whether we are considering recovery of sparse or dense patterns.

\subsection{Condensed pattern}

We now take advantage of self-averaging over the $i$ indices. For any function $F\{\chi^i, S_i\}$ that only depends on one $i$ at a time,
\begin{align}
  \mathrm{Tr}_S \exp\biggl[\sum_i F\{\chi^i, S_i\}\biggr]
  &= \prod_i \mathrm{Tr}_{S_i} \exp F\{\chi^i, S_i\} \nonumber\\
  &= \exp\biggl[ \sum_i \log \mathrm{Tr}_{S_i} \exp F\{\chi^i, S_i\}\biggr] \nonumber\\
  &= \exp\biggl[ N \langle \log \mathrm{Tr}_S \exp F\{\chi, S\} \rangle_\chi \biggr].
\end{align}
Now $\chi$ and $S$ represent the pattern entry and activity of a single neuron. This single neuron is representative of the entire network because we now average over possible pattern entries $\chi$.

With this, our partition function becomes
\begin{equation}
  \langle Z^n \rangle \propto \int \biggl[\prod_{\nu\rho} \mathrm{d} m^\rho_{1\nu} \biggl(\frac{\beta N}{2\pi}\biggr)^{\!\frac{1}{2}}\biggr] \biggl[\prod_{\rho\leq\sigma} \mathrm{d} q^{\rho\sigma}\,\mathrm{d} r^{\rho\sigma}\biggr] \exp[-\beta N f],
\end{equation}
where
\begin{align}
  f ={}& \frac{1}{2} \sum_{\nu\rho} (m^\rho_{1\nu})^2 + \frac{\alpha}{2\beta} \mathrm{Tr}\log\bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) q^{\rho\sigma} \bigr] + \frac{\beta\alpha}{2} \sum_{\rho\sigma} q^{\rho\sigma} r^{\rho\sigma} \nonumber\\
  &{}- \frac{1}{\beta} \left\langle \log\mathrm{Tr}_S \exp\Biggl\{ \beta \biggl[ \sum_{\nu\rho} m^\rho_{1\nu} \chi_{1\nu} S^\rho - \biggl(\theta + \frac{\alpha s \Gamma^2}{2}\biggr) \sum_\rho S^\rho + \frac{\beta\alpha}{2} \sum_{\rho\sigma} r^{\rho\sigma} S^\rho S^\sigma \biggr] \Biggr\} \right\rangle.
\end{align}

\subsection{Saddle-point approximation}

The free energy per neuron is
\begin{equation}
  \frac{F}{N} = -\frac{1}{\beta N} \langle \log Z \rangle = -\frac{1}{\beta N} \lim_{n\rightarrow 0} \frac{\langle Z^n \rangle - 1}{n} = -\frac{1}{\beta N} \lim_{n\rightarrow 0} \frac{1}{n} \log \langle Z^n \rangle = \lim_{n\rightarrow 0} \frac{1}{n} \min f\{m, q, r\},
\end{equation}
where we have replaced $\langle Z^n \rangle$ with its saddle point value. To determine the saddle point location, it is helpful to revert $f$ to its expression before integrating over the uncondensed overlaps $m^\rho_{\mu\nu}$: 
\begin{align}
  f ={}& \frac{1}{2} \sum_{\nu\rho} (m^\rho_{1\nu})^2 + \frac{1}{2} \sum_{\mu{\nu\omega}{\rho\sigma}} \bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) q^{\rho\sigma} \bigr] m^\rho_{\mu\nu} m^\sigma_{\mu\omega} + \frac{\beta\alpha}{2} \sum_{\rho\sigma} q^{\rho\sigma} r^{\rho\sigma} \nonumber\\
  &{}- \frac{1}{\beta} \left\langle \log\mathrm{Tr}_S \exp\Biggl\{ \beta \biggl[ \sum_{\nu\rho} m^\rho_{1\nu} \chi_{1\nu} S^\rho - \biggl(\theta + \frac{\alpha s \Gamma^2}{2}\biggr) \sum_\rho S^\rho + \frac{\beta\alpha}{2} \sum_{\rho\sigma} r^{\rho\sigma} S^\rho S^\sigma \biggr] \Biggr\} \right\rangle.
\end{align}
The location of this point is determined by setting derivatives to zero. If we observe that the single neuron Hamiltonian is
\begin{equation}
  \mathcal{H} \equiv - \sum_{\nu\rho} m^\rho_{1\nu} \chi_{1\nu} S^\rho + \biggl(\theta + \frac{\alpha s \Gamma^2}{2}\biggr) \sum_\rho S^\rho - \frac{\beta\alpha}{2} \sum_{\rho\sigma} r^{\rho\sigma} S^\rho S^\sigma,
\end{equation}
we obtain
\begin{align}
  0 &= \parpar{f}{m^\rho_{1\nu}} = m^\rho_{1\nu} - \left\langle \frac{ \mathrm{Tr}_S\,\chi_{1\nu} S^\rho \exp[-\beta\mathcal{H}] }{ \mathrm{Tr}_S \exp[-\beta\mathcal{H}] } \right\rangle \nonumber\\
  \Rightarrow m^\rho_{1\nu} &= \left\langle \chi_{1\nu} \overline{S^\rho} \right\rangle, \\
  \nonumber\\
  0 &= \parpar{f}{r^{\rho\sigma}} = \frac{\beta\alpha}{2} q^{\rho\sigma} - \frac{\beta\alpha}{2} \left\langle \frac{ \mathrm{Tr}_S\, S^\rho S^\sigma \exp[-\beta\mathcal{H}] }{ \mathrm{Tr}_S \exp[-\beta\mathcal{H}] } \right\rangle \nonumber\\
  \Rightarrow q^{\rho\sigma} &= \left\langle \overline{S^\rho S^\sigma} \right\rangle = \begin{cases} \left\langle \overline{S^\rho} \cdot \overline{S^\sigma} \right\rangle & \rho \neq \sigma \\ \left\langle \overline{S^\rho} \right\rangle & \rho = \sigma, \end{cases} \\
  \nonumber\\
  0 &= \parpar{f}{q^{\rho\sigma}} = \frac{\beta\alpha}{2} r^{\rho\sigma} - \frac{\beta}{2} \sum_{\mu{\nu\omega}} \Gamma^2\bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) m^\rho_{\mu\nu} m^\sigma_{\mu\omega} \nonumber\\
  \Rightarrow r^{\rho\sigma} &= \frac{1}{\alpha} \sum_{\mu{\nu\omega}} \Gamma^2\bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) m^\rho_{\mu\nu} m^\sigma_{\mu\omega}.
\end{align}
Bars over variables represent the thermodynamic ensemble average. Thus, $m$ is the overlap of the network with the condensed pattern to be recovered, $q$ is the overall neural activity, and $r$ is related to the overlap of the network with uncondensed patterns.

\subsection{Replica-symmetry ansatz}

We assume replica symmetry:
\begin{equation}
  m^\rho_{\mu\nu} = m_{\mu\nu}, \qquad q^{\rho\sigma} = q, \qquad q^{\rho\rho} = q_0, \qquad r^{\rho\sigma} = r, \qquad r^{\rho\rho} = r_0.
\end{equation}
Our expression for $f$ then becomes
\begin{align}
  f ={}& \frac{1}{2} n \sum_\nu (m_{1\nu})^2 + \frac{\alpha}{2\beta} \mathrm{Tr}\log\bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) \bigl((q_0-q)\delta^{\rho\sigma} + q\bigr) \bigr] + \frac{\beta\alpha n}{2} q_0 r_0 + \frac{\beta\alpha n(n-1)}{2} qr \nonumber\\
  &{}- \frac{1}{\beta} \left\langle \log\mathrm{Tr}_S \exp\Biggl\{ \beta\biggl[ \biggl(\sum_\nu m_{1\nu} \chi_{1\nu}  - \theta - \frac{\alpha s\Gamma^2}{2} + \frac{\beta\alpha}{2} (r_0 - r)\biggr) \sum_\rho S^\rho + \frac{\beta\alpha}{2} r \biggl(\sum_\rho S^\rho\biggr)^{\!\!2} \biggr]\Biggr\} \right\rangle.
\end{align}

The eigenvalues of a constant $n \times n$ matrix are $n$ with multiplicity $1$ and $0$ with multiplicity $n-1$. Thus,
\begin{alignat}{3}
  &\mathrlap{ \lim_{n \rightarrow 0} \frac{1}{n} \mathrm{Tr}\log\bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta\Gamma^2 \bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) \bigl((q_0-q)\delta^{\rho\sigma} + q\bigr) \bigr] } \nonumber\\
  &\qquad{}={} && \lim_{n \rightarrow 0} \frac{1}{n} \biggl\{ &&\log\bigl[ 1 - \beta\Gamma^2 (1+s\kappa^2-\kappa^2) (q_0-q+nq) \bigr] + (n-1) \log\bigl[ 1 - \beta\Gamma^2 (1+s\kappa^2-\kappa^2) (q_0-q) \bigr]\nonumber\\
  &&&&& + (s-1) \log\bigl[ 1 - \beta\Gamma^2 (1-\kappa^2) (q_0-q+nq) \bigr] + (s-1)(n-1) \log\bigl[ 1 - \beta\Gamma^2 (1-\kappa^2) (q_0-q) \bigr] \biggr\}\nonumber\\
  &\qquad{}={} && \mathrlap{ (s-1) \biggl[\log\bigl[1 - Q (1-\kappa^2)\bigr] - \frac{\beta q \Gamma^2 (1-\kappa^2)}{1 - Q (1-\kappa^2)}\biggr] + \log\bigl[1 - Q (1+s\kappa^2-\kappa^2)\bigr] - \frac{\beta q \Gamma^2 (1+s\kappa^2-\kappa^2)}{1 - Q (1+s\kappa^2-\kappa^2)}, }\nonumber\\
  &\qquad{}\equiv{} && \Lambda[q,q_0],
\end{alignat}
where
\begin{equation}
  Q \equiv \beta(q_0-q)\Gamma^2.
\end{equation}

Meanwhile, we can use another gaussian integral to perform the trace over $S$ in the limit $n \rightarrow 0$:
\begin{align}
  &\left\langle \log\mathrm{Tr}_S \exp\Biggl\{ \beta \biggl[\biggl(\sum_\nu m_{1\nu} \chi_{1\nu} - \theta - \frac{\alpha s\Gamma^2}{2} + \frac{\beta\alpha}{2} (r_0-r)\biggr) \sum_\rho S^\rho + \frac{\beta\alpha}{2} r \biggl(\sum_\rho S^\rho\biggr)^{\!\!2} \biggr]\Biggr\} \right\rangle \nonumber\\
  &\qquad {}= \left\langle \log\mathrm{Tr}_S \int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}} \mathrm{e}^{-z^2/2} \exp\Biggl\{ \beta \biggl[\biggl(\sum_\nu m_{1\nu} \chi_{1\nu} - \theta + \frac{\beta\alpha}{2} (r_0-r) - \frac{\alpha s\Gamma^2}{2} + \sqrt{\alpha r} z\biggr) \sum_\rho S^\rho \biggr]\Biggr\} \right\rangle \nonumber\\
  &\qquad {}= \left\langle \log\int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}} \mathrm{e}^{-z^2/2} \Biggl\{1 + \exp\biggl[ \beta \biggl(\sum_\nu m_{1\nu} \chi_{1\nu} - \theta + \frac{\beta\alpha}{2} (r_0-r) - \frac{\alpha s\Gamma^2}{2} + \sqrt{\alpha r} z\biggr) \biggr] \Biggr\}^{\!n} \right\rangle \nonumber\\
  &\qquad {}\approx \left\langle \log\int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}} \mathrm{e}^{-z^2/2} \Biggl\{1 + n \log \biggl\{1 + \exp\biggl[ \beta \biggl(\sum_\nu m_{1\nu} \chi_{1\nu} - \theta + \frac{\beta\alpha}{2} (r_0-r) - \frac{\alpha s\Gamma^2}{2} + \sqrt{\alpha r} z\biggr) \biggr] \biggl\} \Biggr\} \right\rangle \nonumber\\
  &\qquad {}\approx n\,\left\langle \int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}} \mathrm{e}^{-z^2/2} \log \biggl\{1 + \exp\biggl[ \beta \biggl(\sum_\nu m_{1\nu} \chi_{1\nu} - \theta + \frac{\beta\alpha}{2} (r_0-r) - \frac{\alpha s\Gamma^2}{2} + \sqrt{\alpha r} z\biggr) \biggr] \biggr\} \right\rangle.
\end{align}

This gives us the free energy under replica symmetry
\begin{align}
  \frac{F}{N} ={}& \frac{1}{2} \sum_\nu (m_{1\nu})^2 + \frac{\alpha}{2\beta} \Lambda[q,q_0] + \frac{\beta\alpha}{2} (q_0 r_0 - qr) \nonumber\\
  &{}- \frac{1}{\beta} \left\llangle \log \biggl\{1 + \exp\biggl[ \beta \biggl(\sum_\nu m_{1\nu} \chi_{1\nu} - \theta + \frac{\beta\alpha}{2} (r_0-r) - \frac{\alpha s\Gamma^2}{2} + \sqrt{\alpha r} z\biggr) \biggr] \biggr\} \right\rrangle,
\end{align}
where now the double angle brackets also indicate an average over the gaussian variable $z$.

\subsection{Mean-field equations}

The mean-field equations arise from the saddle point approximation and are equivalent to setting derivatives of $F$ to zero. We first note that
\begin{equation}
  \parpar{\Lambda}{q} = (s-1) \frac{\beta^2 q\Gamma^4 (1-\kappa^2)^2}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}} + \frac{\beta^2 q\Gamma^4 (1+s\kappa^2-\kappa^2)^2}{\bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)^{\!2}}. 
\end{equation}
The combined fraction has numerator $\beta^2 q\Gamma^4$ multipled by
\begin{align}
  &(s-1)\Bigl[\bigl(1-\kappa^2\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)\Bigr]^2 + \Bigl[\bigl(1+s\kappa^2-\kappa^2\bigr) \bigl(1 - Q(1-\kappa^2)\bigr)\Bigr]^2 \nonumber\\
  &\qquad{}= s \bigl(1 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)\bigr)^{\!2} + s(s-1)\kappa^4.
\end{align}
Meanwhile,
\begin{align}
  \parpar{\Lambda}{q_0} &= (s-1)\biggl[ -\frac{\beta\Gamma^2 (1-\kappa^2)}{1 - Q(1-\kappa^2)} - \frac{\beta^2 q\Gamma^4 (1-\kappa^2)^2}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}} \biggr] - \frac{\beta (1+s\kappa^2-\kappa^2)}{1 - Q(1+s\kappa^2-\kappa^2)} - \frac{\beta^2 q\Gamma^4 (1+s\kappa^2-\kappa^2)^2}{\bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)^{\!2}} \nonumber\\
  &= -\parpar{\Lambda}{q} - \beta\Gamma^2\biggl[ \frac{(s-1)(1-\kappa^2)}{1 - Q(1-\kappa^2)} + \frac{1+s\kappa^2-\kappa^2}{1 - Q(1+s\kappa^2-\kappa^2)} \biggr].
\end{align}
The combined fraction has numerator
\begin{align}
  (s-1)(1-\kappa^2)\bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr) + (1+s\kappa^2-\kappa^2)\bigl(1 - Q(1-\kappa^2)\bigr)= s\bigl(1 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)\bigr).
\end{align}

Thus, the derivatives become
\begin{align}
  0 &= \parpar{F}{q} = \frac{\alpha}{2 \beta} \biggl[\beta^2 qs\Gamma^4 \frac{\bigl(1 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)\bigr)^{\!2} + (s-1)\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)^{\!2}} \biggr] - \frac{\beta\alpha}{2} r \nonumber\\
  \Rightarrow r &= qs\Gamma^4 \frac{\bigl(1 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)\bigr)^{\!2} + (s-1)\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)^{\!2}} \\
  \nonumber\\
  0 &= \parpar{F}{q_0} = \frac{\alpha}{2 \beta} \biggl[ -\parpar{\Lambda}{q_0} -\beta s\Gamma^2 \frac{1 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)} \biggr] + \frac{\beta\alpha}{2} r_0 \nonumber\\
  \Rightarrow r_0 &= r + \frac{s\Gamma^2}{\beta} \frac{1 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)} \\
  \nonumber\\
  0 &= \parpar{F}{m_{1\nu}} = m_{1\nu} - \left\llangle \chi_{1\nu} \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \right\rrangle \nonumber\\
  \Rightarrow m_{1\nu} &= \left\llangle \chi_{1\nu} \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \right\rrangle,
\end{align}
where $\mathop{\mathrm{sig}}\nolimits[x] \equiv 1/(1+\mathrm{e}^{-x})$ and
\begin{equation}
  \mathcal{H} \equiv \sum_\nu m_{1\nu} \chi_{1\nu} - \theta + \frac{\beta\alpha}{2} (r_0 - r) - \frac{\alpha s\Gamma^2}{2} + \sqrt{\alpha r} z.
\end{equation}
We can simplify
\begin{align}
  &\frac{\beta\alpha}{2} (r_0 - r) - \frac{\alpha s\Gamma^2}{2} \nonumber\\
  &\qquad{}= \frac{\alpha s\Gamma^2}{2} \cdot \frac{1 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2) - \bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)} \nonumber\\
  &\qquad{}= \frac{\alpha s\Gamma^2}{2} \cdot \frac{Q\kappa^2(1+s\kappa^2-\kappa^2) + Q(1-\kappa^2) - Q^2(1-\kappa^2)(1+s\kappa^2-\kappa^2)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)} \nonumber\\
  &\qquad{}= \frac{Q\alpha s\Gamma^2}{2} \cdot \frac{1+s\kappa^4-\kappa^4 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)}.
\end{align}
Thus,
\begin{equation}
  \mathcal{H} \equiv \sum_\nu m_{1\nu} \chi_{1\nu} - \theta + \frac{\alpha s\Gamma^2}{2} Q \frac{1+s\kappa^4-\kappa^4 - Q(1-\kappa^2)(1+s\kappa^2-\kappa^2)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s\kappa^2-\kappa^2)\bigr)} + \sqrt{\alpha r} z.
\end{equation}
Continuing, and using the identities $\int \mathrm{d} z\,\mathrm{e}^{-z^2/2} z f(z) = \int \mathrm{d} z\,\mathrm{e}^{-z^2/2}\ \mathrm{d} f(z)/\mathrm{d} z$ and $\mathrm{d}\mathop{\mathrm{sig}}\nolimits(x)/\mathrm{d} x = \mathop{\mathrm{sig}}\nolimits(x) \bigl(1-\mathop{\mathrm{sig}}\nolimits(x)\bigr)$,
\begin{align}
  0 &= \parpar{F}{r} = -\frac{\beta\alpha}{2} q - \frac{\sqrt{\alpha}}{2\sqrt{r}} \left\llangle z \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \right\rrangle + \frac{\beta\alpha}{2} \left\llangle \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \right\rrangle \nonumber\\
  \Rightarrow q &= \mathrlap{ \left\llangle \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}]^2 \right\rrangle } \\
  \nonumber\\
  0 &= \parpar{F}{r_0} = \frac{\beta\alpha}{2} q_0 - \frac{\beta\alpha}{2} \left\llangle \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \right\rrangle \nonumber\\
  \Rightarrow q_0 &= \mathrlap{ \left\llangle \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \right\rrangle. } 
\end{align}

With the definitions
\begin{align}
  s_0 &\equiv s-1, \nonumber\\ 
  \phi &\equiv \theta - \frac{Q\alpha s\Gamma^2}{2} \cdot \frac{1+s_0\kappa^4 - Q(1-\kappa^2)(1+s_0\kappa^2)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s_0\kappa^2)\bigr)},
\end{align}
we recover the mean-field equations presented in \cref{eq:overview-meanfield}.

\subsection{Zero-temperature limit}

In the limit $\beta\rightarrow\infty$, we will define the function $\mathop{\mathrm{dsig}}\nolimits(x) = \mathop{\mathrm{sig}}\nolimits(x)[1 - \mathop{\mathrm{sig}}\nolimits(x)]$ and use the identity
\begin{align}
  \int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}} \mathrm{e}^{-z^2/2} \mathop{\mathrm{dsig}}\nolimits[\beta(az+b)]
  = \int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}\beta a} \mathrm{e}^{-z^2/2} \parpar{}{z}\mathop{\mathrm{sig}}\nolimits[\beta(az+b)] 
  \approx \int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}\beta a} \mathrm{e}^{-z^2/2} \parpar{}{z}\Theta[az+b]
  = \frac{1}{\sqrt{2\pi}\beta|a|} \mathrm{e}^{-b^2/2a^2},
\end{align}
where $\Theta$ is the Heaviside step function. Now we calculate
\begin{equation}
  Q \approx \beta\Gamma^2 \biggl\llangle \mathop{\mathrm{dsig}}\nolimits\biggl[\beta \biggl(\sum_\nu m_{1\nu}\chi_{1\nu}-\phi+\sqrt{\alpha r} z\biggr)\biggr] \biggr\rrangle 
  = \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \Biggl\langle\exp\Biggl[ -\frac{\bigl(\sum_\nu m_{1\nu} \chi_{1\nu} - \phi\bigr)^{\!2}}{2{\alpha r}}\Biggr]\Biggr\rangle.
\end{equation}

We then use the identity
\begin{equation}
  \int\!\!\frac{\mathrm{d} z}{\sqrt{2\pi}} \mathrm{e}^{-z^2/2}\, \Theta[az+b]
  = \frac{1}{2} \biggl(1 + \mathop{\mathrm{erf}}\nolimits\frac{b}{\sqrt{2}a}\biggr).
\end{equation}
We also note that $\llangle \chi_{1\nu} \rrangle = 0$ for both sparse and dense patterns. Now we calculate
\begin{align}
  m_{1\nu} &\approx \biggl\llangle \chi_{1\nu} \,\Theta\biggl[\sum_\nu m_{1\nu}\chi_{1\nu}-\phi+\sqrt{\alpha r} z\biggr] \biggr\rrangle 
  = \frac{1}{2}\Biggl\langle \chi_{1\nu} \mathop{\mathrm{erf}}\nolimits\frac{\sum_\nu m_{1\nu} \chi_{1\nu} - \phi}{\sqrt{2{\alpha r}}} \Biggr\rangle, \\
  \nonumber\\
  q &\approx \biggl\llangle \Theta\biggl[\sum_\nu m_{1\nu}\chi_{1\nu}-\phi+\sqrt{\alpha r} z\biggr] \biggr\rrangle
  = \frac{1}{2} \Biggl[1 + \Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{\sum_\nu m_{1\nu} \chi_{1\nu} - \phi}{\sqrt{2{\alpha r}}} \Biggr\rangle \Biggr], \\
  \nonumber\\
  r &= qs\Gamma^4 \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \nonumber\\
  &= \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \Biggl[1 + \Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{\sum_\nu m_{1\nu} \chi_{1\nu} - \phi}{\sqrt{2{\alpha r}}} \Biggr\rangle \Biggr].
\end{align}
Here, the single angle brackets indicate an average over $\chi_{1\nu}$, with the average over $z$ performed.



\section{\label{sec:se}Capacity for sparse examples}

\subsection{Sparse mean-field equations}

Now we wish to recover the sparse pattern, so $\chi_{1\nu} = \eta_{11} \delta_{1\nu}$. We rename $m \equiv m_{11}$ and $\eta \equiv \eta_{11}$. We have 
\begin{equation}
  \eta =
      \begin{cases} (1-2\gamma)(1-a) & \textrm{with probability } a   \\
                    -(1-2\gamma)a    & \textrm{with probability } 1-a \end{cases}
      \approx \begin{cases} 1-2\gamma & \textrm{with probability } a   \\
                            0         & \textrm{with probability } 1 \end{cases}
\end{equation}
Then,
\begin{align}
  m &= \frac{1}{2}\biggl\langle \eta \mathop{\mathrm{erf}}\nolimits\frac{m\eta-\phi}{2{\alpha r}} \biggr\rangle = \frac{(1-2\gamma)a}{2} \Biggl\{\mathop{\mathrm{erf}}\nolimits\frac{\phi}{\sqrt{2{\alpha r}}} + \mathop{\mathrm{erf}}\nolimits\frac{(1-2\gamma)m-\phi}{\sqrt{2{\alpha r}}} \Biggr\}, \\
  \nonumber\\
  r &= \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \biggl[1 + \biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{m\eta-\phi}{\sqrt{2{\alpha r}}} \biggr\rangle \biggr] \nonumber\\
  &= \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \Biggl\{1 - \mathop{\mathrm{erf}}\nolimits\frac{\phi}{\sqrt{2{\alpha r}}} + a \mathop{\mathrm{erf}}\nolimits\frac{(1-2\gamma)m-\phi}{\sqrt{2{\alpha r}}} \Biggr\}, \\
  \nonumber\\
  Q &= \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \biggl\langle\exp\biggl[ -\frac{(m\eta - \phi)^2}{2{\alpha r}}\biggr]\biggr\rangle = \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \Biggl\{\exp\biggl[-\frac{\phi^2}{2{\alpha r}}\biggr] + a\exp\biggl[-\frac{((1-2\gamma)m - \phi)^2}{2{\alpha r}}\biggr] \Biggr\}.
\end{align}

We will see that $Q \ll 1$. In that case, the mean field equations become
\begin{align}
  m &= \frac{(1-2\gamma)a}{2} \Biggl\{\mathop{\mathrm{erf}}\nolimits\frac{\phi}{\sqrt{2{\alpha r}}} + \mathop{\mathrm{erf}}\nolimits\frac{(1-2\gamma)m-\phi}{\sqrt{2{\alpha r}}} \Biggr\}, \nonumber\\
  r &= \frac{s \bigl(1+(s-1)\kappa^4\bigr) \Gamma^4}{2} \Biggl\{1 - \mathop{\mathrm{erf}}\nolimits\frac{\phi}{\sqrt{2{\alpha r}}} + a \mathop{\mathrm{erf}}\nolimits\frac{(1-2\gamma)m-\phi}{\sqrt{2{\alpha r}}} \Biggr\}.
\end{align}
These are presented in \cref{eq:overview-A}.

We can map our dual input equations onto the classic sparse Hopfield equations with the rescalings
\begin{align}
  m &= (1-2\gamma)a \cdot m', \nonumber\\
  \phi &= (1-2\gamma)^2a \cdot \theta', \nonumber\\
  r &= s \bigl(1+s_0\kappa^4\bigr) \Gamma^4 \cdot r', \nonumber\\
  \alpha &= \frac{{(1-2\gamma)^4}a^2}{s \bigl(1+s_0\kappa^4\bigr)\Gamma^4} \cdot \alpha'.
  \label{eq:A-rescale}
\end{align}
Then,
\begin{align}
  m' &= \frac{1}{2}\biggl[\mathop{\mathrm{erf}}\nolimits\frac{\theta'}{\sqrt{2{\alpha' r'}}} + \mathop{\mathrm{erf}}\nolimits\frac{m' - \theta'}{\sqrt{2{\alpha' r'}}}\biggr] \nonumber\\
  r' &= \frac{1}{2}\biggl[1 - \mathop{\mathrm{erf}}\nolimits\frac{\theta'}{\sqrt{2{\alpha' r'}}} + a \mathop{\mathrm{erf}}\nolimits\frac{m' - \theta'}{\sqrt{2{\alpha' r'}}}\biggr].
\end{align}

Successfully recovering the sparse example means that $m \rightarrow 1$, which requires $\theta'/\sqrt{2{\alpha' r'}} \gg 1$ and $(m'-\theta')/\sqrt{2{\alpha' r'}} \gg 1$. Under these limits, $0 < \theta' < 1$ and
\begin{align}
  m' &= 1 - \frac{1}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{\theta'}\mathrm{e}^{-\theta'^2/2{\alpha' r'}} - \frac{1}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{m'-\theta'}\mathrm{e}^{-(m'-\theta')^2/2{\alpha' r'}} \nonumber\\
  r' &= \frac{1}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{\theta'}\mathrm{e}^{-\theta'^2/2{\alpha' r'}} + \frac{a}{2} - \frac{a}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{m'-\theta'}\mathrm{e}^{-(m'-\theta')^2/2{\alpha' r'}},
\end{align}
and $Q \ll 1$ indeed.



\subsection{\texorpdfstring{$\theta' \lesssim 0.5$}{θ' ≲ 0.5} limit}

For small $\theta'$, we can simplify
\begin{align}
  m' &\approx 1 - \frac{1}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{\theta'}\mathrm{e}^{-\theta'^2/2{\alpha' r'}} - \frac{1}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{1-\theta'}\mathrm{e}^{-(1-\theta')^2/2{\alpha' r'}} \nonumber\\
  r' &\approx \frac{1}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{\theta'}\mathrm{e}^{-\theta'^2/2{\alpha' r'}} + \frac{a}{2}.
\end{align}
If we take $y \equiv \theta'/\sqrt{\alpha' r'} \gg 1$, the equation for $r'$ becomes
\begin{align}
  \frac{\theta'^2}{\alpha'} &= \frac{1}{\sqrt{2\pi}}y\mathrm{e}^{-y^2/2} + \frac{a}{2}y^2.
\end{align}
To maximize $\alpha'$, we want to minimize the right-hand side over $y$:
\begin{align}
  0 &= \frac{1}{\sqrt{2\pi}}(1-y^2)\mathrm{e}^{-y^2/2} + ax \nonumber\\
  y\mathrm{e}^{-y^2/2} &\approx \sqrt{2\pi}a \nonumber\\
  y &= \sqrt{-W_{-1}(-2\pi a^2)} \approx \sqrt{2|\!\log a|},
\end{align}
where $W_{-1}$ is the negative branch of the Lambert $W$ function, which is also known as the product logarithm. Thus
\begin{align}
  \frac{\theta'^2}{\alpha'_\textrm{c}} &= a \biggl(1 + \frac{1}{2}y^2\biggr)
  \approx a |\!\log a| \nonumber\\
  \alpha'_\textrm{c} &\sim \frac{\theta'^2}{a |\!\log a|}.
\end{align}

Then,
\begin{align}
  m'_\textrm{c} &\approx 1 - \frac{1}{2\sqrt{\pi|\!\log a|}} \frac{\theta'}{1-\theta'} a^{(1-\theta')^2/\theta'^2}, \nonumber\\
  r'_\textrm{c} &= \frac{\theta'^2}{\alpha'_c y^2} = \frac{a}{2}.
\end{align}



\subsection{\texorpdfstring{$\theta' \rightarrow 1$}{θ' → 1} limit}

In this case, we can simplify
\begin{align}
  m' &= 1 - \frac{1}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{m'-\theta'}\mathrm{e}^{-(m'-\theta')^2/2{\alpha' r'}} \nonumber\\
  r' &= \frac{a}{2} - \frac{a}{\sqrt{2\pi}} \frac{\sqrt{\alpha' r'}}{m'-\theta'}\mathrm{e}^{-(m'-\theta')^2/2{\alpha' r'}}.
\end{align}
Thus,
\begin{equation}
  r' = \frac{a}{2} - a(1-m') \approx \frac{a}{2}.
\end{equation}
Then, with $y \equiv (m'-\theta')/\sqrt{\alpha' r'} \gg 1$,
\begin{align}
  m' - \theta' &= 1 - \theta' - \frac{1}{\sqrt{2\pi}y} \mathrm{e}^{-y^2/2} \nonumber\\
  \sqrt{\frac{a\alpha'}{2}} &= \frac{1 - \theta'}{y} - \frac{1}{\sqrt{2\pi}y^2} \mathrm{e}^{-y^2/2}.
\end{align}
To maximize $\alpha'$, we want to maximize the right-hand size with respect to $y$:
\begin{align}
  0 &= -\frac{1-\theta'}{y^2} + \frac{1}{\sqrt{2\pi}} \biggl(\frac{2}{y^3} + \frac{1}{y}\biggr) \mathrm{e}^{-y^2/2} \nonumber\\
  y\mathrm{e}^{-y^2/2} &\approx \sqrt{2\pi}(1-\theta') \nonumber\\
  y &\approx \sqrt{-W_{-1}\bigl[-2\pi (1-\theta')^2\bigr]} \approx \sqrt{2|\!\log (1-\theta')|}.
\end{align}
Thus
\begin{align}
  \sqrt{\frac{a\alpha'_\textrm{c}}{2}} &= \frac{1-\theta'}{y} - \frac{1-\theta'}{y^3} \nonumber\\
  \alpha'_\textrm{c} &\sim \frac{(1-\theta')^2}{a |\!\log (1-\theta')|}.
\end{align}
And
\begin{equation}
  m'_\textrm{c} \approx 1 - \frac{1-\theta'}{y^2} 
      = 1 - \frac{1-\theta'}{2|\!\log (1-\theta')|}.
\end{equation}

\subsection{Maximizing capacity over \texorpdfstring{$\theta'$}{θ'}}

To approximately maximize $\alpha'_\textrm{c}$ over $\theta'$, we find where the following two expressions meet:
\begin{equation}
  \alpha'_\textrm{c} \sim
    \begin{cases}
      \dfrac{\theta'^2}{a |\!\log a|}               & \theta' \lesssim    0.5 \\
      \dfrac{(1-\theta')^2}{a |\!\log (1-\theta')|} & \theta' \rightarrow 1.
    \end{cases}
\end{equation}

We consider that $\theta'$ is sufficiently far from 1 such that $|\!\log (1-\theta')| \sim 1$. Then,
\begin{align}
  \frac{\theta'^2}{a |\!\log a|} &\approx \frac{(1-\theta')^2}{a} \nonumber\\
  \theta' &= \frac{\sqrt{|\!\log a|}}{1 + \sqrt{|\!\log a|}}.
\end{align}
Thus,
\begin{equation}
  \alpha'_\textrm{c} \sim \frac{1}{a |\!\log a|}.
\end{equation}
By converting $\alpha'$ back to $\alpha$ with \cref{eq:A-rescale}, we recover \cref{eq:results-A1}.



\section{\label{sec:de}Capacity for dense examples}

\subsection{Dense asymmetric mean-field equations}

For dense patterns,
\begin{align}
  \zeta_{1\nu} &=
      \begin{cases}  \zeta_1 & \textrm{with probability } \frac{1+c}{2} \\
                    -\zeta_1 & \textrm{with probability } \frac{1-c}{2} \end{cases} \nonumber\\
  \zeta_1 &=
      \begin{cases}  \gamma & \textrm{with probability } \frac{1}{2}  \\
                    -\gamma & \textrm{with probability } \frac{1}{2}. \end{cases}
\end{align}
To help us in our calculations, we note
\begin{align}
  \int_{-\infty}^\infty \mathrm{d} x\,\mathrm{e}^{-(x-a)^2/\rho^2} \mathrm{e}^{-(x-b)^2/\sigma^2} &= \sqrt\frac{\pi}{\rho^{-2}+\sigma^{-2}} \exp\biggl[ -\frac{(a-b)^2}{\rho^2+\sigma^2} \biggr], \nonumber\\
  \int_{-\infty}^\infty \mathrm{d} x\,\mathrm{e}^{-(x-a)^2/\rho^2} \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{x-b}{\sigma} \biggr] &= \sqrt\pi \rho \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{a-b}{\sqrt{\rho^2+\sigma^2}} \biggr], \nonumber\\
  \int_{-\infty}^\infty \mathrm{d} x\,\mathrm{e}^{-(x-a)^2/\rho^2} x \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{x-b}{\sigma} \biggr] &= \rho\,\Biggl\{ \frac{\rho^2}{\sqrt{\rho^2+\sigma^2}} \exp\biggl[ -\frac{(a-b)^2}{\rho^2+\sigma^2} \biggr] + \sqrt\pi a \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{a-b}{\sqrt{\rho^2+\sigma^2}} \biggr] \Biggr\}.
\end{align}

We first consider the possibility that the network overlaps significantly with one example pattern $\zeta_{11}$. In that case,
\begin{equation}
  \sum_\nu m_{1\nu}\zeta_{1\nu} = m_{11}\zeta_{11} + m_0 \sum_{\nu>1} \zeta_{1\nu} = m\zeta + s_0m_0x_0.
\end{equation}
Here, $m \equiv m_{11}$ and $\zeta \equiv \zeta_{11}$. $m_0$ indicates the average overlap with the other $s_0 \equiv s-1$ examples in concept $\mu = 1$. The average over these other examples $x_0$ follows a binomial distribution, which in the large $s$ limit can be described by a Gaussian random variable with mean $c\zeta_1$ and variance $\gamma^2(1-c^2)/s_0$. The concept pattern $\zeta_1$ has a corresponding overlap $m_1$. Thus,
\begin{align}
  m &= \frac{1}{2}\Biggl\langle \zeta \mathop{\mathrm{erf}}\nolimits\frac{m\zeta+s_0m_0x_0-\phi}{\sqrt{2{\alpha r}}} \Biggr\rangle, \nonumber\\
  m_0 &= \frac{1}{2}\Biggl\langle x_0 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta+s_0m_0x_0-\phi}{\sqrt{2{\alpha r}}} \Biggr\rangle, \nonumber\\
  m_1 &= \frac{1}{2}\Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta+s_0m_0x_0-\phi}{\sqrt{2{\alpha r}}} \Biggr\rangle, \nonumber\\
  r &= \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \Biggl[1 + \Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{m\zeta+s_0m_0x_0-\phi}{\sqrt{2{\alpha r}}} \Biggr\rangle \Biggr], \nonumber\\
  Q &= \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \Biggl\langle\exp\Biggl[ -\frac{\bigl(m\zeta+s_0m_0x_0-\phi\bigr)^{\!2}}{2{\alpha r}}\Biggr]\Biggr\rangle.
\end{align}
To aid us later, we will define
\begin{align}
  \sigma_0^2 &\equiv s_0\gamma^2(1-c^2)m_0^2 + {\alpha r} \nonumber\\
  Y_{\pm\pm} &\equiv \frac{\gamma m \pm s_0\gamma cm_0 \pm \phi}{\sqrt{2}\sigma_0} = \frac{\gamma m \pm s_0\gamma cm_0 \pm \phi}{\sqrt{2\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)}},
\end{align}

By performing the averages successively over $\zeta$, $x_0$, and $\zeta_1$, we find
\begin{equation}
  Q = \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \Biggl\{ \frac{1+c}{2} \Biggl\langle\exp\Biggl[ -\frac{\bigl(m\zeta_1+s_0m_0x_0-\phi\bigr)^{\!2}}{2{\alpha r}}\Biggr]\Biggr\rangle + \frac{1-c}{2} \Biggl\langle\exp\Biggl[ -\frac{\bigl(m\zeta_1-s_0m_0x_0+\phi\bigr)^{\!2}}{2{\alpha r}}\Biggr]\Biggr\rangle \Biggr\}.
\end{equation}
Then,
\begin{align}
  &\frac{1}{\sqrt{2\pi{\alpha r}}} \Biggl\langle\exp\Biggl[ -\frac{\bigl(m\zeta_1+s_0m_0 x_0-\phi\bigr)^{\!2}}{2{\alpha r}}\Biggr]\Biggr\rangle \nonumber\\
  &\qquad= \frac{1}{\sqrt{2\pi{\alpha r}}} \sqrt\frac{s_0}{2\pi\gamma^2(1-c^2)} \Biggl\langle \int\mathrm{d} x_0\,\mathrm{e}^{-s_0(x_0-c\zeta_1)^2/2\gamma^2(1-c^2)} \mathrm{e}^{-s_0^2m_0^2(x_0+(m\zeta_1-\phi)/s_0m_0)^2/2{\alpha r}} \Biggr\rangle \nonumber\\
  &\qquad= \frac{1}{\sqrt{2\pi\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)}} \Biggl\langle \exp\Biggl[-\frac{(m\zeta_1+s_0cm_0\zeta_1-\phi)^2}{2\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)}\Biggr] \Biggr\rangle \nonumber\\
  &\qquad= \frac{1}{\sqrt{2\pi}\sigma_0}\cdot\frac{1}{2} \biggl\{ \mathrm{e}^{-Y_{++}^2}+\mathrm{e}^{-Y_{+-}^2} \biggr\}.
\end{align}
Thus,
\begin{equation}
  Q = \frac{\Gamma^2}{\sqrt{2\pi}\sigma_0} \biggl\{ \frac{1+c}{4} \Bigl[\mathrm{e}^{-Y_{++}^2}+\mathrm{e}^{-Y_{+-}^2}\Bigr] + \frac{1-c}{4} \Bigl[\mathrm{e}^{-Y_{-+}^2}+\mathrm{e}^{-Y_{--}^2}\Bigr] \biggr\}.
\end{equation}

Next,
\begin{equation}
  \Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{m\zeta+s_0m_0x_0-\phi}{\sqrt{2{\alpha r}}} \Biggr\rangle = \Biggl\{ \frac{1+c}{2} \Biggl\langle\mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0m_0x_0-\phi}{2{\alpha r}}\Biggr\rangle - \frac{1-c}{2} \Biggl\langle\mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1-s_0m_0x_0+\phi}{2{\alpha r}}\Biggr\rangle \Biggr\}.
\end{equation}
Then,
\begin{align}
  &\Biggl\langle\mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0m_0x_0-\phi}{2{\alpha r}}\Biggr\rangle \nonumber\\
  &\qquad= \sqrt\frac{s_0}{2\pi\gamma^2(1-c^2)} \Biggl\langle \int\mathrm{d} x_0\,\mathrm{e}^{-s_0(x_0 - c\zeta_1)^2/2\gamma^2(1-c^2)} \mathop{\mathrm{erf}}\nolimits\Biggl[\frac{s_0m_0}{\sqrt{2{\alpha r}}} \biggl(x_0+\frac{m\zeta_1-\phi}{s_0m_0}\biggr)\Biggr] \Biggr\rangle \nonumber\\
  &\qquad= \Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0cm_0\zeta_1-\phi}{\sqrt{2\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)}} \Biggr\rangle \nonumber\\
  &\qquad= -\frac{1}{2} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_{++}-\mathop{\mathrm{erf}}\nolimits Y_{+-} \biggr\}.
\end{align}
Thus,
\begin{equation}
  r = \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \biggl\{1 - \frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}-\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] - \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}-\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\}
\end{equation}

Next,
\begin{equation}
  m = \frac{1}{2} \Biggl\{ \frac{1+c}{2} \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0m_0x_0-\phi}{2{\alpha r}}\Biggr\rangle + \frac{1-c}{2} \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1-s_0m_0x_0+\phi}{2{\alpha r}}\Biggr\rangle \Biggr\}.
\end{equation}
Then,
\begin{align}
  &\Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0m_0x_0-\phi}{2{\alpha r}}\Biggr\rangle \nonumber\\
  &\qquad= \sqrt\frac{s_0}{2\pi\gamma^2(1-c^2)} \Biggl\langle \zeta_1 \int\mathrm{d} x_0\,\mathrm{e}^{-s_0(x_0 - c\zeta_1)^2/2\gamma^2(1-c^2)} \mathop{\mathrm{erf}}\nolimits\Biggl[\frac{s_0m_0}{\sqrt{2{\alpha r}}} \biggl(x_0+\frac{m\zeta_1-\phi}{s_0m_0}\biggr)\Biggr] \Biggr\rangle \nonumber\\
  &\qquad= \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0cm_0\zeta_1-\phi}{\sqrt{2\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)}} \Biggr\rangle \nonumber\\
  &\qquad= \frac{\gamma}{2} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-} \biggr\}.
\end{align}
Thus,
\begin{equation}
  m = \frac{\gamma}{2} \biggl\{\frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] + \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}+\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\}.
\end{equation}
Similarly,
\begin{align}
  m_1 &= \frac{1}{2} \Biggl\{ \frac{1+c}{2} \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0m_0x_0-\phi}{2{\alpha r}}\Biggr\rangle - \frac{1-c}{2} \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1-s_0m_0x_0+\phi}{2{\alpha r}}\Biggr\rangle \Biggr\} \nonumber\\
  &= \frac{\gamma}{2} \biggl\{\frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] - \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}+\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\}.
\end{align}

Finally,
\begin{equation}
  m_0 = \frac{1}{2} \Biggl\{ \frac{1+c}{2} \Biggl\langle x_0 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0m_0x_0-\phi}{2{\alpha r}}\Biggr\rangle - \frac{1-c}{2} \Biggl\langle x_0 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1-s_0m_0x_0+\phi}{2{\alpha r}}\Biggr\rangle \Biggr\}.
\end{equation}
Then,
\begin{alignat}{2}
  &\mathrlap{ \Biggl\langle x_0 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0m_0x_0-\phi}{\sqrt{2{\alpha r}}}\Biggr\rangle }\nonumber\\
  &\qquad={}&& \mathrlap{ \sqrt\frac{s_0}{2\pi\gamma^2(1-c^2)} \Biggl\langle \int\mathrm{d} x_0\,\mathrm{e}^{-s_0(x_0-c\zeta_1)^2/2\gamma^2(1-c^2)} x_0 \mathop{\mathrm{erf}}\nolimits\Biggl[\frac{s_0m_0}{\sqrt{2{\alpha r}}} \biggl(x_0+\frac{m\zeta_1-\phi}{s_0m_0}\biggr)\Biggr] \Biggr\rangle }\nonumber\\
  &\qquad={}&& \gamma^2(1-c^2)m_0 \sqrt\frac{2}{\pi\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)} \Biggl\langle \exp\Biggl[-\frac{(m\zeta_1+s_0cm_0\zeta_1-\phi)^2}{2\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)}\Biggr] \Biggr\rangle \nonumber\\*
  &&&{}+ c \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{m\zeta_1+s_0cm_0\zeta_1-\phi}{\sqrt{2\bigl(s_0\gamma^2(1-c^2)m_0^2 + {\alpha r}\bigr)}} \Biggr\rangle \nonumber\\
  &\qquad={}&& \mathrlap{ \frac{\gamma^2(1-c^2)m_0}{\sqrt{2\pi}\sigma_0} \biggl\{ \mathrm{e}^{-Y_{++}^2}+\mathrm{e}^{-Y_{+-}^2} \biggr\} + \frac{\gamma c}{2} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-} \biggr\}. }
\end{alignat}
Thus,
\begin{align}
  m_0 &= \frac{\gamma c}{2} \biggl\{ \frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] - \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}+\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\} + Q\tfrac{\gamma^2}{\Gamma^2}(1-c^2)m_0 \nonumber\\
  &= \frac{\gamma c}{2\Bigl(1-Q\frac{\gamma^2}{\Gamma^2}(1-c^2)\Bigr)} \biggl\{ \frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] - \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}+\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\}.
\end{align}

These mean-field equations are presented in \cref{eq:overview-0}.


\subsection{Simplified mean-field equations with \texorpdfstring{$\phi = 0$}{φ = 0} and \texorpdfstring{$c^2 \ll 1$}{c² ≪ 1}}

To further simplify the equations, we make two assumptions. First, we assume $c^2 \ll 1$, and correspondingly $\kappa^2 \ll 1$ because $\kappa \leq c$. Second, we assume that $\phi = 0$. Numerically, we find that the maximum load of sparse examples that still admits a retrieval solution is indeed obtained at $|\phi| < 10^{-6}$ over the entire range in \cref{fig:0}. Theoretically, we will see that $Q \ll 1$, which means $\phi = \theta$, the true threshold of the network. For dense patterns in classic Hopfield network, retrieval is maximized at $\theta = 0$ \cite{Weisbuch.1985}. We also assume $s \gg 1$, so $s_0 = s$, and perform the rescalings
\begin{align}
  m &= \frac{\gamma}{2} \cdot m', \nonumber\\
  m_0 &= \frac{\gamma}{2} \cdot m'_0, \nonumber\\
  r &= \frac{s\Gamma^4}{2} \cdot r', \nonumber\\
  \alpha &= \frac{\gamma^4}{2\Gamma^4} \cdot \alpha'.
  \label{eq:0-rescale}
\end{align}

The mean-field equations then become
\begin{align}
  m' &= \frac{1+c}{2} \mathop{\mathrm{erf}}\nolimits Y'_+ + \frac{1-c}{2} \mathop{\mathrm{erf}}\nolimits Y'_-, \nonumber\\
  m'_0 &= \frac{c}{1-Q\frac{\gamma^2}{\Gamma^2}} \biggl\{ \frac{1+c}{2} \mathop{\mathrm{erf}}\nolimits Y'_+ - \frac{1-c}{2} \mathop{\mathrm{erf}}\nolimits Y'_- \biggr\}, \nonumber\\
  r' &= \frac{1}{(1 - Q)^2}, \nonumber\\
  Q &= \sqrt\frac{2}{\pi} \frac{\Gamma^2}{\sigma'_0\gamma^2} \biggl\{ \frac{1+c}{2} \mathrm{e}^{-(Y'_+)^2} + \frac{1-c}{2} \mathrm{e}^{-(Y'_-)^2} \biggr\},
\end{align}
where
\begin{align}
  {\sigma_0'}^{\!2} &\equiv s\bigl({m'_0}^{\!2} + {\alpha' r'}\bigr) \nonumber\\
  Y'_\pm &\equiv \frac{m'}{\sqrt{2}\sigma'_0} \biggl(1 \pm \frac{scm'_0}{m'}\biggr).
\end{align}

If we desire $m' \approx 1$, which corresponds to retrieval of the dense example, we need $Y'_\pm \gg 1$. This means that $m'_0 \approx c^2$, if $Q \ll 1$. So, we replace $m'_0/m'$ by $c^2$ in $Y'_\pm$. We also define
\begin{equation}
  y \equiv \frac{m'}{\sigma'_0}.
\end{equation}
Furthermore, for $m' \approx 1$, we need $sc^3$ to be small, so we expand in it. We also need $y \rightarrow \infty$, so we can expand asymptotically $\erf\frac{y}{\sqrt{2}} \approx 1 - \sqrt{\tfrac{2}{\pi}}\tfrac{1}{y}\ee^{-y^2/2}$. These simplifications yield
\begin{align}
  m' &= 1 - \frac{1}{y^2}\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy, \nonumber\\
  m'_0 &= \frac{c^2}{1-Q\frac{\gamma^2}{\Gamma^2}} \Biggl\{ 1 - \biggl(\frac{1}{y^2}-sc^2\biggr) \sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy \Biggr\}, \nonumber\\
  \alpha' &= \biggl(\frac{{m'}^2}{sy^2} - {m'_0}^{\!2}\biggr) (1 - Q)^2, \nonumber\\
  Q &= \frac{\Gamma^2}{\gamma^2m'} \biggl(1 - sc^4y^2 + \frac{1}{2}s^2c^6y^2(y^2-1)\biggr) \sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy.
  \label{eq:0-simple}
\end{align}


\subsection{Capacity formula}

In \cref{eq:0-simple}, we substitute formulas for $m'$, $m'_0$, and $Q$ into the equation for $\alpha'$ and keep only leading terms in $1/y$ and $c$. After much simplification, we obtain
\begin{equation}
  s(\alpha' + c^4) = \frac{1}{y^2} - \frac{\Gamma^2}{\gamma^2} \biggl(\frac{1}{y} + \frac{1}{2}s^2c^6y^3\biggr) \sqrt\frac{8}{\pi} \ee^{-y^2/2}.
  \label{eq:0-y}
\end{equation}

Now, we make major simplifications for mathematical tractability. They are justified by comparing the resulting formula with numerical analysis of the full mean-field equations [\cref{fig:0}]. At the critical value of $s$ above which this equation cannot be satisfied by any $y$, the derivative of \cref{eq:0-y} with respect to $y$ must also hold. We ignore the term proportional to $s^2$ and eliminate higher orders in $1/y$ to obtain
\begin{equation}
  0 = -\frac{2}{y^3} + \frac{\Gamma^2}{\gamma^2} \sqrt\frac{8}{\pi} \ee^{-y^2/2}
  \label{eq:0-dy}
\end{equation}
Solving for $y$, we obtain
\begin{equation}
  y = \sqrt{-3 W_{-1}\Bigl[-\tfrac{1}{3} \bigl(\tfrac{\pi}{2}\bigr)^{\!1/3} \bigl(\tfrac{\gamma}{\Gamma}\bigr)^{\!4/3} \Bigr]} \approx \sqrt{3 \log \Bigl[3\bigl(\tfrac{2}{\pi}\bigr)^{\!1/3} \bigl(\tfrac{\Gamma}{\gamma}\bigr)^{\!4/3} \Bigr]},
\end{equation}
where $W_{-1}$ is the negative branch of the Lambert $W$ function, which is also known as the product logarithm. Since this function involves a logarithm, it varies very slowly as a function of $\gamma/\Gamma$. For $\gamma = 0.1$ and $a$ between $0.001$ and $0.1$, this expression for $y$ ranges from $1.7$ and $3.3$.

We can use \cref{eq:0-dy} to simplify \cref{eq:0-y} and solve for the critical $s$:
\begin{equation}
  s_\textrm{c}(\alpha' + c^4) = \frac{1}{y^2} - s_\textrm{c}^2c^6.
\end{equation}
This gives
\begin{equation}
  s_\textrm{c} = \frac{\sqrt{(\alpha'+c^4)^2 + \frac{4}{y^2} c^6} - (\alpha'+c^4)}{2c^6}.
\end{equation}
To heuristically obtain a simpler equation, we note that $s_\textrm{c} \rightarrow 1/yc^3$ when $\alpha' \rightarrow 0$ and $s_\textrm{c} \rightarrow 1/y^2\alpha'$ when $\alpha' \rightarrow \infty$. We simply capture both these behaviors with 
\begin{equation}
  s_\textrm{c} = \frac{1}{yc^3 + y^2 \alpha'}.
\end{equation}
If we set $y = 3$ and convert $\alpha'$ back to $\alpha$ with \cref{eq:0-rescale}, we obtain \cref{eq:results-0}.



\section{\label{sec:dc}Critical load for dense concepts}

\subsection{Dense symmetric mean-field equations}

For dense patterns,
\begin{align}
  \zeta_{1\nu} &=
      \begin{cases}  \zeta_1 & \textrm{with probability } \frac{1+c}{2} \\
                    -\zeta_1 & \textrm{with probability } \frac{1-c}{2} \end{cases} \nonumber\\
  \zeta_1 &=
      \begin{cases}  \gamma & \textrm{with probability } \frac{1}{2}  \\
                    -\gamma & \textrm{with probability } \frac{1}{2}. \end{cases}
\end{align}
To help us in our calculations, we note
\begin{align}
  \int_{-\infty}^\infty \mathrm{d} x\,\mathrm{e}^{-(x-a)^2/\rho^2} \mathrm{e}^{-(x-b)^2/\sigma^2} &= \sqrt\frac{\pi}{\rho^{-2}+\sigma^{-2}} \exp\biggl[ -\frac{(a-b)^2}{\rho^2+\sigma^2} \biggr], \nonumber\\
  \int_{-\infty}^\infty \mathrm{d} x\,\mathrm{e}^{-(x-a)^2/\rho^2} \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{x-b}{\sigma} \biggr] &= \sqrt\pi \rho \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{a-b}{\sqrt{\rho^2+\sigma^2}} \biggr], \nonumber\\
  \int_{-\infty}^\infty \mathrm{d} x\,\mathrm{e}^{-(x-a)^2/\rho^2} x \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{x-b}{\sigma} \biggr] &= \rho\,\Biggl\{ \frac{\rho^2}{\sqrt{\rho^2+\sigma^2}} \exp\biggl[ -\frac{(a-b)^2}{\rho^2+\sigma^2} \biggr] + \sqrt\pi a \mathop{\mathrm{erf}}\nolimits\biggl[ \frac{a-b}{\sqrt{\rho^2+\sigma^2}} \biggr] \Biggr\}.
\end{align}

We then consider the possibility that the network overlaps equally with all example patterns $\zeta_{1\nu}$. In that case,
\begin{equation}
  \sum_\nu m_{1\nu}\zeta_{1\nu} = sm_\textrm{s} x_\textrm{s}.
\end{equation}
The average over examples $ x_\textrm{s}$ follows a binomial distribution, which in the large $s$ limit can be described by a Gaussian random variable with mean $c\zeta_1$ and variance $\gamma^2(1-c^2)/s$. Thus,
\begin{align}
  m_\textrm{s} &= \frac{1}{2}\Biggl\langle  x_\textrm{s} \mathop{\mathrm{erf}}\nolimits\frac{sm_\textrm{s} x_\textrm{s}-\phi}{\sqrt{2{\alpha r}}}\Biggr\rangle, \nonumber\\
  m_1 &= \frac{1}{2}\Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{sm_\textrm{s} x_\textrm{s}-\phi}{\sqrt{2{\alpha r}}}\Biggr\rangle, \nonumber\\
  r &= \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \Biggl[1 + \Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{sm_\textrm{s} x_\textrm{s}-\phi}{\sqrt{2{\alpha r}}} \Biggr\rangle \Biggr], \nonumber\\
  Q &= \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \Biggl\langle\exp\Biggl[ -\frac{\bigl(sm_\textrm{s} x_\textrm{s}-\phi\bigr)^{\!2}}{2{\alpha r}}\Biggr]\Biggr\rangle.
\end{align}
To aid us later, we will define
\begin{align}
  \sigma_\textrm{s}^2 &\equiv s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r} \nonumber\\
  Y_\pm &\equiv \frac{s\gamma cm_\textrm{s} \pm \phi}{\sqrt{2}\sigma_\textrm{s}} = \frac{s\gamma cm_\textrm{s} \pm \phi}{\sqrt{2\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)}},
\end{align}

By performing the averages successively over $ x_\textrm{s}$ and $\zeta_1$, we find
\begin{align}
  Q &= \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \Biggl\langle\exp\Biggl[ -\frac{\bigl(sm_\textrm{s} x_\textrm{s}-\phi\bigr)^{\!2}}{2{\alpha r}}\Biggr]\Biggr\rangle \nonumber\\
  &= \frac{\Gamma^2}{\sqrt{2\pi{\alpha r}}} \sqrt\frac{s}{2\pi\gamma^2(1-c^2)} \Biggl\langle \int\mathrm{d} x_\textrm{s}\,\mathrm{e}^{-s( x_\textrm{s}-c\zeta_1)^2/2\gamma^2(1-c^2)} \mathrm{e}^{-s^2m_\textrm{s}^2( x_\textrm{s}-\phi/sm_\textrm{s})^2/2{\alpha r}} \Biggr\rangle \nonumber\\
  &= \frac{\Gamma^2}{\sqrt{2\pi\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)}} \Biggl\langle \exp\Biggl[-\frac{(scm_\textrm{s}\zeta_1-\phi)^2}{2\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)}\Biggr] \Biggr\rangle \nonumber\\
  &= \frac{\Gamma^2}{\sqrt{8\pi}\sigma_\textrm{s}} \biggl\{ \mathrm{e}^{-Y_+^2}+\mathrm{e}^{-Y_-^2} \biggr\}.
\end{align}

Next,
\begin{align}
  &\Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{sm_\textrm{s} x_\textrm{s}-\phi}{\sqrt{2{\alpha r}}} \Biggr\rangle \nonumber\\
  &\qquad= \sqrt\frac{s}{2\pi\gamma^2(1-c^2)} \Biggl\langle \int\mathrm{d} x_\textrm{s}\,\mathrm{e}^{-s( x_\textrm{s} - c\zeta_1)^2/2\gamma^2(1-c^2)} \mathop{\mathrm{erf}}\nolimits\Biggl[\frac{sm_\textrm{s}}{\sqrt{2{\alpha r}}} \biggl( x_\textrm{s}-\frac{\phi}{sm_\textrm{s}}\biggr)\Biggr] \Biggr\rangle \nonumber\\
  &\qquad= \Biggl\langle \mathop{\mathrm{erf}}\nolimits\frac{scm_\textrm{s}\zeta_1-\phi}{\sqrt{2\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)}} \Biggr\rangle \nonumber\\
  &\qquad= -\frac{1}{2} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_+-\mathop{\mathrm{erf}}\nolimits Y_- \biggr\}.
\end{align}
Thus,
\begin{equation}
  r = \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \biggl\{ 1 - \frac{1}{2} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_+-\mathop{\mathrm{erf}}\nolimits Y_-\Bigr] \biggr\}.
\end{equation}

Next,
\begin{align}
  m_1 &= \frac{1}{2} \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{sm_\textrm{s} x_\textrm{s}-\phi}{2{\alpha r}}\Biggr\rangle \nonumber\\
  &= \frac{1}{2}\sqrt\frac{s}{2\pi\gamma^2(1-c^2)} \Biggl\langle \zeta_1 \int\mathrm{d} x_\textrm{s}\,\mathrm{e}^{-s( x_\textrm{s} - c\zeta_1)^2/2\gamma^2(1-c^2)} \mathop{\mathrm{erf}}\nolimits\Biggl[\frac{sm_\textrm{s}}{\sqrt{2{\alpha r}}} \biggl( x_\textrm{s}-\frac{\phi}{sm_\textrm{s}}\biggr)\Biggr] \Biggr\rangle \nonumber\\
  &= \frac{1}{2}\Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{scm_\textrm{s}\zeta_1-\phi}{\sqrt{2\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)}} \Biggr\rangle \nonumber\\
  &= \frac{\gamma}{4} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_++\mathop{\mathrm{erf}}\nolimits Y_- \biggr\}.
\end{align}

Finally,
\begin{alignat}{2}
m_\textrm{s} &={}&& \frac{1}{2} \Biggl\langle  x_\textrm{s} \mathop{\mathrm{erf}}\nolimits\frac{sm_\textrm{s} x_\textrm{s}-\phi}{2{\alpha r}}\Biggr\rangle \nonumber\\
  &={}&& \frac{1}{2} \sqrt\frac{s}{2\pi\gamma^2(1-c^2)} \Biggl\langle \int\mathrm{d}  x_\textrm{s}\,\mathrm{e}^{-s( x_\textrm{s}-c\zeta_1)^2/2\gamma^2(1-c^2)}  x_\textrm{s} \mathop{\mathrm{erf}}\nolimits\Biggl[\frac{sm_\textrm{s}}{\sqrt{2{\alpha r}}} \biggl( x_\textrm{s}-\frac{\phi}{sm_\textrm{s}}\biggr)\Biggr] \Biggr\rangle \nonumber\\
  &={}&& \frac{\gamma^2(1-c^2)m_\textrm{s}}{2} \sqrt\frac{2}{\pi\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)} \Biggl\langle \exp\Biggl[-\frac{(scm_\textrm{s}\zeta_1-\phi)^2}{2\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)}\Biggr] \Biggr\rangle \nonumber\\*
  &&&{}+ \frac{c}{2} \Biggl\langle \zeta_1 \mathop{\mathrm{erf}}\nolimits\frac{scm_\textrm{s}\zeta_1-\phi}{\sqrt{2\bigl(s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r}\bigr)}} \Biggr\rangle \nonumber\\
  &={}&& Q\tfrac{\gamma^2}{\Gamma^2}(1-c^2)m_\textrm{s} + \frac{\gamma c}{4} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_++\mathop{\mathrm{erf}}\nolimits Y_- \biggr\} \nonumber\\
  &={}&& \frac{\gamma c}{4\Bigl(1-Q\frac{\gamma^2}{\Gamma^2}(1-c^2)\Bigr)} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_++\mathop{\mathrm{erf}}\nolimits Y_- \biggr\}.
\end{alignat}

These mean-field equations are presented in \cref{eq:overview-S}.


\subsection{Simplified mean-field equations with \texorpdfstring{$\phi = 0$}{φ = 0} and \texorpdfstring{$c^2 \ll 1$}{c² ≪ 1}}

To further simplify the equations, we make two assumptions. First, we assume $c^2 \ll 1$, and correspondingly $\kappa^2 \ll 1$ because $\kappa \leq c$. Second, we assume that $\phi = 0$. Empirically, as shown in \cref{fig:S-extra}(a), we find that the capacity obtained with $\phi = 0$ is very close to that obtained by maximizing over $\phi$.

\begin{figure*}[h!]
  \centering
  \includegraphics{S-extra}
  \caption{
    \label{fig:S-extra}
    (a) Capacity $s_\textrm{c}$ for dense concepts obtained through numerical analysis of \cref{eq:overview-S}. We either set $\phi = 0$ (dark, thin lines) or maximize over $\phi$ (light, thick lines).
    (b) Right-hand side of \cref{eq:S-y} and its terms plotted separately.
    (c) $y$ as a function of $\alpha$ for sparsity $a = 0$ obtained by numerically solving \cref{eq:Sa-alpha}.
  }
\end{figure*}

We also assume $s \gg 1$, so $s_0 = s$, and perform the rescalings
\begin{align}
  m_\textrm{s} &= \frac{\gamma c}{2} \cdot m'_\textrm{s}, \nonumber\\
  r &= \frac{s\Gamma^4}{2} \cdot m'_\textrm{s} r', \nonumber\\
  \alpha &= \frac{\gamma^4c^2}{2\Gamma^4} \cdot \alpha'.
  \label{eq:S-rescale}
\end{align}
The mean-field equations then become
\begin{align}
  m'_\textrm{s} - Qm'_\textrm{s} \frac{\gamma^2}{\Gamma^2} &= \mathop{\mathrm{erf}}\nolimits\sqrt\frac{sc^2}{2(1+{\alpha' r'})}, \nonumber\\
  r' &= \frac{\bigl(m'_\textrm{s} - Qm'_\textrm{s}(1+s\kappa^2)\bigr)^{\!2} + (m'_\textrm{s})^2s\kappa^4}{\bigl(m'_\textrm{s} - Qm'_\textrm{s}\bigr)^{\!2}\bigl(m'_\textrm{s} - Qm'_\textrm{s}(1+s\kappa^2)\bigr)^{\!2}}, \nonumber\\
  Qm'_\textrm{s} &= \frac{\Gamma^2}{\gamma^2} \sqrt\frac{2}{\pi sc^2(1+{\alpha' r'})} \exp\biggl[-\frac{sc^2}{2(1+{\alpha' r'})}\biggr].
\end{align}
We define
\begin{equation}
  y \equiv \sqrt\frac{sc^2}{1+{\alpha' r'}},
\end{equation}
so the equations are now
\begin{align}
  m'_\textrm{s} - Qm'_\textrm{s} \frac{\gamma^2}{\Gamma^2} &= \erf\frac{y}{\sqrt{2}}, \nonumber\\
  \frac{1}{\alpha'}\biggl(\frac{sc^2}{y^2} - 1\biggr) &= \frac{\bigl(m'_\textrm{s} - Qm'_\textrm{s}(1+s\kappa^2)\bigr)^{\!2} + (m'_\textrm{s})^2s\kappa^4}{\bigl(m'_\textrm{s} - Qm'_\textrm{s}\bigr)^{\!2}\bigl(m'_\textrm{s} - Qm'_\textrm{s}(1+s\kappa^2)\bigr)^{\!2}}, \nonumber\\
  Qm'_\textrm{s} &= \frac{\Gamma^2}{\gamma^2sc^2}\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy.
  \label{eq:S-simple}
\end{align}

We can now substitute expressions for $m'_\textrm{s}$ and $Qm'_\textrm{s}$ into the second equation of \cref{eq:S-simple} to obtain a single equation
\begin{equation}
  0 = \frac{1}{\alpha'}\biggl(\frac{sc^2}{y^2}-1\biggr) \biggl[{\textstyle\erf\frac{y}{\sqrt{2}}}-\frac{(1-2\gamma)^2a}{\gamma^2sc^2}{\textstyle\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy}\biggr]^2 - 1 - s\kappa^4 \frac{\biggl[\erf\frac{y}{\sqrt{2}}+\frac{1}{sc^2}\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\biggr]^2}{\biggl[\erf\frac{y}{\sqrt{2}}-\Bigl(\frac{(1-2\gamma)^2a}{\gamma^2sc^2} + 1\Bigr)\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\biggr]^2}.
  \label{eq:S-y}
\end{equation}
The capacity $s_\textrm{c}$ is obtained when this equation and its derivative with respect to $y$ are satisfied.


\subsection{Capacity formula for \texorpdfstring{$a \gtrsim \gamma^2$}{a ≳ γ²}}

We now perform major simplifications to obtain a formula for $s_\textrm{s}$. In \cref{fig:S-extra}(b), we plot the right-hand side (RHS) of \cref{eq:S-y}, along with its first two terms and third term separately. We see that the first two terms generally capture the behavior of the RHS. The third term contributes a pole, which approximately sets the position of the local maximum of the RHS at largest $y$.

Thus, we use the first two terms to satisfy \cref{eq:S-y} and the denominator of the third term to statisfy the derivative of the equation:
\begin{align}
  \alpha' &= \biggl(\frac{sc^2}{y^2}-1\biggr) \biggl[\erf\frac{y}{\sqrt{2}}-\frac{(1-2\gamma)^2a}{\gamma^2sc^2}\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\biggr]^2 \nonumber\\
  0 &= \erf\frac{y}{\sqrt{2}} - \biggl(\frac{(1-2\gamma)^2a}{\gamma^2sc^2} + 1\biggr)\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy.
\end{align}
We can manipulate these equations to obtain \cref{eq:results-S1} if we convert $\alpha'$ back to $\alpha$ with \cref{eq:S-rescale}.


\subsection{Capacity formula for \texorpdfstring{$a \ll \gamma^2$}{a ≪ γ}}

For $a \rightarrow 0$, the location of the pole in \cref{eq:S-y} approaches $y = 0$. This does not correspond to a retrieval solution according to \cref{eq:S-simple} because $m'_\textrm{s} \sim 1$ requires $y \gtrsim 1$. To obtain a capacity formula in this case, we set $a = 0$ and rearrange \cref{eq:S-y} to obtain
\begin{equation}
  0 = \frac{1}{\alpha'}\biggl(\frac{sc^2}{y^2}-1\biggr)\biggl[\erf\frac{y}{\sqrt{2}}\biggr]^2 \biggl[\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\biggr]^2 - \biggl[\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\biggr]^2 - sc^4 \biggl[\erf\frac{y}{\sqrt{2}} + \frac{1}{sc^2}\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\biggr]^2.
\end{equation}
We then calculate its derivative with respect to $y$, which also holds when the network is at capacity, and manipulate these two equations to obtain
\begin{align}
  \alpha' &= \frac{\Bigl(\frac{sc^2}{y^2}-1\Bigr) \bigl[\erf\frac{y}{\sqrt{2}}\bigr]^2 \Bigr[\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]^2}{\Bigr[\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]^2 + sc^4 \Bigl[\erf\frac{y}{\sqrt{2}} + \frac{1}{sc^2}\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]^2} \nonumber\\
  \alpha'c^2 &= \frac{\erf\frac{y}{\sqrt{2}} \biggl\{\frac{sc^2}{y^2}\Bigr[\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr] + \sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\biggr\} \Bigr[\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]^3}{(sc^2+1)\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy \Bigl[\erf\frac{y}{\sqrt{2}} + \frac{1}{sc^2}\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr] \Bigl[(y^2-1)\erf\frac{y}{\sqrt{2}} + \sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]}.
  \label{eq:Sa-alpha}
\end{align}

\Cref{eq:Sa-alpha} can be solved numerically to obtain $y$ as a function of $\alpha'c^2/2 = \alpha$ by eliminating $s$. We notice that for small $\alpha \sim 10^{-4}$, $y \sim 0.5$ [\cref{fig:S-extra}(c)]. To then find a formula for $s_\textrm{c}$, we boldly expand \cref{eq:Sa-alpha} in leading powers of $y$, but we preserve extra powers of $c^4$:
\begin{align}
  \alpha' &\approx \frac{2sc^2y^4 \bigl(3sc^2 - (3+sc^2)y^2\bigr)}{3\pi\bigl(sc^2y^4 + 9c^2(1+sc^2)^2\bigr)} \nonumber\\
  \alpha'c^2 &\approx \frac{sc^2 (3+sc^2) y^6}{27\pi(1+sc^2)^2}.
\end{align}
Equating these two expressions for $\alpha'$, we get
\begin{equation}
  sc^2(3+sc^2)y^6 + 27(1+sc^2)^2(3+sc^2)c^2y^2 = 54sc^2(1+sc^2)^2c^2.
\end{equation}
We can solve this equation for $y$ using the cubic formula to obtain
\begin{equation}
  y^2 = \bigl(\sqrt{A^3+B^2}+B\bigr)^{1/3} - \bigl(\sqrt{A^3+B^2}-B\bigr)^{1/3}, \quad\textrm{ where }\quad A = \frac{9(1+sc^2)^2c^2}{sc^2}, \quad B = \frac{27(1+sc^2)^2c^2}{3+sc^2}.
\end{equation}

We then have an equation for $\alpha'$ in terms of $s$:
\begin{align}
  \alpha' &= \frac{sc^2 (3+sc^2) y^6}{27\pi(1+sc^2)^2c^2} \nonumber\\
  &= \frac{sc^2}{\pi B} \Bigl\{ 2B - 3A\Bigl[ \bigl(\sqrt{A^3+B^2}+B\bigr)^{1/3} - \bigl(\sqrt{A^3+B^2}-B\bigr)^{1/3} \Bigr] \Bigr\}
\end{align}
Finally, we can solve for $s$ as a series in $\alpha'$. We keep only the leading term in $\alpha'$ and the leading term in $c$ to obtain
\begin{equation}
  s_\textrm{c} \approx 3 \biggl(\frac{3\pi}{8}\biggr)^{\!\!1/4} \frac{(\alpha')^{1/4}}{c^{3/2}} + \frac{3\pi}{8} \frac{\alpha'}{c^2}.
\end{equation}
This yields \cref{eq:results-S2} if we convert $\alpha'$ back to $\alpha$ with \cref{eq:S-rescale}.


\clearpage
\end{widetext}

\section{Introduction}

Autoassociation is the ability for a neural network to store patterns of activity and to retrieve complete patterns when presented with incomplete cues. Autoassociative networks are widely used as models for neural phenomena, such as episodic memory \cite{McNaughton.1987, O'Reilly.2001, Rolls.20069yq}, and also have applications in machine learning \cite{Hopfield.1985, Barra.2012}. It is well-known that properties of the stored patterns can influence the computational capabilites of the network. Sparse patterns, in which a small fraction of the neurons are active, can be stored higher capacity compared to dense patterns \cite{Tsodyks.1988}. Correlated patterns can be merged by the network to represent shared features \cite{Fontanari.1990}. Previous autoassociative models have largely considered the storage of patterns with a single set of statistics.

We consider the possibility that an autoassociative network can store two types of patterns with different statistics, and thus, different computational roles. This idea is inspired by the architecture of the hippocampus in mammalian brains, in which CA3, the presumptive autoassociative network that stores memories of our daily experiences, receives sensory information from two parallel pathways with complementary properties \cite{Amaral.2006}. The first pathway, called the mossy fibers, presents sparser, decorrelated patterns, and the second, called the perforant path, presents denser, correlated patterns to CA3 for storage. Both pathways originate from the same upstream network, the entorhinal cortex, so they presumably represent the same sensory experiences.

We implement these ideas with a Hopfield-like model \cite{Hopfield.1982} that stores memories, each of which is an example $\mu$ of a concept $\nu$. Each example is encoded as both a sparse pattern $\ve\xi_{\mu\nu}$ and a dense pattern $\ve\psi_{\mu\nu}$. The former exhibits no correlation with other sparsely encoded examples. The latter is generated from a dense encoding $\ve\psi_\mu$ of the concept $\mu$, and correlations exist among examples within each concepts. The model is defined in \cref{sec:model}, along with an outline of the derivation of its mean-field equations. In \cref{sec:capacities}, we present our major results regarding the capacity of each type of pattern. The network can retrieve both sparse and dense patterns by tuning its activity threshold between a high value for the former, which are also more strongly stored, and a low value for the latter. The network has a high capacity for sparse examples and a low capacity for dense examples. As the number of examples stored increases beyond the dense example capacity, a critical load is reached at which the network starts retrieving dense concepts, which were never directly presented to the network. This critical load can be smaller than the sparse example capacity, which means that the network can represent distinct memories and generalizations across them.

In \cref{sec:hetero}, we show that our simple Hebbian learning rule allows for heteroassociation between sparse and dense encodings. The ability to retrieve sparse examples from dense concepts and vice versa depends critically on the relative strength with which the dense patterns are stored. Adjusting this strength changes the relative energies of sparse examples and dense concepts, which can predict if heteroassociation is possible. We discuss our results and their significance in \cref{sec:discussion}. Finally, mean-field equations describing the network overlap with various target (condensed) patterns and off-target (uncondensed) patterns are explicitly derived in \cref{sec:mean}, using substantial inspiration from Refs.~\onlinecite{Tsodyks.1988} and \onlinecite{Fontanari.1990}. The subsequent derivations of capacity and overlap formulas are provided in \cref{sec:se,sec:de,sec:dc}.



\section{\label{sec:model}The model}

\subsection{Patterns and architecture}

Consider a Hopfield network with neurons $i = 1, \ldots, N$ that can be either inactive ($S_i = 0$) or active ($S_i = 1$). The network stores $\nu = 1, \ldots, s$ examples for each of $\mu = 1, \ldots, p = \alpha N$ concepts. Examples are encoded both sparsely as $\ve\xi_{\mu\nu}$ and densely as $\ve\psi_{\mu\nu}$. Following Ref.~\onlinecite{Tsodyks.1988}, sparse examples are generated independently with sparsity $a$:
\begin{equation}
  \xi^i_{\mu\nu} =
    \begin{cases} 0 & \textrm{with probability } 1-a \\
                  1 & \textrm{with probability } a.  \end{cases} \nonumber\\
\end{equation}
Following Ref.~\onlinecite{Fontanari.1990}, dense examples within a concept are correlated in the following way. Each concept corresponds to a dense pattern $\ve\psi_\mu$, generated independently with sparsity $\frac{1}{2}$:
\begin{equation}
  \psi^i_\mu =
      \begin{cases} 0 & \textrm{with probability } \frac{1}{2}  \\
                    1 & \textrm{with probability } \frac{1}{2}. \end{cases}
\end{equation}
Dense examples are then generated from these concepts, with the correlation parameter $c > 0$ controlling the likelihood that example patterns match their concept:
\begin{equation}
  \psi^i_{\mu\nu} =
    \begin{cases}   \psi^i_\mu & \textrm{with probability } \frac{1+c}{2} \\
                  1-\psi^i_\mu & \textrm{with probability } \frac{1-c}{2} \end{cases} \nonumber\\
\end{equation}
The average overlap, or product, between two random patterns of sparsity $\frac{1}{2}$ is $\frac{1}{4}$. The average overlap between a dense example and its corresponding dense concept exceeds this value by $\frac{c}{4}$, and the average overlap between two dense examples of the same concept exceeds it by $\frac{c^2}{4}$:
\begin{align}
  \bigl\langle \psi^i_{\mu\nu} \psi^i_\mu \bigr\rangle &= \tfrac{1}{4} + \tfrac{c}{4} \nonumber\\
  \bigl\langle \psi^i_{\mu\nu} \psi^i_{\mu\omega} \bigr\rangle &= \tfrac{1}{4} + \tfrac{c^2}{4}.
\end{align}

During storage, the parameter $2\gamma < \tfrac{1}{2}$ sets the relative strength of the dense encoding compared to the sparse. The factor of 2 is for theoretical convenience. The synaptic weights are
\begin{alignat}{3}
  J_{ij} &={}& \frac{1}{N} \sum_{\mu\nu} & \Bigl[(1 - 2\gamma)\bigl(\xi^i_{\mu\nu} - a\bigr) + 2\gamma\bigl(\psi^i_{\mu\nu} - \tfrac{1}{2}\bigr) \Bigr] \nonumber\\
  &&& {}\times \Bigl[(1 - 2\gamma)\bigl(\xi^j_{\mu\nu} - a\bigr) + 2\gamma\bigl(\psi^j_{\mu\nu} - \tfrac{1}{2}\bigr) \Bigr] \nonumber\\
  &\mathrlap{ {}={} \frac{1}{N} \sum_{\mu\nu} (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) (\eta^j_{\mu\nu} + \zeta^j_{\mu\nu}) }
  \label{eq:overview-J}
\end{alignat}
for $i \neq j$, and $J_{ii} = 0$. This expression uses rescaled patterns $\eta^i_{\mu\nu} \equiv (1 - 2\gamma)\bigl(\xi^i_{\mu\nu} - a\bigr)$ and $\zeta^i_{\mu\nu} \equiv 2\gamma\bigl(\psi^i_{\mu\nu} - \tfrac{1}{2}\bigr)$.

After initializing the network with a cue, neurons are asynchronously and stochastically updated via Glauber dynamics \cite{Amit.1985mb}:
\begin{equation}
  P[S_i(t+1) = 1] = \frac{1}{1 + \exp\bigl\{-\beta \bigl[\sum_j J_{ij} S_j(t) - \theta\bigr]\bigr\}}.
\end{equation}
$\beta = 1/T$ is the inverse temperature and $\theta$ is the activity threshold. We shall see that $\theta$ plays a key role in selecting between sparse and dense patterns.



\subsection{Overview of mean-field equations}

The macroscopic behavior of the network can be analyzed by introducing a series of state variables, including the overlap $m$ between the network activity and condensed (target) patterns, which serves as an order parameter for retrieval \cite{Amit.1985mb, Tsodyks.1988, Fontanari.1990}. The complete derivation of their relationships in the mean-field limit is provided in \cref{sec:mean}, but we will outline it here. The first task is calculating the replica partition function $\langle Z^n \rangle$, where the angle brackets indicate averaging over rescaled patterns $\ve\eta_{\mu\nu}$ and $\ve\zeta_{\mu\nu}$ and $n$ is the number of replica systems. By introducing auxiliary fields via Hubbard-Stratonovich transformations and self-averaging over uncondensed (off-target) patterns, we obtain
\begin{align}
  \langle Z^n \rangle \propto \int &\biggl[\prod_{\nu\rho} \mathrm{d} m^\rho_{1\nu} \biggl(\frac{\beta N}{2\pi}\biggr)^{\!\frac{1}{2}}\biggr] \biggl[\prod_{\rho\leq\sigma} \mathrm{d} q^{\rho\sigma}\,\mathrm{d} r^{\rho\sigma}\biggr] \nonumber\\
  & {}\times \exp[-\beta N f],
\end{align}
where
\begin{widetext}
\begin{align}
  f ={}& \frac{1}{2} \sum_{\nu\rho} (m^\rho_{1\nu})^2 + \frac{\alpha}{2\beta} \mathrm{Tr}\log\bigl[ \delta_{\nu\omega}\delta^{\rho\sigma} - \beta \Gamma^2\bigl((1-\kappa^2)\delta_{\nu\omega} + \kappa^2\bigr) q^{\rho\sigma} \bigr] + \frac{\beta\alpha}{2} \sum_{\rho\sigma} q^{\rho\sigma} r^{\rho\sigma} \nonumber\\
  &{}- \frac{1}{\beta} \Biggl\langle \log\mathrm{Tr}_S \exp\Biggl\{ \beta \biggl[ \sum_{\nu\rho} m^\rho_{1\nu} \chi_{1\nu} S^\rho - \biggl(\theta + \frac{\alpha s \Gamma^2}{2}\biggr) \sum_\rho S^\rho + \frac{\beta\alpha}{2} \sum_{\rho\sigma} r^{\rho\sigma} S^\rho S^\sigma \biggr] \Biggr\} \Biggr\rangle.
  \label{eq:overview-f}
\end{align}
\end{widetext}
$\rho$ and $\sigma$ are replica indices, and
\begin{align}
  \Gamma^2 &\equiv (1-2\gamma)^2a + \gamma^2, \nonumber\\
  \kappa^2 &\equiv \frac{\gamma^2 c^2}{(1-2\gamma)^2a + \gamma^2}.
\end{align}
\Cref{eq:overview-f} assumes a successful retrieval regime in which the network overlaps significantly with either one sparse example $\ve\eta_{11}$ or dense, correlated examples $\ve\zeta_{1\nu}$ of one concept. We capture these two possibilities by introducing $\ve\chi_{1\nu}$, where $\chi^i_{1\nu} = \eta^i_{11}\delta_{1\nu}$ or $\zeta^i_{1\nu}$ respectively for retrieval of sparse or dense patterns. Through self-averaging, we have replaced averages over neurons $i$ with averages over entries $\chi_{1\nu}$ at a single neuron. Thus, the index $i$ no longer appears in \cref{eq:overview-f}. The state variables are $m^\rho_{1\nu}$, which represents the network overlap with $\ve\chi_{1\nu}$; $r^{\rho\sigma}$, which represents noise due to overlap with uncondensed patterns; and $q^{\rho\sigma}$, which is related to the overall neural activity.

Then, we use the replica symmetry ansatz and saddle-point approximation to obtain the following mean-field equations, which justify the physical interpretations of the state variables given above:
\begin{align}
  m_{1\nu} &= \bigl\llangle \chi_{1\nu} \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \bigr\rrangle, \nonumber\\
  q &= \bigl\llangle \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}]^2 \bigr\rrangle, \nonumber\\
  q_0 &= \bigl\llangle \mathop{\mathrm{sig}}\nolimits[\beta\mathcal{H}] \bigr\rrangle, \nonumber\\
  Q &\equiv \beta (q-q_0) \Gamma^2, \nonumber\\
  r &= qs\Gamma^4 \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}}.
  \label{eq:overview-meanfield}
\end{align}
$q$ and $q_0$ are anzatz simplifications of $q^{\rho\sigma}$ for $\rho\neq\sigma$ and $\rho = \sigma$, respectively. Also, $s_0 \equiv s-1$, $\mathop{\mathrm{sig}}\nolimits(x) \equiv 1/(1+\mathrm{e}^{-x})$, and
\begin{align}
  \mathcal{H} &\equiv \sum_\nu m_{1\nu} \chi_{1\nu} - \phi + \sqrt{\alpha r} z, \nonumber\\
  \phi &\equiv \theta - \frac{Q\alpha s\Gamma^2}{2} \cdot \frac{1+s_0\kappa^4 - Q(1-\kappa^2)(1+s_0\kappa^2)}{\bigl(1 - Q(1-\kappa^2)\bigr) \bigl(1 - Q(1+s_0\kappa^2)\bigr)}.
\end{align}
The double angle brackets indicate averages over $\chi_{1\nu}$ and $z$, an auxiliary random field with a standard normal distribution. Empirically, the shifted threshold $\phi$ is very similar to the original threshold $\theta$.

We take the $T = 0$ limit and attempt to recover three types of patterns, which yield different simplifications of the mean-field equations.
\begin{enumerate}

  \item Sparse example $\ve\eta_{11}$: The network approaches one sparsely encoded example with overlap $m \equiv m_{11} \approx (1-2\gamma) a$. The mean-field equations become
    \begin{alignat}{2}
      m &\mathrlap{ {}= \frac{(1-2\gamma)a}{2} \Biggl\{\mathop{\mathrm{erf}}\nolimits\frac{\phi}{\sqrt{2{\alpha r}}} + \mathop{\mathrm{erf}}\nolimits\frac{(1-2\gamma)m-\phi}{\sqrt{2{\alpha r}}} \Biggr\}, }\nonumber\\
      r &= \frac{s \bigl(1+s_0\kappa^4\bigr) \Gamma^4}{2} \Biggl\{ && 1 - \mathop{\mathrm{erf}}\nolimits\frac{\phi}{\sqrt{2{\alpha r}}} \nonumber\\
      &&&{}+ a \mathop{\mathrm{erf}}\nolimits\frac{(1-2\gamma)m-\phi}{\sqrt{2{\alpha r}}} \Biggr\}.
      \label{eq:overview-A}
    \end{alignat}

  \item Dense example $\ve\zeta_{11}$: The network approaches one densely encoded example with overlap $m \equiv m_{11} \approx \gamma/2$. Due to correlations, it also overlaps with other examples $\nu > 1$ of the same concept: $m_0 \equiv m_{1\nu} \approx \gamma c^2/2$.  The mean-field equations become
    \begin{align}
      m &= \frac{\gamma}{2} \biggl\{\frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] \nonumber\\
      &\phantom{{}= \frac{\gamma}{2} \biggl\{} {}+ \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}+\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\}, \nonumber\\
      m_0 &= \frac{\gamma c}{2\Bigl(1-Q\frac{\gamma^2}{\Gamma^2}(1-c^2)\Bigr)} \nonumber\\
      &\phantom{{}={}} {}\times \biggl\{ \frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}+\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] \nonumber\\
      &\phantom{{}={} {}\times \biggl\{} {}- \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}+\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\}, \nonumber\\
      r &= \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}}  \nonumber\\
      &\phantom{{}={}} {}\times \biggl\{1 - \frac{1+c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{++}-\mathop{\mathrm{erf}}\nolimits Y_{+-}\Bigr] \nonumber\\
      &\phantom{{}={} {}\times \biggl\{} - \frac{1-c}{4} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_{-+}-\mathop{\mathrm{erf}}\nolimits Y_{--}\Bigr] \biggr\}, \nonumber\\
      Q &= \frac{\Gamma^2}{\sqrt{2\pi}\sigma_0} \biggl\{ \frac{1+c}{4} \Bigl[\mathrm{e}^{-Y_{++}^2}+\mathrm{e}^{-Y_{+-}^2}\Bigr] \nonumber\\
      &\phantom{{}= \frac{\Gamma^2}{\sqrt{2\pi}\sigma_0} \biggl\{} {}+ \frac{1-c}{4} \Bigl[\mathrm{e}^{-Y_{-+}^2}+\mathrm{e}^{-Y_{--}^2}\Bigr] \biggr\},
      \label{eq:overview-0}
    \end{align}
    where
    \begin{align}
      \sigma_0^2 &\equiv s_0\gamma^2(1-c^2)m_0^2 + {\alpha r} \nonumber\\
      Y_{\pm\pm} &\equiv \frac{\gamma m \pm s_0\gamma cm_0 \pm \phi}{\sqrt{2}\sigma_0}.
    \end{align}

  \item Dense concept $\ve\zeta_1$: The network approaches one densely encoded concept with overlap $m \equiv m_1 \approx \gamma/2$. Due to correlations, it will overlap with all examples $\nu$ of that concept: $m_\textrm{s} \equiv m_{1\nu} \approx \gamma c/2$. The mean-field equations become
    \begin{align}
      m ={}& \frac{\gamma}{4} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_++\mathop{\mathrm{erf}}\nolimits Y_- \biggr\}, \nonumber\\ 
      m_\textrm{s} ={}& \frac{\gamma c}{4\Bigl(1-Q\frac{\gamma^2}{\Gamma^2}(1-c^2)\Bigr)} \biggl\{ \mathop{\mathrm{erf}}\nolimits Y_++\mathop{\mathrm{erf}}\nolimits Y_- \biggr\}, \nonumber\\
      r ={}& \frac{s\Gamma^4}{2} \cdot \frac{\bigl(1 - Q(1-\kappa^2)(1+s_0\kappa^2)\bigr)^{\!2} + s_0\kappa^4}{\bigl(1 - Q(1-\kappa^2)\bigr)^{\!2}\bigl(1 - Q(1+s_0\kappa^2)\bigr)^{\!2}} \nonumber\\
      &{}\times \biggl\{ 1 - \frac{1}{2} \Bigl[\mathop{\mathrm{erf}}\nolimits Y_+-\mathop{\mathrm{erf}}\nolimits Y_-\Bigr] \biggr\}, \nonumber\\
      Q ={}& \frac{\Gamma^2}{\sqrt{8\pi}\sigma_\textrm{s}} \biggl\{ \mathrm{e}^{-Y_+^2}+\mathrm{e}^{-Y_-^2} \biggr\},
      \label{eq:overview-S}
    \end{align}
    where
    \begin{align}
      \sigma_\textrm{s}^2 &\equiv s\gamma^2(1-c^2)m_\textrm{s}^2 + {\alpha r} \nonumber\\
      Y_\pm &\equiv \frac{s\gamma cm_\textrm{s} \pm \phi}{\sqrt{2}\sigma_\textrm{s}}.
    \end{align}

\end{enumerate}



\section{\label{sec:capacities}\texorpdfstring{$T=0$}{T = 0} capacities}

\subsection{Overview of capacity formulas}

\begin{figure}[t!]
  \centering
  \includegraphics{combined}
  \caption{
    \label{fig:combined}
    Retrieval regimes for sparse examples, dense examples, and dense concepts. Capacities $s_\textrm{c}$ are obtained by numerically solving the mean-field equations (connected points). Shaded regions indicate successful retrieval.
    (a) More examples can be stored sparsely than densely.
    (b) For intermediate example loads $s$ and small enough concept loads $\alpha$, both the sparse example and the dense concept can be recovered. Sparse patterns have sparsity $a = 0.01$.
  }
\end{figure}

The mean-field equations can be solved numerically to determine regimes of successful retrieval with respect to the number of concepts per neuron $\alpha$ and the number of examples stored per concept $s$. Successful retrieval is determined by $m \approx (1-2\gamma)a$ for sparse examples [\cref{eq:overview-A}], $m \approx \gamma$ for dense examples [\cref{eq:overview-0}], and $m_\textrm{s} \approx \gamma c$ for dense concepts [\cref{eq:overview-S}]. These regimes will change as a function the sparse pattern sparsity $a$, the dense pattern correlation $c$, the relative dense storage strength $\gamma$. We treat the activity threshold $\phi$ as a free parameter that can be adjusted to maximize $m$ and $m_\textrm{s}$.

\Cref{fig:combined}(a) shows that at any concept load $\alpha$, the network can retrieve sparse and dense examples below a critical example load $s_\textrm{c}$. To determine $s_\textrm{c}$, we fix $\alpha$ and increase $s$ until the mean-field equations no longer admit a solution at finite $m$. \Cref{fig:combined}(b) shows that it can retrieve dense concepts above a critical $s_\textrm{c}$, which we find by decreasing $s$ until the non-trivial solution for $m_\textrm{s}$ vanishes. Optimization over the shifted threshold $\phi$ reveals that optimal retrieval of dense patterns occurs at $\phi = 0$ and sparse patterns at $\phi/(1-2\gamma)^2a \approx 0.65$. These values which match results for classic Hopfield networks that store only dense or only sparse patterns \cite{Weisbuch.1985, Tsodyks.1988}. Note that for low enough $\alpha$, the network can recover both sparse examples and dense concepts at intermediate values of $s$. Thus, our network is capable of retrieving both example and concept representations of the same memories by tuning an activity threshold.

\begin{figure}[t!]
  \centering
  \includegraphics{A}
  \caption{
    \label{fig:A}
    (a) Capacity $s_\textrm{c}$ for sparse examples. Connected points indicate numerical analysis of \cref{eq:overview-A}.
    (b) Collapse of $s_\textrm{c}$ curves under rescaled variables. Gray line indicates theoretical formula \cref{eq:results-A1}.
    (c) $s_\textrm{c}$ is maximized at intermediate values of sparsity $a$. Dense patterns have correlation $c = 0.1$.
  }
\end{figure}

\begin{figure}[t!]
  \centering
  \includegraphics{0}
  \caption{
    \label{fig:0}
    (a) Capacity $s_\textrm{c}$ for dense examples. Connected points indicate numerical analysis of \cref{eq:overview-0}.
    (b) Collapse of $s_\textrm{c}$ curves under rescaled variables. Gray lines indicate theoretical formula \cref{eq:results-0}.
  }
\end{figure}

\begin{figure*}[t!]
  \centering
  \includegraphics{S}
  \caption{
    \label{fig:S}
    (a) Capacity, or critical example load, $s_\textrm{c}$ for dense concepts. Connected points indicate numerical analysis of \cref{eq:overview-S}.
    (b)--(d) Collapse of $s_\textrm{c}$ curves under rescaled variables. Gray lines indicate theoretical formula \cref{eq:results-S1}.
    (e) For the sparsest patterns, $s_\textrm{c}$ curves exhibit better collapse under differently rescaled variables. Gray line indicates theoretical formula \cref{eq:results-S2}, which better matches the numerical results. It exhibits weak dependence on dense correlation $c$, and we only show its behavior for $c = 0.02$.
  }
\end{figure*}

Under various assumptions, we can simplify the mean field equations \cref{eq:overview-A,eq:overview-0,eq:overview-S} to derive formulas for the capacity, or critical example load, of each type of pattern.
\begin{enumerate}

  \item For sparse examples, the capacity is given by
    \begin{equation}
      \frac{1}{\alpha} \sim s_\textrm{c} \bigl(1+s_\textrm{c}\kappa^4\bigr) \frac{\Gamma^4}{(1-2\gamma)^4} \frac{|\!\log a|}{a},
      \label{eq:results-A1}
    \end{equation}
    which means that
     \begin{equation}
       s_\textrm{c} \sim \sqrt{ \frac{1}{4\kappa^8} + \frac{(1-2\gamma)^4}{\gamma^4c^4} \cdot \frac{a}{|\!\log a|} \cdot \frac{1}{\alpha} } - \frac{1}{2\kappa^4}.
      \label{eq:results-A2}
    \end{equation}

  \item For dense examples, the capacity is given by
    \begin{equation}
      s_\textrm{c} \approx \frac{1}{3c^3+18\frac{\Gamma^4}{\gamma^4}\alpha}.
      \label{eq:results-0}
    \end{equation}

  \item For dense concepts, there are two cases. For larger sparsities $a$, the critical example load approximately collapses as a function of $s_\textrm{c} c^2$. It can be obtained by numerically inverting the following first equation for $y$ and substituting it into the second:
    \begin{align}
      \frac{2\Gamma^4}{\gamma^4c^2}\alpha ={}& \frac{(1-2\gamma)^2a}{\gamma^2} \frac{\Bigl[\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]^3}{y^2\Bigl[\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]} \nonumber\\
      & {}- \Bigl[\textstyle\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy\Bigr]^2 \nonumber\\
      s_\textrm{c} c^2 \approx{}& \frac{(1-2\gamma)^2a}{\gamma^2} \frac{\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy}{\erf\frac{y}{\sqrt{2}}-\sqrt\frac{2}{\pi}y\mspace{1.5mu}\expy}.
      \label{eq:results-S1}
    \end{align}
    For smaller $a$, the critical example load approximately collapses as a function of $s_\textrm{c} c^{3/2}$:
    \begin{equation}
      s_\textrm{c} c^{3/2} \approx 3 \bigl(\tfrac{3\pi}{4}\bigr)^{\!1/4} \Bigl(\tfrac{\Gamma^4}{\gamma^4c^2}\alpha\Bigr)^{\!1/4} + \tfrac{3\pi}{4} c^{-1/2} \Bigl(\tfrac{\Gamma^4}{\gamma^4c^2}\alpha\Bigr).
      \label{eq:results-S2}
    \end{equation}
    
\end{enumerate}

\Cref{fig:A,fig:0,fig:S} show that these capacity formulas match well with numerical analysis of the mean-field equations over a wide range of parameters. One noteworthy property of this model is that the capacity for sparse examples is maximized at intermediate sparsities $a$ [\cref{fig:A}(c)]. In contrast, classic Hopfield networks with only sparse patterns have capacity that scales asymptotically like $1/a|\!\log a|$, which is always higher for sparser patterns \cite{Tsodyks.1988}. In our dual encoding model, sparser patterns interfere less with each other but are also more easily disrupted by dense patterns, whose sparsity is always 0.5. The tradeoff between these two factors leads favors intermediate sparsities.



\subsection{\label{sec:sim}Simulation results}

We perform simulations to verify our capacity calculations. For each simulation condition, we construct replicate networks that store different randomly generated patterns. When generating sparse patterns of sparsity $a$, we fix the number of active neurons to $Na$ to reduce finite-size effects. Neural dynamics proceed asynchronously in cycles wherein every neuron is updated once in random order. We use $N = \num{10000}$ neurons and dense strength $\gamma = 0.1$, unless otherwise noted. Retrieval is assessed by the following definition of overlap between target pattern $\ve\chi$ and network activity $\ve S$:
\begin{equation}
  \hat m = \frac{1}{N a_\chi (1-a_\chi)}\sum_i (\chi_i - a_\chi) S_i,
\end{equation}
where $a_\chi = a$ for sparse patterns and $a_\chi = 1/2$ for dense patterns. By comparing with retrieval values described in \cref{sec:capacities} and derived in \cref{sec:se,sec:de,sec:dc}, we expect $\hat m \approx 1$ to indicate successful retrieval of all pattern types $\ve\chi = \ve\xi_{\mu\nu}$, $\ve\psi_{\mu\nu}$, and $\ve\psi_\mu$.

\begin{figure}[t!]
  \centering
  \includegraphics{sim}
  \caption{
    \label{fig:sim}
    Capacities $s_\textrm{c}$ for (a) sparse examples, (b) dense examples, and (c) dense concepts obtained by numerical calculations (lines) and simulations (points). For simulation data, points indicate means over 8 replicate networks, and vertical bars indicate standard deviations which are often obscured by the points. In each replicate network, 20 cues are tested with simulations lasting 10 update cycles.
  }
\end{figure}

In order to obtain capacities, we set a hard threshold with $\beta \rightarrow \infty$ and use the true target patterns as cues. Thus, these simulations are assessing the $T = 0$ stability of the target patterns. For sparse examples, we optimize over the threshold $\theta$ by numerical search. For dense patterns, we use $\theta = 0$. We define successful retrieval as $\hat m > (1 + \hat m_0)/2$, where $\hat m_0$ is the overlap expected for off-target patterns within the same concept. As listed in \cref{sec:capacities} and derived in \cref{sec:se,sec:de,sec:dc}, $\hat m_0 = 0$ for sparse examples, $\hat m_0 = c^2$ for dense examples, and $\hat m_0 = c$ for dense concepts. 

\Cref{fig:sim} reveals reasonable agreement between simulations and numerical analysis of the mean-field equations. This supports the validity of our derivations and the simplifications we invoked to perform them.



\section{\label{sec:hetero}Heteroassociation}

\subsection{Simulation results}

Our network stores linear combinations of sparse and dense patterns, which contain cross-terms between the two [\cref{eq:overview-J}]. Thus, we use simulations to test its ability to perform heteroassociation. We cue the network with a noisy version of the sparse example $\ve\xi_{\mu\nu}$, dense example $\ve\psi_{\mu\nu}$, or dense concept $\ve\psi_\mu$, and attempt to retrieve either of the three as the target pattern. For cases in which concepts are used as cues while examples are desired as targets, the highest overlap with any example within the cued concept is reported. We use $p = 10$ concepts and either $s = 30$ examples per concept during retrieval of sparse examples and dense concepts or $s = 3$ during retrieval of dense examples. Sparse patterns have sparsity $a = 0.01$ and dense patterns have correlation $c = 0.4$. With theoretical motivation in \cref{sec:se}, we define the rescaled parameters
\begin{equation}
  \theta' = \theta/(1-2\gamma)^2a \cond{and} \beta' = \beta\cdot(1-2\gamma)^2a,
  \label{eq:overview-rescaled}
\end{equation}
with rescaled temperature $T' = 1/\beta'$. To retrieve sparse examples, we apply a threshold $\theta' = 0.65$, and to retrieve dense examples and concepts, we apply $\theta' = 0$. We reintroduce noise into our simulations with inverse temperature $\beta' = 50$ and by randomly flipping a fraction 0.01 of the cue pattern between inactive and active during network initialization.

\begin{figure}[t!]
  \centering
  \includegraphics{hetero}
  \caption{
    \label{fig:hetero}
    Auto- and heteroassociation among sparse and dense patterns demonstrated by network simulations. Networks are placed under various conditions compared to baseline.
    Higher $p$: the number of concepts is increased from $p = 10$ to $30$.
    Higher $T$: the rescaled temperature is increased from $T' = 1/50$ to $1/5$.
    Higher cue noise: the fraction of the cue pattern flipped is increased from 0.01 to 0.2.
    Lower $\gamma$: The dense pattern storage strength is decreased from $\gamma = 0.1$ to $0.05$. 
    For dense example and concept targets, we use rescaled threshold $\theta' = 0$. For sparse example targets, we use $\theta' = 0.65$. Overlaps $\hat m$ reported are averages over 10 replicate networks. In each, 10 cues are tested with simulations lasting 20 update cycles.
  }
\end{figure}

\Cref{fig:hetero} shows that the network is generally capable of heteroassociation using the parameters described above, which define the baseline condition. By increasing the number of concepts, heteroassociative performance is largely preserved, but note that the retrieval of dense concepts from sparse examples is impaired. We next amplify two different sources of noise by either raising the temperature or increasing the noise introduced to cues. Different pattern types are more robust to each source. For instance, if we consider the autoassociation of example patterns, sparse encodings are more robust to higher temperature, whereas dense encodings are more robust to cue noise.

\subsection{Dense pattern strength \texorpdfstring{$\gamma$}{γ}}

Notice in the baseline condition of \cref{fig:hetero} that while dense concepts can be easily retrieved from sparse examples, the reverse is more difficult. The ability to perform bidirectional heteroassociation between a concept and its examples would have computational significance, so we seek to find network parameters that achieve it. Intuitively, lowering the storage strength of dense patterns $\gamma$ should bias the network towards retrieving sparse patterns. \Cref{fig:hetero} shows that, indeed, lowering $\gamma$ improves retrieval of sparse examples from dense concepts. Moreover, the network is still capable of the opposite process, albeit with decreased performance.

\begin{figure}[t!]
  \centering
  \includegraphics{hamiltonian}
  \caption{
    \label{fig:hamiltonian}
    The dense pattern storage strength $\gamma$ controls the ability to retrieve sparse examples from dense concepts by changing their relative energies. For all results, we use rescaled threshold $\theta' = 0.65$.
    (a) Hamiltonian energy of sparse examples and dense concepts [\cref{eq:results-H}]. Inset shows sparse example energy in detail.
    (b) Critical dense strength $\gamma_\textrm{c}$ below which sparse examples can be successfully retrieved by dense concept cues. Theoretical predictions are the locations of energy crossovers in (a). 
    Sparse patterns have sparsity $a = 0.01$. For $c = 0.1$, we store $s = 150$ patterns in each of $p = 3$ cues. For $c = 0.4$, $s = 30$ and $p = 10$. For simulation data, points indicate means over 10 replicate networks, and vertical bars indicate standard deviations, which are obscured by the points. In each replicate network, 10 cues are tested with simulations lasting 20 update cycles.
  }
\end{figure}

The value of $\gamma$ appears critical to the ability of the network to retrieve sparse examples from dense concepts. We hypothesize that this is related to the relative energy of these pattern types. As described in \cref{sec:mean}, the Hamiltonian of our network is
\begin{equation}
  H = -\frac{1}{2N}\sum_{\mu\nu} \sum_{i \neq j} (\eta^i_{\mu\nu} + \zeta^i_{\mu\nu}) (\eta^j_{\mu\nu} + \zeta^j_{\mu\nu}) S_i S_j + \theta \sum_i S_i,
\end{equation}
where, again, $\ve\eta_{\mu\nu}$ and $\ve\zeta_{\mu\nu}$ are rescalings of sparse examples $\ve\xi_{\mu\nu}$ and dense examples $\ve\psi_{\mu\nu}$ [\cref{eq:overview-J}]. We set the network activity $\ve S$ to a sparse example $\ve\xi_{\mu\nu}$ or dense concept $\ve\psi_\mu$ and calculate the average over patterns $\langle H \rangle$:
\begin{equation}
  \frac{\langle H \rangle}{N} \approx \begin{cases}
    \displaystyle -\frac{1}{2} (1-2\gamma)^2 a^2 + \theta a & \textrm{sparse example}, \\
    \displaystyle -\frac{1}{2} s \biggl[\frac{\gamma c}{2}\biggr]^2 + \frac{\theta}{2} & \textrm{dense concept}.
  \end{cases}
  \label{eq:results-H}
\end{equation}
\Cref{fig:hamiltonian}(a) shows \cref{eq:results-H}. For both sparse examples and dense concepts, use a high threshold $\theta' = 0.65$ intended to retrieve the former. First consider $c = 0.4$, which is used in \cref{fig:hetero}. For the baseline condition of $\gamma = 0.1$ in \cref{fig:hetero}, dense concepts have lower energy, even while applying a high threshold. This can explain why retrieval of sparse examples from dense concepts is difficult. For the low $\gamma$ condition of $\gamma = 0.05$ in \cref{fig:hetero}, the energy of dense concepts increases above that of sparse examples, which favors the retrieval of the latter. Thus, we predict that the energy crossover between these two pattern types defines a critical $\gamma_\textrm{c}$ above which sparse examples cannot be retrieved from dense concepts.

To test this prediction, we simulate networks at varying values of $\gamma$. We consider a successful retrieval regime to have at least 10\% of the concept cues retrieving a corresponding example with overlap $\hat m > 0.5$. \Cref{fig:hamiltonian}(b) demonstrates that the energy crossover indeed predicts $\gamma_\textrm{c}$ for $c = 0.4$. The $c = 0.1$ case shows lower quantitative agreement between simulation and theory, although the qualitative effect of a higher $\gamma_\textrm{c}$ compared to that for $c = 0.4$ is captured. The comparison of the Hamiltonian energy between sparse examples and dense concepts ignores the contribution of entropy to the free energy. Nevertheless, simulations demonstrate that temperature exerts no significant effect on $\gamma_\textrm{c}$, so our $T = 0$ predictions are equally valid at finite $T$.

\begin{figure*}[t!]
  \centering
  \includegraphics{gamma}
  \caption{
    \label{fig:gamma}
    Capacities $s_\textrm{c}$ as a function of dense pattern storage strength $\gamma$.
    (a), (b) Retrieval regimes. Connected points indicate numerical analysis of the mean-field equations (connected points). Shaded regions indicate successful retrieval.
    Capacities $s_\textrm{c}$ for (c) sparse examples, (d) dense examples, and (e) dense concepts.
    Collapse of $s_\textrm{c}$ curves for (f) sparse examples, (g) dense examples, and (e) dense concepts under rescaled variables. Gray lines indicate theoretical formulas \cref{eq:results-A1,eq:results-0,eq:results-S1}, respectively. Concept load is $\alpha = 0.001$ concepts per neuron.
  }
\end{figure*}

Due to the importance of $\gamma$, we present additional capacity results in which it is varied [\cref{fig:gamma}]. For each set of parameters assessed, there is a range of intermediate $\gamma$ in which both sparse examples and dense concepts are stable [\cref{fig:gamma}(b)]. Within it should be a narrower range in which heteroassociation between the two is possible. \Cref{fig:gamma}(c)--(h) illustrates that our theoretical capacity formulas are still valid as functions over $\gamma$.


\section{\label{sec:discussion}Discussion}

In summary, we present a Hopfield-like model that stores pairs of sparse patterns with low correlation and dense patterns with high correlation. By adjusting the activity threshold, the network can retrieve patterns of either sparsity. The capacity for sparse patterns is large, so many distinct memories can be retrieved. As more dense patterns are stored, they merge according to their correlation structure such that concepts are built through the accumulation of examples. We derive mean-field equations that govern network overlap with sparse examples, dense examples, and dense concepts, and calculate capacity formulas for each type of retrieved pattern. Moreover, we explain how the network can retrieve one type of target pattern from its corresponding cue pattern of a different type.

The capability of autoassociative networks to generalize across correlated examples is well-established \cite{Fontanari.1990, Stariolo.1992, Dominguez.1998}. Another line of investigation has also shown that Hopfield-like networks can build a hierarchy of memories that obeys an ultrametric, tree-like structure \cite{Mezard.1985, Dotsenko.1985, Cortes.1987, Virasoro.1988, Gutfreund.1988, Krogh.1988}. Each pattern at one level in the structure serves as a concept-like trunk from which multiple correlated branches are generated to form the next, more example-like level. While these models are insightful and influential, they possess certain disadvantages that our network can address. The typically use an activity threshold or, equivalently, an external field to move between levels, which is also the case in this work. In one ultrametric model, the field is inhomogeneous and proportional to the pattern retrieved \cite{Gutfreund.1988}. Our activity threshold is homogeneous and does not require memory of the pattern retrieved, though implementing such a feature may improve retrieval performance. In another hierarchical model, coarser representations are stored more sparsely and retrieved at higher threshold \cite{Krogh.1988}. This arrangement prevents the network from leveraging the higher capacity of sparser patterns to store finer representations, which are more numerous. Our network follows this design principle. Moreover, ultrametric Hopfield networks often require complex storage procedures that require a priori knowledge of concepts or other examples \cite{Dotsenko.1985, Cortes.1987, Gutfreund.1988, Krogh.1988}. They do not permit the unsupervised learning of concepts through the accumulation of examples over time, which is achieved by our simple Hebbian learning rule and strengthens the biological significance of our model.



\section*{Acknowledgments}

LK is supported by JSPS KAKENHI for Early-Career Scientists (22K15209) and has been supported by the Miller Institute for Basic Research in Science and a Burroughs Wellcome Fund Collaborative Research Travel Grant. TT is supported by Brain/MINDS from AMED (JP19dm0207001) and JSPS KAKENHI (JP18H05432).



\input{Manuscript.bbl}


\input{Appendices}


\end{document}
