\section{Expectation of softmax of sum of independent truncated normal random variables}
\label{sec:appdx_expectation}
\subsection{Binary case}
In two dimensions, the softmax function can be expressed as $\text{softmax}(x)_l=\sigma(x_l-x_{1-l}).$ We can represent $\sigma(x)$ as a Taylor expansion at some point $\mu$:
\begin{equation}
    \sigma(x) = \sigma(\mu)+\frac{d}{dx}\sigma(\mu)(x-\mu)+\frac{1}{2}\frac{d^2}{dx^2}\sigma(\mu)(x-\mu)^2+\frac{1}{3!}\frac{d^3}{dx^3}\sigma(\mu)(x-\mu)^3+\cdots.
\end{equation}
If $x = y+\sum_k z_k,$ where $z_k\sim\mathcal{N}_+(0,v),$ then $x=\mu+\sum_k(z_k-m_1),$ where $\mu=y+Km_1$ and $m_p$ is the $p$th moment of $\mathcal{N}_+(0,v).$ The expectation of $\sigma(x)$ is then
\begin{equation}
    \mathbb{E}[\sigma(x)] = \sigma(\mu)+\frac{d}{dx}\sigma(\mu)\mathbb{E}\left[\sum_k(z_k-m_1)\right]+\frac{1}{2}\frac{d^2}{dx^2}\sigma(\mu)\mathbb{E}\left[(\sum_k(z_k-m_1))^2\right]+\frac{1}{3!}\frac{d^3}{dx^3}\sigma(\mu)\mathbb{E}\left[(\sum_k(z_k-m_1))^3\right]+\cdots.
\end{equation}
From the multinomial theorem, we have
\begin{equation}
    \mathbb{E}[(\sum_k(z_k-m_1))^p]=\sum_{h_1+h_2+\cdots+h_K=p;h_k\geq 0}\begin{pmatrix}
        n \\ h_1,h_2,...,h_K
    \end{pmatrix}m_{h_1}m_{h_2}\cdots m_{h_K}.
\end{equation}
The moments and partitions needed can be efficiently calculated \citep{kelleher14,orjebin14}. Computing derivatives of $\sigma$ can be done recursively by noting that $(\sigma^p)'=p(\sigma^p-\sigma^{p+1})$, so differentiation is matrix multiplication in coefficient space.
\\\\
For $z\sim\mathcal{N}_-(0,v),$ $E[z^k]=(-1)^mm_k,$ so we can compute expectations of $\sigma(y-\sum_kz_k)$ in the same way.

\subsection{Extension to $L$-ary}
Note that
\begin{equation}
    \frac{\exp(y_l+z)}{\sum_{l'\neq l}\exp(y_{l'})+\exp(y_l+z)}=\sigma(y_l-\log(\sum_{l'\neq l}\exp(y_{l'}))+z)
\end{equation}
and 
\begin{equation}
    \frac{\exp(y_{l'})}{\sum_{l'\neq l}\exp(y_{l'})+\exp(y_l+z)}=\frac{\exp(y_{l'})}{\sum_{l'\neq l}\exp(y_{l'})}\sigma(\log(\sum_{\tilde{l}\neq l}\exp(y_{\tilde{l}}))-y_l-z)),
\end{equation}
so we can turn expectations in one variable within a softmax into an expectation over the sigmoid.
%ie exp(y_l)msigm(logm-mu_l-z)