\documentclass[twoside]{article}
\usepackage{aistats2022}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} 
\usepackage{multirow}
\usepackage[table]{xcolor}

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2021} with \usepackage[nohyperref]{icml2021} above.
\usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% for graphical model
\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usetikzlibrary{arrows}
\usepackage{color}
\usepackage{soul}
\usepackage{graphicx}
\usepackage{caption}

\usepackage[inline]{enumitem}
\usepackage{hhline}
\usepackage{colortbl}
\usetikzlibrary{backgrounds}

\usepackage{math}
\def\etal{\emph{et al.}}
\def\ie{\emph{i.e.}}
\def\eg{\emph{e.g.}}
\newcommand\todo[1]{\textcolor{red}{\textbf{[#1]}}} %TODO
\newcommand\fede[1]{\textcolor{blue}{\textbf{[#1]}}} %TODO
\newcommand{\mounia}[1]{\textcolor{green}{#1}}
\usepackage{cleveref}

\newcommand{\us}{MIST}

\newcommand{\dif}{\text{d}}
\newcommand{\tV}{{X}}%\bm{t}}
\newcommand{\uM}{{U}}
\newcommand{\uV}{\bm{u}}
\newcommand{\zV}{\bm{z}}
\newcommand{\mM}{{M}}
\newcommand{\sM}{{S}}
\newcommand{\fV}{\bm{f}}
\newcommand{\fM}{{F}}
\newcommand{\lM}{{L}}
\newcommand{\kl}{\text{KL}}

\usepackage[round]{natbib}
\renewcommand{\bibname}{References}
\renewcommand{\bibsection}{\subsubsection*{\bibname}}


\begin{document}
\onecolumn
\aistatstitle{Supplementary: Efficient Inference for Dynamic Topic Modeling with Large Vocabularies}
\section{Importance Sampling}
We can write the probability of words in a document conditioned on the parameter $\bm\eta_d$ and $\bm\beta$ as:
\begin{align}
    p(W_d|\bm\eta_d, \bm\beta) = \prod_{n=1}^{N_d}\text{Multi}(1, \sigma(\bm\xi_d)) = \prod_{n=1}^{N_d}\text{Cat}(\sigma(\bm\xi_d)) = \mathcal{\tilde{L}}_W.
\end{align}
Its derivative can be derived as:
\begin{align}
    \nabla \mathcal{\tilde{L}}_W 
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \log \text{Cat}(\sigma(\bm\xi_d))\right] \\
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n}
    - \nabla \log \sum_{j=1}^{P}\exp(\bm\xi_d)_j\right] \\
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    - \frac{1}{\sum_{j=1}^{P}\exp({\bm\xi_{d}}_j)}\nabla \sum_{i=1}^{P}\exp(\bm\xi_d)_i\right] \\
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    - \sum_{i=1}^{P}\frac{\exp(\bm\xi_d)_i}{\sum_{j=1}^{P}\exp(\bm\xi_d)_j}\nabla \bm\xi_{d,i} \right].
\end{align}

To approximate this derivative, we consider a random sample of $M$ words from the vocabulary and use those to approximate the normalisation constant.
% 
Consider a sample vector $\bm s \in \{1,...,P\}^{M+N_d}$, which represents a
sample of words in the vocabulary and stores the index of the $N_d$ positive (words appearing in document $d$) and the index of the $M$ sampled words.
Let $\xi_{d,i}^{\prime} := \xi_{d,i} - \ln(Q_{di} / P)$ if $y_i=0$ (\ie, word $i$ does not appear in document $d$), 
$\xi_{d,i}^{\prime} := \xi_{d,i} -\ln (Q_{di})$ otherwise, with $Q_{di}$ proposal distribution.
We shift the true logits by the expected number of occurrences of a word $i$, ensuring that the sampled softmax is asymptotically unbiased. In our experiment we choose $Q$ to be a uniform distribution over the subset of words considered, so $Q_{di} = 1 / (N_d + M)$ \citep{jean2014using}.
Then:
\begin{align}
    % \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    % - \sum_{i=1}^{P}\frac{\exp(\xi_{i}) }{\sum_{j=1}^{P}\exp(\xi_{j})}\nabla \xi_{d,i} \right] \\
\nabla \mathcal{\tilde{L}}_W  \approx \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    - \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \right].
%     \\
% = \E_{q(\bm\beta | H)q(\bm\eta_d)}\left[ \sum_{n=1}^{N_d} \nabla \bm\xi_{d,n} 
%     - N_d \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \right]
\end{align}

\section{FITC}
The FITC approximation for the multi-output Gaussian process results into the follow formulation:
\begin{align*}
    p_{\text{FITC}}&(\beta|U, Z_X, Z_H, X, H) \\
    = \N(&\beta|  K_{fu}K_{uu}^{-1}(U^\top)_:, \diag{K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}}),
\end{align*}

where $\diag{\cdot}$ returns a diagonal matrix while keeping the diagonal entries, and $A_:$ denotes $vec(A)$, the column-wise vectorisation of the matrix $A$.
Since $K_{fu}$, $K_{ff}$ and $K_{uu}$ have a Kronecker structure, we can rewrite mean and covariance to compute them efficiently as follows
\begin{align*}
    K_{fu}K_{uu}^{-1}(U^\top)_: = (K_{fu}^X{K_{uu}^X}^{-1}U^\top {K_{uu}^H}^{-\top}{K_{fu}^H}^\top)_:
\end{align*}
\begin{align*}
    & \diag{K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}} \\
    &= \diag{K_{ff}} \\
    &- \left(\diag{K_{fu}^H{K_{uu}^H}^{-1}{K_{fu}^H}^\top} \otimes \diag{K_{fu}^X{K_{uu}^X}^{-1}{K_{fu}^X}^\top}\right)_:
\end{align*}
Note that the last line becomes a vectorised outer product between vectors and solved efficiently. We can use the same trick for $\diag{K_{ff}}$. 

The full derivation is the following:

\begin{align*}
    &K_{fu}K_{uu}^{-1}(U^\top)_: \\
    &=
    (K_{fu}^H \otimes K_{fu}^X)(K_{uu}^H \otimes K_{uu}^X)^{-1}(U^\top)_: \\
    &= (K_{fu}^H \otimes K_{fu}^X)({K_{uu}^H}^{-1} \otimes {K_{uu}^X}^{-1})(U^\top)_: \\
    &= (K_{fu}^H{K_{uu}^H}^{-1} \otimes K_{fu}^X{K_{uu}^X}^{-1})(U^\top)_: \quad \text{(matrix eq)} \\
    &= (K_{fu}^X{K_{uu}^X}^{-1}U^\top((K_{fu}^H{K_{uu}^H}^{-1})^\top)_:\\
    &= (K_{fu}^X{K_{uu}^X}^{-1}U^\top {K_{uu}^H}^{-\top}{K_{fu}^H}^\top)_:
\end{align*}

\begin{align*}
    & \diag{K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}} \\
    &= \diag{K_{ff}} - \diag{K_{fu}K_{uu}^{-1}K_{uf}} \\
    &= \diag{K_{ff}} - \diag{(K_{fu}^H{K_{uu}^H}^{-1} \otimes K_{fu}^X{K_{uu}^X}^{-1})(K_{fu}^H \otimes K_{fu}^X)^{\top}} \\
    &= \diag{K_{ff}} - \diag{K_{fu}^H{K_{uu}^H}^{-1}{K_{fu}^H}^\top \otimes K_{fu}^X{K_{uu}^X}^{-1}{K_{fu}^X}^\top} \\
    &= \diag{K_{ff}} - \left(\diag{K_{fu}^H{K_{uu}^H}^{-1}{K_{fu}^H}^\top} \otimes \diag{K_{fu}^X{K_{uu}^X}^{-1}{K_{fu}^X}^\top}\right)_:
\end{align*}

\section{Matrix Normal Distribution}
The matrix normal is related to the multivariate normal distribution in the following way:
\begin{align}
\mathbf{X} \sim \mathcal{MN}_{n\times p}(\mathbf{M}, \mathbf{U}, \mathbf{V}),
\end{align}
if and only if
\begin{align}
\mathrm{vec}(\mathbf{X}) \sim \mathcal{N}_{np}(\mathrm{vec}(\mathbf{M}), \mathbf{V} \otimes \mathbf{U})
\end{align}
where $\otimes$  denotes the Kronecker product and $\mathrm{vec}(\mathbf{M})$ denotes the vectorization of $\mathbf {M}$.

Sampling from the distribution and the $\kl$ divergence can be computed efficiently.
$U_{\bm\beta_k}$ can be sampled efficiently following the procedure:
\begin{enumerate*}[label=\emph{(\roman*)}]
    \item sample ${C}\sim\mathcal{MN}_{h\times x}(\mathbf{0},{I},{I})$, $C\in\R^{h\times x}$, a collection of independent samples from a standard normal distribution; then  
    \item let $U_{\bm\beta_k} = (M + {ACB})_:$, where $\Sigma^H = AA^\top$ and $\Sigma^\tV = B^\top B$.
\end{enumerate*}
%
The $\kl$ divergence between $q(U_{\bm\beta_k})$ and $p(U_{\bm\beta_k})$ can also be computed efficiently (see Supplementary Material).

Sampling from the matrix normal distribution is a special case of the sampling procedure for the multivariate normal distribution. Let $\mathbf{X}$  be an $n$ by $p$ matrix of $np$ independent samples from the standard normal distribution, so that

\begin{align}
\mathbf{X}\sim\mathcal{MN}_{n\times p}(\mathbf{0},\mathbf{I},\mathbf{I}).
\end{align}
Then let

$\mathbf{Y}=\mathbf{M}+\mathbf{A}\mathbf{X}\mathbf{B}$,
so that

\begin{align}
\mathbf{Y}\sim\mathcal{MN}_{n\times p}(\mathbf{M},\mathbf{AA}^T,\mathbf{B}^T\mathbf{B}),
\end{align}
where $A$ and $B$ can be chosen by Cholesky decomposition or a similar matrix square root operation.

\subsection{KL Divergence}
The KL divergence between two matrix-variate normal distributions, \eg, $q(U_{\bm\beta_k})$ and $p(U_{\bm\beta_k})$, can be analytically computed as:
\begin{align*}
    &\kl(q(U_{\bm\beta_k}) || p(U_{\bm\beta_k})) = \frac12 \bigg( M_x\log \frac{|K^H_{uu}|}{|\Sigma^H|} + M_H\log \frac{|K^\tV_{uu}|}{|\Sigma^\tV|} \\
    &\hspace{9mm}+ \tr(M^\top(K^\tV_{uu})^{-1}M(K^H_{uu})^{-1}) + \tr((K^H_{uu})^{-1}\Sigma^H) 
    \tr((K^\tV_{uu})^{-1}\Sigma^\tV) - M_H M_x )\bigg).
\end{align*}

To implement $\tr[M^\top (K^\tV)^{-1}M(K^H)^{-1}]$, we use $K^\tV = L_\tV L_\tV^\top$, 
$K^H = L_H L_H^\top$, $A = L_\tV^{-1} M L_H^{-\top}$, then $\tr[M^\top (K^\tV)^{-1}M(K^H)^{-1}] = \tr(A^\top A)$.

First, recall that
\begin{align}
KL(q || p) = \int q(x) (\log q(x) - \log p(x)) dx.
\end{align}

which in the case of two multivariate Gaussian distributions, say $p(x) = \N(m_1, S_1)$, $q(x) = \N(m_2, S_2)$ is equal to
\begin{align}
    \int \big[ \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 (x-m_1)^\top S_1^{-1}(x-m_1) + \frac12 (x-m_2)^\top S_2^{-1}(x-m_2) \big] q(x) dx \\
    = \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 tr\big\{\E[(x-m_1)(x-m_1)^\top]S_1^{-1}\big\} + \frac12 \E[ (x-m_2)^\top S_2^{-1}(x-m_2)] \\
    = \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 tr\{\E[(x-m_1)(x-m_1)^\top]S_1^{-1}\} + \frac12 \E[(x-m_2)^\top S_2^{-1}(x-m_2)] \\
    = \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 tr\{I_d\} + \frac12 (m_1-m_2)^\top S_2^{-1}(m_1-m_2) + \frac12 tr\{S_2^{-1}S_1\} \\
    = \frac12 [\log\frac{|S_2|}{|S_1|} - d + tr\{S_2^{-1}S_1\} + (m_1-m_2)^\top S_2^{-1}(m_1-m_2) ]
\end{align}

Now, we can use a Kronecker representation of $S_1$ and $S_2$ as
$S_1 = S_h \otimes S_x$ and $S_2 = K_h \otimes K_x$. Let $M = m_1 - m_2$. Also, we consider a vectorised version of $M$, and we indicate it as $M_:$.
Then the KL divergence becomes:
(using $|V\otimes U| = |V|^n|U|^p$, and mixed product property of Kron)
\begin{align}
    &\frac12 [\log\frac{|K_h \otimes K_x|}{|S_h \otimes S_x|} - d + 
    tr\{(K_h \otimes K_x)^{-1}(S_h \otimes S_x)\}
    + M_:^\top (K_h \otimes K_x)^{-1}M_: ] \\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr\{(K_h^{-1} \otimes K_x^{-1})(S_h \otimes S_x)\}
    + M_:^\top (K_h^{-1} \otimes K_x^{-1})M_: ] \\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr\{(K_h^{-1} S_h)\otimes(K_x^{-1} S_x)\}
    + M_:^\top ((K_h^{-1} \otimes K_x^{-1})M_:)]  \quad (associative)\\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr(K_h^{-1}S_h)tr(K_x^{-1} S_x)
    + M_:^\top (K_x^{-1}MK_h^{-1})_: ] \quad (kron matrix equations) \\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr(K_h^{-1}S_h)tr(K_x^{-1} S_x)
    + tr[M^\top K_x^{-1}MK_h^{-1}] ]
\end{align}

\section{Variational Inference for Gaussian and Wishart process}
\paragraph{Inference for $\bm\mu$.}
We first augment the Gaussian process with a set of auxiliary variables with a set of corresponding time stamps, \ie, 
\begin{align}
p(\bm\mu|\tV) = \int p(\bm\mu| \uM_\mu, \tV, \zV_\mu) p(\uM_\mu| \zV_\mu) \dif \uM_\mu, \label{eqn:aux_mu}
\end{align}
where $\uM_\mu$ is the auxiliary variable for $\bm\mu$ and $\zV_\mu$ is the corresponding index. 
Both $p(\bm\mu| \uM_\mu, \tV, \zV_\mu)$ and $p(\uM_\mu| \zV_\mu)$ follow the same Gaussian processes as the one for $p(\bm\mu|\tV)$, \ie, these Gaussian processes have the same mean and kernel functions. 
As shown in \Cref{eqn:aux_mu}, the above augmentation does not change the prior distributions for $\bm\mu$.

The variational posterior of $\bm\mu$ is constructed in a special form to enable efficient inference \citep{pmlr-v5-titsias09a}: $q(\bm\mu, \uM_\mu) = p(\bm\mu| \uM_\mu) q(\uM_\mu)$. $q(\uM_\mu) = \mathcal{N}(\mM_{\mu},\sM_{\mu})$ is a multivariate normal distribution, in which the mean and covariance are variational parameters.
$p(\bm\mu| \uM_\mu)$ is a conditional Gaussian process \citep{hensman2013gaussian}.
When $\bm\mu$ is used in the down-stream distributions, a lower bound can be derived,
\begin{align}
\log p(\cdot|\bm\mu) \geq \mathbb{E}_{q(\bm\mu)}[p(\cdot|\bm\mu)] - \text{KL}\left(q(\uM_\mu)||p(\uM_\mu)\right), \label{eqn:bound_mu}
\end{align}
where $q(\bm\mu) = \int p(\bm\mu| \uM_\mu) q(\uM_\mu) \dif \uM_\mu$.

\paragraph{Inference for $\Sigma$.}
We derive a similar stochastic variational inference method for the Wishart Process. 
We augment each GP $p(\fV_{ij}|\tV)$ in the Wishart process with a set of auxiliary variables and a set of the corresponding inputs, 
\begin{align}
p(\fV_{ij}|\tV) = \int p(\fV_{ij}| \uV_{ij}, \tV, \zV_{ij}) p(\uV_{ij}| \zV_{ij}) \dif \uV_{ij}, 
\end{align}
where $\uV_{ij}$ is the auxiliary variable, $\zV_{ij}$ is the corresponding inputs and $p(\fV_{ij}| \uV_{ij})$
is a conditional Gaussian process \citep{hensman2013gaussian}.
%
We define the variational posterior of $\fV_{ij}$ to be $q(\fV_{ij}, \uV_{ij}) = p(\fV_{ij}| \uV_{ij}) q(\uV_{ij})$, where $q(\uV_{ij}) = \N(\bm m_{ij}, \bm s_{ij})$. We also define the variational posterior of $\bm{\ell}$ to be $q(\bm{\ell}) = \N(\bm{m}_\ell, {S}_\ell)$, where ${S}_\ell$ is a diagonal matrix. As the diagonal elements of $\lM{}$ needs to be positive, we apply a {change of variable} to the variational posterior of the diagonal elements, \ie, $\ell_m = \log(1+\exp(\hat{\ell}_m)), q(\hat{\ell}_m) = \N(\bm m_{\ell_m}, S_{\ell_m})$.
% 
Note that $\bm z_\mu$ and $\bm z_{ij}$ are variational parameters instead of random variables. For this reason, we will omit them from the notation for convenience.

We can derive a variational lower bound with such a set of variational posterior for all the entries $\{\fV_{ij}\}$ and $\bm{\ell}$, when $\cov$ is used for some down-stream distributions,
\begin{align}\label{eqn:bound_sigma}
% &\log p(\cdot|\cov) \geq \mathbb{E}_{q(\fM)}[p(\cdot|\cov)] - \sum_{i,j} \text{KL}\left(q(\uV_{ij})||p(\uV_{ij})\right),\\
&\log p(\cdot|\cov) \geq \mathbb{E}_{q(\fM)q(\bm\ell)}[p(\cdot|\cov)] - \sum_{i,j} \text{KL}\left(q(\uV_{ij})||p(\uV_{ij})\right) 
- \text{KL}\left(q(\bm{\ell})||p(\bm{\ell})\right),
\end{align}
where $q(\fM) = \prod_{ij}\int p(\fV_{ij}| \uV_{ij}) q(\uV_{ij}) \dif \uV_{ij}$.

\subsection{Lower Bound for \us{}}
After deriving the variational lower bounds for the individual components of \us{}, we assemble these components together to form the final variational lower bound.
The word distributions for individual topics are used in defining the distribution of individual words for each document $d$, $p(W_d|\bm\eta_d, \bm\beta^{(\tV_d)})$. 
% To simplify the notation, we follow \cite{tomasi2020stochastic} and 
% Let $q(\Sigma) \propto q(\bm\ell)q(F)$, and $\kl(q(\Sigma) || p(\Sigma)) = \sum_{i,j} \kl\left(q(\uV_{ij})||p(\uV_{ij})\right) + \kl\left(q(\bm{\ell})||p(\bm{\ell})\right)$.
Combining the lower bounds (10), (4), (8), %\eqref{eqn:bound_doc}, \eqref{eq:bound_H}, \eqref{eqn:bound_beta}, 
\eqref{eqn:bound_mu} and \eqref{eqn:bound_sigma}, 
% and the lower bounds for GPs and GWP,
we can derive the complete variational lower bound $\mathcal{L}$ of \us{}.
\begin{align*}
\begin{split}
\log ~&p(W) \geq  \mathbb{E}_{q(\bm\mu)q(\bm\ell)q(F)q(\bm\beta)}\left[\mathcal{L}_W\right]
% &\sum_{d=1}^D \Big( \mathbb{E}_{q(\bm\eta_d)q(\bm\beta^{(\tV_d)})}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] \\
% &\hspace{2mm}-\mathbb{E}_{q(\bm\eta_d)q(\bm\mu_{\tV_d})q(\Sigma_{\tV_d})}\left[\kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right)\right] \Big) \\
- \kl\left(q(\uM_\beta)||p(\uM_\beta)\right) - \kl\left(q(H)||p(H)\right) \\
&- \kl\left(q(\uM_\mu)||p(\uM_\mu)\right)  - \kl\left(q(\bm{\ell})||p(\bm{\ell})\right) - \sum_{i,j} \kl\left(q(\uV_{ij})||p(\uV_{ij})\right)  = \mathcal{L}.
\end{split}
\end{align*}
% 
The first term of $\mathcal{L}$ can be further decomposed by plugging in (3),%\Cref{eqn:bound_doc},
\begin{align}\label{eq:empirical-elbo}
&\E_{q(\bm\mu)q(\bm\ell)q(F)q(\bm\beta)}\left[\mathcal{L}_W\right] = \nonumber\\
&\hspace{2mm}\sum_{d=1}^D \Big( \E_{q(\bm\eta_d)q(\bm\beta)}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] - \mathbb{E}_{q(\bm\eta_d)q(\bm\mu_{\tV_d})q(\Sigma_{\tV_d})}\left[\kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right)\right] \Big). \nonumber
\end{align}
% This formulation allows us to easily perform mini-batch training by data sub-sampling. For each mini-batch, we randomly sub-sampling the data set and re-weight the term $\mathbb{E}_{q(\bm\mu)q(\Sigma)q(\bm\beta)}\left[\mathcal{L}_W\right]$ according to the ratio between the size of dataset and the size of the mini-batch as shown in \Cref{eqn:minibatch_eta}.
Note that all variational parameters of $q(\bm\mu)$, $q(\bm\ell)$, $q(F)$, $q(\bm\beta)$, $q(\bm\eta)$ are optimised.

\section{Datasets}
We include a complete list of details for the dataset we used in our analysis.

We considered the following datasets: State of the Union corpus (\emph{SotU}), department of justice press releases (\emph{DoJ}), Elsevier corpus (\emph{Abstracts}) \citep{kershaw2020elsevier}, Blog Authorship Corpus (\emph{Blogs}) \citep{schler2006effects}, NeurIPS conference papers (\emph{NeurIPS}) \citep{perrone2017poisson}, A Million News Headlines (\emph{News}), Twitter sentiment classification (\emph{Twitter}) \citep{go2009twitter}.

\paragraph{State of the Union corpus (1790-2018).\footnote{\url{https://kaggle.com/rtatman/state-of-the-union-corpus-1989-2017}}}
The dataset includes a yearly address of the US president, from 1790 to 2018 (229 years).
Our vocabulary includes 1442 words after preprocessing. We split the data into 170 documents as training and 57 documents as test data. License: CC BY-SA 4.0

\paragraph{Department of justice press releases (2009-2018).\footnote{\url{https://kaggle.com/jbencina/department-of-justice-20092018-press-releases}}}
The dataset includes 13087 press releases from the Department of Justice 
from 2009 to 2018 (115 unique timestamps), preprocessed to include 2622 unique words. Documents were split into 9674 for training and for 3413 testing. License: CC0: Public Domain

\paragraph{Elsevier OA CC-BY Corpus \citep{kershaw2020elsevier}.\footnote{\url{https://data.mendeley.com/datasets/zm33cdndxs/2}}} 
The dataset includes 40k open access (OA) CC-BY abstracts taken from articles from across Elsevier’s journals, published from 2010 to 2019. We consider a random sample of 6898 for training and same size for testing, including 3000 words in the vocabulary. License: CC BY 4.0
% We also consider a smaller subset for comparison purposes, comprising 2929 documents in the training set and 2929 in the test set, including only 1000 words in the vocabulary.

\paragraph{Blog Authorship Corpus \citep{schler2006effects}.\footnote{\url{https://u.cs.biu.ac.il/~koppel/BlogCorpus.htm}}} 
The corpus consists of the posts of 19k bloggers gathered from \texttt{blogger.com} from June 1999 to August 2004. The corpus incorporates a total of 681k posts, from which we draw a random sample of 5649 for training and 5650 for testing. After our preprocessing, we considered 3000 words in our vocabulary. License: free use for non-commercial research purposes.

\paragraph{NeurIPS conference papers (1987-2015) \citep{perrone2017poisson}.\footnote{\url{https://archive.ics.uci.edu/ml/datasets/NIPS+Conference+Papers+1987-2015}}}
The dataset includes 5804 conference papers from 1987 to 2015 including an average of 34 papers per year. 
We preprocessed the dataset leading to 4799 words. In both cases we used 4237 documents as training data and 1567 as test data.

\paragraph{A Million News Headlines.\footnote{\url{https://kaggle.com/therohk/million-headlines}}} 
The dataset includes 1.2M news headlines published over a period of 17 Years (from 2003 to 2019). After our preprocessing, we used a random sample of size 8526 for training and 2822 for test purposes with a vocabulary size of 3000. License: CC0: Public Domain
% We also consider a smaller dataset with vocabulary size of 1000, comprising 3732 training and 1207 test samples.

\paragraph{Twitter sentiment classification \citep{go2009twitter}.\footnote{\url{https://www.kaggle.com/kazanova/sentiment140}}}
The dataset contains 1.6M tweets, from April to May 2009. We randomly sampled 4525 tweets for training and same for testing for computational efficiency. 
We preprocessed samples using a tweet tokenizer, removing usernames and replacing repeated character sequences (length 3 or more) with sequences of length 3 \citep{bird2009natural}.
After our preprocessing we considered 3000 tokens. %9217 tokens.

For the last two datasets we consider an extended version with the highest number of tokens available, that is 22459 for the headlines dataset, and 83582 for Twitter, and we subsample 1M documents in each dataset in our experiments. We refer to such datasets as \textit{extended}. However, as both the samples and the dimensionality is different, they are effectively different datasets (hence not comparable) with respect to their smaller counterparts.

\subsection{Experiment settings.}
We split each dataset considering 75\% of the samples as training and 25\% as test. Documents associated with the same time stamps were assigned to the same split.

For each dynamic topic model we used a Mat\'ern 3/2 kernel for $\bm\beta$, to allow topics to quickly incorporate new words. This is important especially to incorporate neologisms, and particularly for datasets such as NeurIPS conference papers and Elsevier corpus, where the names of novel models become quoted in citations (for example, "LDA" starting to appear in publications together as "topic modeling" after its introduction in 2003). 
For the other parameters $\bm\mu$ and $\bm f$ we use a squared exponential kernel, as we expect a smooth temporal evolution of both topic probabilities and their correlation. 
We initialise amplitude and length scale of kernels as $1$ and $0.5$ respectively, and we optimise for them %along with other prior parameters for the distribution of $\beta$, $\mu$ and $f$, 
using the approximate empirical Bayes approach~\citep{maritz2018empirical}. 

%\paragraph{Hyperparameters.}
%We implemented our models using the TensorFlow framework \citep{dillon2017tensorflow}. 
Experiments were conducted using Adam optimiser with learning rate $0.001$ and up to $10$k epochs until convergence.
% 
We experimented with different number of topics, and report the results using a default choice of 30 for all datasets (20 for SotU) to maintain consistency with previous works.
We also experimented with a different number of inducing points for the three components $\bm\beta$, $\bm\mu$ and $\bm f$, thus controlling the complexity of the variational posterior used from both DCTM and our models (static models such as LDA and CTM do not have such dynamic components). The number of inducing points used for such components is 15, 20 and 15, respectively. \us{} has an additional component for the latent embedding of words in $\bm\beta$; we used $M_H = 200$ in $Q = 10$ dimensions.
We initialised the posterior for $H$ by transforming the words in our vocabulary using ELMO embeddings \citep{peters2018deep} pre-trained on the 1 Billion Word Benchmark, and take the first $Q$ principal components using a PCA transformation.

For the posterior of $\bm\eta$, when using a static encoder (\eg, for DCTM) we considered a dense neural network with three layers with size $500$, $300$ and $200$, respectively. To account for the increased input dimensionality in our meta-encoder we instead used a dense neural network with three layers, with size $1000$, $600$ and $400$, respectively.\footnote{Since the encoder is a collection of variational parameters, we emphasise that increasing its size does not overfit the model.}

\section{Approximate Perplexity}
The perplexity metric is computed as:
\begin{align}
  \text { perplexity }=\exp \left\{-\frac{1}{\left|D\right|} \sum_{d \in D} \frac{1}{N_d}\E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[\log p(W_d | \bm\xi_{d})\right]\right\}
\end{align}
However, in case of a large vocabulary the log probability cannot be computed exactly, so we need to resort to an approximation by sampling $M$ random negative words, which do not appear in the document:
\begin{align}\label{eq:approx-perplexity}
    \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[\log p(W_d | \bm\xi_{d})\right]
\approx \E_{q(\bm\beta | H)q(\bm\eta_d)}\left[\sum_{n=1}^{N_d} \bm\xi_{d,n} 
    - \log\sum_{n=1}^{M+N_d} \exp\left(\bm\xi_{d,n} - \log(r_{nd})\right) \right],
%     \\
% = \E_{q(\bm\beta | H)q(\bm\eta_d)}\left[ \sum_{n=1}^{N_d} \nabla \bm\xi_{d,n} 
%     - N_d \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \right]
\end{align}
where $r_{nd} = M / (P - N_d)$ if $n$ is one of the negative words (and $r_{nd} = 1$ otherwise) is the uniform probability of picking word $n$.
% . The ELBO for a test document $d^*$ is computed using \Cref{eq:empirical-elbo}. 

\section{Additional Results}
\paragraph{Meta-encoder effect.}
\begin{table}[t]
    \caption{Effect of using meta-encoder on the perplexity.}
    \label{tab:encoder-effect}
    \centering
\resizebox{0.99\linewidth}{!}{%
\begin{tabular}{cccccccc}\toprule
    Meta-Enc.  & SotU & DoJ & Abstracts & Blogs & News  & Twitter & NeurIPS \\\midrule
    Yes
    & \textbf{846.81} & 498.96 & \textbf{1120.13} & \textbf{949.76} & 1414.90 & \textbf{866.81} & \textbf{888.59} \\
    No & 852.86 & \textbf{493.66} & 1158.02 & 951.95 & \textbf{1414.37} & 871.13 & 967.97\\
    \bottomrule
    \end{tabular}}
\end{table}
% \begin{table}[t]
%     \caption{Effect of using meta-encoder on the perplexity.}
%     \label{tab:encoder-effect}
%     \centering
% \resizebox{0.5\linewidth}{!}{%
% \begin{tabular}{ccc}\toprule
%     Meta-Encoder  & Yes & No \\\midrule
%     SotU  & \textbf{846.81} & 852.86 \\
%     DoJ & 498.96 & \textbf{493.66}  \\
%     Abstracts & \textbf{1120.13}& 1158.02  \\
%     Blogs & \textbf{949.76} & 951.95  \\
%     News  & 1414.90 & \textbf{1414.37}  \\
%     Twitter & \textbf{866.81} & 871.13  \\
%     NeurIPS & \textbf{888.59} & 967.97 \\
%     \bottomrule
%     \end{tabular}
% }
% \end{table}

To highlight the benefit of using the meta-encoder, we also trained a set of \us{} with the encoders as defined in~\citep{tomasi2020stochastic}, in which the encoder only takes the document representation as inputs.
The comparison is shown in \Cref{tab:encoder-effect}.
%
\us{}, with the meta-encoder, performs better on five out of seven datasets than the one without the meta-encoder.
DoJ is the only dataset on which without the meta-encoder \us{} performs noticeably better.
It is special because the topics in DoJ change very little across time. 
% This result shows that our meta-encoder would not clearly benefit from encoding such dynamics when the temporal dynamics of $\bm\beta$ does not change much.
Indeed, this result suggests how the benefit of using our meta-encoder is limited when the temporal dynamics of $\bm\beta$ does not change much.
% \mounia{We argue that our meta-encoder would not clearly benefit from encoding such dynamics when the temporal dynamics of $\bm\beta$ does not change much.}

% The benefits are especially clear in documents with larger time span and large amount of documents. \todo{other}
% The results on NeurIPS allows for interesting considerations.
% The best performing model on the small dataset (1047 words) is \us-E, a variant of our model with no meta-encoder. This suggests that, in this case, incorporating dynamics in the posterior of $\bm\eta$ parameter may be redundant. Indeed, a closer inspection reveals that static models such as LDA and CTM perform already quite well, without considering temporal dynamics at all. In such cases, the number of documents over time is enough for a reliable inference.
% However, this does not hold when the dimension of the dataset increases. With 4799 words in the vocabulary (almost 5 times than the small dataset), the perplexity of the baselines more than doubles for LDA and CTM, and the performance gap between static and dynamic models increases. Here, the best performing model is \us{}, that includes our meta-encoder. Indeed, the benefit of our model is most visible when the number of words is higher, as their co-occurrence in documents is not enough to capture low frequency words.

% The performance difference between \us, \us-F and \us-E largely depends on the dataset at hand. Notably, \us-E outperforms the other versions in only two cases, NeurIPS (1047) and News (1000), the two small versions of the correspondent datasets. This suggests how, in presence of a restricted vocabulary, encoding a temporal information for every document is not necessarily beneficial. Vice versa, results on larger datasets emphasise how, incorporating additional contexts for the documents allows for a better inference of the words in the vocabulary.
% 
% \todo{The performance difference between FITC and not can go in the suppl if we have time}
% We note how in general the performance of \us{} is comparable to \us-F. As \us-F considers the FITC approximation the model is much faster than \us, but achieves comparable and sometimes even better results than the former. 

\paragraph{Coherence.}
To evaluate topics extracted from our model, we also compute a measure of coherence \citep{roder2015exploring}.
The coherence is a measure which quantifies how much a set of texts is coherence based on the topics extracted by the topic model. We compute the normalised pointwise mutual information (NPMI) score (the higher the better).
A limitation of such metric is that it requires a fixed set of topics to compute against a set of documents. However our topic model is dynamic, meaning each document could potentially have a different set of topics (in terms of the appearing words) based on the timestamp of the document. We approximate this measure to retrieve a static set of topics based on the most frequent words of a topic across all time points (\cref{tab:coherence}).
The results show that, while the metric is more appropriate to static models which infer independent topics (such as LDA), our model is able to outperform the state of the art on 4 out of 8 datasets.

\begin{table}[]
    \centering
    \caption{Coherence metric across datasets and methods. The higher the better.}
    \label{tab:coherence}
    \begin{tabular}{lcrrrr}
\toprule
{} &       \#words & LDA &       CTM &      DCTM &      MIST \\
\midrule
SotU          & 1442  & -0.592569 & -0.578372 & \textbf{-0.550996} & -0.561410 \\
DoJ           & 2622  & \textbf{-0.359855} & -0.413999 & -0.363901 & -0.366874 \\
Abstracts     & 3000  & -0.468714 & -0.446985 & -0.432409 & \textbf{-0.402740} \\
Blogs         &  3000 & \textbf{-0.222944} & -0.365012 & -0.387720 & -0.238367 \\
News    &  3000 & -0.343660 & -0.562780 & -0.592340 & \textbf{-0.297206} \\
Twitter & 3000  & -0.173941 & -0.547871 & -0.721763 & \textbf{-0.119556} \\
NeurIPS       &  4799 & -0.633054 & -0.400259 & -0.501386 & \textbf{-0.365073} \\
Twitter (ext.)   & 83582  &  \textbf{0.002190 }& -0.226310 &       - & -0.122431 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[t]
    \caption{Top 10 words associated with four popular topics.}\label{tab:word-topic-distribution}
    \centering
    \resizebox{0.6\columnwidth}{!}{%
    \begin{tabular}{cp{2em}l}
        \toprule
        Topic & Color & Top Words \\
        \midrule
18 & \cellcolor[HTML]{1f77b4} &      state time dynam sequenc model transit use process observ trajectori \\
6  & \cellcolor[HTML]{ff7f0e} &  estim distribut inform sampl probabl statist densiti mean entropi measur\\
8  & \cellcolor[HTML]{2ca02c} &          network unit layer input train output hidden learn neural weight\\
2  & \cellcolor[HTML]{d62728} &           imag object featur use pixel visual model segment recognit face\\
\bottomrule
\end{tabular}}
\end{table}
\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{neurips_p_eta.png}
    \caption{\protect Mean of the mixture of topics $\sigma(\bm\eta)$ for a few selected topics throughout the time span predicted using \us{}.\footnotemark{}}\label{fig:neurips-topics}
\end{figure}
\footnotetext{Computed as $\langle\sigma(\bm\eta_{t_*}) \rangle_{p(\bm\eta_{t_*}|D)}$, where $p(\bm\eta_{t_*}|D) = \int p(\bm\eta_{t_*}|\bm{\mu_{t_*}},\cov_{t_*} ) p(\bm{\mu_{t_*}}|D) p(\cov_{t_*}|D) \dif \bm{\mu_{t_*}} \dif  \cov_{t_*}$.}

\Cref{fig:neurips-topics} shows the mean of topics over time as computed by \us{} on NeurIPS dataset.
% 
The distribution shows a decreasing trend for topic 8, associated with \emph{neural networks} (consistent with prior knowledge and previous results on this dataset).
Similar considerations can be made for topic 1, associated with \emph{neuroscience}.
Topic 19 (associated with \emph{topic modeling}) has a spike between the years 2004 and 2007. We can attribute this to the interest for topic modeling after the introduction of LDA in 2003, and the following publications of CTM and DTM in 2006.


\bibliography{main_bib}
\bibliographystyle{abbrvnat}

\end{document}

