\section{Proofs of the theorems and lemmas}
\subsection{Proofs on Model Structures and Representations}\label{ProofOfStructure}
\paragraph{Proof of Lemma \ref{lemma: convolution kernel}} It is clear that $h_{n_1}(t)$ is linear w.r.t. $x(s)$, for $n_1 = 1, \dots, l$ and $s = 0, \dots, t$. Then we denote as $h_{(n_1)}(t) = \sum_{s = 0}^{t} h(s, n_1) x(t - s)$.
To prove this lemma, we first apply induction on $s$ and $n_1$ to prove the following statement:
\begin{equation}
    h(s, n_1) = \sum_{\substack{i_{1}+i_{2}+...+i_{n_1}=s\\i_{1},...,i_{n_1}\in\mathbb{N}}}\prod_{j=1}^{n_1}(A_{n_1-j+1}^{i_{n_1-j+1}}B_{n_1-j+1}), \forall n_1 = 1, \dots, l
\end{equation}
When $s = 0$, it is clear that $h(0, n_1) = \prod_{j=1}^{n_1}B_{n_1-j+1}$, $\forall n_1$.
When $n_1 = 1$, it is clear that $h(s, 1) = A_1^{s} B_1$, $\forall s$. 
Then suppose this representation for $h$ is true for $h(s, n_1 - 1)$ and $h(s-1, n_1)$, then we have 
\begin{align}
    h(s, n_1) &= A_{n_1}h(s-1, n_1) + B_{n_1}h(s, n_1 - 1)\\
    &= A_{n_1}\sum_{\substack{i_{1}+i_{2}+...+i_{n_1}=s-1\\i_{1},...,i_{n_1}\in\mathbb{N}}}\prod_{j=1}^{n_1}(A_{n_1-j+1}^{i_{n_1-j+1}}B_{n_1-j+1}) \\ &+ B_{n_1}\sum_{\substack{i_{1}+i_{2}+...+i_{n_1}=s\\i_{1},...,i_{n_1 - 1}\in\mathbb{N}}}\prod_{j=1}^{n_1 - 1}(A_{n_1 - 1-j+1}^{i_{n_1-1-j+1}}B_{n_1-1-j+1})\\
    &= \sum_{\substack{i_{1}+i_{2}+...+i_{n_1}=s\\i_{1},...,i_{n_1}\in\mathbb{N}\\ i_{n_1} \ge 1}}\prod_{j=1}^{n_1}(A_{n_1-j+1}^{i_{n_1-j+1}}B_{n_1-j+1}) \\ &+ \sum_{\substack{i_{1}+i_{2}+...+i_{n_1}=s\\i_{1},...,i_{n_1}\in\mathbb{N}\\ i_{n_1} = 0}}\prod_{j=1}^{n_1}(A_{n_1-j+1}^{i_{n_1-j+1}}B_{n_1-j+1})\\
    &= \sum_{\substack{i_{1}+i_{2}+...+i_{n_1}=s\\i_{1},...,i_{n_1}\in\mathbb{N}}}\prod_{j=1}^{n_1}(A_{n_1-j+1}^{i_{n_1-j+1}}B_{n_1-j+1})
\end{align}
Then we have that with the expression of $h(s-1, n_1)$ and $h(s, n_1 - 1)$, we can have the expression of $h(s, n_1)$ as above. Then with $h(0, n_1)$ and $h(s, 1)$, induction shows the correctness of the above expression. Thus
\begin{equation}
    \rho(t) = C^Th(t, l) = C^T\sum_{\substack{i_{1}+i_{2}+...+i_{l}=t\\i_{1},...,i_{l}\in\mathbb{N}}}\prod_{j=1}^{l}(A_{l-j+1}^{i_{l-j+1}}B_{l-j+1})
\end{equation}
\hfill $\square$

\subsection{Rewriting of Explicit Forms} \label{Rewriting}
Before we come to the proof of \textbf{Theorem \ref{theorem 4.1}}, we prove the following lemmas to aid our proofs. 

\paragraph{Definition 1} Denote by $F_k (\alpha_1, \dots, \alpha_n)$ the following function:
\begin{center}
    $F_k (\alpha_1, \dots, \alpha_n) = \sum_{i = 1}^n \frac{\alpha_i^{k+n-1}}{\prod_{j \ne i} (\alpha_i - \alpha_j)}$
\end{center}
where $n$ is the number of inputs taken into the function, and $\alpha_i$ are distinct complex numbers.
\paragraph{Corollary 1} $F_k (\alpha_1, \dots, \alpha_n) = F_k (\alpha_{\sigma (1)}, \dots, \alpha_{\sigma (n)})$ where $\sigma$ is a permutation. And $F_k (\alpha_1, \dots, \alpha_n) = F_k (\alpha_1, \dots, \alpha_n, 0)$. 
\par The corollary is easy to verify, so we omit the proof here. 
\paragraph{Lemma 3.} For all integer $k$ satisfying $ 0\le k \le n-2$, and distinct complex number $\alpha_1, \dots, \alpha_n$, we have 
\begin{center}
    $\sum_{i=1}^{n} \frac{\alpha_i^{k}}{\prod_{j \ne i} (\alpha_i - \alpha_j)} = 0$
\end{center}
\paragraph{Proof of Lemma 3.} 
For a given integer $1 \le k \le n-2$, consider the following polynomial of $x$ of degree at most $n-1$:
\begin{center}
    $f(x) = \sum_{i=1}^{n} \frac{\alpha_i^{k}\prod_{j \ne i} (x - \alpha_j)}{\prod_{j \ne i} (\alpha_i - \alpha_j)}$
\end{center}
It is obvious by the properties of Lagrange Interpolation Polynomial that $f(x) = x^k$, thus the coefficient of $x^{n-1}$ is $0$, this gives exactly
\begin{center}
    $\sum_{i=1}^{n} \frac{\alpha_i^{k}}{\prod_{j \ne i} (\alpha_i - \alpha_j)} = 0$
\end{center}
\hfill $\square$
\paragraph{Lemma 4.} Denote by $I(n, k)$ the following set
\begin{center}
    $I(n,k) = \{(i_1, \dots, i_n): \sum_{j = 1}^ni_{j} = k; \quad  i_{j} \ge 0, i_{j} \in \mathbb{Z}, \forall j \}$
\end{center}
Then for distinct $\alpha_1, \dots, \alpha_n$, we have
\begin{center}
    $\sum_{(i_1, \dots, i_n) \in I(n,k)} (\prod_{j = 1}^n \alpha_j^{i_j}) = F_k (\alpha_1, \dots, \alpha_n)$
\end{center}
\paragraph{Proof of Lemma 4.} We do induction over $n$. It is easy to calculate that when $n = 1$, $\forall k \ge 0$, 
\begin{center}
    $\sum_{(i_1, \dots, i_n) \in I(n,k)} (\prod_{j = 1}^n \alpha_j^{i_j}) = \alpha_1^k = F_k (\alpha_1)$
\end{center}
Now suppose that for $n-1$ and $\forall k \ge 0$, the equalities holds. Then we consider the case of $n$. 
\begin{align}
    \sum_{(i_1, \dots, i_n) \in I(n,k)} (\prod_{j = 1}^n \alpha_j^{i_j}) &= \sum_{k_1 = 0}^k \alpha_{n}^{k - k_1}(\sum_{(i_1, \dots, i_{n-1}) \in I(n-1,k_1)} (\prod_{j = 1}^{n-1} \alpha_j^{i_j})) \\
    &= \sum_{k_1 = 0}^k \alpha_{n}^{k - k_1}F_{k_1}(\alpha_1, \dots, \alpha_{n-1})\\
    &= \sum_{k_1 = 0}^k  \sum_{i = 1}^{n-1} \frac{\alpha_i^{k_1+n-2}\alpha_{n}^{k - k_1}}{\prod_{j \ne i;j \ne n} (\alpha_i - \alpha_j)}\\
    &= \sum_{i = 1}^{n-1} \frac{\alpha_i^{k+n-1} - \alpha_n^{k+1}\alpha_i^{n-2}}{(\prod_{j \ne i; j \ne n} (\alpha_i - \alpha_j)) (\alpha_i - \alpha_n)}\\
    &= \sum_{i = 1}^{n-1} \frac{\alpha_i^{k+n-1} - \alpha_n^{k+1}\alpha_i^{n-2}}{\prod_{j \ne i} (\alpha_i - \alpha_j)}\\
    &= \sum_{i = 1}^{n-1} \frac{\alpha_i^{k+n-1}}{\prod_{j \ne i} (\alpha_i - \alpha_j)} + \alpha_n^{k+1}(0 - \sum_{i = 1}^{n-1} \frac{\alpha_i^{n-2}}{\prod_{j \ne i} (\alpha_i - \alpha_j)})\\
    &= \sum_{i = 1}^{n-1} \frac{\alpha_i^{k+n-1}}{\prod_{j \ne i} (\alpha_i - \alpha_j)} + \alpha_n^{k+1} (\frac{\alpha_n^{n-2}}{\prod_{j \ne n} (\alpha_n - \alpha_j)})\\
    &= F_k(\alpha_1, \dots, \alpha_n)
\end{align}
Where the second last step uses the result of \textbf{Lemma 3}. 
\hfill $\square$
\par As the set $\{(\alpha_1, \dots, \alpha_n): \text{$\alpha_1, \dots, \alpha_n$ are distinct}\}$ is open and dense in $\mathbb{C}^n$, and taking finite sum in the form of $\sum_{(i_1, \dots, i_n) \in I(n,k)} (\prod_{j = 1}^n \alpha_j^{i_j}) = F_k (\alpha_1, \dots, \alpha_n)$ is continuous w.r.t. $(\alpha_1, \dots, \alpha_n)$ under norms on $\mathbb{C}^n$, thus we can extend the definition of $F_k(\alpha_1, \dots, \alpha_n)$ to the entire $\mathbb{C}^n$ by taking limits. 
\paragraph{Corollary 2.} For a $l$ layer SSM $\rho \in \mathcal{H}_{\infty,l}^{m}$, suppose it is defined by matrix $A_i, i = 1, \dots, l$; $B_i, i = 1, \dots, l$ and $C$ as given in the formulation \ref{linear deep ssm}. Then we have 
\begin{equation}
    \rho(t)= \sum_{1\le j_{1},...,j_{l}\le m}C_{j_l}[\prod_{p=1}^{l-1}(B_{l-p+1})_{(j_{l-p+1}, j_{l-p})}] (B_1)_{j_1}[ \sum_{\substack{i_{1}+i_{2}+...+i_{l}=t\\i_{1},...,i_{l}\in\mathbb{N}}}\prod_{j=1}^{l}((A_{l-j+1})_{(j_{l-p+1}, j_{l-p+1})}^{i_{l-j+1}}) ]
\end{equation}
If we denote $\alpha(x,y)$ as $\alpha(x,y) = (A_x)_{(y,y)}$ for diagonal matrices $A_x$, then we have 
\begin{equation}
    \rho(t)= \sum_{1\le j_{1},...,j_{l}\le m}C_{j_l}[\prod_{p=1}^{l-1}(B_{l-p+1})_{(j_{l-p+1}, j_{l-p})}] (B_1)_{j_1}F_t(\alpha(1, j_1), \dots, \alpha(l, j_l))
\end{equation}
This corollary is straight from calculations of \textbf{Lemma \ref{lemma: convolution kernel}}
\paragraph{Lemma 5.} Given fixed positive integers $M \ge N, T > 0$. Denote by $B(\alpha, \epsilon) = \{c \in \mathbb{C}: |c-\alpha| < \epsilon\}$ for real number $\epsilon > 0$. For fixed complex numbers $b_1, \dots, b_N$ satisfying $\sum_{i = 1}^{N} b_i \ne 0$, and pairwise distinct non-zero complex numbers $\alpha_1, \dots, \alpha_n$. Then for $T > N + M + 1$ and sufficiently small $\epsilon$, we have 
\begin{equation}
    \lim_{\epsilon \to 0}(\inf_{\substack{\beta_1, \dots, \beta_M \in \cup_{i = 1}^N B(\alpha_i, \epsilon)\\ \sum_{i = 1}^M c_i = 0}} \sum_{t = 1} ^ T |(\sum_{i=1}^N b_i \alpha_i^t)-(\sum_{i = 1}^M c_i \beta_i^t)|) > f(\{\alpha_i\}, \{b_i\}) >0
\end{equation}
Where $f(\{\alpha_i\}, \{b_i\})$ is a real number independent of $\epsilon$. 
\paragraph{Proof of Lemma 5.} It is clear that for sufficiently small $\epsilon$, $\beta_i$ have to be non-zero, and $B(\alpha_i, \epsilon)$ are disjoint. Then we rewrite $(\sum_{i = 1}^M c_i \beta_i^t)$ as 
\begin{equation}
    \sum_{i = 1}^M c_i \beta_i^t = \sum_{j = 1}^N \sum_{\beta_{ji} \in B(\alpha_j, \epsilon)} c_{ji} \beta_{ji}^t
\end{equation}
Then by considering the first-order asymptotic of $\sum_{\beta_{ji} \in B(\alpha_j, \epsilon)} c_{ji} \beta_{ji}^t$ w.r.t. $\epsilon$, we have that 
\begin{equation}
    \sum_{\beta_{ji} \in B(\alpha_j, \epsilon)} c_{ji} \beta_{ji}^t = B_j(t) \alpha_j^t + o(1) 
\end{equation}
Where $B_j(t)$ are polynomials with $\sum_{j = 1}^N deg(B_j) \le M - N$ (The possible existence of polynomial comes from cases where $\sum_{\beta_{ji} \in B(\alpha_j, \epsilon)} c_{ji}= 0$, leading to cancellation of the originally highest order term. In the first order asymptotic we consider the highest order not canceled out in the summation). 
\par If $\sum_{j = 1}^N deg(B_j) \ne 0$, then it is clear that such $f(\{\alpha_i\}, \{b_i\})$ exists, as at least one $B_j(t)$ grows at least linearly w.r.t. $t$ but the corresponding $b_j$ remains constant. 
\par If $\sum_{j = 1}^N deg(B_j) = 0$, then there's no highest-order cancellation in the way above (canceling the entire term doesn't count in this case). Then by writing $B_j(t) = B_j$, then minium becomes 
\begin{equation}
    \lim_{\epsilon \to 0}(\inf_{\sum_{i = 1}^M B_i = 0} \sum_{t = 1} ^ T |(\sum_{i=1}^N b_i \alpha_i^t)-(\sum_{i = 1}^N B_i \alpha_i^t)| + O(\epsilon))
\end{equation}
And it is clear from $\sum_{i = 1}^N b_i \ne 0$ that such limit is strictly greater than zero. Thus the $f(\{\alpha_i\}, \{b_i\})$ exists. 
\hfill $\square$



\subsection{Proofs of Main Theorems} \label{ProofMain}
\paragraph{Lemma 6.} $\mathcal{H}_{\infty,1}^{l(m-1)+2}\not\subseteq\mathcal{H}_{\infty,l}^{m}$. 
\paragraph{Proof of Lemma 6.} We consider $\rho \in \mathcal{H}_{\infty,1}^{l(m-1)+2}$ defined by matrices $A_0, B_0, C_0$, where $A_0 = Diag\{\sigma_1, \dots, \sigma_K\}$ with $K = l(m-1) +2$ and distinct nonzero $\sigma_i$'s. Then we have $\rho(t) = \sum_{i=1}^K (B_0)_i(C_0)_i \sigma_i^t$. 
Assume the contrary, if $\rho(t) \in \mathcal{H}_{\infty,l}^{m}$, then suppose it is defined by $A_1, \dots, A_l$; $B_1, \dots, B_l$; $C$ as in the formulation, and denote $\alpha(x, y) = (A_x)_{(y,y)}$ to be the diagonal elements of $A_x$. According to the formulation in \textbf{Corollary 2.}, it is clear that $\{\alpha(i,j): 1 \le i \le l, 1\le j \le m\} = \{\sigma_i: 1 \le i \le K\} \cup \{0\}$.\\
\par If there are any two non-zero $\alpha(x_1,y_1) = \alpha(x_2, y_2)$, then we perturb them up to the magnitude of $\epsilon$ such that all the non-zero $\alpha(x, y)$ are distinct, and satisfying that $\alpha(x, y) \in (\cup_{i = 1}^K B(\sigma_i, \epsilon)) \cup\{0\}$. As $\epsilon \to 0$, the perturbation of $\rho(t)$ also goes to $0$ for any fixed $t$.  

\par Now we consider $F_t(\alpha(1, j_1), \dots, \alpha(l, j_l))$, and eliminate all the zeros according to the rule in \textbf{Corollary 1.}. Then we denote that $F_t(\alpha(1, j_1), \dots, \alpha(l, j_l)) = F_t(\beta_1, \dots, \beta_{l_1})$, where $l_1 \ge 2$ due to that there's at most $l-2$ zeros, and $\beta_i$ are all the non-zero elements in $\{\alpha(i, j_i)\}$, and $\beta_i$ are distinct due to that non-zero $\{\alpha(i, j_i)\}$ are perturbed to be distinct. 
\par Recalling the definition, we have
\begin{align}
    F_t(\beta_1, \dots, \beta_{l_1}) &= \sum_{i = 1}^{l_1} \frac{\beta_i^{t+l_1-1}}{\prod_{j \ne i} (\beta_i - \beta_j)}\\
    &= \sum_{i = 1}^{l_1} (\frac{\beta_i^{l_1-2}}{\prod_{j \ne i} (\beta_i - \beta_j)}) \beta_i^{t+1}
\end{align}
where the sum of coefficients is 
\begin{equation}
    \sum_{i = 1}^{l_1} (\frac{\beta_i^{l_1-2}}{\prod_{j \ne i} (\beta_i - \beta_j)}) = 0
\end{equation}
As according to \textbf{Lemma 3}. And this applies for all such$F_t(\alpha(1, j_1), \dots, \alpha(l, j_l))$. Then by taking $N = K$, $M = lm$, $b_i = \frac{(B_0)_i(C_0)_i}{\sigma_i}$, $\alpha_i = \sigma_i$, and $\beta_j = \alpha(x,y)$ (with the perturbation applied), then we have exactly the setting of \textbf{Lemma 5}. 
\par Then according to \textbf{Lemma 5}, we have that for the perturbed $\hat{\rho}(t)$, $\sum_{t = 1}^{2ml+1} |\rho(t) - \hat{\rho}(t)| > f_0 > 0$, where $f_0$ is completely determined by the original system defining $\rho$ and independent of the magnitude of perturbation $\epsilon$. 
\par However, we know that for any bounded range of $t$, the summation $\sum_{\substack{i_{1}+i_{2}+...+i_{l}=t\\i_{1},...,i_{l}\in\mathbb{N}}}\prod_{j=1}^{l}((A_{l-j+1})_{(j_{l-p+1}, j_{l-p+1})}^{i_{l-j+1}})$ in \textbf{Corollary 2} has perturbation bounded by a function of $\epsilon$ that goes to zero for $\epsilon \to 0$. This is in contradiction with the positive bound $f_0$. Thus concludes the proof. 
\hfill $\square$
\paragraph{Lemma 7.} $\mathcal{H}_{\infty,l}^{m}\subseteq\mathcal{H}_{\infty,1}^{lm}$.
\paragraph{Proof of Lemma 7.} For $\rho \in \mathcal{H}_{\infty,l}^{m}$ defined by matrices $A_i, i = 1, \dots, l$; $B_i, i = 1, \dots, l$; $C$ according to the formulation, we construct a $1$ layer SSM of width $ml$ defined by vector $C_0, B_0 \in \mathbb{C}^{ml \times 1}$ and $A_0 \in \mathbb{C}^{ml\times ml}$, such that the hidden state $f^{(t)}$ at time $t$ is exactly $({h}^T_1(t), \dots, {h}^T_l(t))^T$. 
\par We denote $A_0 = (A_0)_{(ij)}, 1 \le i, j\le l$, where each $(A_0)_{(ij)}$ is a $m \times m$ block. And $B_0 = (\beta_1, \dots, \beta_l)$, $C_0 = (c_1, \dots, c_l)$ are separated into $l$ of $m \times 1$ vectors stacked together.
\begin{equation}
    (A_0)_{(ij)}=\left\{\begin{matrix}
 (\prod_{k = 1}^{i-j} B_{i +1 - k}) A_j, i > j\\
 A_i, i = j\\
0, i < j
\end{matrix}\right.
\end{equation}
And $b_i = \prod_{j = 1}^{i} B_{i+1 -j}$, $C_0 = (0, \dots, 0, C)$. By direct calculation, we could verify that the $\hat\rho$ corresponding to the SSM defined by $A_0, B_0, C_0$ is exactly $\rho$, which means $\rho \in \mathcal{H}_{\infty,1}^{lm}$. 
\hfill $\square$
\paragraph{Corollary 3.} For distinct complex numbers $\gamma_1, \dots, \gamma_n$, we have that for non-negative integer $t$ 
\begin{equation}
    F_t(\gamma_1, \dots, \gamma_n) - \frac{\gamma_{n-1}}{\gamma_{n-1} - \gamma_{n}}F_t(\gamma_1, \dots, \gamma_{n-1}) = \frac{\gamma_n}{\gamma_n - \gamma_{n-1}}F_t(\gamma_1, \dots, \gamma_{n-2}, \gamma_{n})
\end{equation}
This result in \textbf{Corollary 3} could be derived directly by expanding $F_t$ according to \textbf{Definition 1} and simple calculations. 
\paragraph{Lemma 8.} Given $n$ distinct non-zero complex number $\beta_1, \dots, \beta_n$, and $n$ complex number $Z_1, \dots, Z_n$. If we define $H_k, k = 1, \dots, n$ to be
\begin{equation}
    H_k = \sum_{j = k}^n Z_j \frac{\beta_k\prod_{p = 1}^{k - 1}(\beta_j - \beta_p)}{\beta_j^{k}}
\end{equation}
Then we have 
\begin{equation}
    \sum_{k = 1}^n H_k F_t(\beta_1, \dots, \beta_k) = \sum_{k =1}^n Z_k \beta_k^t
\end{equation}
Furthermore, if $|\beta_i|$ is in an non-decreasing order, then we have $|H_k| \le 2^n \max_{1\le i \le n} |Z_i|, \forall k$. 
\paragraph{Proof of Lemma 8.} We use induction to prove the following stronger result for all $u = 1, \dots, n$:
\begin{equation}
    \sum_{k = u}^n H_k F_t(\beta_1, \dots, \beta_k) = \sum_{k = u}^{n} Z_k \frac{\prod_{p =1}^{u-1}(\beta_k - \beta_p)}{\beta_k^{u-1}}F_t(\beta_1, \dots, \beta_{u-1}, \beta_k)
\end{equation}
In particular, when $u = 1$, it becomes exactly the result we want. 
\par When $u = n$, it is clear that the above is true. We then calculate the following for $u \ge 2$:
\begin{align}
    &\sum_{k = u-1}^{n} Z_k \frac{\prod_{p =1}^{u-2}(\beta_k - \beta_p)}{\beta_k^{u-2}}F_t(\beta_1, \dots, \beta_{u-2}, \beta_k) - \sum_{k = u}^{n} Z_k \frac{\prod_{p =1}^{u-1}(\beta_k - \beta_p)}{\beta_k^{u-1}}F_t(\beta_1, \dots, \beta_{u-1}, \beta_k)\\
    &= Z_{u-1} \frac{\prod_{p =1}^{u-2}(\beta_{u-1} - \beta_p)}{\beta_{u-1}^{u-2}}F_t(\beta_1, \dots, \beta_{u-2}, \beta_{u-1}) \\
    &+ \sum_{k = u}^{n} Z_k (\frac{\prod_{p =1}^{u-2}(\beta_k - \beta_p)}{\beta_k^{u-2}}F_t(\beta_1, \dots, \beta_{u-2}, \beta_k)-\frac{\prod_{p =1}^{u-1}(\beta_k - \beta_p)}{\beta_k^{u-1}}F_t(\beta_1, \dots, \beta_{u-1}, \beta_k))\\
    &= Z_{u-1} \frac{\prod_{p =1}^{u-2}(\beta_{u-1} - \beta_p)}{\beta_{u-1}^{u-2}}F_t(\beta_1, \dots, \beta_{u-2}, \beta_{u-1}) \\
    &+ \sum_{k = u}^{n} Z_k \frac{\prod_{p =1}^{u-1}(\beta_k - \beta_p)}{\beta_k^{u-1}}(\frac{\beta_k}{(\beta_k - \beta_{u-1})}F_t(\beta_1, \dots, \beta_{u-2}, \beta_k)-F_t(\beta_1, \dots, \beta_{u-1}, \beta_k))\\
    &= Z_{u-1} \frac{\prod_{p =1}^{u-2}(\beta_{u-1} - \beta_p)}{\beta_{u-1}^{u-2}}F_t(\beta_1, \dots, \beta_{u-2}, \beta_{u-1}) \\
    &+ \sum_{k = u}^{n} Z_k \frac{\prod_{p =1}^{u-1}(\beta_k - \beta_p)}{\beta_k^{u-1}}(\frac{\beta_{u-1}}{\beta_{k} - \beta_{u-1}}F_t(\beta_1, \dots,\beta_{u-2} , \beta_{u-1}))\\
    &= \sum_{k = u-1}^{n} Z_k \frac{\beta_{u-1}\prod_{p =1}^{u-2}(\beta_k - \beta_p)}{\beta_k^{u-1}}(F_t(\beta_1, \dots,\beta_{u-2} , \beta_{u-1}))\\
    &= H_{u-1}F_t(\beta_1, \dots,\beta_{u-2} , \beta_{u-1})
\end{align}
Then it is clear that the stronger result holds for all $u = 1, \dots, n$. Thus taking $u = 1$ we have the desired result. 
\par If the non-decreasing order of $|\beta_i|$ holds, then we have that 
\begin{align}
    |H_k| &= |\sum_{j = k}^n Z_j \frac{\beta_k\prod_{p = 1}^{k - 1}(\beta_j - \beta_p)}{\beta_j^{k}}| \\
    &\le \sum_{j = k}^n |Z_j| *|\frac{\beta_k}{\beta_j}|*{\prod_{p = 1}^{k - 1}|\frac{(\beta_j - \beta_p)}{\beta_j}|}\\
    &\le \sum_{j = k}^n |Z_j|*2^{k-1}\\
    &\le 2^n \max_{1\le i \le n} |Z_i|
\end{align}
\hfill $\square$
\paragraph{Lemma 9.} $\mathcal{H}_{\infty,1}^{l(m-1)+1}\subseteq\mathcal{H}_{\infty,l}^{m}$.
\paragraph{Proof of Lemma 9.} For simplicity we denote $K = l(m-1)+1$. 
\par Then for $\rho \in \mathcal{H}_{\infty,1}^{l(m-1)+1}$, suppose that it is defined by vectors $B_0, C_0$, and matrix $A_0 = Diag\{\sigma_1, \dots, \sigma_K\}$. Then from \textbf{Corollary 2} we have $\rho(t) = \sum_{i =1}^K (C_0)_i (B_0)_i \sigma_i^t$.  
\par As the case of $\sigma_i = \sigma_j$ or $\sigma_i = 0$ will lead to the degenerate case, which is weaker than the non-degenerate case, thus we do not consider them here. Now we assume that $\sigma_i$ are distinct and non-zero. Due to the symmetry of the above form, we can assume without loss of generality that $|\sigma_i|$ is in an non-decreasing order. 
\par we define the following functions:
\begin{equation}
    \alpha(i,j) = \left\{\begin{matrix}
 \sigma_j, \quad \text{for }i = 1; j = 1, \dots, m  \\
 \sigma_{(i-1)(m-1)+ j +1 }, \quad \text{for }i = 2, \dots, l; j = 1, \dots, m\\
0, \quad \text{for }i = 2, \dots, l; j = m
\end{matrix}\right.
\end{equation}
\begin{equation}
    Z(i,j) = \left\{\begin{matrix}
 (B_0)_j(C_0)_j, \quad \text{for }i = 1; j = 1, \dots, m  \\
 (B_0)_{(i-1)(m-1)+ j +1 }(C_0)_{(i-1)(m-1)+ j +1 }, \quad \text{for }i = 2, \dots, l; j = 1, \dots, m\\
0, \quad \text{for }i = 2, \dots, l; j = m
\end{matrix}\right.
\end{equation}
\begin{equation}
    H(i,j) = \left\{\begin{matrix}
 \sum_{p = i}^l Z(p, j) \frac{\alpha(i,j)\prod_{q = 1}^{i - 1}(\alpha(p.j) - \alpha(q, j))}{\alpha(p, j)^{i}}, \quad \text{for } i = 1, \dots, l; j = 1, \dots, m-1\\
 Z(1, m), \quad \text{for } i = 1; j = m\\
0, \quad \text{for }i = 2, \dots, l; j = m
\end{matrix}\right.
\end{equation}
Also, denote by $Z_0 = 2 *(\max_{1 \le i \le K} |(B_0)_i(C_0)_i|)^{\frac{1}{l+1}}$. 
\par Then we construct diagonal matrices $A_1, \dots, A_l$; matrices $B_2, \dots, B_l$; vectors $B_1, C$ such that the $l$ layer SSM of width $m$ defined by these parameters has exactly the same $\rho$. 
\begin{itemize}
    \item $B_0 = C = Z_0 1_m$, where $1_m$ is the vector consisting of all $1$.
    \item $(A_i)_{(j,j)} = \alpha(i,j)$, for $i = 1, \dots, l; j =1, \dots, m $.
    \item $(B_i)_{(j,j)} = Z_0$, for $i = 2, \dots, l-1; j = 1, \dots, m-1$. 
    \item $(B_i)_{(m,m)} = Z_0$, for $i = 3, \dots, l$.
    \item $(B_2)_{(m,m)} = \frac{H(1,m)}{Z_0^l}$. 
    \item $(B_l)_{(j,j)} = \frac{H(l,j)}{Z_0^l}$, for $j = 1, \dots, m-1$.
    \item $(B_i)_{(m, j)} = \frac{H(i-1,j)}{Z_0^l}$, for $i = 2, \dots, l; j = 1, \dots, m-1$
    \item All the unmentioned elements are set to zero.
\end{itemize}
Under such construction, we suppose it defines $\hat\rho$, then following \textbf{Corollary 2} and \textbf{Lemma 8}, we have 
\begin{align}
    \hat\rho(t) &= \sum_{1\le j_{1},...,j_{l}\le m}C_{j_l}[\prod_{p=1}^{l-1}(B_{l-p+1})_{(j_{l-p+1}, j_{l-p})}] (B_1)_{j_1}F_t(\alpha(1, j_1), \dots, \alpha(l, j_l))\\
    &= Z(1,m)\alpha(1,m)^t + \sum_{j = 1}^{m-1}\sum_{i = 1}^l H(i,j)F_t(\alpha(1,j), \dots, \alpha(i,j))\\
    &= Z(1,m)\alpha(1,m)^t + \sum_{j = 1}^{m-1}\sum_{i = 1}^l Z(i,j)\alpha(i,j)^t\\
    &= \sum_{i =1}^K (C_0)_i (B_0)_i \sigma_i^t\\
    &= \rho(t)
\end{align}
Thus $\rho = \hat\rho \in \mathcal{H}_{\infty,l}^{m}$. 
\hfill $\square$
\paragraph{Remark 1.}: As for the degenerated cases, as there are less parameters in $\rho_t$, its construction could be achieved by simple modifications on the above construction, or simply achieved by taking limits as the set of non-degenerate $(\sigma_1, \dots, \sigma_K)$ is open and dense in $\mathbb{C}^{K}$. 
\paragraph{Remark 2.}: It can be seen from \textbf{Lemma 8} that all the entries of $B_i$, $C$ are bounded by $Z_0 = 2 *(\max_{1 \le i \le K} |(B_0)_i(C_0)_i|)^{\frac{1}{l+1}}$. As for the degenerated cases, we can do the construction by taking limits, thus for these cases the same bound holds. 

\paragraph{Proof of Theorem \ref{theorem 4.1}} This is direct from \textbf{Lemma 6}, \textbf{Lemma 7}, \textbf{Lemma 9}. 
\hfill $\square$
\paragraph{Proof of Theorem \ref{theorem 4.2}} For $\rho\in\mathcal{H}_{c_{1},1}^{l(m-1)+1}$, we have that 
\begin{equation}
    Z_0 = 2 *(\max_{1 \le i \le K} |(B_0)_i(C_0)_i|)^{\frac{1}{l+1}} \le 2c_1^{\frac{2}{l+1}}
\end{equation}
Then it is direct from \textbf{Lemma 9, Remark 2}.
\hfill $\square$
\paragraph{Proof of Theorem \ref{theorem 4.3}} From \textbf{Theorem \ref{theorem 4.2}}, it suffices that 
\begin{equation}
    2c_1^{\frac{2}{l+1}} \le c_2
\end{equation}
Which means 
\begin{equation}
    l \ge \lceil\frac{2\ln(c_{1})}{\ln{(\frac{c_{2}}{2})}}-1\rceil
\end{equation}
For this $l$, we already have that $m = \lceil \frac{K}{l} \rceil +1$ satisfies that $K \le l(m-1)$, and \textbf{Lemma 9, Remark 2} shows that the bound also holds for degenerate cases, which is valid to be used here. 
\hfill $\square$
\paragraph{Proof of Corollary \ref{Hermite property}} To prove this corollary, we only need to show that $\mathcal{G}_{c_{1},1}^{l(m-1)+1} \subseteq \mathcal{H}_{\sqrt{l(m-1)+1}c_{1},1}^{l(m-1)+1}$. Assume $\rho \in \mathcal{G}_{c_{1},1}^{l(m-1)+1}$ is defined by $A_1, B_1, C$, and based on the property of normal matrices, we can decompose as $A_1 = U\Sigma U^*$ for diagonal $\Sigma$ and unitary $U$. Then we have 
\begin{align}
    \rho(t) &= C^T A_1^{t}B_1\\
    &= C^T U \Sigma^t U^*B_1 \\
    &= (U^TC)^T \Sigma^t (U^*B_1)
\end{align}
Therefore $\rho$ is also defined by $\Sigma, (U^TC), (U^* B_1)$. And by the norm-preserving property of unitary matrix, we have $\|(U^TC)\|_2 = \|C\|_2$, $\|U^*B_1\|_2 \le \|B_1\|_2$. Then $\|(U^TC)\|_\infty \le \sqrt{l(m-1)+1}c_{1}$, $\|(U^*B_1)\|_\infty \le \sqrt{l(m-1)+1}c_{1}$, leading to $\rho \in \mathcal{G}_{\sqrt{l(m-1)+1}c_{1},1}^{l(m-1)+1}$. Then by \textbf{Theorem \ref{theorem 4.2}} , the conclusion holds. 
\hfill $\square$
\paragraph{Proof of Lemma \ref{expansion}} This lemma can be directly obtained from \textbf{Corollary 2} by indexing $F_t(\alpha(1, j_1), \dots, \alpha(l, j_l))$ as according to \textbf{Definition 1}. 
\hfill $\square$

\paragraph{Remark 3.} We could define an even larger space as follows: 
\begin{align}
    \mathcal{L}_{c,l}^{m}=&\{\rho(t):y(t)=(\rho\ast x)(t),A_{1},...,A_{l}\in\mathbb{C}^{m\times m}\thinspace\mbox{diagonalizable},B_{2},...,B_{l},\in\mathbb{C}^{m\times m},\\ &C,B_{1}\in\mathbb{C}^{m\times1},\max_{i=1,...,l}r(A_{i})<1,||C||_{\infty}\le c, ||B_{1}||_{\infty}\le c,\max_{2\le k\le l}\max_{1\le i,j\le m}|(B_{k})_{ij}|\le c\}
\end{align}

The difference between $\mathcal{H}_{c,l}^{m}$ and $\mathcal{L}_{c,l}^{m}$ is not as substantial as one might initially expect. In fact, if we remove the norm constraint-that is, as $c$ approaches infinity-the two hypothesis spaces become identical.
\begin{lemma}[Representing ability equivalence in two hypothesis space]
\label{lemma: equaivalence}
Given $m,l\ge1$, the we have 
\begin{equation}
    \mathcal{L}_{\infty,l}^{m}=\mathcal{H}_{\infty,l}^{m}
\end{equation}
\end{lemma}

\paragraph{Proof of Lemma \ref{lemma: equaivalence}} As it is clear that $$\mathcal{H}_{\infty,l}^{m} \subseteq \mathcal{L}_{\infty,l}^{m}$$. We only need to prove that $\forall \rho \in \mathcal{L}_{\infty,l}^{m}$, we have $ \rho \in \mathcal{H}_{\infty,l}^{m}$. 
Suppose this $l$ layer SSM is defined by matrix $A_i, i = 1, \dots, l$; $B_i, i = 1, \dots, l$ and $C$ as given in the formulation. 
Then suppose $A_i = P_i^{-1} D_i P_i$ is the Jordan decomposition of $A_i$, where $P_i$ is invertible and $D_i$ is diagonal. 
Then let $\hat{C} =  P_l^T C$, $\hat{B}_j = P_j^{-1} B_jP_{j-1}, j = 2, \dots, l$, $\hat{B}_1 = P_1^{-1}B_1$, $\hat{A}_i = D_i, i = 1, \dots, l$. It is clear that for all $t$, 
\begin{align}
    \rho(t) &= C^T\sum_{\substack{i_{1}+i_{2}+...+i_{l}=t\\i_{1},...,i_{l}\in\mathbb{N}}}\prod_{j=1}^{l}(A_{l-j+1}^{i_{l-j+1}}B_{l-j+1}) \\
    &= \hat{C^T}\sum_{\substack{i_{1}+i_{2}+...+i_{l}=t\\i_{1},...,i_{l}\in\mathbb{N}}}\prod_{j=1}^{l}(\hat{D}_{l-j+1}^{i_{l-j+1}}\hat{B}_{l-j+1})
\end{align}
As $D_i$ are diagonal matrices, we have that $\rho \in \mathcal{H}_{\infty,l}^{m}$. 
\hfill $\square$



\section{Experiment Details}
The model architecture used in our experiments on the nonlinear S4 model for the MNIST dataset is described as follows:
\label{appendx:S4D structure}
\begin{figure}[!ht]
   \centering
   \includegraphics[width=0.5\textwidth]{figs/S4_structure.png}
   \label{fig: S4D structure}
\end{figure}