\section{Proofs of theoretical results}
\label{app:proof}
\lemmaone*
\begin{proof}
Based on our remark above, the only thing left to prove is that the integral $\int_{-\infty}^\infty \E{G^{2}(u; z, \delta)}{z \sim \lambda }du$ is finite. To this end, we have (by using equation \eqref{eq: expected zo})
\begin{align*}
    \int_{-\infty}^\infty \E{G^{2}(u; z, \delta)}{z \sim \lambda }du &= \int_{-\infty}^\infty\frac{1}{\delta}\int_{\frac{|u|}{\delta}}^\infty z^\alpha \lambda(z) dz du = \frac{2}{\delta}\int_{0}^\infty\int_{\frac{|u|}{\delta}}^\infty z^\alpha \lambda(z)dz du\\
    &= \frac{2}{\delta}\int_{0}^\infty\int_{0}^{|z|\delta}|z|^\alpha \lambda(z) du dz = 2\int_{0}^\infty z^{\alpha+1} \lambda(z) dz, 
    % &=2\int_{0}^{\frac{B}{\delta}}z^{\alpha+1}\lambda(z)dz +\frac{2B}{\delta}\int_{\frac{B}{\delta}}^\infty z^\alpha \lambda(z) dz,
\end{align*}
which proves the lemma, as by assumptions the resulting integral is finite.
\end{proof}
\thmone*
\begin{proof}
We have \begin{align*}
\frac{d}{du} &\E{c\, h(u+\delta z)}{z \sim \tilde{\lambda} }
= c\,\frac{d}{du}\int_{-\infty}^\infty h(u+\delta z)\Tilde{\lambda}(z)dz=c\, \frac{d}{du}\int_{-\frac{u}{\delta}}^{\infty}\tilde{\lambda}(z) dz
= \frac{c}{\delta}\,\Tilde{\lambda}(-\frac{u}{\delta})=\frac{c}{\delta}\, \Tilde{\lambda}(\frac{u}{\delta}),
\end{align*}
which coincides with \eqref{eq: expected zo}.
\end{proof}
For our following result, note that a surrogate function is differentiable almost everywhere, which follows from the Lebesgue theorem on the differentiability of monotone functions. So, taking derivatives here is understood in an ``almost everywhere'' sense.

\thmtwo*
\begin{proof}
Let us assume that $u\geq 0$ (the other case is similar). Then, 
\begin{align*}
\E{cG^2(u;z,\delta)}{z\sim \lambda} = \frac{c}{\delta}\int_{\frac{u}{\delta}}^\infty z^\alpha \lambda(z)dz =-\frac{1}{\delta}\int_{\frac{u}{\delta}}^\infty z^\alpha \frac{\delta^2}{z^\alpha}g'(z\delta)dz
\end{align*}
which after change of variables $u=\delta z$ becomes $g(u)$ and finishes our proof.
\end{proof}
\subsection{Obtaining full-surrogates on Expectation}
\label{app:dist_to_surr}
We demonstrate performance of \lzo over different distributions of $z$, such as standard Normal, Uniform$([\sqrt{3}, \sqrt{3}])$ and Laplace$(0, \frac{1}{\sqrt{2}})$, for $m\in \{1,5\}$ and $\delta=0.05$. The distributions are of unit variance so that parameter $\delta$ is comparable across the methods. We supply \spgd algorithm the back-propagation threshold $\tilde{B}_{th}$ obtained in Table \ref{tab:expected_th}. Table \ref{tab:dist} shows the performance of the methods on the N-MNIST dataset in terms of accuracy and speedup. The \lzo method obtains better train and test accuracies for all cases with a slight compromise in the speedup, except for the uniform distribution where it offers better speedup for $m=1$ compared to the \spgd method. 


\subsubsection{From standard Gaussian}
Recall that the standard normal distribution $N(0,1)$ has PDF of the form $\frac{1}{\sqrt{2\pi}}\exp(-\frac{z^2}{2})$. Consequently, it is straightforward to obtain
\begin{align}
\E{G^{2}(u; z, \delta)}{z \sim \lambda } &=  \frac{1}{\sqrt{2 \pi}}\int_{-\infty}^{\infty} \frac{\abs{z}}{2 \delta} \exp(-\frac{z^2}{2}) dz =\frac{1}{\delta \sqrt{2\pi}} \exp(-\frac{u^2}{2\delta^2}).
\end{align} 

\subsubsection{From Uniform Continuous}
Consider the PDF of a continuous uniform distribution: 
$$f(z; a, b) = \begin{cases}
    \frac{1}{b-a} & \text{for} \, z \in [a,b]\\
    0    & \text{otherwise},
\end{cases}$$ 
where $a<b$ are some real numbers. For the distribution to be even and the resulting scaling constant of the surrogate to be 1 (which translates to $\E{z}{}=0$ and $\E{z^2}{}=1$, respectively) we set, $a=-\sqrt{3}$, $b=\sqrt{3}$. Then,
\begin{align}
\E{G^{2}(u; z, \delta)}{z \sim \lambda }&=  \int_{-\infty}^{\infty}  \frac{\abs{z}}{2\delta} f(z) dz \nonumber \\
    &=\frac{1}{2\sqrt{3}}[\int_{-\sqrt{3}}^{-\frac{\abs{u}}{\delta}}  \frac{\abs{z}}{2 \delta}  dz + \int_{\frac{\abs{u}}{\delta}}^{\sqrt{3}}  \frac{\abs{z}}{2 \delta}  dz]
    =\frac{1}{4\sqrt{3}\delta } z^2 \biggr \rvert_{\frac{\abs{u}}{\delta}}^{\sqrt{3}} \nonumber \\
    &= \begin{cases}
        \frac{1}{4\sqrt{3}\delta } (3 - \frac{u^2}{\delta^2}) & \text{if } \frac{\abs{u}}{\delta} < \sqrt{3}, \\
        0 & \text{otherwise.}
    \end{cases}
 \end{align}

\subsubsection{From Laplacian Distribution}
The PDF of Laplace distribution is given by:
$$f(z; \mu, b) = \frac{1}{2b} \exp(-\frac{\abs{z-\mu}}{b})$$ 
with mean $\mu$ and variance $2b^2$. Setting, $b=\frac{1}{\sqrt{2}}$ and $\mu=0$
%So that, $\lambda = \frac{1}{\sqrt{2}} \exp(-\sqrt{2}\abs{z})$
and using \eqref{eq:lzo} we obtain,
\begin{align}
    &\E{G^{2}(u; z, \delta)}{z \sim \lambda }
    =\frac{2}{\sqrt{2}}\int_{\frac{\abs{u}}{\delta}}^{\infty}  \frac{\abs{z}}{2 \delta} \exp(-\sqrt{2}\abs{z})dz 
    =\frac{1}{\delta \sqrt{2}}\int_{\frac{\abs{u}}{\delta}}^{\infty}  z \exp(-\sqrt{2}z)dz \nonumber \\
    &=-\frac{1}{\delta \sqrt{2}} (\frac{z}{\sqrt{2}}+\frac{1}{2})  \exp(-\sqrt{2}z)\biggr \rvert_{\frac{\abs{u}}{\delta}}^{\infty} 
    = \frac{1}{2\delta} (\frac{\abs{u}}{\delta}+\frac{1}{\sqrt{2}})  \exp(-\sqrt{2}\frac{\abs{u}}{\delta}).
 \end{align}

\subsection{Simulating a specific Surrogate}
\label{app:surr_to_dist}
\subsubsection{Sigmoid}
Consider the Sigmoid surrogate function, where the Heaviside is approximated by the differentiable Sigmoid function \cite{zenke2018superspike}. The corresponding surrogate gradient is given by,
\begin{align*}
   \frac{dx}{du} = \frac{d}{du}\frac{1}{1 + \exp(-k u)} = \frac{k \exp(-k u)}{(1 + \exp(-ku))^2} =: g(u)
\end{align*}
and, $$g'(u) = -\frac{k^2 \exp(-ku)(1-\exp(-ku))}{(1+\exp(-ku))^3}$$
Observe that $g(u)$ satisfies our definition of a surrogate ($g(u)$ being even, non-decreasing on $(-\infty,0)$ and 
$\int_{-\infty}^{\infty} g(u) du = 1<\infty$).
Thus, according to Theorem \ref{thm: main2}, we have
\begin{align*}
    &c=-2\delta^2\int_{0}^\infty \frac{g'(t\delta)}{t}dt
    =2\delta^2 k^2\int_{0}^\infty \frac{\exp(-k \delta t)(1-\exp(-k\delta t))}{t(1+\exp(-k\delta t))^3} dt
    = \frac{\delta^2 k^2}{a^2},
\end{align*}
where,  $a:=\sqrt{\frac{1}{0.4262}}$. The corresponding PDF is given by
\begin{align}
    \lambda(z)=-\frac{\delta^2}{c}\frac{g'(\delta t)}{z}
    =a^2\frac{\exp(-k \delta z)(1-\exp(-k\delta z))}{z(1+\exp(-k\delta z))^3}
    \label{eq:pdf_sigmoid}
\end{align}
Note that the temperature parameter $k$ comes from the surrogate to be simulated, while $\delta$ is used by \lzo. We compute the expected back-propagation threshold of \spgd for $m=1$ as, $\tilde{B}_{th} = \delta \, \E{\abs{z}}{z\sim \lambda }$, with,
\begin{align}
\E{\abs{z}}{z\sim \lambda } &= 2a^2 \int_{0}^{\infty} z  \frac{\exp(-az)(1-\exp(-az))}{z(1+\exp(-az))^3} dz =\frac{a}{2} = 0.7659.
\label{eq:sigmoid_bth}
\end{align}
\subsubsection{Fast Sigmoid}
Consider also the Fast Sigmoid surrogate gradient \cite{zenke2018superspike, nieves2021sparse} that avoids computing the exponential function in Sigmoid to obtain the gradient: 
\begin{align*}
   \frac{dx}{du} = \frac{1}{(1 + k\abs{u})^2} =: g(u).
\end{align*}
We choose $\alpha=-1$ (note that $\alpha=1$ does not work in this case) and apply theorem \ref{thm: main2} so that,
\begin{align*}
   c&=-2\delta^2\int_{0}^\infty \frac{1}{z^\alpha}g'(z\delta)dz =4\delta^2\int_{0}^\infty \frac{1}{z^\alpha}\frac{k\sign(z \delta )}{(1 + k\abs{z \delta})^3}dz \\
   &=4\delta^2 k \int_{0}^\infty \frac{z}{(1 + k\delta z)^3}dz = \frac{4}{k} \int_{0}^\infty \frac{t}{(1 + t)^3}dt = \frac{2}{k}.
\end{align*}
The PDF is then given by
\begin{align}
\lambda(z)= -\frac{1}{c} \frac{\delta^2}{z^\alpha}g'(z \delta) = k^2\delta^2  \frac{z \sign(z \delta)}{(1 + k\abs{z \delta})^3}. 
\end{align}
To compute the expected back-propagation threshold, we note,
\begin{align*}
    \tilde{B}_{th}=\delta\E{\abs{z}}{z\sim \lambda } = 2\delta^3 k^2 \int_{0}^{\infty}\frac{z^2 \sign(z \delta)}{(1 + k\abs{z \delta})^3}dz = 2\delta^3 k^2  \int_{0}^{\infty}\frac{ z^2 }{(1 + kz \delta)^3}dz = \frac{2}{k}  \int_{0}^{\infty}\frac{ x^2 }{(1 + x)^3}dx
\end{align*}
The above integral does not converge. However, if we consider finite support [-a, a], we may compute,
$\frac{2}{k}  \int_{0}^{a}\frac{ x^2 }{(1 + x)^3}dx$

\subsubsection{Inverse Transform Sampling}\label{ssec: inverse transform}
To simulate a given surrogate in \lzo, one needs to sample from the corresponding distribution described by the PDF $\lambda$. Given a sample $r \sim$ Unif$([0,1])$ and the inverse CDF $\Lambda^{-1}$ of the distribution, the inverse sampling technique returns, $\Lambda^{-1}(r)$, as  a sample from the distribution. Suppose the inverse CDF is not computable analytically from the PDF (or not implementable practically). In that case, we may choose a finite support over which the PDF is evaluated at a sufficiently dense set of points and compute the discretized CDF using the Riemann sum. The inverse discretized CDF is then computed empirically and stored as a list for a finite number of points (spaced regularly) between $[0,1]$. Sampling from the uniform distribution then amounts to randomly choosing the indices of the list and picking the corresponding inverse CDF values.

\subsection{Expected Back-propagation Thresholds}
\label{app:threshold}
In what follows, $m$ is the number of samples used in \eqref{eqn:lzo_sum}, while $k$ is the index of a particular sample. To compute the expected back-propagation threshold, we observe that a neuron is inactive in \lzo back-propagation if,
\begin{align*}
 \abs{u_i^{(l)}[t] - u_{th}} > \abs{z_k}\delta, \quad \text{for $k=1,\dots,m$}, \\   
\text{or,} \abs{u_i^{(l)}[t] - u_{th}} > t\delta, \,\text{where} \, t=\max_k\{\abs{z_1}, \cdots,\abs{z_m}\}
\end{align*}
 Assume $z_k \sim \lambda$, where $\lambda(t)$ denotes the PDF of the sampling distribution, with the corresponding CDF denoted by $F_{z_k}$. The PDF, $\tilde{\lambda}$, of the random variable $\abs{z_k}$ is given by
\begin{align}
\label{eq:sym_pdf}
\tilde{\lambda}(x)=\begin{cases}
   0, & \text{if } x <0 \\
   2\lambda(x), & \text{otherwise.}
\end{cases}    
\end{align}

The corresponding CDF is obtained by integrating the previous expression,
\begin{align}
\label{eq:sym_cdf}
F_{\abs{z_{k}}}(x)=\begin{cases}
   0, & \text{if } x <0 \\
   2(F_{z_k}(x) - F_{z_k}(0)), & \text{otherwise.}
\end{cases}
\end{align}
Further note that, 
\begin{align}
    F_{t}(x)=P(t < x)=\prod_{k=1}^{m} P(\abs{z_k} < x) = F_{\abs{z_k}}^m(x)
    %\nonumber\\ &= P(\abs{z_1} <  x, \cdots, \abs{z_k} < x, \cdots ,\abs{z_m} < x)\nonumber\\
\end{align}
If we denote the PDF of the random variable $t$ as $\hat{\lambda}$, we obtain
\begin{align}
\hat{\lambda}(x)= m F_{\abs{z_k}}^{m-1}(x) \tilde{\lambda}(x).    
\end{align}
Finally, the expected back-propagation threshold takes the form 
\begin{align}
\tilde{B}_{th}=\delta\E{t}{} = \delta \int_{0}^{\infty} t \hat{\lambda}(t)dt.
\end{align}
In the cases of distributions used in experimental sections, the previous expression simplifies. Table \ref{tab:expected_th} gives the numerical values for some particular $m$. To obtain an expected back-propagation threshold, we would like to evaluate:
\begin{align*}
    \tilde{B}_{th}=\delta\E{t}{} = \delta \int_{0}^{\infty} t \hat{\lambda}(t)dt = \delta m \int_{0}^{\infty} t F_{\abs{z}}^{m-1}(t) \tilde{\lambda}(t)dt
\end{align*}
For the standard normal distribution, $\lambda=$ Normal$(0,1)$ we have,
$F_{\abs{z}}(t)=\erf(\frac{t}{\sqrt{2}})$ giving,
\begin{align}
    \tilde{B}_{th}=\frac{2\delta m}{\sqrt{2 \pi}} \int_{0}^{\infty} t \erf^{m-1}(\frac{t}{\sqrt{2}}) \exp(-\frac{t^2}{2})dt.
\end{align}

For uniform continuous, $\lambda=$ Unif$([-\sqrt{3}, \sqrt{3}])$,  we have,  $F_{\abs{z}}(t)=\frac{t}{\sqrt{3}}$ giving,
\begin{align}
    \tilde{B}_{th}=\frac{\delta m}{\sqrt{3}} \int_{0}^{\sqrt{3}} t (\frac{t}{\sqrt{3}})^{m-1}dt = \delta \sqrt{3} \frac{m}{m+1}.
\end{align}

For Laplace distribution, $\lambda=$ Laplace$(0, \frac{1}{\sqrt{2}})$, we have,  $F_{\abs{z}}(t)=1-\exp(-\sqrt{2} t)$,
\begin{align}
    \tilde{B}_{th}=\delta m \sqrt{2} \int_{0}^{\infty} t (1-\exp(-\sqrt{2} t))^{m-1} \exp(-\sqrt{2}t)dt.
\end{align}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Additional Results}
%\subsection{From distributions to surrogates}
% Figure \ref{fig:loss_dist} shows the training loss for the methods after each gradient update for $m=1$, averaged over five trials. The \lzo loss is consistently closer to the \sur loss, which results in better training and test accuracies. In contrast, the fixed truncation used by \spgd affects its gradients. 
\subsection{Computational Speedup in \spgd}
The \spgd frame-work derives speedup by performing the back-propagation in a layerwise fashion. To summarize their finding, let us start with eqn. \ref{eq:lif_discrete}.
\begin{align*}
u^{(l)}_{i}[t] &= \beta u^{(l)}_{i}[t-1] + \sum_{j} w_{ij} x^{(l-1)}_{j}[t] - x^{(l)}_{i}[t-1] u_{th}    
\end{align*}

which, after unfolding the recurrence and using the fact that $u^{(l)}_{i}[0]=0$, can be restated as:

\begin{align}
u^{(l)}_{i}[t] &=  \sum_{j} \sum_{k=0}^{t-1} \beta^{t-k-1}x^{(l-1)}_{j}[k] w_{ij} - \sum_{k=0}^{t-1} \beta^{t-k-1} x^{(l)}_{i}[k] u_{th}    
\end{align}

where, the \textbf{input trace}, $\sum_{k=0}^{t-1} \beta^{t-k-1}x^{(l-1)}_{j}[k]$ can be computed in a layer-wise fashion from the forward propagation. Note that, by ignoring the reset mechanism in the gradient the computation, the gradient of the loss can be written as:

\begin{align}
    \frac{\partial l}{\partial w_{ij}} &= \sum_{t}\frac{\partial l[t]}{\partial x[t]} \frac{\partial x[t]}{\partial u[t]} \frac{\partial u[t]}{\partial w_{ij}}\nonumber \\
    &= \sum_{t}\frac{\partial l[t]}{\partial x[t]} \frac{\partial x[t]}{\partial u[t]} \sum_{k=0}^{t-1} \beta^{t-k-1}x^{(l-1)}_{j}[k]
\end{align}

where, the term $\frac{\partial l[t]}{\partial x[t]}$ is coming from the next layer, the term $\frac{\partial x[t]}{\partial u[t]}$ is the given by \ref{eq:lzo}, so it can be computed in the forward propagation along with the \textbf{input trace}. Thus, whenever, many neurons are inactive, i.e., $\frac{\partial x[t]}{\partial u[t]}=0$ the \spgd frame-work can reduce the computation burden of the back-propagation.
\subsection{Further comparison with \spgd}

\begin{figure*}
%\vskip 0.2in
\begin{center}
\includegraphics[width=0.32 \textwidth]{plots/loss_nmnist_normal_1-crop.pdf}
\includegraphics[width=0.32 \textwidth]{plots/loss_nmnist_laplace_1-crop.pdf}
\includegraphics[width=0.32 \textwidth]{plots/loss_nmnist_uc_1-crop.pdf}\\
\vspace{1mm}
\includegraphics[width=0.32 \textwidth]{plots/speedup_backward_normal_1-crop.pdf}
\includegraphics[width=0.32 \textwidth]{plots/speedup_backward_laplace_1-crop.pdf}
\includegraphics[width=0.32 \textwidth]{plots/speedup_backward_uc_1-crop.pdf}\\
\vspace{1mm}
\includegraphics[width=0.32 \textwidth]{plots/speedup_overall_normal_1-crop.pdf}
\includegraphics[width=0.32 \textwidth]{plots/speedup_overall_laplace_1-crop.pdf}
\includegraphics[width=0.32 \textwidth]{plots/speedup_overall_uc_1-crop.pdf}

\caption{We plot training loss, backward speedup (Back.), and overall speedup (Over.) of \spgd and \lzo after each gradient update performed on NMNIST data, as summarized in Table \ref{tab:dist} . The columns represent plots for three distributions Normal$(0,1)$, Laplace$(1, \frac{1}{\sqrt{2}})$, and Unif$([-\sqrt{3}, \sqrt{3}])$ respectively, with $\delta=0.05$ and $m=1$. The \lzo algorithm converges faster than the \spgd method and provides comparable backward and overall speedup.}
\label{fig:loss_dist}
\end{center}
%\vskip -0.4in
\end{figure*}

\begin{table*}
\caption{Performance comparison on NMNIST for m=1}
\label{tab:dist}
%\vskip 0.15in
\begin{center}
\begin{footnotesize}
\begin{sc}
\begin{tabular}{lcccc}
\toprule
Method & Train & Test & Back. & Over.  \\ %& Train & Test & Back. & Over.
\midrule
\multicolumn{5}{c}{$z \sim$ Normal$(0,1)$, $\delta=0.05, m=1$ }\\
\midrule
%\sur &95.25 $\pm$ 0.14& 93.70$\pm$ 0.10& 1 & 1 &95.44 $\pm$ 0.22& 93.76$\pm$ 0.10& 1 & 1 \\
\spgd &93.26 $\pm$ 0.31& 91.86$\pm$ 0.29& 99.57 & 3.38 \\
\lzo   &94.38 $\pm$ 0.12& 93.29$\pm$ 0.08& 92.27 & 3.34 \\
\midrule
\multicolumn{5}{c}{$z \sim$ Laplace$(1, \frac{1}{\sqrt{2}})$, $\delta=0.05, m=1$ }\\ %
\midrule
%\sur   &95.61 $\pm$ 0.16& 93.76$\pm$ 0.08& 1 & 1 &95.42 $\pm$ 0.03& 93.73$\pm$ 0.04& 1 & 1 \\
\spgd  &93.97 $\pm$ 0.43& 92.65$\pm$ 0.52& 88.2 & 3.19 \\
\lzo    &94.25 $\pm$ 0.17& 93.05$\pm$ 0.09& 83.7 & 3.07\\
\midrule
\multicolumn{5}{c}{ $z \sim$ Unif$([-\sqrt{3}, \sqrt{3}])$, $\delta=0.05, m=1$ }\\ 
\midrule
%\sur   &95.15 $\pm$ 0.19& 93.74$\pm$ 0.09& 1 & 1 &95.15 $\pm$ 0.19& 93.74$\pm$ 0.09& 1 & 1\\
\spgd &93.34 $\pm$ 0.44& 91.85$\pm$ 0.35& 83.2 & 3.26 \\
\lzo &94.24 $\pm$ 0.46& 93.05$\pm$ 0.37 & 84.8 & 3.43 \\ 
\midrule
\multicolumn{5}{c}{Sigmoid, $\delta=0.05, k \approx 30.63, m=1$ }\\ %
\midrule
%\surt  &94.58$\pm$ 0.31& 75.48$\pm$ 0.70& 1 & 1\\
\spgd &92.96$\pm$ 0.26& 91.04$\pm$ 0.32& 87.45 & 3.00\\
\lzo    &93.98$\pm$ 0.08& 92.97$\pm$ 0.05& 83.54 & 3.02\\
\midrule
\multicolumn{5}{c}{FastSigmoid, $\delta=0.05, k=100, m=1$ }\\
\midrule
%\surt  &93.33$\pm$ 0.05& 91.20$\pm$ 0.11& 1 & 1\\
\spgd &93.24$\pm$ 0.23& 92.16$\pm$ 0.20& 84.87 & 3.18\\
\lzo  &93.44$\pm$ 0.13& 92.52$\pm$ 0.09& 73.23 & 3.11\\

\bottomrule
\end{tabular}
\end{sc}
\end{footnotesize}
\end{center}
\vskip -0.1in
\end{table*}

\begin{table}
\caption{Performance comparison on NMNIST for m=5}
\label{tab:nmnist_5}
\vskip -0.5in
\begin{center}
\begin{footnotesize}
\begin{sc}
\begin{tabular}{lcccc}
\toprule
Method & Train & Test & Back. & Over. \\
\midrule
\multicolumn{5}{c}{$z \sim$ Normal$(0,1)$, $\delta=0.05, m=5$ }\\
\midrule
\spgd & 95.02 $\pm$ 0.29& 93.39$\pm$ 0.25 & 80.0& 3.40\\
\lzo  & 95.20 $\pm$ 0.22& 93.69$\pm$ 0.17& 77.7 & 3.22 \\
\midrule
\multicolumn{5}{c}{$z \sim$ Laplace$(1, \frac{1}{\sqrt{2}})$, $\delta=0.05, m=5$ }\\
\midrule
\spgd  & 94.73 $\pm$ 0.29& 93.13$\pm$ 0.23& 72.9 & 3.15\\
\lzo   & 95.07 $\pm$ 0.03& 93.63$\pm$ 0.05& 69.4 & 2.80 \\
\midrule
\multicolumn{5}{c}{ $z \sim$ Unif$([-\sqrt{3}, \sqrt{3}])$, $\delta=0.05, m=5$ }\\
\midrule
\spgd &94.82 $\pm$ 0.27& 93.38$\pm$ 0.17& 76.4 & 3.14 \\
\lzo &94.95 $\pm$ 0.35& 93.47$\pm$ 0.23 & 73.5 & 2.91 \\
\bottomrule
\end{tabular}
\end{sc}
\end{footnotesize}
\end{center}
%\vskip -0.3in
\end{table}



 In the section \ref{sec:surr_to_dist}, we derived surrogates corresponding to distributions and distributions corresponding to popular surrogates. We implement \lzo, with $\delta=0.05$ for different distribution such as Normal$(0,1)$, Laplace$(1, \frac{1}{\sqrt{2}})$, and Unif$([-\sqrt{3}, \sqrt{3}])$. They all have a unit variance to ensure $\delta$ is comparable across the distributions. The corresponding back-propagation thresholds for \spgd are derived in Table \ref{tab:expected_th}.
 
 For the Sigmoid surrogate, we take the temperature parameter $k=a/\delta\approx 30.63$ so that $c=\frac{\delta^2 k^2}{a^2}=1$ for the corresponding PDF derived in  eqn.\ref{eq:pdf_sigmoid}. We supply \spgd method the corresponding back-propagation threshold, $\Tilde{B}_{th} = 0.766\delta$, as obtained in eqn. \ref{eq:sigmoid_bth}.
 
 For the Fast Sigmoid surrogate, we choose $k=100$ following \cite{nieves2021sparse} so that $c=\frac{2}{k}$. To compute the expected back-propagation threshold, we consider finite support $[-10,10]$ used in the inverse transform sampling of $z$ and evaluate $\tilde{B}_{th} = 0.0461$. Table \ref{tab:dist} reports accuracies obtained by \lzo and \spgd across various sampling distributions, with $m=1$. \lzo offers better accuracies compared to \spgd across all the distributions, with a slight compromise in speed-up in most cases. For uniform distribution, \lzo offers even better speed-up than \spgd.

Table \ref{tab:nmnist_5} reports the details of the comparison over the N-MNIST dataset for sampling $z$ from various distributions with $m=5$. \lzo method obtains better test accuracies. The \spgd method is supplied with the respective surrogate and back-propagation threshold as computed in Tab.\ref{tab:expected_th}. The \lzo method achieves better accuracies than \spgd for all the distributions, though the speed-up is reduced due to higher sampling cost at $m=5$.


In table \ref{tab:parameter}, we further provide the hyper-parameters for the comparison in the \spgd framework, which are replicated from their work. The FMNIST data uses latency encoding, where each input pixel $x$, is converted to a single spike, the spike timing is determined by:
\begin{align*}
T(x) = \begin{cases}
    \tau_{\text{eff}} \log \frac{x}{x-\theta}
\end{cases}    
\end{align*}

Table \ref{tab:parameter_general} gives the hyper-parameter setting for comparison in general framework reported in Tab. \ref{tab:acc_comp}.
 
\begin{table}[!t]
\caption{Hyper-parameter settings for general comparison}
\label{tab:parameter_general}
\vskip -0.5in
\begin{center}
\begin{footnotesize}
\begin{tabular}{lccccc}
\toprule
& CIFAR-10/100 & ImageNet-100 & DVS-CIFAR-10 & DVS-Gesture & N-Caltech/NCARS\\
Number epochs & 300 & 300 & 300 & 200 & 200  \\
Mini batch size & 64 & 64, 72 & 64 & 64 & 16  \\
T & 6,4,2 & 4 & 10 & 10 & 10\\
LIF: $\beta$ & 0.5 & 1 & 0.5 & 0.5 & 0.5 \\ 
LIF: $u_0$ & 0 & 0 & 0 & 0 & 0\\
LIF: $u_{th}$ & 1 & 1 & 1 & 1 & 1 \\
\lzo: $\delta$ & 0.5 & 0.5 & 0.5 & 0.5 & 0.5  \\
\lzo: m & 5 & 5, 20 & 1 & 5 & 5\\
\lzo: $\lambda$ & N(0,1)& N(0,1)& N(0,1) & N(0,1)& N(0,1)\\ 
$\lambda_{TET}$ & 0.05 & 0.001 & 0.0001 & 0.05 & 0.05 \\
%Optimiser & Adam &  Adam & Adam  & Adam & Adam\\
Learning Rate & 0.001 & 0.1  & 0.001 & 0.001  & 0.001\\
%Adam: Betas & (0.9; 0.999) & (0.9; 0.999) & (0.9; 0.999) & (0.9; 0.999) & (0.9; 0.999)\\
%Rate Scheduler & CosineAnn. & CosineAnn. & CosineAnn. & CosineAnn. & CosineAnn.\\
\bottomrule
\multicolumn{6}{c}{Optimizer: Adam with betas: (0.9; 0.999), Rate Scheduler: cosine annealing}\\
\end{tabular}
\end{footnotesize}
\end{center}
%\vskip -0.3in
\end{table}


\begin{table}[!t]
\caption{Hyper-parameter settings for comparison in \spgd framework}
\label{tab:parameter}
\vskip -0.5in
\begin{center}
\begin{footnotesize}
\begin{tabular}{lccc}
\toprule
& FMNIST & SHD & N-MNIST\\
Number of Input Neurons & 784 & 1156 & 700 \\
Number of Hidden & 200 & 200 & 200 \\
Number of classes & 10 & 10 & 20 \\
Number epochs & 20 & 20 & 20 \\
Mini batch size & 256 & 256 & 256 \\
T & 100 & 300 & 500 \\
$\Delta t$ & 1ms & 1ms & 2ms \\
%$\tau$ hidden & 10ms & 10ms & 10ms  \\
%$\tau$ readout & 10ms & 10ms & 20ms \\
$\tau_{\text{eff}}$ & 20ms & N/A & N/A \\
$\theta$& 0.2 & N/A & N/A \\
$u_0$ & 0 & 0 & 0 \\
%$u_{rest}$ & 0 & 0 & 0 \\
$u_{th}$ & 1 & 1 & 1 \\
\lzo: $\delta$ & 0.05 & 0.05 & 0.05\\
%Bth & 0:2&  0:2 & 0:2  \\
%& 100 & 100 & 100 \\
Optimiser & Adam & Adam & Adam  \\
Learning Rate & 0.0002 & 0.0002 & 0.001 \\
Betas & (0.9; 0.999) & (0.9; 0.999) & (0.9; 0.999) \\
%$\lambda^{(low)}$ & 100 & 100 & 100  \\
%$\nu^{(low)}$ & 0.001 & 0.001 & 0.001 \\
%$\lambda^{(up)}$ & 0.06 & 0.06 & 0.06 \\
%$\nu^{(up)}$ & 1 & 1 & 10 \\
\bottomrule
\end{tabular}
\end{footnotesize}
\end{center}
%\vskip -0.3in
\end{table}




%\clearpage
\subsection{Dropout effect in \lzo}
A fixed neuron has an active gradient for $m=1$, only if $\abs{u_i[t]-u_{th}}< \abs{z}\delta$ as described in eqn\ref{eq:lzo}. Observe that $u_i[t]$ is specific to an input data point and time-step. The same neuron can be inactive for another data-point, or at a different time-step. Moreover, a neuron can have a non-zero gradient with respect to a single data point, even if the spikes are zero. Fixing a neuron and time-step, we plot the distribution of membrane potential (minus membrane threshold), spikes, and non-zero gradients over different data points across batches, where we implement \lzo on MNIST data with $\delta=0.5$. The plot captures the sparsity of spikes and sparsity of neuron gradients for \lzo.
\begin{figure*}
%\vskip 0.2in
\begin{center}
\includegraphics[width=0.32 \textwidth]{plots/dist_T_1.pdf}
\includegraphics[width=0.32 \textwidth]{plots/dist_T_2.pdf}
\includegraphics[width=0.32 \textwidth]{plots/dist_T_4.pdf}
\\
\includegraphics[width=0.32 \textwidth]{plots/spikes_T_1.pdf}
\includegraphics[width=0.32 \textwidth]{plots/spikes_T_2.pdf}
\includegraphics[width=0.32 \textwidth]{plots/spikes_T_4.pdf}
\\
\includegraphics[width=0.32 \textwidth]{plots/active_T_1.pdf}
\includegraphics[width=0.32 \textwidth]{plots/active_T_2.pdf}
\includegraphics[width=0.32 \textwidth]{plots/active_T_4.pdf}

\caption{Fixing a neuron, we plot the distribution of membrane potential, spikes, and gradient activity over different data points across batches, where we implement \lzo on MNIST data with $\delta=0.5$. The plot captures the sparsity of spikes and sparsity of neuron gradients for \lzo at different time-steps.}
\label{fig:dist_spk_act}
\end{center}
%\vskip -0.4in
\end{figure*}


%Let us assume the difference of membrane potential and membrane threshold of a neuron denoted by, $u$,  is distributed according to the pdf $f_{u}$ and it is symmetric. Following the notation in eqn. \ref{eq:sym_pdf} and eqn. \ref{eq:sym_cdf} we write the pdf and cdf of the random variable $\abs{u}$ as, $f_{\abs{u}}$ and $F_{\abs{u}}$, respectively. A neuron will back-propagate zero gradient, whenever $\abs{u}> \abs{z}\delta$ as given in eqn.\ref{eq:lzo}. Thus, it is natural ask probability of this event. As $z$ is sampled independent of $u$, we may write:
% \begin{align}
% Pr(\abs{u}> \abs{z}\delta) = \int_{0}^{\infty} \int_{0}^{y} f_{\abs{z}\delta}(x) f_{\abs{u}}(y) dx dy = \int_{0}^{\infty} F_{\abs{z}\delta}(y) f_{\abs{u}}(y) dy = \E{F_{\abs{z}\delta}(y)}{y \sim f_{\abs{u}}} 
% \end{align}