
\documentclass{article} % For LaTeX2e
\usepackage{iclr2024_conference,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}
\usepackage{graphicx}
\usepackage{wrapfig}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{makecell}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{bbding}
\usepackage{enumitem}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}

%\title{Formatting Instructions for ICLR 2024 \\ Conference Submissions}
\title{Spatio-Temporal Approximation: A Training-Free SNN Conversion for Transformers}

% Authors must not appear in the submitted version. They should be hidden
% as long as the \iclrfinalcopy macro remains commented out below.
% Non-anonymous submissions will be rejected without review.

\author{Antiquus S.~Hippocampus, Natalia Cerebro \& Amelie P. Amygdale \thanks{ Use footnote for providing further information
		about author (webpage, alternative address)---\emph{not} for acknowledging
		funding agencies.  Funding acknowledgements go at the end of the paper.} \\
	Department of Computer Science\\
	Cranberry-Lemon University\\
	Pittsburgh, PA 15213, USA \\
	\texttt{\{hippo,brain,jen\}@cs.cranberry-lemon.edu} \\
	\And
	Ji Q. Ren \& Yevgeny LeNet \\
	Department of Computational Neuroscience \\
	University of the Witwatersrand \\
	Joburg, South Africa \\
	\texttt{\{robot,net\}@wits.ac.za} \\
	\AND
	Coauthor \\
	Affiliation \\
	Address \\
	\texttt{email}
}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to \LaTeX{} to determine where to break
% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
% puts 3 of 4 authors names on the first line, and the last on the second
% line, try using \AND instead of \And before the third author name.


%\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}
	
	\maketitle
	
	\begin{abstract}
		Spiking neural networks (SNNs) are energy-efficient and hold great potential for large-scale inference. Since training SNNs from scratch is costly and has limited performance, converting pretrained artificial neural networks (ANNs) to SNNs is an attractive approach that retains robust performance without additional training data and resources. However, while existing conversion methods work well on convolution networks, emerging Transformer models introduce unique mechanisms like self-attention and test-time normalization, leading to non-causal non-linear interactions unachievable by current SNNs. To address this, we approximate these operations in both temporal and spatial dimensions, thereby providing the first SNN conversion pipeline for Transformers. We propose \textit{Universal Group Operators} to approximate non-linear operations spatially and a \textit{Temporal-Corrective Self-Attention Layer} that approximates spike multiplications at inference through an estimation-correction approach. Our algorithm is implemented on a pretrained ViT-B/32 from CLIP, inheriting its zero-shot classification capabilities, while improving control over conversion losses. To our knowledge, this is the first direct training-free conversion of a pretrained Transformer to a purely event-driven SNN, promising for neuromorphic hardware deployment.
	\end{abstract}
	
	\section{Introduction}
	The recent success of large Transformer models has increased the need for efficient inference. Spiking neural networks (SNNs), as the third generation of neural networks, use multi-step sparse spike accumulations instead of dense multiply-accumulations, providing significant advantages in energy and speed. This makes SNNs a prospective candidate to replace ANNs for large-scale deployment. 
	
	Due to the non-differentiability of spiking neurons, obtaining large-scale SNNs remains a challenge. 
	Existing method using surrogate gradients \citep{bellec2018long,neftci2019surrogate,lee2020enabling} or synaptic plasticity \citep{bicknell2021synaptic, liu2022biologically} requires training from scratch on large datasets, incurring high complexity, and still struggle to achieve high performance. 
	Instead, in practice, limited training data and resources create a more urgent need to directly convert powerful ANNs into equivalent SNNs in a training-free fashion \citep{diehl2015fast}. Such ANN-to-SNN conversion replaces ANN activations with temporal spike sequences, nearly preserving all capabilities of the source model. Thus, it can directly reduce the inference power consumption of open-source ANN models without other modification, even for those pretrained on large private datasets.
	
    % Obtaining large-scale high-performance SNNs remains challenging due to the non-differentiability of spike neurons. Current training methods using surrogate gradients \citep{bellec2018long,neftci2019surrogate,lee2020enabling} or synaptic plasticity \citep{bicknell2021synaptic, liu2022biologically} struggle to train deep SNN from scratch on large datasets. Instead, another main approach is ANN-to-SNN conversion that converts floating-point ANN features into equivalent temporal spike sequences. Such training-free conversion can nearly preserve the strong capabilities of the source ANN model without requiring additional training data and resources, thus enabling direct utilization of modern open-source ANN models pretrained on large private datasets.
	
	Nevertheless, such training-free conversion seem to be impossible for mainstream large-scale ANNs based on \textbf{Transformers} \citep{vaswani2017attention,dosovitskiy2020image,radford2021learning}. Their computational characteristics differs from convolutional networks, leading to two critical conflicts \citep{li2022spikeformer}. 
	First, the matrix products between variable features in self-attention are non-causal during inference, relying on complete input spike sequences. Such multiplications are incompatible with the additive accumulation over time in SNN and thus cannot be directly calculated.
	Second, unlike ReLU and BatchNorm in CNNs, operations such as GELU and LayerNorm in Transformers depend on complicated non-linearities at test-time, so that cannot be accurately represented by the quantized piece-wise linearity of spiking neurons. 
	
	
	% Considering such computational discrepancies between Transformers and SNNs, our idea is to develop spiking approximators to universally simulate the raw calculations, which poses two key challenges: 1) applicability to all modules regardless of spatial position or input range; 2) conformance to SNN temporal properties like accumulation over time and stationary spike emissions. These require innovations in the construction and operational mechanism of existing SNN conversion.
	
	% In biological networks, neurons often perform complex computations in groups. e.g. Bayesian inference can be reduced to a linear combination of neuron group activities. Inspired by this, we propose spiking approximators as the computational proxy to reduce the computational discrepancies between Transformer and SNNs. Approximators in the SNN perspective can be decoupled into two dimensions, spatial and temporal. Spatially, it’s natural to use a set of neurons to fit a correlation computation or a nonlinear operation. However, how to make it apply to all possible inputs to guarantee its generalization at different network positions is something that needs to be considered in depth. Also, temporally, this approximator has to ensure its conformance to SNN properties like accumulation and stability over time These require innovations in the construction and operational mechanism of existing SNN conversion.
	
    Considering such inherent discrepancies, existing spiking networks cannot strictly implement Transformer operations through a directly corresponding structure. Fortunately, the spatial population coding and temporal memory properties of SNNs can be further leveraged to enhance the representational capacity on both dimensions. By redefining spiking computations as a gradual approximation process to ANN floating-point values, we propose our conversion pipeline, termed Spatio-Temporal Approximation (STA), consisting of two novel spiking modules as universal \textbf{approximators}. Spatially, we adopt the strategy of trading space for precision, introducing local neuron populations to simulate precise non-linearities through multiple discrete binary spikes. These modules are driven by synthetic data regardless of their actual input at inference for universality. Temporally, to obtain stationary spike emissions for rate-coding, we remodel the non-causal multiplications into an estimation-correction process. Based on the accumulated input memory, we first approximately estimate future reactions, then correct the results with the actual input as time progresses.
    
	% To address these challenges, we delves deeper into the spatial population and temporal memory properties of SNNs, expanding their computational scale for enhanced capacity. Our conversion pipeline, termed Spatio-Temporal Approximation (STA), consists of two novel spiking modules. Spatially, we adopt a strategy of trading space for precision, introducing local neuron groups to model non-linearities and using synthetic-data-driven training to ensure generability. Temporally, we modify the non-causal multiplications into an estimation-correction process, using accumulated membrane potentials to first approximately estimate future reactions, and then correct the results as time progresses.
	
	%To address this, we leverage the spatial representation capacity and temporal memorial characteristic of SNN, constructing two novel spiking modules for our conversion pipeline, termed Spatio-Temporal Approximation (STA). Spatially, we introduce local spiking neuron groups driven by synthetic data to model precise non-linearities in general scenarios. Temporally, we modify the non-causal multiplications into an estimation-correction process, using accumulated membrane potentials to first approximately estimate future reactions, and then correct the results as time progresses.
	
	% Since such computational differences impede the direct accurate conversion from Transformers to SNNs, we propose to construct approximate models through the discretized computation of spiking neurons as a universal simulation for raw calculations. Considering the powerful capabilities of SNNs in both spatial and temporal domains, we construct two novel spiking modules as the components of our conversion pipeline, termed as Spatio-Temporal Approximation (STA). Spatially, we leverage groups of spiking neurons to build universal approximators of nonlinear operators, thereby simulating precise computations by multiple imprecise binary spikes. Temporally, we modify the non-causal multiplications into an estimation-correction process. The temporarily accumulated membrane potentials are used to approximately estimate future reactions and correct the result over time.
	
	With our STA pipeline, we convert a ViT-B/32 model pretrained on CLIP \citep{radford2021learning} into an SNN. The resulting SNN directly inherits the capabilities like zero-shot classification and transferability from the large multimodal Transformer. It also achieves state-of-the-art accuracy for SNNs on multiple benchmarks after supervised fine-tuning. Additionally, our converted SNN requires no floating-point operations, enabling energy-efficient deployment on neuromorphic hardware.
	
	In summary, our main contributions are as follows:
	\begin{itemize}[itemsep=2pt, topsep=0pt, parsep=0pt, leftmargin=15pt, rightmargin=15pt]
	    \item We propose Spatio-Temporal Approximation (STA), a training-free pipeline to convert ANN Transformers to SNNs via universal approximations in both spatial and temporal domains.
		\item We provide theoretical analysis on the error bounds and convergence rates of both key modules in STA, proving their efficacy in approximating ANN computation.
		\item To our knowledge, we are the first to directly convert a pretrained mainstream Transformer (ViT-B/32 from CLIP) into an SNN without additional training or fine-tuning, while still retaining the generalization performance of the original model. 
	\end{itemize}
	
	\section{Related Work}
	\subsection{ANN-to-SNN conversion}
	Converting ANNs to SNNs is an active area of research for improving performance and training efficiency on large-scale tasks \citep{diehl2015fast}, whereby ReLU activations in ANN are replaced by "soft-reset" IF neurons \citep{rueckauer2017conversion,han2020rmp}. Its key directions include:
	
	\textbf{Training-free conversion} is directly conducted on pretrained ANNs through threshold balancing \citep{diehl2015fast,rueckauer2017conversion}, parameter calibration \citep{li2021free} and functional spike emission \citep{wang2022signed,li2022efficient} to convert to SNN and calibrate by only a few examples without retraining or fine-tuning. Thus, they can be applied on high-performing open-source ANN models. However, these methods are mostly limited to CNNs, lacking applicability to Transformers \citep{li2022spikeformer} and suffering from long simulation steps.
	
	\textbf{Training-dependent conversion} tailors the ANN for SNN compatibility before conversion \citep{bu2021optimal,ding2021optimal,jiang2023unified}, or fine-tunes the SNN after conversion \citep{wang2022towards}. Despite reducing conversion loss and latency, they depend on given datasets, entailing greater training costs and weaker generalization, while maintaining CNN-like structural constraints.
	
	Our work presents a training-free approach that extends conversion beyond CNNs to Transformers. As spiking equivalents of Attention Blocks, our proposed modules approximates them spatially and temporally, thus retaining the applicability of large-scale pretrained models to complex scenarios. 
	
	\subsection{Transformer and Spike-Based Transformer}
	\textbf{Transformers} have achieved impressive results on numerous tasks like natural language processing \citep{brown2020language,devlin2018bert} and computer vision \citep{dosovitskiy2020image} via the self-attention mechanism that captures global dependencies by aggregating features across spatial dimensions. Transformers differ from CNNs in two key aspects: \textbf{1)} interactions between spatial features, and \textbf{2)} complex non-linearity/normalization, both not achievable by existing SNNs.
	
	\textbf{Spike-Based Transformers} are recently proposed models for direct SNN training. \citet{li2022spikeformer} substitutes the activations with spiking neurons but retains many floating-point operations. \citet{zhou2022spikformer} intruduces a purely spiking self-attention module by modifying the Softmax operation. \citet{zhou2023spikingformer} presents the first fully event-driven Transformer through tailored residual connections. Additionally, \citet{zhang2022spiking,zhang2022spike} design specified Transformers for event-based cameras, which do not readily extend to conventional visual data. All these models differ from ANN Transformers structurally and require training from scratch, while our method directly leverages conversion to inherit capabilities from pretrained ANN Transformers without training. 

	
	\section{Preliminaries and Problem Analysis}
	
	\subsection{Neurons for ANN \& SNN}
	In ANNs using ReLU activation, for neurons in layer $l$, we denote their output as vector $\vx^l$, and the weight matrix between layer $l-1$ and $l$ as $W^l$. Ignoring bias, its floating-point inference process is:
	\begin{equation}\label{eq: ANN_forward}
		\vx^{l} = max\left(\mW^l\vx^{l-1},0\right),\quad l=1,2,...T.
	\end{equation}
	
	As for SNNs, similar to \citet{han2020rmp}, we consider the soft-reset Integrate-and-Fire (IF) neurons. When the $l$-th layer receives weighted binary spikes $\vx_s^{l-1}(t)\in \{0,1\}$, the update rule is:
	\begin{flalign}\label{eq: IF_forward}
	&	&\vm^l(t)=\vp^l(t-1)+\mW^l\vv_{th}^{l-1}\otimes\vx_s^{l-1}(t),
	&	&\left\{
			\begin{array}{l}
				\vs^l(t)=H(\vm^l(t)-\vv_{th}^l) \\ \\	\vp^l(t)=\vm^l(t)-\vv_{th}^{l}\otimes\vx_s^{l}(t)
			\end{array} \right. ,&&
	\end{flalign}
	where $\vm^l(t)$ and $\vp^l(t)$ represent the potentials before and after the trigger of spike $\vs^{l}(t)$, $\vv_{th}^l$ is the threshold, and $H(\cdot)$ is Heaviside step function. The firing rate is measured as the average number of spikes over time $T$, denoted as $\bar{\vs}^{l}$. The converted SNN exhibits similarities with ReLU ANN on the activation values for each layer, i.e., $\vx^{l} \approx \bar{\vs}^{l}$, because of their comparable linear growth arithmetic. 
	
	\subsection{Operations in Transformers}
	A basic attention block in Transformer is shown in Fig.~\ref{fig:transformer}, relying on two main types of operations that differ from those in conventional CNNs for conversion. More details on the modules in Transformer are provided in the Appendix.\ref{subsec:detail_transformer}.
	
	1) \textbf{Non-linear operators.} While CNNs primarily use ReLU activation for non-linearity, Transformer involves more complex nonlinear functions like GELU \citep{hendrycks2016gaussian}, square root, exponentiation, etc., which cannot be directly achieved by the piece-wise linear dynamics of IF neurons. This requires us to approximate their computational characteristics in the spatial domain.
	
	2) \textbf{Variable Scalar / Matmul product.} The inference in CNNs is conducted through variable features multiplied by constant weight matrices, while Transformers contain more \textit{variable-variable} multiplications, such as the query-key products in self-attention. Additionally, LayerNorm in Transformer computes normalization coefficients dynamically during inference, preventing integration into weight matrices as with BatchNorm in CNNs \citep{rueckauer2017conversion}. Thus, computing these multiplications with spiking neurons is challenging and may require temporal modifications.
	
	\begin{figure}[tbp]
		\vspace{-0.6cm}
		\centering
		\includegraphics[width =1\columnwidth]{figs/transformer_modules.pdf}
		\vspace{-0.8cm}
		\caption{The modules and operators in each Residual Attention Block of ViT.}
		\label{fig:transformer}
		\vspace{-0.3cm}
	\end{figure}
	
	
	\section{Spatial Approximation for Non-linearity}
	\label{sec:nonlinear}
	As Transformer's floating-point non-linearity poses challenges for SNN conversion, our goal is developing spiking counterparts to simulate their spatial reactions. The proposed approximators should: \textbf{1)} consist only IF neurons, and \textbf{2)} be universally applicable to all operations, models and data. Due to the insufficient representation capability of each single neuron, we adopt groups of neurons to substitute individual operators. These approximators are pre-trained by synthetic floating-point data independent of real examples, and thus universally applicable to all scenarios.
	
	\subsection{Neuron Groups for Universal Approximation}
	\label{ss:ugo}
	We first examine common non-linear operators like GELU or square root that are low-dimensional with complicated computations. We note that with the Universal Approximation Theorem \citep{hornik1989multilayer}, single-layer ANNs can approximate these continuous functions over definite intervals. Further, ANNs with ReLU activation can be efficiently converted to equivalent SNNs. Therefore, we propose the Universal Group Operator (UGO), a small groups of spiking neurons for approximation.
	
	\begin{definition}[Universal Group Operator]
		\label{def:ugo}
		Let $f:x\mapsto y$ defined on domain $x\in\mathcal{D}$ be a real continuous unary function. Its spiking universal group operator $\hat{f}$ comprises two fully connected (FC) layers surrounding a single hidden IF layer with N neurons, such that $\exists \epsilon>0$ where for any spike input $\vx_s$ with mean $\bar{\vx_s}=x$, the output spikes $\vy_s$ satisfy $\mathbb{E}\left|\bar{\vy_s}-y\right|\le \epsilon$.
		
		The input and output layers have weights $\vw_1, \vw_2\in \mathbb{R}^n$, and biases $\vb_1\in \mathbb{R}^n, b_2\in \mathbb{R}$, respectively.
	\end{definition}
	
	\textbf{Construction.} Three stages are required to obtain a universal group operator, shown in Fig.~\ref{fig:ugo}:
	\begin{figure}
	\vspace{-0.6cm}
		\begin{minipage}[h]{.6\linewidth}
			\centering
			\includegraphics[width =  1\columnwidth]{figs/nonlinear_fitting.pdf}
			\caption{Spatial approximation process with UGO.}
			\label{fig:ugo}
		\end{minipage}
		\begin{minipage}[h]{.40\linewidth}
			\centering
			\includegraphics[width = 0.9 \columnwidth]{figs/gelu_standard.pdf}
			\caption{An approximated UGO for GELU with $N=16, T=16$.}
			\label{fig:gelu}
		\end{minipage}
	\end{figure}
	
	1. \textit{Data Synthesis.} On account of LayerNorm in Transformers, the input range of any function $f$ is always empirically restricted to a small continuous interval $\mathcal{D}$, e.g., statistically, $\mathcal{D}=\left[-10,10\right]$ for GELU. To enable the UGO to approximate $f$ without real training data, we roughly synthesize a mixture of uniform/normal distribution $\mathcal{\hat{D}}$ that covers $\mathcal{D}$, and sample $M$ points $\left\{x_i\right\}$ from $\mathcal{\hat{D}}$ to cover all possible inputs. The floating-point data pairs $\left\{x_i,f(x_i)\right\}$ serve as the synthetic training data.
	
	2. \textit{ANN Construction.} We manually select a suitable hyperparameter size $N$ to define the scale of an ANN $\hat{f}_{\mathrm{ReLU}}$ based on the complexity of $f$, with typically $N\in \left[8,32\right]$ for balanced accuracy and efficiency. It is then trained on the synthetic data using ReLU or other tailored activation as in \citet{jiang2023unified} for approximation.
	
	3. \textit{SNN Conversion.} The pretrained ANN is finally converted to an SNN $\hat{f}_{\mathrm{IF}}$ of IF neurons over $T$ time-steps using existing methods like \citet{li2021free}. Its conducts purely event-driven inference via spike accumulation and can directly replace its ANN counterpart with equivalent functionality.
	
	The universal group operators thus allow implementation of all low-dimensional operations in Transformers for SNN conversion. As the synthesized data covers all possible inputs during inference, the pretrained UGOs are universally applicable to all test samples at high accuracy. Fig.~\ref{fig:gelu} demonstrates a conversion result for GELU with $N=16, T=16$, and more details are in the Appendix.\ref{subsec:ugo_setting}. 
	
	\textbf{Approximation Error Analysis.}
	While bringing high efficiency, the small scale of UGOs also raise concerns about their accuracy and generalizability. To qualitatively analyze how the design impacts performance, we consider errors from three sources: insufficient sampling, limited parameterization and spiking quantization. This yields the following error bound:
	\begin{theorem}[Error Bound for Spatial Approximation]
		\label{theo:error}
		For an optimal $\hat{f}^*$, the error $\epsilon^*$ satisfies
		\begin{equation} \label{eq: error_bound}
			\epsilon^*\leq
			\underbrace{\mathcal{O}\left(\sqrt{\frac{N\log N \log M}{M}}\right)}_{Empirical\ Gap}+
			\underbrace{\mathcal{O}\left(\frac{\mathcal{L}_f \left|y\right|_{\max}}{N^2}\right)}_{Parameterization \ Gap}+
			\underbrace{\frac{\left\|\vw_1\left|x\right|_{\max}+\vb_1 \right\|_\infty\cdot \|\vw_2\|_1}{T}}_{Quantization\ Gap},
		\end{equation}
	where $\mathcal{L}_f$ is the Lipschitz constant of $f$ on $\mathcal{D}$. Proof in Appendix. \ref{subsec:proof_1}.
	\end{theorem}
	The terms correspond to the gap between function $f$, the optimal learner, the optimal fixed-scaled ANN, and its SNN counterpart. This theoretical analysis guides our implementation in two aspects:
	
	1. \textit{ANN training:} The Quantization Gap reflects that the two weighted layers contribute differently to the error depending on distinct norms $\left\|\vw_1\left|x\right|_{\max}+\vb_1 \right\|_\infty$ and $\|\vw_2\|_1$. Thus, unlike common $L1/L2$ regularizations, it is adopted as a layer-specific regularization during training.
	
	2. \textit{Hyperparameter determination:} While larger $M$ and $T$ always improve performance, the optimal scale $N$ depends on the case. Note that $\|\vw_2\|_1$ can be scaled up to $N\cdot\vw_{2\max}$, all three gaps correlate differently with $N$, requiring experimental search for a balance on accuracy and conversion loss.
	
	\subsection{Integration for High-Dimensional Operations}
	\label{subsec:high-dimension}
	\begin{wrapfigure}{r}{0.44\columnwidth}
	\vspace{-0.9cm}
	\centering
	\includegraphics[width=0.44\columnwidth]{figs/layernorm_subop.pdf}
	\vspace{-0.6cm}
	\caption{Integration for LayerNorm.}
	\label{fig:LN}
	\vspace{-0.9cm}
	\end{wrapfigure}
	By proposing the universal group operator, we have achieved event-driven unary operations. However, such scheme is infeasible for normalization functions like LayerNorm and Softmax, as their higher-dimensional input space cannot be sufficiently covered by the synthesized training data as in UGOs. 
	
	To address this issue, we achieve them by integrating three types of basic spiking operations. Take LayerNorm as an example, as in Fig.\ref{fig:LN} (and Softmax in Appendix.\ref{subsec:detail_high}). The ANN implementation is $\mathrm{LN}(x_i)=\gamma \dfrac{x_i-\mu}{\sqrt{\sigma^2+\epsilon}}+\beta$, where $\epsilon$ is a small constant, decomposed into the following parts:
	
	1. \textit{Weighted addition:} Simple, high-dimensional computations such as zero-centering and variance for binary inputs via fixed-weight linear layers.
	
	2. \textit{Universal group operator:} The normalization coefficient $1/\sqrt{\sigma^2+\epsilon}$ computed by a UGO.
	
	3. \textit{Multiplication:} Scalar or Matmul product between two variables, to be achieved in Section.\ref{sec:attention}.
	
	Such modular integration enables constructing high-dimensional spiking operators with UGOs, demonstrating the spatial aspect of our Spatio-Temporal Approximation pipeline. Nevertheless, performing variable multiplication in SNNs remains an unresolved issue due to its temporal characteristics. This computational requirement arises not just for normalization, but is critical for self-attention in Transformers. Therefore, we next focus on the spiking implementation of multiplications.
	
	\section{Temporal Approximation for Multiplications}
	\label{sec:attention}
	% Unlike conventional networks, the self-attention in Transformer performs multiplications between variable feature matrices rather than fixed weights. During inference, these matrices are encoded by incomplete temporal sequences, so directly computing their product is non-causal. Naively avoiding this can lead to uneven spike outputs and performance degradation. To address this, we propose an estimation-correction mechanism. The product is first estimated using the temporally available sequences, and then corrected by the next actual spike input. This distributes each spikes' contribution to the product across all time steps, smoothing the output for enhanced stability of multiplication.
	
	Unlike conventional networks, the self-attention in Transformer performs multiplications between variable feature matrices rather than fixed weights. During inference, these matrices are encoded by incomplete temporal sequences, so directly computing their product is non-causal. Naively avoiding this can lead to uneven spike outputs and performance degradation. To address this, we propose Temporal-Corrective Self-Attention Layer (TCSA), employing an estimation-correction mechanism. The product is first estimated using the temporally available sequences, and then corrected by the next actual spike input. This distributes each spikes' contribution to the product across all time steps, smoothing the output for enhanced stability of multiplication.
	
	\subsection{Temporal Split for Spike-based Multiplication}
	To analysis this problem, we first consider basic matrix multiplication $\mA\cdot\mB$. For simplicity, assume a matrix $\mM$ with shared scalar threshold $v_m$ for each element is split into a spike sequence $\mM_s(t)\in \{0,1\},t=1,\dots,T$. In conventional architectures, such operations typically occur between fixed-weight matrix $\mW$ and binary variable features $\mX$, computed as 
	\begin{equation}
	    \mW\mX = \mW\cdot v_x\bar{\mX_s}=\frac{v_x}{T} \sum_{t=1}^{T}{\mW \mX_s(t)}.
	\end{equation}
	Thus,$v_x\mW \mX_s(t)$ are used as a weighted spike output at each step, and are accumulated for result.
	
	In contrast, for common \textit{inter-variable} multiplications in Transformer such as query-key products, the operations are rather different. Note that before the input at step $t$, both matrices are incomplete, with only inputs at $[1,t-1]$ available in their temporal split sequences.
	
	\begin{definition}[Naive Temporal Split for Causality]
	\label{def:naive}
	Let $\mA, \mB, \mA_s, \mB_s$ be two variable matrices and their encoded spiking sequences in $T$ steps with thresholds $v_a$, $v_b$.
	The temporary product $\Phi(t)$ is the sum of all currently available binary terms in the matrix product at step $t$ considering causality:
	\begin{equation}
	\label{eq:tmp_product}
        \Phi(t)\triangleq \sum_{i=1}^t \mA_s(i)\sum_{j=1}^t \mB_s(j) =\sum_{i,j=1}^t \mA_s(i)\mB_s(j).
	\end{equation}
	Since $\Phi(t-1)$ is available before step $t$, the increment $\phi(t)$ to obtain $\Phi(t)$ is defined as below:
	\begin{equation}
	\phi(t)\triangleq \Phi(t)-\Phi(t-1) = \mA_s(t)\mB_s(t) + \mA_s(t)\sum_{i=1}^{t-1}\mB_s(i) + \sum_{i=1}^{t-1}\mA_s(i)\mB_s(t),
	\end{equation}
	which uses only Boolean ANDs and additions. Accordingly, let $\mP(t) \triangleq \frac{v_a v_b}{T}\phi(t)$ be the output at $t$:
	\begin{equation}\label{eq: naive_increment}
	    \bar{\mP}= \frac{1}{T}\sum_{t=1}^{T}\mP(t) = \frac{1}{T}\sum_{t=1}^T \frac{v_a v_b}{T}\phi(t) = \frac{v_a v_b}{T^2}\Phi(T) = \mA\mB,
	\end{equation}
	which aligns with the objective of ANN-to-SNN conversion.
	\end{definition}

	
	\subsection{Estimation-Correction for Firing-Rate Stability}
	Although the naive method in Def.\ref{def:naive} maintains numerical equivalence in the conversion, its output $\mP(t)$ contains $2t-1$ terms with the incomplete sequence temporarily. This implies a linearly growing magnitude over time, leading to uneven firing rates along the time dimension. As these spikes propagate, the large inputs in the last few steps make subsequent neurons hoard substantial residual membrane potential, preventing effective spike emission. To mitigate such instability, it is necessary to estimate the distribution of future input spikes earlier on, so as to react proactively.
	
	\begin{figure}[tbp]
		\vspace{-0.6cm}
		\centering
		\includegraphics[width =1\columnwidth]{figs/temporal_multiplication.pdf}
		\vspace{-0.4cm}
		\caption{Spike multiplications with naive temporal split and estimation-correction mechanism.}
		\label{fig:multiplication}
			\vspace{-0.1cm}
	\end{figure}
	
	\textbf{Methodology.} Considering the temporal consistency of rate-coding, we propose that by regarding the available sequence at $t$ as a $t$-point sampling of the complete $T$-step simulation, the overall firing rate can be approximated by that of a shorter $t$-step time interval. The estimation is thus defined as:
	\begin{theorem}[Temporal Estimation] The unbiased estimations of $\mA$ and product $\mA \mB$ at step $t$ are
	\begin{equation}
	    \hat{\mA}(t)=\frac{v_a}{t}\sum_{i=1}^t \mA_s(i),\quad \Psi(t)=\hat{\mA}(t)\hat{\mB}(t)=\frac{v_a v_b}{t^2}\Phi(t),
	\end{equation}
	\end{theorem}
	Such estimation provides two key benefits: 1) Guaranteed evenness: As $\mathbb{E}\Psi(t)=\mA \mB$ for any $t$, the estimation is independent of $t$ with small temporal variation, resulting in sparse spike outputs. 2) Progressive approximation: Since $\lim_{t\to T}\Psi(t)=\Psi(T)=\mA \mB$, the estimate gradually approximates the exact statistic for the full sequence. Each step's output brings the estimate closer to the final result. Thus, we propose:
	\begin{definition}[Temporal Correction]
	    The corrective increment $Q(t)$ as the output sequence is:
	    \begin{equation}\label{eq: corrective_increment}
	        \mQ(t) \triangleq t\Psi(t) - (t-1)\Psi(t-1) = \frac{v_a v_b}{t}\left[\frac{1}{1-t}\Phi(t-1)+\phi(t)\right]
	    \end{equation}
	    where all computations are Boolean ANDs and their weighted additions, such that
	    \begin{equation}
	        \bar{\mQ} = \frac{1}{T}\sum_{t=1}^{T}\mQ(t) = \Psi(T) =  \mA \mB.
	    \end{equation}
	\end{definition}
	This mechanism is the core of our Temporal-Corrective Self-Attention Layer as a spiking self-attention module, and is also similarly adopted in Section.\ref{subsec:high-dimension} for multiplications. In practice, spike multiplications are always constantly weighted, e.g., $v_a\mA_s(t_1)\mW_A\mW_B v_b\mB_s(t_2)$, and the weights of additions at each step $t$ can be pre-integrated into the linear layers $\mW$ before inference. Thus, the computations remain hardware friendly. Moreover, our estimation-correction algorithm allows reusing accumulated $\Phi(t)$ values from prior time steps during the update, reducing computations. 

	\textbf{Estimation Error Analysis.} The performance of our corrective multiplication method relies heavily on accurate estimation. We quantitatively analyzed how our estimate $\Psi$ converges to the ground truth over time steps. Considering that all multiplications are obtained from scalar multiplications, for clarity, we assume all elements are independent with a threshold $v_{th}=1$.
	\begin{theorem}[Convergence Rate of Temporal Estimation]
	Assuming two independent floating-point elements $a$ \& $b$, and their converted $T$-step spiking sequence follows a stationary independent process with $Ta$ \& $Tb$ spikes emitted. Denote the number of arrived spikes by step $t$ as $x$, the estimated $\Psi(t)$ satisfy: (Proof in Appendix.\ref{subsec:proof_3})
	\begin{equation}
	    \mathbb{E}\left\{\Psi(t)\right\}=ab,\qquad
 	    \mathbb{D}\left\{\Psi(t)\right\}=\frac{ab(1-a)(1-b)}{(T-1)^2}\cdot\left(\frac{T}{t}-1\right)^2 \propto \left(\frac{1}{t}-\frac{1}{T}\right)^2.
	\end{equation}
% 	\begin{align}
% 	    &\mathbb{E}\left\{\hat{A}(t)\right\}=a, &\mathbb{D}\left\{\hat{A}(t)\right\}=\frac{a(1-a)}{T-1}\cdot\frac{T-t}{t}\\
% 	    &\mathbb{E}\left\{\Psi(t)\right\}=ab,
% 	    &\mathbb{D}\left\{\Psi(t)\right\}=\frac{ab(1-a)(1-b)}{(T-1)^2}\cdot\frac{T-t}{t}^2
% 	\end{align}
	\end{theorem}
	It demonstrates the estimation error decreases quadratically with $t$ initially, then stabilizes in the final few steps. This mechanism acts as a smoothing filter, providing the temporal component of our Spatio-Temporal Approximation pipeline.
	
	\section{Implementation and Experiments}
	To demonstrate the advantages of our training-free Transformer conversion approach, we apply our pipeline to the Image Encoder of CLIP \citep{radford2021learning}, a prevalent Language-Image model. This allows our converted model to leverage CLIP's powerful generalization abilities such as zero-shot classification. In comparison to conventional ResNet architectures, Transformers can better exploit large-scale pretraining to achieve superior performance. Furthermore, for a fair comparison with existing methods, we fine-tune the pretrained ViT on benchmarks like CIFAR and ImageNet, achieving state-of-the-art results of SNN with smaller conversion error and faster simulation.
	
	\subsection{Conversion Implementation}
	Our work enables all Transformer computations in SNN to be conducted without specified conversion methodology. In practice, we combine prior techniques to complete the entire conversion, including MMSE \citep{li2021free} to determine optimal neuron thresholds, signed neurons \citep{wang2022signed} to handle negative weighted inputs, and burst spikes \citep{li2022efficient} to mitigate lagging inputs and reduce residual potentials. Implementation details are provided in Appendix.\ref{subsec:implementation}.
	
	\subsection{Zero-shot Classification}
	\textbf{Settings and Models.} CLIP is a multi-modal ANN trained on image-text pairs with diversified Image Encoder backbones including ResNet and Vision Transformer (ViT). It performs various tasks based on natural language prompts. Since no existing methods directly convert Transformers, we use pretrained ResNet-50 backbone for our baselines. Following standard CLIP configuration for zero-shot prediction, we evaluate on CIFAR-10/100, ImageNet-200 benchmarks, and distribution-shifted CIFAR-10.1/10.2 datasets. Details in Appendix.\ref{subsec:datasets}.
	
\begin{table}[htbp]
\centering
\vspace{-1.2cm}
\caption{Comparison with other backbones and baselines on \textbf{zero-shot} classification of CLIP.}
\label{tab:zero-shot}
\setlength\tabcolsep{3pt} 
\begin{center}
\begin{tabular}{lllccccc}
\toprule
Dataset                       & Model                       & Method   & ANN Acc.             & T=32  & T=64  & T=128 & T=256 \\ \midrule
\multirow{5}{*}{CIFAR-10}     & \multirow{2}{*}{ResNet-50} &Calib. \citep{li2021free} &\multirow{2}{*}{72.35}  & 64.08 & 68.13 & 71.04 &71.19       \\
                              &                            &SNM \citep{wang2022signed}     &                       & 58.69 & 61.22 & 70.68 &70.88       \\ 
                               &\multirow{2}{*}{ResNet-101} &Calib. \citep{li2021free} &\multirow{2}{*}{79.64}  &38.21 &55.37 &67.44 &71.21      \\           
                              &  &SNM \citep{wang2022signed} & &43.25  &52.68 &68.42 &72.96      \\          
                              & ViT-B/32                   & \textbf{STA (Ours)}    & 89.74                  & \textbf{87.71}  & \textbf{88.20} & \textbf{88.29} & \textbf{88.34}       \\ \midrule
\multirow{3}{*}{CIFAR-100}    & \multirow{2}{*}{ResNet-50} & Calib. \citep{li2021free} & \multirow{2}{*}{41.01} & 24.67 & 33.41 & 38.20  &39.01       \\
                              &                            & SNM \citep{wang2022signed}    &                        & 35.64 & 34.71 & 39.95 &41.13       \\ 
                              & ViT-B/32                   & \textbf{STA (Ours)}    & 64.26                  & \textbf{62.55} & \textbf{62.74} & \textbf{62.98} &\textbf{63.01}       \\ \midrule
\multirow{3}{*}{ImageNet-200} & \multirow{2}{*}{ResNet-50} & Calib. \citep{li2021free} & \multirow{2}{*}{45.63} & 22.50  & 34.51 & 41.82 &42.03       \\
                              &                            & SNM \citep{wang2022signed}    &                        & 25.43 & 38.17 & 42.25 &42.95       \\ 
                              & ViT-B/32                   & \textbf{STA (Ours)}    & 62.25                  &\textbf{59.79}       &\textbf{61.24}       &\textbf{61.53}       &\textbf{61.66}      \\ \midrule
\multirow{3}{*}{CIFAR-10.1} & \multirow{2}{*}{ResNet-50} & Calib. \citep{li2021free} & \multirow{2}{*}{65.05} & 61.01  &63.44 &64.39 &64.42       \\
                              &                            & SNM \citep{wang2022signed}    &                        & 44.56 & 58.26 & 63.53 &64.06       \\ 
                              & ViT-B/32                   & \textbf{STA (Ours)}    & 84.15                  &\textbf{83.05}       &\textbf{83.25}       &\textbf{83.58}       &\textbf{83.52}     \\ \midrule
\multirow{3}{*}{CIFAR-10.2} & \multirow{2}{*}{ResNet-50} & Calib. \citep{li2021free} & \multirow{2}{*}{63.90} & 58.97  &61.01 &62.50 &62.68       \\
&                            & SNM \citep{wang2022signed}    &                        & 46.83 & 54.68 & 62.94 &63.08       \\ 
& ViT-B/32                   & \textbf{STA (Ours)}    & 80.35                  &\textbf{78.55}       &\textbf{79.65}       &\textbf{79.77}       &\textbf{79.83}     \\ \bottomrule
\end{tabular}
\end{center}
\vspace{-0.6cm}
\end{table}
	
	\textbf{Classification performance.} The results in Table.\ref{tab:zero-shot} show that the converted ViT model substantially exceeds ResNet across all datasets and time settings. This confirms that large-scale pretrained Transformer are superior to convolutional networks for zero-shot classification, emphasizing the value of SNN conversion targeted on Transformers over CNNs.
	
	\textbf{Accuracy loss from conversion.} Despite having more parameters than ResNet-50 (87.8M vs 25.6M), our ViT model still experiences much lower accuracy drop after conversion. Two main factors contribute: 1) Self-attention layers have lower precision requirements than convolutions, making them less prone to numerical errors. 2) Transformer architecture provides more robust features with larger label margins, maintaining predictions even under conversion perturbations.
	
	\textbf{Limitations of existing works.} We make two key observations: 1) Larger convolutional networks like ResNet-101 do not improve SNN conversion performance over ResNet-50, as their ANN accuracy still lags behind ViT while depth exacerbates conversion errors. This highlights the need for advanced architectures like Transformers. 2) Many current conversion methods only succeed on models like resnet-20 or VGG-16, while being incompatible with deep residual networks. Thus we selectively demonstrate those with better ResNet-50 results from CLIP.
	
	\subsection{Standard Classification and Ablation Studies}
	
	\begin{wrapfigure}{r}{0.46\columnwidth}
	\vspace{-0.8cm}
	\centering
	\includegraphics[width=0.45\columnwidth]{figs/ablation.pdf}
	\vspace{-0.5cm}
	\caption{Ablations on components in CIFAR-100, T=32.}
	\vspace{-0.3cm}
	\label{fig:ablation}
	\end{wrapfigure}
	
	\textbf{Standard Classification.} We fine-tune our ViT on benchmarks and compared its performance on conventional image classification tasks to resnet-20 and pretrained ResNet-50 baselines from CLIP. Table.\ref{tab:cifar-100} shows results on CIFAR-100, with other results on CIFAR-10 / ImageNet in the Appendix.\ref{subsec:results_classification}. Compared to other conversion methods, our algorithm achieves near peak accuracy with fewer steps ($T=32$ or $64$), while most baselines require over $128$ steps for optimal accuracy. The remaining small accuracy gap to ANN ViT is largely due to the unavoidable approximation error from the Universal Group Operators. This demonstrates the faster simulation time advantages of our approach. 

	\textbf{Ablations.} We also conduct ablation experiments to analyze the spatial and temporal impact in our pipeline, in Fig.\ref{fig:ablation}. Our results lead the the following conclusions:
	1) UGO nearly eliminates the three Gaps in Eq.\ref{eq: error_bound}, thereby retaining nonlinear computation capabilities after spatial approximation.
	2) The estimation-correction mechanism for temporal multiplication prevents large residual potential accumulation caused by output lag, thus significantly improving performance over the naive method.
	
	\begin{table}[tbp]
		\vspace{-0.8cm}
    \centering
    \caption{Comparison with other backbones and baselines on standard classification of CIFAR-100}
    \label{tab:cifar-100}
    \begin{center}
    \begin{tabular}{llccccc}
         \toprule Model &Method &ANN Acc. & T=32  & T=64  & T=128 & T=256 \\ \midrule
         \multirow{6}{*}{resnet-20} &RMP \citep{han2020rmp} &\multirow{6}{*}{76.12} &30.60 &42.61 &62.59 &69.86 \\
         &TSC \citep{han2020deep} & &35.87 &49.70 &65.42 &70.59 \\
         &Opt. \citep{deng2020optimal} & &49.81 &69.82 &75.75 &75.94 \\
         &Calib. \citep{li2021free} & &74.25 &75.08 &75.58 &76.24 \\
         &SNM \citep{wang2022signed} & &74.58 &75.89 &76.11 &76.18 \\
         &Burst \citep{li2022efficient} & &71.14 &75.50 &75.89 &76.03 \\
         \midrule
         \multirow{3}{*}{\makecell[l]{ResNet-50 \\(CLIP)}} & Opt. \citep{deng2020optimal} &\multirow{3}{*}{81.13} &64.48 &71.71 &76.67 &79.52 \\
         &Calib. \citep{li2021free} & &75.61 &77.29 &78.13 &80.02\\
         &SNM \citep{wang2022signed} & &68.24 &75.30 &77.91 &80.75 \\
         \midrule
         ViT-B/32 &\textbf{STA (Ours)} &87.35  &\textbf{84.15} &\textbf{85.25} &\textbf{85.69} &\textbf{85.98} \\
         \bottomrule
    \end{tabular}
    \end{center}
    \vspace{-0.2cm}
    \end{table}
	
	\subsection{Energy Estimation}
	The energy efficiency of SNN stems from two aspects: 1) Sparsity and event-driven computation, where only a small fraction of synapses are active during inference. 2) Low-power synaptic operations like Boolean logic and weighted additions instead of expensive floating-point operations. The consumption of ANN inference is characterized by floating-point operations ($FLOPs$) with energy cost $E_{MAC}$, while SNNs rely on synaptic operations ($SOPs$) with $E_{AC}$. Therefore, the ratio of inference energy for SNN versus ANN for a module is estimated in \citet{rathi2020diet} as:
	\begin{equation}\label{eq:energy}
	    \gamma=\frac{E_{SNN}}{E_{ANN}} = \frac{SOPs\cdot E_{AC}} {FLOPs\cdot E_{MAC}},\quad with \ E_{MAC}\approx 4.6J, E_{AC}\approx 0.9J
	\end{equation}
	Using an empirical firing rate denoted as $\eta$, we analyze both components in our pipeline:
	
	\textbf{Universal Group Operator.} A unary non-linear operator like GELU requires $FLOPs\approx 70$ primarily due to exponents in $\tanh$, while a UGO with $N$ neurons requires $SOPs=2NT\eta$. For a high accuracy implementation with $N=32, T=32, \eta\approx 9.1\%$, UGOs reduce computational costs by $41\%$ compared to GELU. This saving is further amplified in high-dimension operations.
	
	\textbf{Spike Multiplications.} We illustrate this with the $N\times N$ query-key matrix products, where $FLOPs=3N^3$. While naively implementing matrix multiplication requires $O(T^2)$ spike products, our proposed TCSA layer reduces complexity to $O(T)$ with accumulated $\Phi(t)$. Specifically, $SOPs=4TN^3\eta$. With $\eta\in [3\%,13\%]$ at $T=32$ across all $12$ blocks, the attention modules achieve $33\%$ savings on average, up to $75\%$ for the sparsest cases.

    Admittedly, considering the unique computational demands of Transformer, its energy savings from SNN conversion are not superior than convolutional spiking networks. However, our work still demonstrates potential for low power usage: training UGOs with sparsity constraints or optimizing multiplication estimations could further reduce the $\eta$ in our Spatial-Temporal Approximation pipeline. In addition, the latest hardware \citep{pei2019towards} allows utilizing both floating-point and event-driven computation synergistically, thereby further improving energy performance. 
	
	\section{Conclusion and Discussion}
	For the first time, this paper establishes a bridge between mainstream pretrained Transformers and SNNs. By designing novel spiking operators and layers, we approximate Tranformers in both spatial and temporal dimensions in a purely event-driven fashion, breaking with convention. Since all Transformer-based models share similar computation modules, our proposed pipeline is broadly applicable to various language and vision models, including the Text Encoder in CLIP, or even Large Language Models, as our subsequent work. These pretrained large models are often transferable without additional training or fine-tuning, and our training-free conversion pipeline avoids performance degradation, promoting practical SNN usage on various downstream applications. While the converted ViT has slightly higher computations than conventional spiking CNNs, it provides stronger performance and robustness with fewer simulation steps. This enables potential energy-efficient deployment of open-source large models in the future with neuromorphic hardware.
	
	
	\bibliography{iclr2024_conference}
	\bibliographystyle{iclr2024_conference}
	
	\newpage
	\appendix
	\begin{center}{\Large{
	Appendix}}
	\end{center}
	\section{Modules in Transformers}
	\label{subsec:detail_transformer}
	This section provides a detailed overview of all the modules, operators, and formulas in the Residual Attention Block of the Transformer. Taking ViT-B/32 as an example, it consists of one convolutional layer, 12 sequentially connected Residual Attention Blocks, and a final linear layer. Each Attention Block contains the following modules:
	
	1. \textbf{Multi-Head Attention} is the core module of each block, allowing the model to jointly attend to information from different representation subspaces. In self-attention, the same input vector is first projected $h$ times into queries $\mQ$, keys $\mK$, and values $\mV$ using different learned linear projections. Attention is then performed in parallel for each projection:
	\begin{equation}
	    {Head}_i = \mathrm{Softmax} (\mQ_i \mK_i^T) \mV_i,\quad for\ i=1,...,h
	\end{equation}
    The outputs of the $h$ heads are concatenated and projected once more by $\mW^O$ to get the final values:
    \begin{equation}
        MultiHead(\mQ,\mK,\mV)=\mathrm{Concat}(Head_1,...,Head_h) \mW^O
    \end{equation}
    
    The uniqueness of Attention is that it performs a large number of matrix multiplications between feature matrices, such as $\mX=\mQ_i \mK_i^T$, which is different from the multiplications with constant weight matrices like $\mW^O$.
    
    In addition, the Softmax function is applied to the result of the query-key multiplication for the attention weights in each heads. Specifically, Softmax normalizes the attention weights to output a probability distribution:
    \begin{equation}
        \mathrm{Softmax}(x_i)=\frac{e^{x_i}}{\sum_i e^{x_i}},
    \end{equation}
    which requires nonlinear operations like exponentiation and inverse.
    
    2. \textbf{Layer Normalization} (LayerNorm), is used twice in each block before and after the attention module to stabilize and accelerate training. It normalizes the activations of each layer by subtracting the mean and dividing by the standard deviation:
    \begin{equation}
    \mathrm{LN}(x_i)=\gamma \dfrac{x_i-\mu}{\sqrt{\sigma^2+\epsilon}}+\beta,
    \end{equation}
    where $\mu$ and $\sigma^2$ are the mean and variance calculated over all hidden units in the same layer. 
    
    Notably, unlike BatchNorm which tracks global statistics during training across entire channels, LayerNorm normalizes each element independently. This normalizing occurs at both training and inference, requiring dynamic statistics, which hinders weights absorption into matrices compared to BatchNorm for SNN conversion. Therefore, it also requires more complex nonlinear operations like square root and inverse at inference.
	    
    3. \textbf{Gaussian Error Linear Unit} (GELU) is used as the non-linear activation function in the MLP in each self-attention block. It applies the cumulative distribution function of the Gaussian distribution to each input element $x_i$, and is approximated as:
    \begin{equation}
    \mathrm{GELU}(x_i) = 0.5x_i(1 + \tanh(\sqrt{2/\pi}(x_i + 0.044715x_i^3)),
    \end{equation}
    which allows gradients to flow efficiently through the activation during backpropagation.
    
	\begin{figure}[htbp]
	\centering
	\includegraphics[width =1\columnwidth]{figs/transformer_modules.pdf}
	\vspace{-0.8cm}
	\caption{The modules and operators in each Residual Attention Block of ViT.}
	\vspace{-0.3cm}
	\end{figure}
	
	\newpage
	\section{Settings of Universal Group Operator}
	\label{subsec:ugo_setting}
	The Universal Group Operator achieves four main operations: 1) \textbf{Exponentiation} in Softmax, 2) \textbf{GELU} in MLP, 3) \textbf{Inverse} in Softmax, 4) \textbf{Inverse of Square Root} in LayerNorm. The settings required for training a UGO are organized as follows:
	\begin{enumerate}
	    \item \textit{Data Synthesis.} The synthesized distribution $\hat{\mathcal{D}}$ and the sample number $M$.
	    \item \textit{ANN Construction.} The hidden-layer size $N$, the loss function, optimizer and scheduler.
	    \item \textit{SNN Conversion.} The selected threshold $V_{th}$.
	\end{enumerate}
	These settings are summarized in Table.\ref{tab:train_ugo}, where the Loss Penalty refers to the Quantization Gap in Eq.\ref{eq: error_bound}. To empirically demonstrate $\mathcal{D}$ in practice, the inputs of each module under real sampling conditions are provided in Fig.\ref{fig:input}.
	
	The fitting results of Universal Group Operators implemented in our algorighm are shown in Fig.\ref{fig:result}.
	
		\begin{table}[hbp]
    \centering
    \vspace{-0.3cm}
    \caption{Hyperparameters and settings for UGO training.}
    \label{tab:train_ugo}
    \begin{center}
    \setlength\tabcolsep{3pt} 
    \begin{tabular}{lllll}
    \toprule
    & Exp & GELU &Inverse &LayerNorm \\
    \midrule
    \multirow{2}{*}{ $\hat{\mathcal{D}}$} &U(-35,3) 50\% &U(-25,25) 50\% &U(3,38) 75\% &\multirow{2}{*}{U(0.01,1) 100\%}
   \\ &U(-12,2) 50\% &U(-1,1) 50\% &U(2,75) 25\% \\
       \specialrule{0em}{1pt}{1pt}
   $M$ &\multicolumn{4}{c}{164384 samples * 128 batch * 1000 epoch}\\
      \midrule
    $N$ &32 &32 & 16 &8\\
        \specialrule{0em}{3pt}{3pt}
    $Loss$ &Huber &Huber+Penalty &Huber+Penalty &MSE\\
    \specialrule{0em}{3pt}{3pt}
    $Optim-LR$ &SGD-0.01 & SGD-0.01 &SGD-0.01 &Adam-0.01\\
    \specialrule{0em}{3pt}{3pt}
    \multirow{3}{*}{$Scheduler$} &MultiStepLR &StepLR &MultiStepLR &CosineAnnealingLR \\
    &milestones=[500,800] &step=100 &milestones=[500,800] & \\
    &$\gamma=0.5$ &$\gamma=0.5$ &$\gamma=0.1$\\
    \midrule
    $V_{th}$ &\multicolumn{4}{c}{Determined by \citet{li2021free}}\\
         \bottomrule
    \end{tabular}
    \end{center}
    \end{table}

	
	\begin{figure}[htbp]
	\centering
	\includegraphics[width =1\columnwidth]{figs/UGO_input.pdf}
	\caption{Empirical input distribution $\mathcal{D}$ sampled from 10 CIFAR-10 input images.}
	\label{fig:input}
	\vspace{-0.3cm}
	\end{figure}
	
	\begin{figure}[htbp]
	\centering
	\includegraphics[width =1\columnwidth]{figs/fitting_result.pdf}
	\caption{Fitting Results for the Universal Group Operators.}
	\label{fig:result}
	\end{figure}
	
	\newpage
	\section{Proof for Theorem 1}
	\label{subsec:proof_1}
	\newtheorem*{thm1}{Theorem 1}
		\begin{thm1}[Error Bound for UGO]
		For an optimal $\hat{f}^*$, its approximation error $\epsilon^*$ satisfies
		\begin{equation} 
			\epsilon^*\leq
			\underbrace{\mathcal{O}\left(\sqrt{\frac{N\log N \log M}{M}}\right)}_{Empirical\ Gap}+
			\underbrace{\mathcal{O}\left(\frac{\mathcal{L}_f \left|y\right|_{\max}}{N^2}\right)}_{Parameterization \ Gap}+
			\underbrace{\frac{\left\|\vw_1\left|x\right|_{\max}+\vb_1 \right\|_\infty\cdot \|\vw_2\|_1}{T}}_{Quantization\ Gap},
		\end{equation}
	where $\mathcal{L}_f$ is the Lipschitz constant of $f$ on $\mathcal{D}$.
	\end{thm1}
	\begin{proof}
		We decompose the error into three gaps:
		\begin{itemize}
		    \item The \textbf{Empirical Gap} between $f$ and the optimal learning machine $f_m$ due to limited $M$ and model complexity regarding to $N$.
		    \item The \textbf{Parameterization Gap} between $f_m$ and a single-layer ANN $f_n$ with $N$ neurons due to the limited parameters determined by $N$.
		    \item The \textbf{Quantization Gap} between $f_n$ and the UGO $\hat{f}$ due to spiking discretization regarding to $T$.
		\end{itemize}
		
		\textbf{Empirical Gap.} We first quote a lemma from \citet{bartlett2019nearly}.
		\begin{lemma}
		    For deep neural networks with arbitrary piecewise linear activation function where $W$ is the number of weights and $L$ is the number of layers, its VC-dimension is bounded by $\Omega(WL\log(W/L))$ and  $\mathcal{O}(WL\log(W))$.
		\end{lemma}
		For $f_m$, we have $L=1$ and $W=2N$, thus ${d}_{VC}=\mathcal{O}(N\log(N))$.
		According to the classical conclusion in \citet{vapnik1999nature}, the empirical gap between $f_m$ and $f$ with $M$ samples is:
		\begin{equation}
		\label{eq:emp}
		    \epsilon_{emp}=\mathcal{O}\left(\sqrt{\frac{d_{VC} \log \frac{M}{d_{VC}}}{M}}\right)=\mathcal{O}\left(\sqrt{\frac{N\log N\log M}{M}}\right).
		\end{equation}
		
		\textbf{Parameterization Gap.} Considering that the parameter quantity $2N$ of the ANN is much smaller than the sampling quantity $M$ in practice, the ANN $f_n$ cannot empirically fit all data, leading to a gap between $f_n$ and $f_m$. We modify the conclusion from \citet{lu2021deep}:
		\begin{lemma}
		For deep ReLU networks with width $N$ and depth $L$ approximating $f\in C([0,1])^d$ with Lipschitz constant $\mathcal{L}_f$ , the optimal approximation error is $\mathcal{O}\left(\mathcal{L}_f \cdot N^{-2/d} \cdot L^{-2/d}\right)$.
		\end{lemma}
		Accordingly, the gap in our implementation for $f$ is 
		\begin{equation}
		\label{eq:parm}
		    \epsilon_{parm}=\vert y\vert_{max}\cdot \mathcal{O}\left(\mathcal{L}_f \cdot N^{-2}\right)=\mathcal{O}\left(\frac{\mathcal{L}_f \left|y\right|_{\max}}{N^2}\right).
		\end{equation}
		
		\textbf{Quantization Gap.} When converting the ANN $f_n$ to an SNN, we set the threshold as the maximum output of neurons to avoid truncation errors:
		\begin{equation}
		    V_{th}=\max(\vw_1\cdot \vx+\vb_1)\leq \left\|\vw_1\left|x\right|_{\max}+\vb_1\right\|_{\infty}.
		\end{equation}
		The quantization error on the IF neuron outputs is $\frac{V_{th}}{T}$, so the error of the result is:
		\begin{equation}
		\label{eq:quant}
		    \epsilon_{quant}\leq V_{th} \cdot \|\vw_2\|_1=\frac{\left\|\vw_1\left|x\right|_{\max}+\vb_1 \right\|_\infty\cdot \|\vw_2\|_1}{T}.
		\end{equation}
		It can also be generalized to:
		\begin{equation}
		    \epsilon_{quant}\leq \frac{N\left|\vw_2\right|}{T}\left\|\vw_1\left|x\right|_{\max}+\vb_1 \right\|_\infty
		\end{equation}
		
		Combining Eq.\ref{eq:emp}, \ref{eq:parm}, \ref{eq:quant}, the result is proved.
	\end{proof}

    \newpage
    
	\section{Details of High-Dimensional Operations}
	\label{subsec:detail_high}
	We have roughly introduced the decomposition process of high-dimensional operations represented by LayerNorm in \ref{subsec:high-dimension}. In this section, the implementation details of LayerNorm and Softmax will be explained.
	
	\subsection{LayerNorm}
	
	In order to adapt $\mathrm{LN}(\vx)=\gamma\frac{\vx-\mu}{\sqrt{\sigma^2+\epsilon}}+\beta$ to SNN computations, we unroll it into the following steps:
	
	\begin{enumerate}
	    \item Calculate the decentralized inputs $\vx-\mu=\vx-\frac{1}{n}\sum_{i=1}^{n}x_i=\mW^{\mathrm{dc}}\vx$, $x_i\in\{0,1\}$.
	    \item Calculate self-scalar product of $\vx-\mu$, i.e. $\vh=(\mW^{\mathrm{dc}}\vx)\circ (\mW^{\mathrm{dc}}\vx)$, which can be implemented with TCSA in SNN.
	    \item Calculate the variance $\sigma^2=\bar{\vh}=\mW^{\mathrm{avg}}(\mW^{\mathrm{dc}}\vx)\circ (\mW^{\mathrm{dc}}\vx)$.
	    \item Approximate the inverse of the standard deviation via (spiking) UGO, i.e. $\frac{1}{\sqrt{\sigma^2+\epsilon}}\approx\hat{f}(\sigma^2)=v_{th}\vy$, $y_i\in\{0,1\}$
	    \item Calculate scalar product of the inverse and the decentralized inputs with factor $\gamma,\beta$ to get $\mathrm{LN}(\vx)=\gamma\frac{\vx-\mu}{\sqrt{\sigma^2+\epsilon}}+\beta \approx \gamma v_{th}(\mW^{\mathrm{dc}}\vx)\circ \vy+\beta$, implemented with TCSA in SNN.
	\end{enumerate}
	
	\subsection{Softmax}
	

    \begin{figure}
	\vspace{-0.8cm}
		\begin{minipage}[h]{.4\linewidth}
			\centering
			\vspace{-0.3cm}
			\includegraphics[width =  1\columnwidth]{figs/layernorm_subop.pdf}
			\caption{Integration for Layernorm.}
			\label{fig:ln}
		\end{minipage}
		\begin{minipage}[h]{.6\linewidth}
			\centering
			\vspace{-0.3cm}
			\includegraphics[width = 0.99 \columnwidth]{figs/softmax_subop.pdf}
			\vspace{-0.34cm}
			\caption{Integration for Softmax.}
			\label{fig:softmax}
		\end{minipage}
		\vspace{-0.4cm}
	\end{figure}
	
% 	\begin{figure}[tbp]
% 	\vspace{-0.6cm}
% 	\centering
% 	\includegraphics[width =0.6\columnwidth]{figs/softmax_subop.pdf}
% 	\caption{Integration for Softmax}
% 	\label{fig:softmax}
% 	\vspace{-0.3cm}
% 	\end{figure}
	
	Like LayerNorm, Softmax can also be roughly decomposed into three suboperations: weighted summation, UGO approximation, and split multiplication. Specifically, as shown in Fig.\ref{fig:softmax}, Softmax is unrolled into the following steps:
	
	\begin{enumerate}
	    \item Translate the inputs to $[-\infty,1]$ by subtracting an offset $(v_{th}x_i)_{max}-1$ ($v_{th}$ can be neuron-wise and $(v_{th}x_i)_{max}$ is usually in dimension 0) to ensure that no overflow occurs during the exponential operation. This translation has no effect on the result, as it will be cancelled out by the numerator and denominator in the subsequent division. Similar to LayerNorm, $x_i\in\{0,1\}$.
	    \item Clamp the translated inputs to a suitable range that the exponential UGO can handle, avoiding UGO output exceptions due to too small or too large input.
	    \item Approximate the exponential function value via (spiking) UGO, i.e. $e^{x_i}\approx \hat{f}_{\mathrm{exp}}(x_i)=v^{\mathrm{exp}}_{th}y^{\mathrm{exp}}_i$, $y^{\mathrm{exp}}_i\in\{0,1\}$.
	    \item Calculate $\sum_{i=1}^{n}e^{x_i} = \sum_{i=1}^{n}v^{\mathrm{exp}}_{th}y^{\mathrm{exp}}_i$.
	    \item Clamp $\sum_{i=1}^{n}e^{x_i}$ to a suitable range that the inverse UGO can handle.
	    \item Approximate the inverse via (spiking) UGO, i.e. $\frac{1}{\sum_{i=1}^{n}e^{x_i}}\approx  \hat{f}_{\mathrm{inv}}\left(\sum_{i=1}^{n}e^{x_i}\right)=v^{\mathrm{inv}}_{th}y^{\mathrm{inv}}$, $y^{\mathrm{inv}}\in\{0,1\}$.
	    \item Calculate scalar product of the inputs and the inverse to get $\mathrm{Softmax}(\vx)=\frac{\vx}{\sum_{i=1}^{n}e^{x_i}}=v^{\mathrm{inv}}_{th}y^{\mathrm{inv}}\vx$.
	\end{enumerate}

	\newpage
	
	\section{Proof for Theorem 3}
	\label{subsec:proof_3}
	\newtheorem*{thm3}{Theorem 3}
	
	\begin{thm3}[Convergence Rate of Temporal Estimation]
	Assuming two independent floating-point elements $a$ \& $b$, their converted $T$-step spiking sequence follows a stationary independent process with $Ta$ \& $Tb$ spikes emitted. Denote the number of arrived spikes by step $t$ as $x$, the estimated $\Psi(t)$ satisfy:
	\begin{equation}
	    \mathbb{E}\left\{\Psi(t)\right\}=ab,\qquad
 	    \mathbb{D}\left\{\Psi(t)\right\}=\frac{ab(1-a)(1-b)}{(T-1)^2}\cdot\left(\frac{T}{t}-1\right)^2.
	\end{equation}
% 	\begin{align}
% 	    &\mathbb{E}\left\{\hat{A}(t)\right\}=a, &\mathbb{D}\left\{\hat{A}(t)\right\}=\frac{a(1-a)}{T-1}\cdot\frac{T-t}{t}\\
% 	    &\mathbb{E}\left\{\Psi(t)\right\}=ab,
% 	    &\mathbb{D}\left\{\Psi(t)\right\}=\frac{ab(1-a)(1-b)}{(T-1)^2}\cdot\frac{T-t}{t}^2
% 	\end{align}
	\end{thm3}
	
	\begin{proof}
	\label{proof:estimation}
	Considering a single scalar $a$, let $N(t)$ denote the number of spikes from sequences $a_s$ that have arrived by time $t$. Given $N(T)=Ta$, the probability of emitting $x$ spikes in the first $t$ steps is:
	\begin{align}
	   &P\left(N(t)=x|N(T)=Ta\right)\notag\\
	   =&P\left(N(T)=Ta|N(t)=x\right)\cdot\frac{P\left(N(t)=x\right)}{P\left(N(T)=Ta\right)}\notag\\
	   =&P(N(T-t)=Ta-x)\cdot\frac{P\left(N(t)=x\right)}{P\left(N(T)=Ta\right)}\notag\\
	   =&\binom{T-t}{Ta-x}\binom{t}{x}{\binom{T}{Ta}}^{-1}.
	\end{align}
	For the expectation of $x$ and corresponding estimation $\hat{a}(t)$:
	\begin{align}\
	    &\mathbb{E}(x)
	    =\sum_x x\cdot \frac{\binom{T-t}{Ta-x}\binom{t}{x}}{\binom{T}{Ta}}
	    =t\sum_x \frac{\binom{T-t}{Ta-x}\binom{t-1}{x-1}}{\binom{T}{Ta}}
	    =t\frac{\binom{T-1}{Ta-1}}{\binom{T}{Ta}}
	    =t\frac{Ta}{T}
	    =ta\\
	    &\mathbb{E}(\hat{a}(t))=\frac{\mathbb{E}(x)}{t} =a.
	\end{align}
	The second order is similarly derived as:
	\begin{align}
	\mathbb{E}(x^2)
	=&\sum_x x^2\cdot\frac{\binom{T-t}{Ta-x}\binom{t}{x}}{\binom{T}{Ta}}\notag
	\\=&t\left\{
	\sum_x (x-1)\frac{\binom{T-t}{Ta-x}\binom{t-1}{x-1}}{\binom{T}{Ta}}+
	\sum_x \frac{\binom{T-t}{Ta-x}\binom{t-1}{x-1}}{\binom{T}{Ta}}
	\right\}\notag
	\\=&t(t-1)\frac{\binom{T-t}{Ta-x} \binom{t-2}{x-2}}{\binom{T}{Ta}}+ta\notag
	\\=&ta(t-1)\frac{(Ta-1)}{(T-1)}+ta.
	\end{align}
	For the variance:
	\begin{align}
	    &\mathbb{D}(x)=\mathbb{E}(x^2)-\mathbb{E}^2(x)=ta \frac{(1-a)(T-t)}{T-1} \\
	    &\mathbb{D}\left(\hat{a}(t)\right)=\frac{\mathbb{D}(x)}{t^2}=\frac{a(1-a)}{T-1}\cdot \left(\frac{T}{t}-1\right).
	\end{align}
	As the input elements $a$ \& $b$ are independent in neural networks, the statistics of their product is:
	\begin{align}
	    \mathbb{E}\left(\Phi(t)\right)&=\mathbb{E}\left(\hat{a}\hat{b}\right)=ab \\
	    \mathbb{D}\left(\Phi(t)\right)&=\frac{ab(1-a)(1-b)}{(T-1)^2}\cdot\left(\frac{T}{t}-1\right)^2.
	\end{align}
	\end{proof}

	\newpage
	\section{Implementation of UGO-approximated ANN conversion to SNN}
	\label{subsec:implementation}
	
	Considering the large network size of ViT, traditional methods like MaxNorm\citep{rueckauer2017conversion} are not sufficient to preserve ANN's performance. Therefore we use some advanced, training-free techniques for conversion.
	
	\subsection{Threshold Balancing}
	When a UGO-approximated ANN is obtained, the activation functions have all been replaced with ReLU, which is convenient for us to directly convert it to SNN. A small number of training samples are needed to obtain the maximum activation value and its quantitative estimate to determine the threshold potential of the neurons \citep{li2021free}. Our optimization objective is:
	\begin{equation}\label{eq: mmse}
	    \min_{\vv_{th}^{l}} \left(\mathrm{QT}\left(\vz^{l},T,\vv_{th}^{l}\right) - \mathrm{ReLU}\left(\vz^{l}\right)\right),\qquad
	    \mathrm{QT}\left(\vz^{l},T,\vv_{th}^{l}\right) = \frac{\vv_{th}^{l}}{T} \cdot \mathrm{clamp}\left( \left \lfloor \frac{T}{\vv_{th}^{l}} \vz^{l} \right \rfloor,0,T\right),
	\end{equation}
	where $\vz^l=\mW^l\vx^{l-1}$. As Eq.\ref{eq: mmse} has no closed-form solution, we find the optimal threshold by grid search, enumerating $n$($n=100$ in our experiments) values within $[0.5\vz_{max}^l,\vz_{max}^l]$.
	
	\subsection{Signed Neurons with Memory Potential}
	
	For the IF neuronal structure that can only release positive spikes, if the input from a neuron's negative-weighted synapse arrives late and the positive-weighted input has already been converted into a spike, a portion of the significant negative potential information will not be transmitted. Therefore, we introduce Signed Neurons with Memory potentials(SNM) which allow negative spikes to be released\citep{wang2022signed} to ensure that negative-weighted information is not lost. In this perspective, Eq.\ref{eq: IF_forward} can be refined as:
	\begin{align}
	    \vm^l(t)&=\vp^l(t-1)+\mW^l\vs^l(t), \notag \\
	    \tilde{\vr}^l(t) &= \vr^l(t-1), \notag \\
	    s^{l,i}(t)&=\left\{
	        \begin{array}{ll}\label{eq: SNM}
                v_{th}^{l,i}, & m^{l,i}(t) \geq v_{th}^{l,i} \\
                -v_{th}^{l,i}, & m^{l,i}(t)\leq -v_{th}^{l,i} \quad \text { and } \quad \tilde{r}^{l,i}(t)>0 \\
                0, & \text { otherwise }
            \end{array}\right., \\
        \vp^l(t)&=\vm^l(t)-\vs^l(t), \notag \\
        \vr^l(t)&=\tilde{\vr}^l(t-1) + \vs^l(t) \notag
	\end{align}
	where $\tilde{\vr}^l(t)$ and $\vr^l(t)$ denotes the memory potential before and after spikes' triggering.
	
	\subsection{Burst Spikes with $\rho$-Scale Threshold}
	
	In order to minimize the effect of lagging inputs generated during SNN inference, we use the burst spikes mechanism, which allows neurons to clear off residual potentials in the form of $\Gamma$($\Gamma=2$ in our experiments) high-frequency spikes between regular emissions\citep{li2022efficient}. The threshold of the residual potential is set to $\rho\vv_{th}$. Considering its small scale in relation to $\vv_{th}$ and without disrupting the quantization relationship established by Eq.\ref{eq: mmse}, we set $\rho = 0.5$.
	
	\subsection{Algorithm}
	
	\begin{algorithm}[htbp]
        \caption{STA Conversion Pipeline}
        \label{algo: STA_conversion}
        \renewcommand{\algorithmicrequire}{\textbf{Input}}  
        \renewcommand{\algorithmicensure}{\textbf{Output}} 
        \begin{algorithmic}
            \REQUIRE{Pretrained ANN; Non-linearities $\{f_i\}$ with synthetic distributions $\{\hat{\mathcal{D}}_i\}$; Simulation length $T$}
            \FOR{each nonlinear function $f_i$ in Transformer}
                \STATE Initialize UGO model $\hat{f}_i$ with $N_i$ hidden neurons
                \STATE Sample $M$ points $\{x_j\}_{j=1,...,M}$ from $\hat{\mathcal{D}}_i$
                \STATE Optimize $\hat{f}_i$ using labels $\{f_i(x_j)\}_{j=1,...,M}$
            \ENDFOR
            \STATE Replace non-linearites $\{f_i\}$ in pretrained ANN with $\{\hat{f}_i\}$
            \STATE Replace multiplications in pretrained ANN with TCSA (cf.Eq.\ref{eq: corrective_increment})
            \FOR{$l=1,2,...L$-th ReLU layer in the ANN}
                \STATE Collect the input $\vx^{l}$ and the output $\vx^{l+1}$
                \STATE Find the optimal threshold $\vv_{th}^{l}$  for SNN by grid search (cf.Eq.\ref{eq: mmse})
            \ENDFOR
            \ENSURE{Converted SNN}
        \end{algorithmic}
    \end{algorithm}
    
    \begin{algorithm}[htbp]
        \caption{STA Inference}
        \label{algo: STA_inference}
        \renewcommand{\algorithmicrequire}{\textbf{Input}}  
        \renewcommand{\algorithmicensure}{\textbf{Output}} 
        \begin{algorithmic}
            \REQUIRE{Converted SNN; Simulation length $T$; Burst length $\Gamma$; Burst scale $\rho$}
            \FOR{$t=1,2,...,T$}
                \FOR{each forward operation $g$ in the SNN}
                    \IF{$g$ is a multiplication layer}
                        \STATE Output temporary scaled product $\sum_{j=1}^{t}k(t)\mX_1(j)\mW_1\mW_2\mX_2(j)$ using TCSA (cf.Eq.\ref{eq: corrective_increment})
                    \ENDIF
                    \IF{$g$ is a non-spiking linear layer}
                        \STATE Output $\mW\vx(t)+\vb$
                    \ENDIF
                    \IF{$g$ is a spiking linear layer constructed by IF neurons}
                        \STATE Calculate $\bigtriangleup \vv=\mW\vx(t)+\vb$
                        \STATE Release positive \& negative spikes with threshold $\vv_{th}$, and update potentials $\vv$ (cf.Eq.\ref{eq: SNM})
                        \FOR{$i=1,2,...\Gamma$}
                            \STATE Release spikes with threshold $\rho\vv_{th}$, and update potentials $\vv$
                        \ENDFOR
                    \ENDIF
                \ENDFOR
            \ENDFOR
        \end{algorithmic}
    \end{algorithm}
    
	
	\newpage
	\section{Supplementary for Experiments}
	\subsection{Datasets}
	\label{subsec:datasets}
	\textbf{CIFAR-10}
	is a dataset developed by the Canadian Institute for Advanced Research (CIFAR), widely used as a benchmark dataset for developing and evaluating image classification models due to its manageable size and variety of classes. It consists of 60,000 color images sampled from TinyImages Dataset\citep{torralba2008tinyimages}, divided into 50,000 training images and 10,000 testing images. There are 10 different classes in CIFAR-10, including common objects like airplanes, cars, birds, cats, etc.
	
	\textbf{CIFAR-10.1 \& CIFAR-10.2} 
	are new testing sets for CIFAR-10, each incorporating 2,000 images from TinyImages Dataset. There are small distribution shifts between them and the original data set, which may be attributed to different generation conditions (such as illumination, angle, etc.) or adversarial attacks. Therefore, they are created to assess the robustness and generalization of models trained on CIFAR-10\citep{recht2018cifar10.1,lu2020harder}.
	
	In our experiments, since there is no training set for the above two datasets, we use the training samples of CIFAR-10 to determine the threshold potentials for SNN. The high accuracy in the results demonstrates that the converted SNN not only retains the generalization ability of the pretrained model but also has the robustness to this distribution shift.
	
	\textbf{CIFAR-100}
	is also a subset of TinyImages Dataset and serves as a more challenging version of CIFAR-10, consisting of 100 fine-grained classes, categorized into 20 superclasses. Like CIFAR-10, it includes 50,000 training images and 10,000 testing images.
	
	\textbf{ImageNet}
	is one of the largest public image databases, containing about 14 million images labeled into 1,000 categories(the full dataset is over 20,000). Unlike TinyImages Dataset, it consists of high-resolution images from the Internet. ImageNet provides a wide range of help for the realization of tasks such as image classification, target detection and semantic segmentation in large-scale scenarios. ImageNet-200 is a well-chosen subset of ImageNet containing 200 categories that can help train and evaluate models more efficiently.
	
	\subsection{Additional Results}
	\label{subsec:results_classification}
	We provide additional results on standard classification tasks using fine-tuned ViT-B/32 from CLIP, as well as other models like resnet-20 and ResNet-34 trained directly on these dataset. The pretrained ResNet-50 does not surpass the performance of direct training, while ViT-B/32 performs well on generalization.
	
	 To determine the optimal scale $N$ and training method for the Universal Group Operator (UGO), we conducted ablation experiments on the CIFAR-100 dataset with $T=32$. By modifying the UGO parameters, we compared different settings' impact on overall accuracy. The results in Table.\ref{tab:ablation_ugo} show that as we increased N, accuracy shows decelerated growth with efficiency continually declined. Considering both factors, we selected a balanced approach that achieves good accuracy without excessive computational cost. The ablation experiments guided our selection of an appropriate UGO scale and training method.
	 
	 To verify the effectiveness of the techniques mentioned in \ref{subsec:implementation}, we did several sets of ablation experiments on cifar-10 with $T=32$. Table.\ref{tab:ablation_techniques} shows that the SNM structure significantly improves the performance of the converted SNN. The neuron-wise search of potential threshold and the introduction of the burst spikes mechanism also play an important role in model conversion, especially when combined with SNM. This may be due to the unique dynamic characteristics caused by estimation-correction mechanism.
	
	\begin{table}[htbp]
    \centering
    \vspace{-0.4cm}
    \caption{Comparison with other backbones and baselines on standard classification of CIFAR-10}
    \label{tab:cifar-10}
    \begin{center}
    \begin{tabular}{llccccc}
         \toprule Model &Method &ANN Acc. & T=32  & T=64  & T=128 & T=256 \\ \midrule
         \multirow{6}{*}{resnet-20} &RMP \citep{han2020rmp} &\multirow{6}{*}{95.68} &38.04 &59.73 &90.10 &90.47 \\
         &TSC \citep{han2020deep} & &57.64 &71.22 &91.30 &92.30 \\
         &Opt. \citep{deng2020optimal} & & 87.30 &92.50 &94.32 &95.28 \\
         &Calib. \citep{li2021free} & &94.77 &95.02 &95.17 &95.44 \\
         &SNM \citep{wang2022signed} & &94.13 &95.43 &95.75 &95.69 \\
         &Burst \citep{li2022efficient} & &94.92 &95.51 &95.40 &95.61 \\
         \midrule
         \multirow{3}{*}{\makecell[l]{ResNet-50 \\(CLIP)}} & Opt. \citep{deng2020optimal} &\multirow{3}{*}{95.71} &71.37 &81.52 &85.43 &88.10 \\
         &Calib. \citep{li2021free} & &87.64 &91.85 &92.79 &94.60\\
         &SNM \citep{wang2022signed} & &90.30 &91.42 &92.44 &94.31 \\
         \midrule
         ViT-B/32 &\textbf{STA (Ours)} &96.16  &\textbf{95.49} &\textbf{95.74} &\textbf{95.68} &\textbf{95.82} \\
         \bottomrule
    \end{tabular}
    \end{center}
    \end{table}
	
	\begin{table}[htbp]
    \centering
    \caption{Comparison with other backbones and baselines on standard classification of ImageNet}
    \label{tab:imagenet}
    \begin{center}
    \begin{tabular}{llccccc}
         \toprule Model &Method &ANN Acc. & T=32  & T=64  & T=128 & T=256 \\ \midrule
         \multirow{4}{*}{ResNet-34} &RMP \citep{han2020rmp} &70.64 &- &- &- &55.65 \\
         &Opt. \citep{deng2020optimal} &70.95 &33.01 &59.52 &67.54 &70.06 \\
         &Calib. \citep{li2021free} &75.66 &64.54 &71.12 &73.45 &74.61 \\
         &SNM \citep{wang2022signed} &73.30 &55.28 &62.72 &65.53 &69.31 \\
         \midrule
         ViT-B/32 &\textbf{STA (Ours)} &83.60  &\textbf{78.72} &\textbf{82.33} &\textbf{82.56} &\textbf{82.79} \\
         \bottomrule
    \end{tabular}
    \end{center}
    \end{table}
    
	\begin{table}[htbp]
    \centering
    \caption{Ablations for settings of Universal Group Operators classification on CIFAR-100, T=32.}
    \label{tab:ablation_ugo}
    \begin{center}
    \begin{tabular}{lcccc}
    \toprule $N$ and Penalty & Exp & GELU & Inverse & LayerNorm \\ \midrule
    \multicolumn{5}{c}{T=32, N in Table.\ref{tab:train_ugo}, Baseline Accuracy=84.15}
    \\ \midrule
    8 &-10.46 &-5.24 &-6.43 &\textit{+0.00} \\
    8 + Penalty &-12.62 &-4.92 &-9.76 &-6.14\\
    16 & -4.62 &-2.43 &-0.14 &\textbf{+0.02} \\
    16 + Penalty &-4.51 &-1.41 &\textit{+0.00} &-1.79\\
    32 &\textit{+0.00} &-0.40 &+0.08 &-0.24 \\
    32 + Penalty &-0.08 &\textbf{\textit{+0.00}} &\textbf{+0.56} &-1.58\\
    64 &\textbf{+0.22} &-0.03 &+0.52 &-0.05 \\
         \bottomrule
    \end{tabular}
    \end{center}
    \end{table}
    
    \begin{table}[htbp]
    \centering
    \vspace{-0.4cm}
    \caption{Ablations for conversion techniques on CIFAR-10, T=32}
    \label{tab:ablation_techniques}
    \begin{center}
    \begin{tabular}{cccc}
    \toprule
    \multicolumn{3}{c}{Technique Settings} & \multirow{2}{*}{SNN Acc.} \\ \cmidrule(r){1-3}
    MMSE            & Burst Spikes          & Use SNM       &                           \\ \midrule
    layer-wise      & $\Gamma=0$            & $\times$      & 11.35                    \\
    layer-wise      & $\Gamma=0$            & $\surd$       & 15.57                    \\
    neuron-wise     & $\Gamma=0$            & $\times$      & 13.48                    \\
    neuron-wise     & $\Gamma=0$            & $\surd$       & 54.32                    \\
    neuron-wise     & $\Gamma=2,\rho=0.5$   & $\times$      & 19.84                    \\
    neuron-wise     & $\Gamma=2,\rho=0.5$   & $\surd$       & 95.26                    \\ \bottomrule
    \end{tabular}
    \end{center}
    \end{table}
    
\end{document}
