System,Author(s),Publication date,Year,Reference,Citations,Peer reviewed?,Link,Parameters,Hardware,Training Compute,Epoch,Epoch (pretrain),Epoch (finetune),uncertain,Pretrain Dataset Size,Inferred_compute (6ND),Finetune Dataset Size,Dataset Size,Dataset(s),Perplexity (WT103),Perplexity (WT2),Perplexity (PTB),Zero-shot?,Include?,Uses Cache,Outlier?,Architecture,Base Model,GitHub,Organizations,Organization Categorization,Comments,Complete row,Tokenizer,Vocabulary
CD-GraB (WT2),"A. Feder Cooper, Wentao Guo, Khiem Pham, Tiancheng Yuan, Charlie F. Ruan, Yucheng Lu, Christopher De Sa",2023/02/02,2023,CD-GraB: Coordinating Distributed Example Orders for Provably Accelerated Training,0,0,https://arxiv.org/pdf/2302.00845.pdf,UNK,,,50,,,0.00,2.08E+06,#VALUE!,,2.08E+06,WikiText-2,,223.44,,0.0,0,0,0,Recurrent,LSTM,,,,,0,GPT2Tokenizer,50257
LSTM-300units,"Martin Sundermeyer, Ralf Schlüter, Hermann Ney",2012/09/01,2012,LSTM Neural Networks for Language Modeling,2503,,http://www.quaero.org/media/files/bibliographie/sundermeyer_lstm_neural_interspeech2012.pdf,1.20E+07,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,114.50,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
RNN,"Tomas Mikolov, Geoffrey Zweig",2012/12/01,2012,Context dependent recurrent neural network language model,716,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/rnn_ctxt.pdf,6.00E+06,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,Penn TreeBank,,,124.70,0.0,1,0,0,Recurrent,RNN,,,,,1,Word-level,10000
Alleviated TOI 10 (WT103),"Noémien Kocher, Christian Scuito, Lorenzo Tarantino, Alexandros Lazaridis, Andreas Fischer, Claudiu Musat",2019/09/18,2019,Alleviating Sequence Information Loss with Data Overlapping and Prime Batch Sizes,0,1,https://arxiv.org/abs/1909.08700,UNK,,,1000,,,1.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,32.85,,,0.0,1,0,0,Recurrent,LSTM,https://github.com/nkcr/overlap-ml,,,,0,?,?
GPT2-LayerFusion-WS,"James O' Neill, Greg Ver Steeg, Aram Galstyan",2020/07/29,2020,Compressing Deep Neural Networks via Layer Fusion,5,,https://arxiv.org/pdf/2007.14917,,,,,,,0.00,1.03E+08,,2.08E+06,1.05E+08,,,13.71,,0.0,0,0,0,Transformer,GPT,,,,,0,?,?
TransfoRNN(d=1024)(2-layer) (WT2),"Tze Yuang Chong, Xuyang Wang, Lin Yang, Junjie Wang",2021/04/04,2021,TransfoRNN: Capturing the Sequential Information in Self-Attention Representations for Language Modeling,0,0,https://arxiv.org/pdf/2104.01572,9.76E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,94.80,,0.0,0,0,1,Transformer/RNN,TransfoRNN,,,,,1,Own,33000
RNN+LDA+KN5+cache,"Tomas Mikolov, Geoffrey Zweig",2012/12/01,2012,Context dependent recurrent neural network language model,716,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/rnn_ctxt.pdf,9.00E+06,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,Penn TreeBank,,,92.00,0.0,1,1,0,Recurrent,RNN,,,,,1,Word-level,10000
RNN+LDA,"Tomas Mikolov, Geoffrey Zweig",2012/12/01,2012,Context dependent recurrent neural network language model,716,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/rnn_ctxt.pdf,7.00E+06,227,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,Penn TreeBank,,,113.70,0.0,1,0,0,Recurrent,RNN,,,,,1,Word-level,10000
RNN (SGD+CLR) (PTB),"Yoshua Bengio, Nicolas Boulanger-Lewandowski, Razvan Pascanu",2012/12/04,2012,Advances in Optimizing Recurrent Networks,665,,https://arxiv.org/abs/1212.0901,2.05E+06,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,128.35,0.0,1,0,0,Recurrent,RNN,,,,,1,Word-level,10000
LBL,"Andriy Mnih, Yee Whye Teh",2012/06/27,2012,A Fast and Simple Algorithm for Training Neural Probabilistic Language Models,835,,https://arxiv.org/pdf/1206.6426,2.00E+06,,,45,,,0.00,9.29E+05,5.02E+14,,9.29E+05,,,,159.10,0.0,1,0,0,Probabilistic,NPLM,,,,,1,,10000
Deep RNN,"Stephen Merity, Caiming Xiong, James Bradbury, Richard Socher",2013/12/11,2013,Pointer Sentinel Mixture Models,1558,,https://arxiv.org/abs/1609.07843,6.00E+06,,,64,,,0.00,9.29E+05,2.14E+15,,9.29E+05,Penn TreeBank,,,107.50,0.0,1,0,0,Recurrent,,,,,,1,,10000
DOT(S)-RNN,"Razvan Pascanu, Caglar Gulcehre, Kyunghyun Cho, Yoshua Bengio",2013/12/20,2013,How to Construct Deep Recurrent Neural Networks,1255,,https://arxiv.org/pdf/1312.6026.pdf,6.16E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,107.50,0.0,1,0,0,Recurrent,RNN,,,,,1,Word-level,10000
RNN+weight noise+dynamic eval,Alex Graves,2013/08/04,2013,Generating Sequences With Recurrent Neural Networks,4734,,https://arxiv.org/abs/1308.0850,5.40E+07,,,14,,,0.00,9.29E+05,4.21E+15,,9.29E+05,,,,117.00,0.0,1,0,0,Recurrent,RNN,,,,,1,Word-level,10000
Large regularized LSTM,"Wojciech Zaremba, Ilya Sutskever, Oriol Vinyals",2014/09/08,2014,Recurrent Neural Network Regularization,3224,,https://arxiv.org/abs/1409.2329,6.60E+07,,9.10E+16,55,,,0.00,9.29E+05,2.02E+16,,9.29E+05,Penn TreeBank,,,78.40,0.0,1,0,0,Recurrent,LSTM,"https://github.com/wojzaremba/lstm., ",,,,1,,10000
SCRN(Structurally Constrained Recurrent Network),"Tomas Mikolov, Armand Joulin, Sumit Chopra, Michael Mathieu, Marc'Aurelio Ranzato",2014/12/24,2014,Learning Longer Memory in Recurrent Neural Networks,306,,https://arxiv.org/abs/1412.7753,2.65E+07,,,UNK,,,1.00,9.29E+05,#VALUE!,,9.29E+05,,,,115.00,0.0,1,0,0,Recurrent,RNN,https://github.com/facebookarchive/SCRNNs,,,,1,,10000
SPN-4,"W. Cheng, Stanley Kok, Hoai Vu Pham, Hai Leong Chieu, K. M. A. Chai",2014/01/01,2014,Language modeling with sum-product networks,102,,https://spn.cs.washington.edu/papers/is14.pdf,5.00E+06,,,UNK,,,0.00,1.01E+06,#VALUE!,,1.01E+06,Penn TreeBank,,,100.00,0.0,1,0,0,,,https://github.com/stakok/lmspn,,,,1,Word-level,10000
4-gram + 8 DENN,"Kartik Audhkhasi, Abhinav Sethy, Bhuvana Ramabhadran",2014/12/22,2014,Diverse Embedding Neural Network Language Models,1,0,https://arxiv.org/pdf/1412.7063,1.61E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,109.32,0.0,1,0,0,N-gram,,,,,,1,,10000
Stack RNN,"Armand Joulin, Tomas Mikolov",2015/03/03,2015,Inferring Algorithmic Patterns with Stack-Augmented Recurrent Nets,440,,https://arxiv.org/abs/1503.01007,2.01E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,118.00,0.0,1,0,0,Recurrent,RNN,"https://github.com/facebook/Stack-RNN, ",,,,1,?,?
genCNN + dyn eval,"Mingxuan Wang, Zhengdong Lu, Hang Li, Wenbin Jiang, Qun Liu",2015/03/17,2015,genCNN: A Convolutional Architecture for Word Sequence Prediction,33,,https://aclanthology.org/P15-1151/,8.00E+06,,7.30E+16,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,106.30,0.0,1,0,0,,,,,,,1,,10000
LSTM-Char-Large,"Yoon Kim, Yacine Jernite, David Sontag, Alexander M. Rush",2015/08/26,2015,Character-Aware Neural Language Models,2033,,https://arxiv.org/abs/1508.06615,1.90E+07,,,25,,,0.00,9.29E+05,2.65E+15,,9.29E+05,Penn TreeBank,,,78.90,0.0,1,0,0,Recurrent,LSTM,https://github.com/yoonkim/lstm-char-cnn,,,,1,Word-level,10000
Search-Proven Best LSTM,"R. Józefowicz, Wojciech Zaremba, Ilya Sutskever",2015/07/06,2015,An Empirical Exploration of Recurrent Network Architectures,2207,,https://proceedings.mlr.press/v37/jozefowicz15.pdf,2.00E+07,,,30,,,0.00,9.29E+05,3.34E+15,,9.29E+05,,,,79.83,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
"Variational (untied weights, MC) LSTM (Large)","Yarin Gal, Zoubin Ghahramani",2015/12/16,2015,A Theoretically Grounded Application of Dropout in Recurrent Neural Networks,1838,,https://arxiv.org/abs/1512.05287?context=stat,6.60E+07,,,16,,,0.00,8.88E+05,5.62E+15,,8.88E+05,Penn TreeBank,,,73.40,0.0,1,0,0,Recurrent,RNN,https://github.com/yaringal/BayesianRNN,,,,1,Word-level,10000
2nd order FOFE-FNNLM,"Shiliang Zhang, Hui Jiang, Mingbin Xu, Junfeng Hou, Lirong Dai",2015/05/06,2015,A Fixed-Size Encoding Method for Variable-Length Sequences with its Application to Neural Network Language Models,18,,https://arxiv.org/abs/1505.01504,6.00E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,108.00,0.0,1,0,0,Feed-forward,FNN-LM,,,,,1,,10000
VD-LSTM+REAL Large,"Hakan Inan, Khashayar Khosravi, Richard Socher",2016/11/04,2016,Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling,397,,https://arxiv.org/abs/1611.01462,5.10E+07,,,75,,,0.00,9.29E+05,2.13E+16,,9.29E+05,Penn TreeBank,,,68.50,0.0,1,0,0,Recurrent,LSTM,,,,,1,,10000
"GCRN-M1, dropout","Youngjoo Seo, Michaël Defferrard, Pierre Vandergheynst, Xavier Bresson",2016/12/22,2016,Structured Sequence Modeling with Graph Convolutional Recurrent Networks,674,,https://arxiv.org/pdf/1612.07659,4.20E+07,,,13,,,0.00,9.29E+05,3.04E+15,,9.29E+05,,,,98.67,0.0,1,0,0,Recurrent,GCRN,,,,,1,,10000
EGRU (PTB),"Anand Subramoney, Khaleelulla Khan Nazeer, Mark Schöne, Christian Mayr, David Kappel",2022/06/13,2022,Efficient recurrent architectures through activity sparsity and sparse back-propagation through time,1,1,https://arxiv.org/pdf/2206.06178v3.pdf,5.50E+07,,,2500,,,0.00,9.29E+05,7.66E+17,,9.29E+05,,,,57.00,0.0,0,0,1,Recurrent,,https://github.com/Efficient-Scalable-Machine-Learning/EvNN,,,,1,,
Variational RHN + WT,"Julian Georg Zilly, Rupesh Kumar Srivastava, Jan Koutník, Jürgen Schmidhuber",2016/07/12,2016,Recurrent Highway Networks,493,,https://arxiv.org/abs/1607.03474,2.30E+07,,,20,,,0.00,9.29E+05,2.56E+15,,9.29E+05,Penn TreeBank,,,65.40,0.0,1,0,0,Recurrent,RHN,https://github.com/julian121266/RecurrentHighwayNetworks,,,,1,Word-level?,10000
VD-RHN,"Julian Georg Zilly, Rupesh Kumar Srivastava, Jan Koutník, Jürgen Schmidhuber",2016/07/12,2016,Recurrent Highway Networks,493,,https://arxiv.org/abs/1607.03474,3.20E+07,,,20,,,0.00,9.29E+05,3.57E+15,,9.29E+05,Penn TreeBank,,,68.50,0.0,1,0,0,Recurrent,RHN,https://github.com/julian121266/RecurrentHighwayNetworks,,,,1,Word-level?,10000
Pointer Sentinel-LSTM (medium),"Stephen Merity, Caiming Xiong, James Bradbury, Richard Socher",2016/09/26,2016,Pointer Sentinel Mixture Models,1558,,https://arxiv.org/abs/1609.07843,2.10E+07,,,64,,,0.00,9.29E+05,7.49E+15,,9.29E+05,Penn TreeBank,,,70.90,0.0,1,0,0,Recurrent,LSTM,,,,,1,,10000
Relational Memory Core,"Adam Santoro, Ryan Faulkner, David Raposo, Jack Rae, Mike Chrzanowski, Theophane Weber, Daan Wierstra, Oriol Vinyals, Razvan Pascanu, Timothy Lillicrap",2018/06/05,2018,Relational recurrent neural networks,235,,https://arxiv.org/abs/1806.01822,UNK,,,,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,31.60,,,0.0,1,0,0,Recurrent,LSTM,,Carnegie Mellon University; Google Brain,Industry - Academia Collaboration,,0,,250000
SPALM + RelationLM,"Qi Liu, Dani Yogatama, Phil Blunsom",2022/01/24,2022,Relational Memory-Augmented Language Models,21,,https://arxiv.org/pdf/2201.09680,1.24E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,WikiText-103,18.60,,,0.0,0,0,0,Transformer,Transformer-XL,,Tianjin University; Microsoft Research; Beijing Institute of Technology,Industry - Academia Collaboration,,1,,267735
Zoneout + Variational LSTM (WT2),"Stephen Merity, Caiming Xiong, James Bradbury, Richard Socher",2016/09/26,2016,Pointer Sentinel Mixture Models,1558,,https://arxiv.org/abs/1609.07843,2.10E+07,,,64,,,0.00,2.08E+06,1.68E+16,,2.08E+06,WikiText-2,,100.90,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,33278
Pointer Sentinel-LSTM,"Stephen Merity, Caiming Xiong, James Bradbury, Richard Socher",2016/09/26,2016,Pointer Sentinel Mixture Models,1558,,https://arxiv.org/abs/1609.07843,2.00E+07,,,64,,,0.00,2.08E+06,1.60E+16,,2.08E+06,WikiText-2,,80.80,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,33278
Characterizing Verbatim Short-Term Memory in Neural Language Models (108M),"Kristijan Armeni, Christopher Honey, Tal Linzen",2022/10/24,2022,Characterizing Verbatim Short-Term Memory in Neural Language Models,3,1,https://arxiv.org/pdf/2210.13569.pdf,1.08E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,,40.30,,,0.0,0,0,0,,,https://github.com/KristijanArmeni/verbatim-memory-in-NLMs,NT Communication Science Laboratories; Tohoku University,Industry - Academia Collaboration,,1,,
Zoneout + Variational LSTM (PTB),"Stephen Merity, Caiming Xiong, James Bradbury, Richard Socher",2016/09/26,2016,Pointer Sentinel Mixture Models,1558,,https://arxiv.org/abs/1609.07843,2.10E+07,,,64,,,0.00,9.29E+05,7.49E+15,,9.29E+05,Penn TreeBank,,,80.60,0.0,1,0,0,Recurrent,LSTM,,,,,1,,10000
Byte-mLSTM+emb+WN+VD,"Ben Krause, Liang Lu, Iain Murray, Steve Renals",2016/09/26,2016,Multiplicative LSTM for sequence modelling,216,,https://arxiv.org/pdf/1609.07959,4.60E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,88.80,,1.0,1,0,0,Recurrent,LSTM,https://github.com/benkrause/mLSTM,,,,1,Word-level,33000
Neural cache model (size=2000) (300M),"Edouard Grave, Armand Joulin, Nicolas Usunier",2016/12/13,2016,Improving Neural Language Models with a Continuous Cache,302,,https://arxiv.org/abs/1612.04426,3.00E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,40.80,,,0.0,1,1,0,Recurrent,LSTM,,,,,1,Word-level,260000
LSTM (WT103),"Edouard Grave, Armand Joulin, Nicolas Usunier",2016/12/13,2016,Improving Neural Language Models with a Continuous Cache,302,,https://arxiv.org/abs/1612.04426,1.10E+07,,,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,48.70,,,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,260000
LSTM (PTB),"Edouard Grave, Armand Joulin, Nicolas Usunier",2016/12/13,2016,Improving Neural Language Models with a Continuous Cache,302,,https://arxiv.org/abs/1612.04426,3.28E+07,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,Penn TreeBank,,,82.30,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
Gated HORNN (3rd order),"Rohollah Soltani, Hui Jiang",2016/04/30,2016,Higher Order Recurrent Neural Networks,77,,https://arxiv.org/pdf/1605.00064,8.97E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,100.00,0.0,1,0,0,Recurrent,RNN,,,,,1,,10000
Word-Independent-SRNN+KN5,"Youssef Oualil, Clayton Greenberg, Mittul Singh, Dietrich Klakow",2017/03/23,2017,Sequential Recurrent Neural Networks for Language Modeling,7,,https://arxiv.org/pdf/1703.08068,5.32E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,94.00,0.0,1,0,0,Recurrent,RNN,,,,,1,,10000
AWD-LSTM - 3-layer LSTM (tied) + continuous cache pointer (WT2),"Stephen Merity, Nitish Shirish Keskar, Richard Socher",2017/08/07,2017,Regularizing and Optimizing LSTM Language Models,1176,,https://arxiv.org/abs/1708.02182,3.30E+07,,,750,,,0.00,2.08E+06,3.09E+17,,2.08E+06,WikiText-2,,52.00,,0.0,1,1,0,Recurrent,LSTM,https://github.com/salesforce/awd-lstm-lm,,,,1,,30000
AWD-LSTM - 3-layer LSTM (tied) + continuous cache pointer (PTB),"Stephen Merity, Nitish Shirish Keskar, Richard Socher",2017/08/07,2017,Regularizing and Optimizing LSTM Language Models,1176,,https://arxiv.org/abs/1708.02182,2.40E+07,,,500,,,0.00,9.29E+05,6.69E+16,,9.29E+05,Penn TreeBank,,,52.80,0.0,1,1,0,Recurrent,LSTM,https://github.com/salesforce/awd-lstm-lm,,,,1,,10000
EGRU (WT2),"Anand Subramoney, Khaleelulla Khan Nazeer, Mark Schöne, Christian Mayr, David Kappel",2022/06/13,2022,Efficient recurrent architectures through activity sparsity and sparse back-propagation through time,1,1,https://arxiv.org/pdf/2206.06178v3.pdf,7.40E+07,,,2500,,,0.00,2.08E+06,2.31E+18,,2.08E+06,,,68.90,,0.0,0,0,1,Recurrent,,https://github.com/Efficient-Scalable-Machine-Learning/EvNN,,,,1,,
TRIMELMext (7M),"Zexuan Zhong, Tao Lei, Danqi Chen",2022/05/25,2022,Training Language Models with Memory Augmentation,35,,https://arxiv.org/abs/2205.12674,7.00E+06,,,47.72,,,0.00,1.03E+08,2.06E+17,1.03E+08,2.06E+08,WikiText-103,42.36,,,0.0,0,0,0,Transformer,Transformer-XL,https://github.com/princeton-nlp/TRIME,Tsinghua University; Beijing Academy of Artificial Intelligence; MIT CSAIL; Shanghai Qi Zhi Institute,Academia,,1,,
AFP+FPI (PTB),"Zhengxiong Wang, Anton Ragni",2021/06/04,2021,Approximate Fixed-Points in Recurrent Neural Networks,1,0,https://arxiv.org/pdf/2106.02417,2.04E+06,,,20,,,0.00,9.29E+05,2.27E+14,,9.29E+05,Penn TreeBank,,,129.40,0.0,0,0,0,Recurrent,AFP,,,,,1,,
AWD-LSTM,"Gábor Melis, Chris Dyer, Phil Blunsom",2017/07/18,2017,On the State of the Art of Evaluation in Neural Language Models,555,,https://arxiv.org/abs/1707.05589,3.30E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,WikiText-2,,65.80,,0.0,1,0,0,Recurrent,LSTM,,,,,1,GPT-2 tokenizer,52000
LTM,"Anupiya Nugaliyadde, Kok Wai Wong, Ferdous Sohel, Hong Xie",2019/04/18,2019,Language Modeling through Long Term Memory Network,19,,https://arxiv.org/pdf/1904.08936,UNK,,,,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,83.00,0.0,1,0,0,Recurrent,LTM,,"KU Leuven, Leuven, Belgium; Apple",Industry - Academia Collaboration,,0,,
D-LSRC(200)+KN5,"Youssef Oualil, Mittul Singh, Clayton Greenberg, Dietrich Klakow",2017/08/22,2017,Long-Short Range Context Neural Networks for Language Modeling,19,,https://arxiv.org/pdf/1708.06555,7.16E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,92.00,0.0,1,0,0,Recurrent,LSRC,,,,,1,Word-level,10000
D-LSRC(100)+KN5,"Youssef Oualil, Mittul Singh, Clayton Greenberg, Dietrich Klakow",2017/08/22,2017,Long-Short Range Context Neural Networks for Language Modeling,19,,https://arxiv.org/pdf/1708.06555,5.97E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,93.00,0.0,1,0,0,Recurrent,LSRC,,,,,1,Word-level,10000
AWD-LSTM+WT+Cache+IOG (PTB),"Sho Takase, Jun Suzuki, Masaaki Nagata",2017/09/26,2017,Input-to-Output Gate to Improve RNN Language Models,7,,https://arxiv.org/pdf/1709.08907,3.00E+07,,,5,,,0.00,9.29E+05,8.36E+14,,9.29E+05,,,,53.00,0.0,1,1,0,Recurrent,LSTM,https://github.com/nttcslab-nlp/iog,,,,1,Word-level,10000
GLM-130B,"Aohan Zeng, Xiao Liu, Zhengxiao Du, Zihan Wang, Hanyu Lai, Ming Ding, Zhuoyi Yang, Yifan Xu, Wendi Zheng, Xiao Xia, Weng Lam Tam, Zixuan Ma, Yufei Xue, Jidong Zhai, Wenguang Chen, Peng Zhang, Yuxiao Dong, Jie Tang",2022/10/05,2022,GLM-130B: An Open Bilingual Pre-trained Model,131,,https://openreview.net/forum?id=-Aw0rrrPUF,1.30E+11,,,1,,,1.00,6.32E+11,4.93E+23,,6.32E+11,,10.76,10.55,18.90,1.0,0,0,0,Transformer,GLM,https://github.com/THUDM/GLM-130B,,,,1,,
AWD-LSTM+WT+Cache+IOG (WT2),"Sho Takase, Jun Suzuki, Masaaki Nagata",2017/09/26,2017,Input-to-Output Gate to Improve RNN Language Models,7,,https://arxiv.org/pdf/1709.08907,5.30E+07,,,5,,,0.00,2.08E+06,3.31E+15,,2.08E+06,,,51.70,,0.0,1,1,0,Recurrent,LSTM,https://github.com/nttcslab-nlp/iog,,,,1,Word-level,33000
4 layer Densely Connected LSTM,"Fréderic Godin, Joni Dambre, Wesley De Neve",2017/07/19,2017,Improving Language Modeling using Densely Connected Recurrent Neural Networks,7,,https://arxiv.org/pdf/1707.06130,1.40E+07,,,100,,,0.00,9.29E+05,7.80E+15,,9.29E+05,,,,76.80,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
Densely Connected LSTM + Var. Dropout,"Fréderic Godin, Joni Dambre, Wesley De Neve",2017/07/19,2017,Improving Language Modeling using Densely Connected Recurrent Neural Networks,7,,https://arxiv.org/pdf/1707.06130,2.30E+07,,,100,,,0.00,9.29E+05,1.28E+16,,9.29E+05,,,,78.30,0.0,1,0,0,Recurrent,,,,,,1,Word-level,10000
GL-LWGC-AWD-MoS-LSTM + dynamic evaluation (PTB),"Ziv Aharoni, Gal Rattner, Haim Permuter",2017/08/29,2017,Gradual Learning of Recurrent Neural Networks,4,1,https://arxiv.org/abs/1708.08863,2.60E+07,,,1000,,,0.00,9.29E+05,1.45E+17,,9.29E+05,Penn TreeBank,,,46.34,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
S + I-Attention (3),"Artyom Gadetsky, Ilya Yakubovskiy, Dmitry Vetrov",2018/06/26,2018,Conditional Generators of Words Definitions,56,,https://arxiv.org/abs/1806.10090,UNK,,,35.00,,,0.00,1.03E+08,#VALUE!,1.08E+06,1.04E+08,Oxford Dictionary,43.54,,,1.0,1,0,0,Recurrent,LSTM,https://github.com/sbos/AdaGram.jl,University of California San Diego,Academia,,0,,
GL-LWGC-AWD-MoS-LSTM + dynamic evaluation (WT2),"Ziv Aharoni, Gal Rattner, Haim Permuter",2017/08/29,2017,Gradual Learning of Recurrent Neural Networks,4,1,https://arxiv.org/abs/1708.08863,3.80E+07,,,1000,,,0.00,2.08E+06,4.74E+17,,2.08E+06,WikiText-2,,40.46,,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,33000
Decay RNN,"Gantavya Bhatt, Hritik Bansal, Rishubh Singh, Sumeet Agarwal",2020/05/17,2020,How much complexity does an RNN architecture need to learn syntax-sensitive dependencies?,7,,https://arxiv.org/abs/2005.08199,1.40E+06,,,,,,0.00,,0.00E+00,,0.00E+00,Lizen et al 2016,76.67,,,1.0,0,0,0,Recurrent,,https://github.com/bhattg/Decay-RNN-ACL-SRW2020,Stanford University; Salesforce Research,Industry - Academia Collaboration,,0,,
Fraternal dropout + AWD-LSTM 3-layer (WT2),"Konrad Zolna, Devansh Arpit, Dendi Suhubdy, Yoshua Bengio",2017/10/31,2017,Fraternal Dropout,55,,https://arxiv.org/abs/1711.00066,3.40E+07,,,520,,,0.00,2.08E+06,2.21E+17,,2.08E+06,WikiText-2,,64.10,,0.0,1,0,0,Recurrent,,,,,,1,Moses,?
Grown to Prune Two-layer stacked LSTM,"Xin Yuan, Pedro Savarese, Michael Maire",2020/07/30,2020,Growing Efficient Deep Networks by Structured Continuous Sparsification,37,,https://arxiv.org/pdf/2007.15353,,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,78.68,0.0,0,0,0,Recurrent,LSTM,,"Tsinghua University; Beijing National Research Center for Information Science and Technology; Beijing University of Posts and Telecommunications; Huawei Noah’s Ark Lab, Hong Kong, China",Industry - Academia Collaboration,,0,,
Engin-Medium(NE),"Zhongping Zhang, Yiwen Gu, Bryan A. Plummer",2021/12/11,2021,Show and Write: Entity-aware Article Generation with Image Information,0,0.5,https://arxiv.org/pdf/2112.05917,3.55E+08,,,3,,,0.00,,0.00E+00,,0.00E+00,,,15.40,,0.0,1,0,0,,,,,,,0,,
Fraternal dropout + AWD-LSTM 3-layer (PTB),"Konrad Zolna, Devansh Arpit, Dendi Suhubdy, Yoshua Bengio",2017/10/31,2017,Fraternal Dropout,55,,https://arxiv.org/abs/1711.00066,2.40E+07,,,520,,,0.00,9.29E+05,6.96E+16,,9.29E+05,Penn TreeBank,,,56.80,0.0,1,0,0,Recurrent,,,,,,1,Word-level,10000
NAS+ESS (23M),"Yinqiao Li, Chi Hu, Yuhao Zhang, Nuo Xu, Yufan Jiang, Tong Xiao, Jingbo Zhu, Tongran Liu, Changliang Li",2020/05/06,2020,Learning Architectures from an Extended Search Space for Language Modeling,12,,https://arxiv.org/pdf/2005.02593,2.30E+07,,,30,,,0.00,1.03E+08,4.26E+17,0.00E+00,1.03E+08,,,,45.60,0.0,0,0,0,Recurrent,RNN,,,Beihang University; Chongqing University; Mila,,1,,
EI-REHN-1200D,"Hyunsin Park, Chang D. Yoo",2017/08/14,2017,Early Improving Recurrent Elastic Highway Network,6,,https://arxiv.org/pdf/1708.04116,1.20E+07,,,100,,,0.00,9.29E+05,6.69E+15,,9.29E+05,,,,66.20,0.0,1,0,0,Recurrent,RHN,,,,,1,Word-level,10000
Char-CNN-BiLSTM,"Chris Larson, Tarek Lahlou, Diana Mingels, Zachary Kulis, Erik Mueller",2019/06/13,2019,Telephonetic: Making Neural Language Models Robust to ASR and Semantic Noise,1,0,https://arxiv.org/pdf/1906.05678,UNK,,,,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,37.49,0.0,1,0,0,Recurrent,LSTM,,iCoSys Institute; EPFL; Swisscom; University of Fribourg,Industry - Academia Collaboration,,0,,
EI-REHN-1000D,"Hyunsin Park, Chang D. Yoo",2017/08/14,2017,Early Improving Recurrent Elastic Highway Network,6,,https://arxiv.org/pdf/1708.04116,1.90E+07,,,100,,,0.00,9.29E+05,1.06E+16,,9.29E+05,,,,68.70,0.0,1,0,0,Recurrent,RHN,,,,,1,Word-level,10000
AWD-LSTM + dynamic eval (PTB),"Ben Krause, Emmanuel Kahembwe, Iain Murray, Steve Renals",2017/09/21,2017,Dynamic Evaluation of Neural Sequence Models,130,,https://arxiv.org/abs/1709.07432,2.40E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,51.10,0.0,1,0,0,Recurrent,LSTM,https://github.com/benkrause/dynamic-evaluation,,,,1,,10000
LSTM + dynamic eval,"Ben Krause, Emmanuel Kahembwe, Iain Murray, Steve Renals",2017/09/21,2017,Dynamic Evaluation of Neural Sequence Models,130,,https://arxiv.org/abs/1709.07432,5.00E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,59.80,,0.0,1,0,0,Recurrent,LSTM,https://github.com/benkrause/dynamic-evaluation,,,,1,,33000
AWD-LSTM + dynamic eval (WT2),"Ben Krause, Emmanuel Kahembwe, Iain Murray, Steve Renals",2017/09/21,2017,Dynamic Evaluation of Neural Sequence Models,130,,https://arxiv.org/abs/1709.07432,3.30E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,WikiText-2,,44.30,,0.0,1,0,0,Recurrent,LSTM,https://github.com/benkrause/dynamic-evaluation,,,,1,,33000
"AWD-LSTM-MoS + dynamic evaluation (WT2, 2017)","Zhilin Yang, Zihang Dai, Ruslan Salakhutdinov, William W. Cohen",2017/11/10,2017,Breaking the Softmax Bottleneck: A High-Rank RNN Language Model,358,,https://arxiv.org/abs/1711.03953,3.50E+07,,,1000,,,0.00,2.08E+06,4.37E+17,,2.08E+06,WikiText-2,,40.68,,0.0,1,0,0,Recurrent,LSTM,https://github.com/zihangdai/mos,,,,1,Top-words,100000
"AWD-LSTM-MoS + dynamic evaluation (PTB, 2017)","Zhilin Yang, Zihang Dai, Ruslan Salakhutdinov, William W. Cohen",2017/11/10,2017,Breaking the Softmax Bottleneck: A High-Rank RNN Language Model,358,,https://arxiv.org/abs/1711.03953,2.20E+07,,,1000,,,0.00,9.29E+05,1.23E+17,,9.29E+05,Penn TreeBank,,,47.69,0.0,1,0,0,Recurrent,LSTM,https://github.com/zihangdai/mos,,,,1,Word-level,10000
NMM(LSTM+RNN),"Youssef Oualil, Dietrich Klakow",2017/08/23,2017,A Neural Network Approach for Mixing Language Models,10,,https://arxiv.org/pdf/1708.06989,5.18E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,102.00,0.0,1,0,0,Recurrent,RNN+LSTM,,,,,1,,10000
TrellisNet,"Shaojie Bai, J. Zico Kolter, Vladlen Koltun",2018/10/15,2018,Trellis Networks for Sequence Modeling,132,,https://arxiv.org/abs/1810.06682,1.80E+08,,,25,,,0.00,1.03E+08,2.78E+18,0.00E+00,1.03E+08,WikiText-103,29.19,,,0.0,1,0,0,Recurrent/Convolutional,TrellisNet,https://github.com/locuslab/trellisnet,,,,1,word-level,268000
Local Transformer,"Aurko Roy, Mohammad Saffar, Ashish Vaswani, David Grangier",2020/03/12,2020,Efficient Content-Based Sparse Attention with Routing Transformers,349,,https://arxiv.org/abs/2003.05997,#VALUE!,,,,,,0.00,1.03E+08,#VALUE!,,1.03E+08,WikiText-103,19.80,,,0.0,1,0,0,Transformer,Local transformer,https://github.com/google-research/google-research/tree/master/routing_transformer,,,,0,,
TF-LM-discourse LSTM (PTB),"Lyan Verwimp, Hugo Van hamme, Patrick Wambacq",2018/05/01,2018,TF-LM: TensorFlow-based Language Modeling Toolkit,7,,https://aclanthology.org/L18-1470.pdf,UNK,,,39,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,84.10,0.0,1,0,0,Recurrent,LSTM,https://github.com/lverwimp/tf-lm,,,,0,,
(ensemble): AWD-LSTM-DOC (fin) × 5 (WT2),"Sho Takase, Jun Suzuki, Masaaki Nagata",2018/08/30,2018,Direct Output Connection for a High-Rank Language Model,36,,https://arxiv.org/abs/1808.10143,1.85E+08,,,300,,,0.00,2.08E+06,6.93E+17,,2.08E+06,WikiText-2,,53.09,,0.0,0,0,0,Recurrent,LSTM,https://github.com/nttcslab-nlp/doc_lm,,,,1,,
TrellisNet-MoS (1.4x larger),"Shaojie Bai, J. Zico Kolter, Vladlen Koltun",2018/10/15,2018,Trellis Networks for Sequence Modeling,132,,https://arxiv.org/abs/1810.06682,1.80E+08,,,25,,,0.00,1.03E+08,2.78E+18,0.00E+00,1.03E+08,Penn TreeBank,29.19,,54.19,0.0,1,0,0,Recurrent/Convolutional,TrellisNet,https://github.com/locuslab/trellisnet,,,,1,word-level,268000
RHN(depth=40),"Ron Shoham, Haim Permuter",2018/05/23,2018,Highway State Gating for Recurrent Highway Networks: improving information flow through time,0,1,https://arxiv.org/pdf/1805.09238,UNK,,,300,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,63.60,0.0,1,0,0,Recurrent,RHN,,National Tsing Hua University; Google,Industry - Academia Collaboration,,0,,
QRNN,"Stephen Merity, Nitish Shirish Keskar, James Bradbury, Richard Socher",2018/02/01,2018,Scalable Language Modeling: WikiText-103 on a Single GPU in 12 hours,4,0,https://mlsys.org/Conferences/doc/2018/50.pdf,1.35E+08,,3.60E+17,14.00,,,0.00,1.03E+08,1.17E+18,0.00E+00,1.03E+08,WikiText-103,33.00,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,,,,,1,Word-level,267735
Temporal Convolutional Attention-based Network(TCAN) (PTB),"Hongyan Hao, Yan Wang, Yudi Xia, Jian Zhao, Furao Shen",2020/02/28,2020,Temporal Convolutional Attention-based Network For Sequence Modeling,33,,https://arxiv.org/pdf/2002.12530,1.30E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,26.92,0.0,0,0,0,,,https://github.com/haohy/TCAN,University of Washington,Industry,,1,,
2-layer skip-LSTM + dropout tuning (PTB),"Gábor Melis, Charles Blundell, Tomáš Kočiský, Karl Moritz Hermann, Chris Dyer, Phil Blunsom",2018/05/23,2018,Pushing the bounds of dropout,14,,https://arxiv.org/abs/1805.09208,5.40E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,55.30,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level?,?
GCNN-14,"Yann N. Dauphin, Angela Fan, Michael Auli, David Grangier",2016/12/23,2016,Language Modeling with Gated Convolutional Networks,2176,,https://arxiv.org/abs/1612.08083,UNK,,,,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,,,108.70,0.0,1,0,0,Recurrent,GCNN,,,,,0,,
2-layer skip-LSTM + dropout tuning (WT2),"Gábor Melis, Charles Blundell, Tomáš Kočiský, Karl Moritz Hermann, Chris Dyer, Phil Blunsom",2018/05/23,2018,Pushing the bounds of dropout,14,,https://arxiv.org/abs/1805.09208,5.40E+06,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,WikiText-2,,63.70,,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level?,?
AWD-LSTM-MoS+Noisin+dynamic evaluation ,"Adji B. Dieng, Rajesh Ranganath, Jaan Altosaar, David M. Blei",2018/05/03,2018,Noisin: Unbiased Regularization for Recurrent Neural Networks,26,,https://arxiv.org/pdf/1805.01500,2.20E+07,,,400,,,0.00,9.29E+05,4.91E+16,,9.29E+05,,,,47.60,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
LSTM+Noise(Beta),"Adji B. Dieng, Rajesh Ranganath, Jaan Altosaar, David M. Blei",2018/05/03,2018,Noisin: Unbiased Regularization for Recurrent Neural Networks,26,,https://arxiv.org/pdf/1805.01500,5.10E+07,,,200,,,0.00,2.08E+06,1.27E+17,,2.08E+06,,,82.90,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,30000
Dropout-LSTM+Noise(Laplace),"Adji B. Dieng, Rajesh Ranganath, Jaan Altosaar, David M. Blei",2018/05/03,2018,Noisin: Unbiased Regularization for Recurrent Neural Networks,26,,https://arxiv.org/pdf/1805.01500,1.30E+07,,,200,,,0.00,2.08E+06,3.24E+16,,2.08E+06,,,82.10,,0.0,1,0,0,Recurrent,,,,,,1,,30000
Dropout-LSTM+Noise(Bernoulli) (WT2),"Adji B. Dieng, Rajesh Ranganath, Jaan Altosaar, David M. Blei",2018/05/03,2018,Noisin: Unbiased Regularization for Recurrent Neural Networks,26,,https://arxiv.org/pdf/1805.01500,5.10E+07,,,200,,,0.00,2.08E+06,1.27E+17,,2.08E+06,,,76.80,,0.0,1,0,0,Recurrent,,,,,,1,,30000
Dropout-LSTM+Noise(Bernoulli) (PTB),"Adji B. Dieng, Rajesh Ranganath, Jaan Altosaar, David M. Blei",2018/05/03,2018,Noisin: Unbiased Regularization for Recurrent Neural Networks,26,,https://arxiv.org/pdf/1805.01500,5.10E+07,,,200,,,0.00,9.29E+05,5.69E+16,,9.29E+05,,,,66.10,0.0,1,0,0,Recurrent,,,,,,1,Word-level,10000
MGK 8 heads (small),"Tam Nguyen, Tan M. Nguyen, Dung D. Le, Duy Khuong Nguyen, Viet-Anh Tran, Richard G. Baraniuk, Nhat Ho, Stanley J. Osher",2021/10/16,2021,Improving Transformers with Probabilistic Attention Keys,12,,https://arxiv.org/pdf/2110.08678,4.00E+07,,,120,,,0.00,1.03E+08,2.97E+18,0.00E+00,1.03E+08,,33.93,,,0.0,0,0,0,Transformer,Transformer,https://github.com/minhtannguyen/transformer-mgk,DeepMind; University of Oxford,,,1,,
Multi-cell LSTM,"Thomas Cherian, Akshay Badola, Vineet Padmanabhan",2018/11/15,2018,Multi-cell LSTM Based Neural Language Model,6,,https://arxiv.org/pdf/1811.06477,7.20E+06,,,50,,,0.00,9.29E+05,2.01E+15,,9.29E+05,,,,77.12,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
Multipop Adaptive Continuous Stack (WT2),"Dani Yogatama, Yishu Miao, Gabor Melis, Wang Ling, Adhiguna Kuncoro, Chris Dyer, Phil Blunsom",2018/02/15,2018,Memory Architectures in Recurrent Neural Network Language Models,59,,https://openreview.net/forum?id=SkFqf0lAZ,2.60E+07,,,UNK,,,0.00,2.08E+06,#VALUE!,,2.08E+06,,,72.40,,0.0,1,0,0,Recurrent,RNN,,,,,1,Word-level,33000
Multipop Adaptive Continuous Stack (PTB),"Dani Yogatama, Yishu Miao, Gabor Melis, Wang Ling, Adhiguna Kuncoro, Chris Dyer, Phil Blunsom",2018/02/15,2018,Memory Architectures in Recurrent Neural Network Language Models,59,,https://openreview.net/forum?id=SkFqf0lAZ,1.10E+07,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,63.50,0.0,1,0,0,Recurrent,RNN,,,,,1,Word-level,10000
MGK 4 heads (medium),"Tam Nguyen, Tan M. Nguyen, Dung D. Le, Duy Khuong Nguyen, Viet-Anh Tran, Richard G. Baraniuk, Nhat Ho, Stanley J. Osher",2021/10/16,2021,Improving Transformers with Probabilistic Attention Keys,12,,https://arxiv.org/pdf/2110.08678,9.00E+07,,,120,,,0.00,1.03E+08,6.67E+18,0.00E+00,1.03E+08,,28.86,,,0.0,0,0,0,Transformer,Transformer,https://github.com/minhtannguyen/transformer-mgk,Salesforce Resarch,Industry,,1,,
LSTM+NeuralCache,"Lyan Verwimp, Joris Pelemans, Hugo Van hamme, Patrick Wambacq",2018/09/24,2018,Information-Weighted Neural Cache Language Models for ASR,3,1,https://arxiv.org/pdf/1809.08826,2.10E+06,,,39,,,0.00,2.08E+06,1.02E+15,,2.08E+06,,,66.20,,0.0,1,1,0,Recurrent,LSTM,,,,,1,Word-level,33000
Characterizing Verbatim Short-Term Memory in Neural Language Models (117M),"Kristijan Armeni, Christopher Honey, Tal Linzen",2022/10/24,2022,Characterizing Verbatim Short-Term Memory in Neural Language Models,3,1,https://arxiv.org/pdf/2210.13569.pdf,1.17E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,,37.50,,,0.0,0,0,0,,,https://github.com/KristijanArmeni/verbatim-memory-in-NLMs,NT Communication Science Laboratories; Tohoku University,Industry - Academia Collaboration,,1,,
AWD-LSTM-MoS+PDR + dynamic evaluation (PTB),Siddhartha Brahma,2018/08/14,2018,Improved Language Modeling by Decoding the Past,5,,https://arxiv.org/abs/1808.05908,2.20E+07,,,1200,,,0.00,8.88E+05,1.41E+17,,8.88E+05,Penn TreeBank,,,47.30,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
AWD-LSTM-MoS+PDR + dynamic evaluation (WT2),Siddhartha Brahma,2018/08/14,2018,Improved Language Modeling by Decoding the Past,5,,https://arxiv.org/abs/1808.05908,3.50E+07,,,,,,0.00,2.05E+06,0.00E+00,,2.05E+06,WikiText-2,,40.30,,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,33000
"AWD-LSTM-MoS + dynamic evaluation (WT2, 2018)","Chengyue Gong, Di He, Xu Tan, Tao Qin, Liwei Wang, Tie-Yan Liu",2018/09/18,2018,FRAGE: Frequency-Agnostic Word Representation,152,,https://arxiv.org/abs/1809.06858,3.50E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,WikiText-2,,39.14,,0.0,1,0,0,Recurrent,LSTM,https://github.com/ChengyueGongR/Frequency-Agnostic,,,,1,,33278
LSTM+GraB,"Yucheng Lu, Wentao Guo, Christopher De Sa",2022/05/22,2022,GraB: Finding Provably Better Data Permutations than Random Reshuffling,7,,https://arxiv.org/pdf/2205.10733,UNK,,,50,,,0.00,2.08E+06,#VALUE!,,2.08E+06,,,199.40,,0.0,0,0,0,Recurrent,LSTM,https://github.com/EugeneLYC/GraB,,,,0,,
"AWD-LSTM-MoS + dynamic evaluation (PTB, 2018)","Chengyue Gong, Di He, Xu Tan, Tao Qin, Liwei Wang, Tie-Yan Liu",2018/09/18,2018,FRAGE: Frequency-Agnostic Word Representation,152,,https://arxiv.org/abs/1809.06858,2.40E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,46.54,0.0,1,0,0,Recurrent,LSTM,https://github.com/ChengyueGongR/Frequency-Agnostic,,,,1,,10000
Fine-tuned-AWD-LSTM-DOC(fin),"Vadim Popov, Mikhail Kudinov",2018/11/12,2018,Fine-tuning of Language Models with Discriminator,2,0,https://arxiv.org/pdf/1811.04623,2.30E+07,,,15,,,0.00,9.29E+05,1.92E+15,,9.29E+05,,,,52.12,0.0,1,0,0,Recurrent,LSTM,,,,,1,,100000
LSTM-MemoryAug (WT2),"Ke Li, Daniel Povey, Sanjeev Khudanpur",2020/09/29,2020,Neural Language Modeling With Implicit Cache Pointers,4,1,https://arxiv.org/pdf/2009.13774,2.85E+07,,,,,,1.00,2.08E+06,0.00E+00,,2.08E+06,,,74.30,,0.0,0,0,0,Recurrent,LSTM,,,,,1,,
T2R 75% + Pretrain,"Jungo Kasai, Hao Peng, Yizhe Zhang, Dani Yogatama, Gabriel Ilharco, Nikolaos Pappas, Yi Mao, Weizhu Chen, Noah A. Smith",2021/03/24,2021,Finetuning Pretrained Transformers into RNNs,30,,https://arxiv.org/abs/2103.13076,4.50E+08,,1.93E+19,34.47,,,1.00,1.03E+08,9.59E+18,1.03E+08,2.06E+08,WikiText-103,18.50,,,0.0,0,0,0,Transformer,ELU,https://github.com/jungokasai/T2R/,Facebook AI Research; Google Brain,Industry,,1,,
RHN+HSG(depth=40),"Ron Shoham, Haim Permuter",2018/05/23,2018,Highway State Gating for Recurrent Highway Networks: improving information flow through time,0,1,https://arxiv.org/pdf/1805.09238,UNK,,,300,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,61.70,0.0,1,0,0,Recurrent,RHN,,Eindhoven University of Technologyl; University of Twente,Academia,,0,,
"LSTM (Hebbian, Cache, MbPA)","Jack W Rae, Chris Dyer, Peter Dayan, Timothy P Lillicrap",2018/03/27,2018,Fast Parametric Learning with Activation Memorization,44,,https://arxiv.org/abs/1803.10049,4.52E+07,,2.40E+19,90.00,,,0.00,1.03E+08,2.51E+18,0.00E+00,1.03E+08,WikiText-103,29.20,,,0.0,1,1,0,Recurrent,LSTM,,,,,1,,267735
AWD-LSTM-DOC (fin) (37M),"Sho Takase, Jun Suzuki, Masaaki Nagata",2018/08/30,2018,Direct Output Connection for a High-Rank Language Model,36,,https://arxiv.org/abs/1808.10143,3.70E+07,,,300,,,0.00,2.08E+06,1.39E+17,,2.08E+06,WikiText-2,,58.03,,0.0,1,0,0,Recurrent,LSTM,https://github.com/nttcslab-nlp/doc_lm,,,,1,,33278
AWD-LSTM-DOC (fin) (23M),"Sho Takase, Jun Suzuki, Masaaki Nagata",2018/08/30,2018,Direct Output Connection for a High-Rank Language Model,36,,https://arxiv.org/abs/1808.10143,2.30E+07,,,300,,,0.00,2.08E+06,8.61E+16,,2.08E+06,WikiText-2,,,52.38,0.0,1,0,0,Recurrent,LSTM,https://github.com/nttcslab-nlp/doc_lm,,,,1,,10000
RNNLM + Dynamic KL Regularization,"Thanapon Noraset, David Demeter, Doug Downey",2018/01/01,2018,Controlling Global Statistics in Recurrent Neural Network Text Generation,6,,https://ojs.aaai.org/index.php/AAAI/article/view/11993,2.77E+07,,,20,,,0.00,9.29E+05,3.09E+15,,9.29E+05,Penn Treebank,,,77.80,0.0,1,0,0,Recurrent,RNN,,"Northeastern University, Shenyang, China; NiuTrans Research, Shenyang, China; Chinese Academy of Sciences; Kingsoft AI Lab, Beijing, China",Industry - Academia Collaboration,,1,?,?
RNNLM + Dynamic KL Regularization (WT2),"Thanapon Noraset, David Demeter, Doug Downey",2018/01/01,2018,Controlling Global Statistics in Recurrent Neural Network Text Generation,6,,https://ojs.aaai.org/index.php/AAAI/article/view/11993,8.76E+07,,,20,,,0.00,2.08E+06,2.19E+16,,2.08E+06,,,86.80,,0.0,1,0,0,Recurrent,RNN,,,,,1,?,?
aLSTM(depth-2)+RecurrentPolicy (PTB),"Sebastian Flennerhag, Hujun Yin, John Keane, Mark Elliot",2018/05/22,2018,Breaking the Activation Function Bottleneck through Adaptive Parameterization,12,,https://arxiv.org/pdf/1805.08574,2.40E+07,,,180,,,0.00,9.29E+05,2.41E+16,,9.29E+05,,,,55.30,0.0,1,0,0,Recurrent,LSTM,https://github.com/flennerhag/alstm,,,,1,Word-level,10000
aLSTM(depth-2)+RecurrentPolicy (WT2),"Sebastian Flennerhag, Hujun Yin, John Keane, Mark Elliot",2018/05/22,2018,Breaking the Activation Function Bottleneck through Adaptive Parameterization,12,,https://arxiv.org/pdf/1805.08574,3.20E+07,,,190,,,0.00,2.08E+06,7.59E+16,,2.08E+06,,,64.50,,0.0,1,0,0,Recurrent,LSTM,https://github.com/flennerhag/alstm,,,,1,Word-level,267735
Alleviated TOI 10 (PTB),"Noémien Kocher, Christian Scuito, Lorenzo Tarantino, Alexandros Lazaridis, Andreas Fischer, Claudiu Musat",2019/09/18,2019,Alleviating Sequence Information Loss with Data Overlapping and Prime Batch Sizes,0,1,https://arxiv.org/abs/1909.08700,UNK,,,1000,,,1.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,Penn TreeBank,,,56.46,0.0,1,0,0,Recurrent,LSTM,https://github.com/nkcr/overlap-ml,,,,0,,
LSTM (2018),"Shaojie Bai, J. Zico Kolter, Vladlen Koltun",2018/03/04,2018,An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling,4024,,https://arxiv.org/abs/1803.01271,1.30E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,78.93,0.0,1,0,0,Recurrent,LSTM,http://github.com/locuslab/TCN,,,,1,Word-level,10000
4 layer QRNN (h=2500),"Stephen Merity, Nitish Shirish Keskar, Richard Socher",2018/03/22,2018,An Analysis of Neural Language Modeling at Multiple Scales,183,,https://arxiv.org/abs/1803.08240,2.60E+07,,2.40E+17,14.00,,,0.00,1.03E+08,2.25E+17,0.00E+00,1.03E+08,WikiText-103,33.00,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,https://github.com/salesforce/awd-lstm-lm,,,,1,Word-level,260000
Transformer (Adaptive Input Embeddings),"Alexei Baevski, Michael Auli",2018/09/28,2018,Adaptive Input Representations for Neural Language Modeling,337,,https://arxiv.org/abs/1809.10853,2.47E+08,,7.30E+18,180,,,0.00,1.03E+08,2.75E+19,0.00E+00,1.03E+08,WikiText-103,18.70,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/facebookresearch/fairseq,,,,1,Word-level,267735
Transformer-XL + SIS,"Sagar Verma, Jean-Christophe Pesquet",2021/05/03,2021,Sparsifying Networks via Subdifferential Inclusion,9,,https://web.archive.org/web/20220122141508/http://proceedings.mlr.press/v139/verma21b/verma21b.pdf,2.46E+08,1.03E+08,1.04E+19,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,21.10,,,0.0,0,0,0,Transformer,Transformer-XL,https://sagarverma.github.io/compression,University of Washington; Facebook AI Research; Allen Institute for AI,Industry - Academia Collaboration,,1,,
Transformer-XL Large,"Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov",2019/01/09,2019,Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context,3155,,https://arxiv.org/abs/1901.02860,2.57E+08,,1.09E+19,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,18.30,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/kimiyoung/,,,,1,word-level,260000
GLM-10B,"Zhengxiao Du, Yujie Qian, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, Jie Tang",2021/03/18,2021,GLM: General Language Model Pretraining with Autoregressive Blank Infilling,131,,https://arxiv.org/abs/2103.10360,1.00E+10,,,1,,,0.00,6.32E+11,3.79E+22,,6.32E+11,,,12.00,22.52,1.0,0,0,0,Transformer,GLM,https://github.com/THUDM/GLM,,,,1,,
Transformer-XL-ptb,"Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov",2019/01/09,2019,Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context,3155,,https://arxiv.org/abs/1901.02860,2.57E+08,,1.09E+19,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,Penn TreeBank,,,54.52,1.0,1,0,0,Transformer,Transformer-XL,https://github.com/kimiyoung/,,,,1,word-level,10000
R-Transformer,"Zhiwei Wang, Yao Ma, Zitao Liu, Jiliang Tang",2019/07/12,2019,R-Transformer: Recurrent Neural Network Enhanced Transformer,93,,https://arxiv.org/abs/1907.05572,1.58E+07,,,100,,,0.00,8.88E+05,8.40E+15,,8.88E+05,Penn TreeBank,,,84.38,0.0,1,0,0,Transformer,R-Transformer,https://github.com/DSE-MSU/R-transformer,,,,1,Word-level,10000
RSM,"David Rawlinson, Abdelrahman Ahmed, Gideon Kowadlo",2019/05/28,2019,Learning distant cause and effect using only local and immediate credit assignment,3,1,https://arxiv.org/pdf/1905.11589,,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,166.00,0.0,0,0,0,,,https://github.com/Cerenaut/rsm,,,,0,,
DOC + Finetune∗ + Partial Shuffle (WT2),Ofir Press,2019/03/11,2019,Partially Shuffling the Training Data to Improve Language Models,4,0,https://arxiv.org/abs/1903.04167,6.73E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,WikiText-2,,57.85,,0.0,1,0,0,Recurrent,LSTM,https://github.com/ofirpress/PartialShuffle,,,,1,Word-level,33278
DOC + Finetune∗ + Partial Shuffle (PTB),Ofir Press,2019/03/11,2019,Partially Shuffling the Training Data to Improve Language Models,4,0,https://arxiv.org/abs/1903.04167,3.70E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,52.00,0.0,1,0,0,Recurrent,LSTM,https://github.com/ofirpress/PartialShuffle,"Johns Hopkins University, New York University",Academia,,1,Word-level,10000
L_UL-seq,"Sean Welleck, Ilia Kulikov, Stephen Roller, Emily Dinan, Kyunghyun Cho, Jason Weston",2019/08/12,2019,Neural Text Generation with Unlikelihood Training,365,,https://arxiv.org/abs/1908.04319,2.47E+08,,,,,,0.00,1.03E+08,0.00E+00,0.00E+00,1.03E+08,WikiText-103,25.42,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/facebookresearch/unlikelihood_training,,,,1,Word-level,260000
"Mogrifier (d2, MoS2, MC) + dynamic eval","Gábor Melis, Tomáš Kočiský, Phil Blunsom",2019/09/04,2019,Mogrifier LSTM,109,,https://arxiv.org/abs/1909.01792,3.50E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,WikiText-2,,38.60,,0.0,1,0,0,Recurrent,LSTM,https://github.com/deepmind/lamb,,,,1,Word-level,33000
"Mogrifier (d2, MC) + dynamic eval","Gábor Melis, Tomáš Kočiský, Phil Blunsom",2019/09/04,2019,Mogrifier LSTM,109,,https://arxiv.org/abs/1909.01792,2.40E+07,,,,,,0.00,9.23E+05,0.00E+00,,9.23E+05,Penn TreeBank,,,44.80,0.0,1,0,0,Recurrent,LSTM,https://github.com/deepmind/lamb,,,,1,Word-level,10000
GRU + p-tHSM (pretrain via Brown) (PTB),"Nan Jiang, Wenge Rong, Min Gao, Yikang Shen, Zhang Xiong",2017/08/19,2017,Exploration of Tree-based Hierarchical Softmax for Recurrent Language Models,5,,https://www.researchgate.net/profile/Yikang-Shen-2/publication/318830618_Exploration_of_Tree-based_Hierarchical_Softmax_for_Recurrent_Language_Models/links/5b2c050aa6fdcc8506bc6f4a/Exploration-of-Tree-based-Hierarchical-Softmax-for-Recurrent-Language-Models.pdf,5.20E+06,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,6.97E+05,,,128.78,0.0,0,0,0,Recurrent,GRU,,,,,1,,
Megatron-LM (2.5B),"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, Bryan Catanzaro",2019/09/17,2019,Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism,905,,https://arxiv.org/abs/1909.08053,2.50E+09,,4.00E+21,4.40,,,0.00,4.64E+10,3.06E+21,,4.64E+10,WikiText-103; CC-Stories; RealNews; OpenWebtext,12.76,,,1.0,,0,0,Transformer,GPT,https://github.com/NVIDIA/Megatron-LM,,,,1,Own,51200
Characterizing Verbatim Short-Term Memory in Neural Language Models (182M),"Kristijan Armeni, Christopher Honey, Tal Linzen",2022/10/24,2022,Characterizing Verbatim Short-Term Memory in Neural Language Models,3,1,https://arxiv.org/pdf/2210.13569.pdf,1.82E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,,41.90,,,0.0,0,0,0,,,https://github.com/KristijanArmeni/verbatim-memory-in-NLMs,NT Communication Science Laboratories; Tohoku University,Industry - Academia Collaboration,,1,,
Megatron-LM (8.3B),"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, Bryan Catanzaro",2019/09/17,2019,Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism,905,,https://arxiv.org/abs/1909.08053,8.30E+09,,9.10E+21,4.40,,,0.00,4.64E+10,1.02E+22,,4.64E+10,WikiText-103; CC-Stories; RealNews; OpenWebtext,10.81,,,1.0,1,0,0,Transformer,GPT,https://github.com/NVIDIA/Megatron-LM,,,,1,Own,51200
Megatron-LM (355M),"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, Bryan Catanzaro",2019/09/17,2019,Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism,905,,https://arxiv.org/abs/1909.08053,3.55E+08,,4.35E+20,4.40,,,0.00,4.64E+10,4.35E+20,,4.64E+10,WikiText-103; CC-Stories; RealNews; OpenWebtext,19.31,,,1.0,1,0,0,Transformer,GPT,https://github.com/NVIDIA/Megatron-LM,,,,1,Own,51200
MMLSTM,"Kai Shuang, Rui Li, Mengyu Gu, Jonathan Loo, Sen Su",2019/12/05,2019,Major–Minor Long Short-Term Memory for Word-Level Language Model,14,,http://repository.uwl.ac.uk/id/eprint/6490/1/Loo_etal_IEEE_TNNLS_2019_Major-minor_long_short-term_memory_for_word-level_language_model.pdf,7.50E+07,,,50,,,0.00,1.03E+08,2.32E+18,0.00E+00,1.03E+08,WikiText-103,44.69,,,0.0,1,0,1,Recurrent,LSTM,,,,,1,Word-level,268000
LSTM+Adam+Lookahead,"Michael R. Zhang, James Lucas, Geoffrey Hinton, Jimmy Ba",2019/07/19,2019,"Lookahead Optimizer: k steps forward, 1 step back",612,,https://arxiv.org/pdf/1907.08610,7.19E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,57.72,0.0,1,0,0,Recurrent,LSTM,https://github.com/michaelrzhang/lookahead,,,,1,?,?
bRSM + cache,"Jeremy Gordon, David Rawlinson, Subutai Ahmad",2019/12/02,2019,Long Distance Relationships without Time Travel: Boosting the Performance of a Sparse Predictive Autoencoder in Sequence Modeling,4,0,https://arxiv.org/abs/1912.01116,2.55E+06,,,15,,,0.00,9.29E+05,2.13E+14,,9.29E+05,,,,103.50,0.0,1,1,0,Recurrent,bRSM,,,,,1,,10000
GPT-2 (1542M),"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever",2019/02/14,2019,Language Models are Unsupervised Multitask Learners,6654,,https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf,1.50E+09,,1.50E+21,20,,,0.00,4.00E+09,7.20E+20,,4.00E+09,WebText,17.48,18.34,35.76,1.0,1,0,0,Transformer,GPT,https://github.com/openai/gpt-2,,,,1,Own,50257
True-Regularization+Finetune+Dynamic-Eval,"Yangyang Shi, Mei-Yuh Hwang, Xin Lei, Haoyu Sheng",2019/04/08,2019,Knowledge Distillation For Recurrent Neural Network Language Modeling With Trust Regularization,24,,https://arxiv.org/pdf/1904.04163,7.00E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,47.60,0.0,0,0,0,Recurrent,RNN,,,,,1,,
GPT-2 (762M),"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever",2019/02/14,2019,Language Models are Unsupervised Multitask Learners,6654,,https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf,7.62E+08,,7.62E+20,100,,,0.00,4.00E+09,1.83E+21,,4.00E+09,WebText,22.05,,,1.0,1,0,0,Transformer,GPT,https://github.com/openai/gpt-3,,,,1,Own,50257
GPT-2 (345M),"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever",2019/02/14,2019,Language Models are Unsupervised Multitask Learners,6654,,https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf,3.45E+08,,3.45E+20,100,,,0.00,4.00E+09,8.28E+20,,4.00E+09,WebText,26.37,,,1.0,1,0,0,Transformer,GPT,https://github.com/openai/gpt-4,,,,1,Own,50257
GPT-2 (117M),"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever",2019/02/14,2019,Language Models are Unsupervised Multitask Learners,6654,,https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf,1.17E+08,,1.17E+20,100,,,0.00,4.00E+09,2.81E+20,,4.00E+09,WebText,37.50,,,1.0,1,0,1,Transformer,GPT,https://github.com/openai/gpt-5,,,,1,Own,50257
Sandwich Transformer,"Ofir Press, Noah A. Smith, Omer Levy",2019/11/10,2019,Improving Transformer Models by Reordering their Sublayers,57,,https://arxiv.org/abs/1911.03864,2.09E+08,,,180,,,0.00,7.00E+08,1.58E+20,0.00E+00,7.00E+08,Toronto Books Corpus,17.84,,,1.0,1,0,0,Transformer,Transformer-XL,https://github.com/ofirpress/sandwich_transformer,,,,1,BERT,29000
LSTM(large)+Sememe+cell,"Yujia Qin, Fanchao Qi, Sicong Ouyang, Zhiyuan Liu, Cheng Yang, Yasheng Wang, Qun Liu, Maosong Sun",2019/10/20,2019,Improving Sequence Modeling Ability of Recurrent Neural Networks via Sememes,19,,https://arxiv.org/pdf/1910.08910,4.80E+07,,,40,,,0.00,2.08E+06,2.40E+16,,2.08E+06,,,85.76,,0.0,1,0,1,Recurrent,LSTM,https://github.com/thunlp/SememeRNN,,,,1,Word-level,33378
Progressive LRD,"Habib Hajimolahoseini, Walid Ahmed, Mehdi Rezagholizadeh, Vahid Partovinia, Yang Liu",2022/10/12,2022,Strategies for Applying Low Rank Decomposition to Transformer-Based Models,0,1,https://web.archive.org/web/20221130215920/https://neurips2022-enlsp.github.io/papers/paper_33.pdf,3.10E+07,1.03E+08,6.20E+19,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,,22.00,,,0.0,0,0,0,Transformer,GPT,,OpenAI,Industry,,1,,
TRIMELMlong (150M),"Zexuan Zhong, Tao Lei, Danqi Chen",2022/05/25,2022,Training Language Models with Memory Augmentation,35,,https://arxiv.org/abs/2205.12674,1.50E+08,,,139.81,,,0.00,1.03E+08,1.30E+19,1.03E+08,2.06E+08,WikiText-103,22.66,,,0.0,0,0,0,Transformer,Transformer-XL,https://github.com/princeton-nlp/TRIME,Microsoft,Industry,,1,,
LSTM(medium)+Sememe+cell,"Yujia Qin, Fanchao Qi, Sicong Ouyang, Zhiyuan Liu, Cheng Yang, Yasheng Wang, Qun Liu, Maosong Sun",2019/10/20,2019,Improving Sequence Modeling Ability of Recurrent Neural Networks via Sememes,19,,https://arxiv.org/pdf/1910.08910,1.00E+07,,,40,,,0.00,2.08E+06,5.00E+15,,2.08E+06,,,89.16,,0.0,1,0,0,Recurrent,LSTM,https://github.com/thunlp/SememeRNN,,,,1,Word-level,33378
DARTS (second order),"Hanxiao Liu, Karen Simonyan, Yiming Yang",2018/06/24,2018,DARTS: Differentiable Architecture Search,3990,,https://arxiv.org/abs/1806.09055,2.30E+07,,1.10E+16,300,,,0.00,9.29E+05,3.85E+16,,9.29E+05,Penn TreeBank,,,55.70,0.0,0,0,0,NAS,DARTS,https://github.com/quark0/darts,Seoul National University; Hanyang University,Academia,,1,,
SPALM + kNN,"Dani Yogatama, Cyprien de Masson d’Autume, Lingpeng Kong",2021/04/26,2021,Adaptive Semiparametric Language Models,70,,https://web.archive.org/web/20230210050534/https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00371/100688/Adaptive-Semiparametric-Language-Models,UNK,1.10E+08,,UNK,,,0.00,1.10E+08,#VALUE!,,1.10E+08,,17.66,,,0.0,0,0,0,Transformer,Transformer,,University of Waterloo; Microsoft Research,Industry - Academia Collaboration,,0,,
AWD-LSTM + Phrase Induction + finetuning,"Hongyin Luo, Lan Jiang, Yonatan Belinkov, James Glass",2019/06/04,2019,"Improving Neural Language Models by Segmenting, Attending, and Predicting the Future",12,,https://arxiv.org/abs/1906.01702,2.40E+07,,,,,,0.00,,0.00E+00,9.29E+05,9.29E+05,,,,55.70,0.0,1,0,0,Recurrent,LSTM,https://github.com/luohongyin/PILM,,,,1,Word-level,10000
RGC+ASQ (WT2),"Jiarui Fang, Haohuan Fu, Guangwen Yang, Cho-Jui Hsieh",2018/08/13,2018,RedSync : Reducing Synchronization Traffic for Distributed Deep Learning,28,,https://arxiv.org/pdf/1808.04357,2.09E+08,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,87.84,,0.0,0,0,0,,,,Naver AI Lab; Yonsei University,Industry,,1,,
Transformer-XL Large + Phrase Induction,"Hongyin Luo, Lan Jiang, Yonatan Belinkov, James Glass",2019/06/04,2019,"Improving Neural Language Models by Segmenting, Attending, and Predicting the Future",12,,https://arxiv.org/abs/1906.01702,2.57E+08,,,1,,,0.00,1.03E+08,1.59E+17,1.03E+08,2.06E+08,WikiText-103,17.40,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/luohongyin/PILM,,,,1,Word-level,268000
AdvSoft + 4 layer QRNN + dynamic evaluation,"Dilin Wang, Chengyue Gong, Qiang Liu",2019/06/10,2019,Improving Neural Language Modeling via Adversarial Training,95,,https://arxiv.org/abs/1906.03805,2.60E+07,,3.60E+17,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,WikiText-103,28.00,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,https://github.com/ChengyueGongR/advsoft,,,,1,Word-level,268000
AWD-LSTM + MoS + Partial Shuffled,"Dilin Wang, Chengyue Gong, Qiang Liu",2019/06/10,2019,Improving Neural Language Modeling via Adversarial Training,95,,https://arxiv.org/abs/1906.03805,3.50E+07,,,750,,,0.00,2.08E+06,3.28E+17,,2.08E+06,WikiText-2,,38.07,,0.0,1,0,0,Recurrent/Convolutional,QRNN,https://github.com/ChengyueGongR/advsoft,,,,1,Word-level,33000
RGC+ASQ (PTB),"Jiarui Fang, Haohuan Fu, Guangwen Yang, Cho-Jui Hsieh",2018/08/13,2018,RedSync : Reducing Synchronization Traffic for Distributed Deep Learning,28,,https://arxiv.org/pdf/1808.04357,6.90E+07,,,40,,,0.00,9.29E+05,1.54E+16,,9.29E+05,,,,74.69,0.0,0,0,0,,,,Naver AI Lab; Yonsei University,Industry,,1,,
4 layer QRNN + dynamic evaluation,"Dilin Wang, Chengyue Gong, Qiang Liu",2019/06/10,2019,Improving Neural Language Modeling via Adversarial Training,95,,https://arxiv.org/abs/1906.03805,2.60E+07,,3.60E+17,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,WikiText-103,31.60,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,https://github.com/ChengyueGongR/advsoft,,,,1,Word-level,268000
Adversarial + AWD-LSTM-MoS + partial shuffled,"Dilin Wang, Chengyue Gong, Qiang Liu",2019/06/10,2019,Improving Neural Language Modeling via Adversarial Training,95,,https://arxiv.org/abs/1906.03805,2.20E+07,,,450,,,0.00,9.23E+05,5.48E+16,,9.23E+05,Penn TreeBank,,,46.01,0.0,1,0,0,Recurrent/Convolutional,QRNN,https://github.com/ChengyueGongR/advsoft,,,,1,?,?
Transformer-XL+AdamGapAware(GA),"Saar Barkai, Ido Hakimi, Assaf Schuster",2019/09/24,2019,Gap Aware Mitigation of Gradient Staleness,12,,https://arxiv.org/pdf/1909.10802,2.57E+08,,,,,,0.00,1.03E+08,0.00E+00,1.03E+08,2.06E+08,,26.48,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,,?
TCN (13M),"Shaojie Bai, J. Zico Kolter, Vladlen Koltun",2018/02/15,2018,Convolutional Sequence Modeling Revisited,64,,https://openreview.net/forum?id=rk8wKk-R-,1.30E+07,,,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,,,90.17,0.0,0,0,1,Convolutional,TCN,,,,,1,,
Transformer-XL + RMS dynamic eval,"Ben Krause, Emmanuel Kahembwe, Iain Murray, Steve Renals",2019/04/17,2019,Dynamic Evaluation of Transformer Language Models,40,,https://arxiv.org/abs/1904.08378,2.57E+08,,,,,,0.00,1.03E+08,0.00E+00,1.03E+08,2.06E+08,WikiText-103,16.40,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/benkrause/dynamiceval-transformer,,,,1,,268000
Transformer-XL DeFINE (107M),"Sachin Mehta, Rik Koncel-Kedziorski, Mohammad Rastegari, Hannaneh Hajishirzi",2019/11/27,2019,DeFINE: DEep Factorized INput Token Embeddings for Neural Sequence Modeling,21,,https://arxiv.org/abs/1911.12385,1.07E+08,,5.20E+18,20,,,0.00,1.03E+08,1.33E+18,,1.03E+08,WikiText-103; Penn Treebank,25.72,,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,260000
Adaptive LSTM + DeFINE,"Sachin Mehta, Rik Koncel-Kedziorski, Mohammad Rastegari, Hannaneh Hajishirzi",2019/11/27,2019,DeFINE: DEep Factorized INput Token Embeddings for Neural Sequence Modeling,21,,https://arxiv.org/abs/1911.12385,4.87E+07,,6.20E+18,20,,,0.00,1.03E+08,6.02E+17,,1.03E+08,WikiText-103; Penn Treebank,35.94,,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,260000
Transformer-XL DeFINE (141M),"Sachin Mehta, Rik Koncel-Kedziorski, Mohammad Rastegari, Hannaneh Hajishirzi",2019/11/27,2019,DeFINE: DEep Factorized INput Token Embeddings for Neural Sequence Modeling,21,,https://arxiv.org/abs/1911.12385,1.41E+08,,6.20E+18,20,,,0.00,1.03E+08,1.75E+18,,1.03E+08,WikiText-103; Penn Treebank,24.17,,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,260000
AWD-LSTM + DeFINE,"Sachin Mehta, Rik Koncel-Kedziorski, Mohammad Rastegari, Hannaneh Hajishirzi",2019/11/27,2019,DeFINE: DEep Factorized INput Token Embeddings for Neural Sequence Modeling,21,,https://arxiv.org/abs/1911.12385,2.00E+07,,,20,,,0.00,9.29E+05,2.23E+15,,9.29E+05,WikiText-103; Penn Treebank,,,54.20,0.0,1,0,0,Recurrent,LSTM,,,,,1,,10000
AWD-LSTM-DRILL + dynamic evaluation† (PTB),"Nikolaos Pappas, James Henderson",2019/05/14,2019,Deep Residual Output Layers for Neural Language Generation,7,,https://arxiv.org/abs/1905.05513,2.40E+07,,,1000,,,0.00,9.29E+05,1.34E+17,,9.29E+05,Penn TreeBank,,,49.40,0.0,1,0,0,Recurrent,LSTM,https://github.com/idiap/drill,,,,1,?,10000
AWD-LSTM-DRILL + dynamic evaluation† (WT2),"Nikolaos Pappas, James Henderson",2019/05/14,2019,Deep Residual Output Layers for Neural Language Generation,7,,https://arxiv.org/abs/1905.05513,3.40E+07,,,1000,,,0.00,2.08E+06,4.24E+17,,2.08E+06,WikiText-2,,42.00,,0.0,1,0,0,Recurrent,LSTM,https://github.com/idiap/drill,,,,1,?,33278
dense-IndRNN+dynamic eval,"Shuai Li, Wanqing Li, Chris Cook, Yanbo Gao",2019/10/11,2019,Deep Independently Recurrent Neural Network (IndRNN),45,,https://arxiv.org/abs/1910.06251,4.41E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,49.95,0.0,1,0,0,Recurrent,,,,,,1,Word-level,10000
DEQ-TrellisNet,"Shaojie Bai, J. Zico Kolter, Vladlen Koltun",2019/09/03,2019,Deep Equilibrium Models,496,,https://arxiv.org/abs/1909.01377,1.10E+08,,,12,,,0.00,1.03E+08,8.16E+17,0.00E+00,1.03E+08,Penn TreeBank,,,57.10,0.0,1,0,0,Transformer,DEQ,https://github.com/locuslab/deq,,,,1,?,10000
"DEQ-Transformer (Medium, Adaptive Embedding)","Shaojie Bai, J. Zico Kolter, Vladlen Koltun",2019/09/03,2019,Deep Equilibrium Models,496,,https://arxiv.org/abs/1909.01377,1.10E+08,,,12,,,0.00,1.03E+08,8.16E+17,0.00E+00,1.03E+08,WikiText-103,23.20,,,0.0,1,0,0,Transformer,DEQ,https://github.com/locuslab/deq,,,,1,?,260000
RNN Baseline,"Sho Takase, Jun Suzuki, Masaaki Nagata",2019/07/14,2019,Character n-Gram Embeddings to Improve RNN Language Models,26,,https://ojs.aaai.org/index.php/AAAI/article/view/4437,1.53E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,32.19,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,,,,,1,Word-level,267735
RNN + char3-MS-vec,"Sho Takase, Jun Suzuki, Masaaki Nagata",2019/07/16,2019,Character n-Gram Embeddings to Improve RNN Language Models,26,,https://ojs.aaai.org/index.php/AAAI/article/view/4439,1.75E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,31.81,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,,,,,1,Word-level,267735
RNN + char4-MS-vec,"Sho Takase, Jun Suzuki, Masaaki Nagata",2019/07/17,2019,Character n-Gram Embeddings to Improve RNN Language Models,26,,https://ojs.aaai.org/index.php/AAAI/article/view/4440,2.26E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,WikiText-103,32.21,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,,,,,1,Word-level,267735
Base LM + kNN LM + Continuous Cache,"Urvashi Khandelwal, Omer Levy, Dan Jurafsky, Luke Zettlemoyer, Mike Lewis",2019/11/01,2019,Generalization through Memorization: Nearest Neighbor Language Models,410,,https://arxiv.org/abs/1911.00172,2.47E+08,,7.30E+18,200.00,,,0.00,1.03E+08,3.05E+19,,1.03E+08,WikiText-103,16.12,,,0.0,0,1,0,Transformer,Transformer-XL,https://github.com/urvashik/knnlm,IIT Delhi,Academia,,1,,
RNN + char2-MS-vec,"Sho Takase, Jun Suzuki, Masaaki Nagata",2019/07/15,2019,Character n-Gram Embeddings to Improve RNN Language Models,26,,https://ojs.aaai.org/index.php/AAAI/article/view/4438,1.58E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,31.92,,,0.0,1,0,0,Recurrent/Convolutional,QRNN,,,,,1,Word-level,267735
LSTM-Medium+Behaviorial-Gating,"Prashanth Gurunath Shivakumar, Shao-Yen Tseng, Panayiotis Georgiou, Shrikanth Narayanan",2019/08/31,2019,Behavior Gated Language Models,3,0,https://arxiv.org/pdf/1909.00107,2.00E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,78.75,0.0,1,0,0,Recurrent,LSTM,,,,,1,?,?
AWD-LSTM+Behaviorial-Gating,"Prashanth Gurunath Shivakumar, Shao-Yen Tseng, Panayiotis Georgiou, Shrikanth Narayanan",2019/08/31,2019,Behavior Gated Language Models,3,0,https://arxiv.org/pdf/1909.00107,2.70E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,56.92,0.0,1,0,0,Recurrent,LSTM,,,,,1,?,?
LSTM-Large+Behaviorial-Gating,"Prashanth Gurunath Shivakumar, Shao-Yen Tseng, Panayiotis Georgiou, Shrikanth Narayanan",2019/08/31,2019,Behavior Gated Language Models,3,0,https://arxiv.org/pdf/1909.00107,6.70E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,75.80,0.0,1,0,0,Recurrent,LSTM,,,,,1,?,?
NLM,"Junxian He, Graham Neubig, Taylor Berg-Kirkpatrick",2021/09/09,2021,Efficient Nearest Neighbor Language Models,55,,https://arxiv.org/abs/2109.04212,5.15E+08,,7.36E+18,,,,0.00,1.03E+08,0.00E+00,1.03E+08,2.06E+08,WikiText-103,18.66,,,0.0,0,0,0,Transformer,kNN-LM,https://github.com/jxhe/efficient-knnlm,Tianjin University; Microsoft Research; Beijing Institute of Technology,Industry - Academia Collaboration,,1,,
All-attention network + adaptive span,"Sainbayar Sukhbaatar, Edouard Grave, Guillaume Lample, Herve Jegou, Armand Joulin",2019/07/02,2019,Augmenting Self-attention with Persistent Memory,94,,https://arxiv.org/abs/1907.01470,1.33E+08,,4.60E+19,,,,0.00,1.03E+08,0.00E+00,0.00E+00,1.03E+08,WikiText-103,20.60,,,0.0,1,0,0,Transformer,All-attention network,,,,,1,Word-level,260000
Tensorized Transformer (small),"Xindian Ma, Peng Zhang, Shuai Zhang, Nan Duan, Yuexian Hou, Ming Zhou, Dawei Song",2019/06/24,2019,A Tensorized Transformer for Language Modeling,126,,https://arxiv.org/abs/1906.09777,1.20E+07,,,30.00,,,0.00,9.29E+05,2.01E+15,,9.29E+05,WikiText-103,,,57.90,0.0,1,0,0,Transformer,Tensorized Transformer,,,,,1,Word-level,10000
VD-LSTM+REAL Small,"Hakan Inan, Khashayar Khosravi, Richard Socher",2016/11/04,2016,Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling,397,,https://arxiv.org/abs/1611.01462,6.08E+06,,,60,,,1.00,2.08E+06,4.55E+15,,2.08E+06,WikiText-2,,98.90,,0.0,0,0,0,Recurrent,LSTM,,,,,1,,
Tensorized Transformer (large PTB),"Xindian Ma, Peng Zhang, Shuai Zhang, Nan Duan, Yuexian Hou, Ming Zhou, Dawei Song",2019/06/24,2019,A Tensorized Transformer for Language Modeling,126,,https://arxiv.org/abs/1906.09777,2.40E+07,,,30.00,,,0.00,9.29E+05,4.01E+15,,9.29E+05,WikiText-103,,,52.70,0.0,1,0,0,Transformer,Tensorized Transformer,,,,,1,Word-level,10000
retrieval-quality-kNN-LMs,"Andrew Drozdov, Shufan Wang, Razieh Rahimi, Andrew McCallum, Hamed Zamani, Mohit Iyyer",2022/10/28,2022,"You can’t pick your neighbors, or can you? When and how to rely on retrieval in the kNN-LM",7,,https://arxiv.org/pdf/2210.15859.pdf,2.47E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,,15.50,,,0.0,0,0,0,,,https://stanfordnlp.github.io/stanza/,University of Texas at Austin,Academia,,1,,
Tensorized Transformer (257M),"Xindian Ma, Peng Zhang, Shuai Zhang, Nan Duan, Yuexian Hou, Ming Zhou, Dawei Song",2019/06/24,2019,A Tensorized Transformer for Language Modeling,126,,https://arxiv.org/abs/1906.09777,2.57E+08,,,30.00,,,0.00,1.03E+08,4.76E+18,,1.03E+08,WikiText-103,21.20,,,0.0,1,0,0,Transformer,Tensorized Transformer,,,,,1,Word-level,260000
Tensorized Transformer (core-2),"Xindian Ma, Peng Zhang, Shuai Zhang, Nan Duan, Yuexian Hou, Ming Zhou, Dawei Song",2019/06/24,2019,A Tensorized Transformer for Language Modeling,126,,https://arxiv.org/abs/1906.09777,8.53E+07,,,30.00,,,0.00,1.03E+08,1.58E+18,,1.03E+08,WikiText-103,18.90,,,0.0,1,0,0,Transformer,Tensorized Transformer,,,,,1,Word-level,260000
Tensorized Transformer (151M),"Xindian Ma, Peng Zhang, Shuai Zhang, Nan Duan, Yuexian Hou, Ming Zhou, Dawei Song",2019/06/24,2019,A Tensorized Transformer for Language Modeling,126,,https://arxiv.org/abs/1906.09777,1.51E+08,,,30.00,,,0.00,1.03E+08,2.80E+18,,1.03E+08,WikiText-103,18.80,,,0.0,1,0,0,Transformer,Tensorized Transformer,,,,,1,Word-level,260000
TSLM+MoS (PTB),"Lipeng Zhang, Peng Zhang, Xindian Ma, Shuqin Gu, Zhan Su, Dawei Song",2019/01/31,2019,A Generalized Language Model in Tensor Space,21,,https://arxiv.org/pdf/1901.11167,2.63E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,Penn TreeBank,,,83.60,0.0,1,0,0,,TSLM,,University of Manchester; The Alan Turing Institute,Academia,,1,,10000
GLM-2B,"Zhengxiao Du, Yujie Qian, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, Jie Tang",2021/03/18,2021,GLM: General Language Model Pretraining with Autoregressive Blank Infilling,131,,https://arxiv.org/abs/2103.10360,2.00E+09,,,1,,,0.00,6.32E+11,7.58E+21,,6.32E+11,,11.65,14.90,33.31,1.0,0,0,0,Transformer,GLM,https://github.com/THUDM/GLM,,,,1,,
TSLM+MoS (WT2),"Lipeng Zhang, Peng Zhang, Xindian Ma, Shuqin Gu, Zhan Su, Dawei Song",2019/01/31,2019,A Generalized Language Model in Tensor Space,21,,https://arxiv.org/pdf/1901.11167,9.12E+06,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,81.00,,0.0,1,0,0,,TSLM,,,,,1,,33278
Turing-NLG,Corby Rosset,2020/02/13,2020,Turing-NLG: A 17-billion-parameter language model by Microsoft,NA,,https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/,1.70E+10,,1.57E+22,3.39,,,0.00,4.64E+10,1.60E+22,,4.64E+10,,10.21,,,0.5,1,0,0,Transformer,NLG,,,,,1,?,?
Engin-Base (NE),"Zhongping Zhang, Yiwen Gu, Bryan A. Plummer",2021/12/11,2021,Show and Write: Entity-aware Article Generation with Image Information,0,0.5,https://arxiv.org/pdf/2112.05917,1.24E+08,,,3,,,0.00,,0.00E+00,,0.00E+00,,,20.70,,0.0,1,0,0,,,,,,,0,,
Amended-DARTS,"Kaifeng Bi, Changping Hu, Lingxi Xie, Xin Chen, Longhui Wei, Qi Tian",2019/10/25,2019,Stabilizing DARTS with Amended Gradient Estimation on Architectural Parameters,48,,https://arxiv.org/pdf/1910.11831,2.30E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,54.80,0.0,0,0,0,NAS,DARTS,,"Stanford University; SambaNova Systems; Peking University; Adobe; University at Buffalo, SUNY",Industry - Academia Collaboration,,1,,
RNN+LSA+KN5+cache (model combination w/ linear extrapolation),"Tomas Mikolov, Geoffrey Zweig",2012/12/01,2012,Context dependent recurrent neural network language model,716,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/rnn_ctxt.pdf,3.14E+06,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,Penn TreeBank,,,72.90,0.0,0,1,0,Recurrent,RNN,,,,,1,,
Integer Transformer,"Ye Lin, Yanyang Li, Tengbo Liu, Tong Xiao, Tongran Liu, Jingbo Zhu",2020/09/17,2020,Towards Fully 8-bit Integer Inference for the Transformer Model,26,,https://arxiv.org/pdf/2009.08034,5.97E+07,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,,18.16,,,0.0,1,0,0,Transformer,,,,,,1,,32000
2-layer-LSTM+Deep-Gradient-Compression,"Yujun Lin, Song Han, Huizi Mao, Yu Wang, William J. Dally",2017/12/05,2017,Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training,1270,,https://arxiv.org/pdf/1712.01887,6.02E+06,,,40,,,0.00,9.29E+05,1.34E+15,,9.29E+05,,,,72.24,0.0,0,0,0,Recurrent,LSTM,https://github.com/synxlin/deep-gradient-compression,University of Liverpool; USC Information Sciences Institute,Academia,,1,,
Integer Transformer,"Ye Lin, Yanyang Li, Tengbo Liu, Tong Xiao, Tongran Liu, Jingbo Zhu",2020/09/17,2020,Towards Fully 8-bit Integer Inference for the Transformer Model,26,,https://arxiv.org/pdf/2009.08034,2.47E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,,30.79,,,0.0,1,0,0,Transformer,,,,,,1,,32000
TaLK Convolution,"Vasileios Lioutas, Yuhong Guo",2020/02/08,2020,Time-aware Large Kernel Convolutions,24,,https://arxiv.org/abs/2002.03184,2.40E+08,,,187.43,,,0.00,1.03E+08,2.78E+19,0.00E+00,1.03E+08,WikiText-103,23.30,,,0.0,1,0,0,Convolutional,Transformer,https://github.com/lioutasb/TaLKConvolutions,,,,1,,260000
GPT3-6.7B (rerun of original),"Greg Yang, Edward J. Hu, Igor Babuschkin, Szymon Sidor, Xiaodong Liu, David Farhi, Nick Ryder, Jakub Pachocki, Weizhu Chen, Jianfeng Gao",2020/05/28,2020,Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer,37,,https://web.archive.org/web/20221014063419/https://arxiv.org/pdf/2203.03466.pdf,6.70E+09,,1.20E+22,1,,,0.00,4.99E+11,2.01E+22,,4.99E+11,,9.13,,,1.0,1,0,0,Transformer,GPT,https://github.com/microsoft/mup,,,,1,custom,?
Shortformer,"Ofir Press, Noah A. Smith, Mike Lewis",2020/12/31,2020,Shortformer: Better Language Modeling using Shorter Inputs,43,,https://arxiv.org/abs/2012.15832,2.40E+07,,,205,,,0.00,1.03E+08,3.04E+18,0.00E+00,1.03E+08,WikiText-103,18.15,,,0.0,1,0,0,Transformer,Shortformer,https://github.com/ofirpress/shortformer,,,,1,BERT,29000
"Segatron XL base, M=384","He Bai, Peng Shi, Jimmy Lin, Yuqing Xie, Luchen Tan, Kun Xiong, Wen Gao, Ming Li",2020/04/30,2020,Segatron: Segment-Aware Transformer for Language Modeling and Understanding,13,,https://arxiv.org/abs/2004.14996,1.51E+08,,,18.64,,,0.00,1.03E+08,1.74E+18,,1.03E+08,WikiText-103,22.50,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/rsvp-ai/segatron_aaai,,,,1,BERT,29000
NAS+ESS (156M),"Yinqiao Li, Chi Hu, Yuhao Zhang, Nuo Xu, Yufan Jiang, Tong Xiao, Jingbo Zhu, Tongran Liu, Changliang Li",2020/05/06,2020,Learning Architectures from an Extended Search Space for Language Modeling,12,,https://arxiv.org/pdf/2005.02593,1.56E+08,,,30,,,0.00,1.03E+08,2.89E+18,0.00E+00,1.03E+08,,29.20,,,0.0,0,0,0,Recurrent,RNN,,Salesforce Resarch,Industry,,1,,
"Segatron XL large, M=384","He Bai, Peng Shi, Jimmy Lin, Yuqing Xie, Luchen Tan, Kun Xiong, Wen Gao, Ming Li",2020/04/30,2020,Segatron: Segment-Aware Transformer for Language Modeling and Understanding,13,,https://arxiv.org/abs/2004.14996,2.57E+08,,,167.02,,,0.00,1.03E+08,2.65E+19,,1.03E+08,WikiText-103,17.10,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/rsvp-ai/segatron_aaai,,,,1,BERT,29000
rTop-k(distributed setting),"Leighton Pate Barnes, Huseyin A. Inan, Berivan Isik, Ayfer Ozgur",2020/05/21,2020,rTop-k: A Statistical Estimation Approach to Distributed SGD,41,,https://arxiv.org/pdf/2005.10761,6.90E+07,,,38,,,0.00,9.29E+05,1.46E+16,,9.29E+05,,,,82.49,0.0,1,0,0,Recurrent,RNN,,"Tianjin University, Tianjin, China; Beijing Institute of Technology, Beijing, China",Academia,,1,Word-level?,?
GPT-Neo-1.3B,"Sid Black, Leo Gao, Phil Wang, Connor Leahy, Stella Biderman",2021/03/21,2021,GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow,NA,,https://github.com/EleutherAI/gpt-neo,1.30E+09,,,1,,,0.00,4.00E+11,3.12E+21,,4.00E+11,,,13.10,,1.0,0,0,0,Transformer,GPT-Neo,https://github.com/EleutherAI/gpt-neo,,,,1,,
DiffStk-MRNN,"Ankur Mali, Alexander Ororbia, Daniel Kifer, Clyde Lee Giles",2020/04/04,2020,Recognizing Long Grammatical Sequences Using Recurrent Networks Augmented With An External Differentiable Stack,8,,https://arxiv.org/pdf/2004.07623,1.01E+06,,,50,,,1.00,9.29E+05,2.82E+14,,9.29E+05,,,,115.00,0.0,1,0,0,Recurrent,,,,,,1,?,?
Tensor-Transformer(1core)+PN (PTB),"Sheng Shen, Zhewei Yao, Amir Gholami, Michael W. Mahoney, Kurt Keutzer",2020/03/17,2020,PowerNorm: Rethinking Batch Normalization in Transformers,60,,https://arxiv.org/pdf/2003.07845,1.20E+07,,,30,,,0.00,9.29E+05,2.01E+15,,9.29E+05,,,,47.60,0.0,1,0,0,Transformer,Tensorized Transformer,https://github.com/sIncerass/powernorm,,,,1,Word-level,10000
TCN (148M),"Shaojie Bai, J. Zico Kolter, Vladlen Koltun",2018/02/15,2018,Convolutional Sequence Modeling Revisited,64,,https://openreview.net/forum?id=rk8wKk-R-,1.48E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,45.19,,,0.0,0,0,1,Convolutional,TCN,,,,,1,,
Tensor-Transformer(1core)+PN (WT103),"Sheng Shen, Zhewei Yao, Amir Gholami, Michael W. Mahoney, Kurt Keutzer",2020/03/17,2020,PowerNorm: Rethinking Batch Normalization in Transformers,60,,https://arxiv.org/pdf/2003.07845,8.53E+07,,,30,,,0.00,1.03E+08,1.58E+18,,1.03E+08,,17.90,,,0.0,1,0,0,Transformer,Tensorized Transformer,https://github.com/sIncerass/powernorm,,,,1,Word-level,268000
ENAS,"Hieu Pham, Melody Y. Guan, Barret Zoph, Quoc V. Le, Jeff Dean",2018/02/09,2018,Efficient Neural Architecture Search via Parameter Sharing,2760,,https://arxiv.org/abs/1802.03268,2.40E+07,,,150,,,0.00,9.29E+05,2.01E+16,,9.29E+05,Penn TreeBank,,,55.80,0.0,0,0,0,NAS,NAS,,Peking University,Academia,,1,,
Frage-AWD-LSTM-MemoryAug-NeuralCache (PTB),"Ke Li, Daniel Povey, Sanjeev Khudanpur",2020/09/29,2020,Neural Language Modeling With Implicit Cache Pointers,4,1,https://arxiv.org/pdf/2009.13774,2.40E+07,,,,,,1.00,9.29E+05,0.00E+00,,9.29E+05,,,,52.50,0.0,1,1,0,Recurrent,,,,,,1,Word-level,10000
Frage-AWD-LSTM-MemoryAug-NeuralCache (WT2),"Ke Li, Daniel Povey, Sanjeev Khudanpur",2020/09/29,2020,Neural Language Modeling With Implicit Cache Pointers,4,1,https://arxiv.org/pdf/2009.13774,3.30E+07,,,,,,1.00,2.08E+06,0.00E+00,,2.08E+06,,,55.60,,0.0,1,1,0,Recurrent,,,,,,1,,33000
AWD-FWM (WT2),"Imanol Schlag, Tsendsuren Munkhdalai, Jürgen Schmidhuber",2020/11/16,2020,Learning Associative Inference Using Fast Weight Memory,29,,https://arxiv.org/abs/2011.07831,3.70E+07,,,1600,,,0.00,2.08E+06,7.39E+17,,2.08E+06,WikiText-2,,61.65,,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,33000
Transformer + Average Attention Network,"Jian Guo Zhang, Jian Ping Li, Huang Li",2019/01/01,2019,Language Modeling with Transformer,126,,https://ieeexplore.ieee.org/abstract/document/9067534,UNK,,,,,,1.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,22.13,,,0.0,0,0,0,Transformer,Transformer,,NVIDIA,Industry,,0,,
AWD-FWM (PTB),"Imanol Schlag, Tsendsuren Munkhdalai, Jürgen Schmidhuber",2020/11/16,2020,Learning Associative Inference Using Fast Weight Memory,29,,https://arxiv.org/abs/2011.07831,2.40E+07,,,1000,,,0.00,9.29E+05,1.34E+17,,9.29E+05,Penn TreeBank,,,54.48,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
Compress-LSTM (4.6M),"Artem M. Grachev, Dmitry I. Ignatov, Andrey V. Savchenko",2019/02/06,2019,Compression of Recurrent Neural Networks for Efficient Language Modeling,37,,"https://arxiv.org/abs/1902.02380#:~:text=Compression%20of%20Recurrent%20Neural%20Networks%20for%20Efficient%20Language%20Modeling,-Artem%20M.&text=Recurrent%20neural%20networks%20have%20proved,real%2Dtime%20offline%20mobile%20applications.",4.64E+06,,,90,,,0.00,9.29E+05,2.33E+15,,9.29E+05,,,,117.66,0.0,0,0,0,Recurrent,LSTM,,,,,1,,
GPT-3 175B (davinci),"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, Dario Amodei",2020/05/28,2020,Language Models are Few-Shot Learners,13822,,https://arxiv.org/abs/2005.14165,1.75E+11,,3.14E+23,0.601,,,0.00,4.99E+11,3.15E+23,,4.99E+11,CommonCrawl; WebText2; Books1; Books2; Wikipedia,,,20.50,1.0,1,0,1,Transformer,GPT,https://github.com/openai/gpt-3/,,,,1,Own,50257
LSTM-3-layer+Gadam,"Diego Granziol, Xingchen Wan, Samuel Albanie, Stephen Roberts",2020/03/02,2020,Iterative Averaging in the Quest for Best Test Error,5,,https://arxiv.org/pdf/2003.01247,2.40E+07,,,200,,,0.00,9.29E+05,2.68E+16,,9.29E+05,,,,58.77,0.0,1,0,0,Recurrent,LSTM,,Eindhoven University of Technologyl; University of Twente,Academia,,1,Word-level,10000
Temporal Convolutional Attention-based Network(TCAN) (WT2),"Hongyan Hao, Yan Wang, Yudi Xia, Jian Zhao, Furao Shen",2020/02/28,2020,Temporal Convolutional Attention-based Network For Sequence Modeling,33,,https://arxiv.org/pdf/2002.12530,3.30E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,6.66,,0.0,0,0,0,,,https://github.com/haohy/TCAN,,,,1,,
SparseOPT-66B,"Elias Frantar, Dan Alistarh",2023/01/02,2023,SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot,8,,https://arxiv.org/abs/2301.00774,3.30E+10,,,1.666666667,,,1.00,1.80E+11,5.94E+22,0.00E+00,1.80E+11,,9.32,,,1.0,0,0,0,Transformer,OPT,https://github.com/IST-DASLab/sparsegpt,,,,1,,
TransformerXL + spectrum control,"Lingxiao Wang, Jing Huang, Kevin Huang, Ziniu Hu, Guangtao Wang, Quanquan Gu",2020/03/11,2020,Improving Neural Language Generation with Spectrum Control,55,,https://openreview.net/forum?id=ByxY8CNtvr,1.51E+08,,4.60E+17,250,,,0.00,1.03E+08,2.33E+19,0.00E+00,1.03E+08,WikiText-103,23.20,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,Word-level,260000
ONLSTM-SYD,"Wenyu Du, Zhouhan Lin, Yikang Shen, Timothy J. O'Donnell, Yoshua Bengio, Yue Zhang",2020/05/12,2020,Exploiting Syntactic Structure for Better Language Modeling: A Syntactic Distance Approach,15,,https://arxiv.org/pdf/2005.05864,2.50E+07,,,1000,,,0.00,9.29E+05,1.39E+17,,9.29E+05,,,,55.70,0.0,1,0,0,Recurrent,LSTM,https://github.com/wenyudu/SDLM,,,,1,,10000
ERNIE-Doc (151M),"Siyu Ding, Junyuan Shang, Shuohuan Wang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang",2020/12/31,2020,ERNIE-Doc: A Retrospective Long-Document Modeling Transformer,35,,https://arxiv.org/pdf/2012.15688,1.51E+08,,,190.88,,,0.00,1.03E+08,1.78E+19,,1.03E+08,,21.00,,,1.0,1,0,0,Transformer,Transformer-XL,https://github.com/PaddlePaddle/ERNIE/,,,,1,RoBERTa wordpieces tokenizer,50000
ERNIE-Doc (247M),"Siyu Ding, Junyuan Shang, Shuohuan Wang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang",2020/12/31,2020,ERNIE-Doc: A Retrospective Long-Document Modeling Transformer,35,,https://arxiv.org/pdf/2012.15688,2.47E+08,,,190.88,,,0.00,1.03E+08,2.91E+19,,1.03E+08,,16.80,,,1.0,1,0,0,Transformer,Transformer-XL,https://github.com/PaddlePaddle/ERNIE/,,,,1,RoBERTa wordpieces tokenizer,50000
Routing Transformer,"Aurko Roy, Mohammad Saffar, Ashish Vaswani, David Grangier",2020/03/12,2020,Efficient Content-Based Sparse Attention with Routing Transformers,349,,https://arxiv.org/abs/2003.05997,7.95E+07,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,WikiText-103,15.80,,,0.0,1,0,0,Transformer,Local transformer,https://github.com/google-research/google-research/tree/master/routing_transformer,,,,1,Word-level,268000
KnGPT2,"Ali Edalati, Marzieh Tahaei, Ahmad Rashid, Vahid Partovi Nia, James J. Clark, Mehdi Rezagholizadeh",2021/10/15,2021,Kronecker Decomposition for GPT Compression,14,,https://web.archive.org/web/20221111092612/https://arxiv.org/pdf/2110.08152.pdf,8.30E+07,4.40E+09,1.24E+20,1,,,0.00,4.00E+09,1.99E+18,4.00E+08,4.40E+09,,20.50,,,1.0,0,0,0,Transformer,GPT,,University of Lugano; King Abdullah University of Science and Technology,Academia,,1,,
DeLight,"Sachin Mehta, Marjan Ghazvininejad, Srinivasan Iyer, Luke Zettlemoyer, Hannaneh Hajishirzi",2020/08/03,2020,DeLighT: Deep and Light-weight Transformer,98,,https://arxiv.org/abs/2008.00623,9.90E+07,,2.40E+19,62.14,,,0.00,1.03E+08,3.80E+18,0.00E+00,1.03E+08,WikiText-103,24.14,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/sacmehta/delight,,,,1,,260000
CT-MoS (PTB),"Pei-Hsin Wang, Sheng-Iou Hsieh, Shih-Chieh Chang, Yu-Ting Chen, Jia-Yu Pan, Wei Wei, Da-Chang Juan",2020/12/25,2020,Contextual Temperature for Language Modeling,6,,https://arxiv.org/pdf/2012.13575,2.40E+07,,,1000,,,0.00,9.29E+05,1.34E+17,,9.29E+05,,,,54.69,0.0,1,0,0,Recurrent,LSTM,,,,,1,,10000
CT-MoS + DynamicEval (WT2),"Pei-Hsin Wang, Sheng-Iou Hsieh, Shih-Chieh Chang, Yu-Ting Chen, Jia-Yu Pan, Wei Wei, Da-Chang Juan",2020/12/25,2020,Contextual Temperature for Language Modeling,6,,https://arxiv.org/pdf/2012.13575,4.50E+07,,,1000,,,0.00,2.08E+06,5.62E+17,,2.08E+06,,,40.96,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,33000
CT-MoS (WT2),"Pei-Hsin Wang, Sheng-Iou Hsieh, Shih-Chieh Chang, Yu-Ting Chen, Jia-Yu Pan, Wei Wei, Da-Chang Juan",2020/12/25,2020,Contextual Temperature for Language Modeling,6,,https://arxiv.org/pdf/2012.13575,4.50E+07,,,1000,,,0.00,2.08E+06,5.62E+17,,2.08E+06,,,62.21,,0.0,1,0,0,Recurrent,LSTM,,,,,1,,33000
CT-MoS + DynamicEval (PTB),"Pei-Hsin Wang, Sheng-Iou Hsieh, Shih-Chieh Chang, Yu-Ting Chen, Jia-Yu Pan, Wei Wei, Da-Chang Juan",2020/12/25,2020,Contextual Temperature for Language Modeling,6,,https://arxiv.org/pdf/2012.13575,2.40E+07,,,1000,,,0.00,9.29E+05,1.34E+17,,9.29E+05,,,,47.42,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
Feedback Transformer,"Angela Fan, Thibaut Lavril, Edouard Grave, Armand Joulin, Sainbayar Sukhbaatar",2020/02/21,2020,Addressing Some Limitations of Transformers with Feedback Memory,41,,https://arxiv.org/abs/2002.09402,1.26E+08,,,267.23,,,0.00,1.03E+08,2.08E+19,,1.03E+08,WikiText-103,18.30,,,0.0,1,0,0,Transformer,Feedback transformer,,,,,1,Word-level,268000
Transformer+Recurrent Windows of Context,"Davis Yoshida, Allyson Ettinger, Kevin Gimpel",2020/08/16,2020,Adding Recurrence to Pretrained Transformers for Improved Efficiency and Context Size,4,0.5,https://arxiv.org/pdf/2008.07027,1.24E+08,,1.17E+20,2,,,0.00,4.00E+09,5.95E+18,1.03E+08,4.10E+09,,26.73,,,0.0,1,0,0,Recurrent/Transformer,GPT,,,,,1,GPT2Tokenizer,50257
Transformer-XL+AdamP,"Byeongho Heo, Sanghyuk Chun, Seong Joon Oh, Dongyoon Han, Sangdoo Yun, Gyuwan Kim, Youngjung Uh, Jung-Woo Ha",2020/06/15,2020,AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights,114,,https://arxiv.org/pdf/2006.08217,2.57E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,1.03E+08,2.06E+08,,23.26,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/clovaai/adamp,,,,1,Word-level,267735
Transformer-XL+WN+AdamP,"Byeongho Heo, Sanghyuk Chun, Seong Joon Oh, Dongyoon Han, Sangdoo Yun, Gyuwan Kim, Youngjung Uh, Jung-Woo Ha",2020/06/15,2020,AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights,114,,https://arxiv.org/pdf/2006.08217,2.57E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,22.77,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/clovaai/adamp,,,,1,Word-level,267735
3-Layer-Tensor-Transformer+AdaHessian,"Zhewei Yao, Amir Gholami, Sheng Shen, Mustafa Mustafa, Kurt Keutzer, Michael W. Mahoney",2020/06/01,2020,ADAHESSIAN: An Adaptive Second Order Optimizer for Machine Learning,151,,https://arxiv.org/pdf/2006.00719,1.20E+07,,,30,,,1.00,9.29E+05,2.01E+15,0.00E+00,9.29E+05,,,,51.50,0.0,1,0,0,Transformer,Tensorized Transformer,https://github.com/amirgholami/ADAHESSIAN.git,University of Massachusetts Amherst,Academia,,1,Word-level,10000
6-Layer-Tensor-Transformer+AdaHessian,"Zhewei Yao, Amir Gholami, Sheng Shen, Mustafa Mustafa, Kurt Keutzer, Michael W. Mahoney",2020/06/01,2020,ADAHESSIAN: An Adaptive Second Order Optimizer for Machine Learning,151,,https://arxiv.org/pdf/2006.00719,8.53E+07,,,30,,,1.00,1.03E+08,1.58E+18,0.00E+00,1.03E+08,,19.90,,,0.0,1,0,0,Transformer,Tensorized Transformer,https://github.com/amirgholami/ADAHESSIAN.git,University of Massachusetts Amherst,Academia,,1,Word-level,268000
SRU++ Large,Tao Lei,2021/02/24,2021,When Attention Meets Fast Recurrence: Training Language Models with Reduced Compute,35,,https://arxiv.org/abs/2102.12459,2.34E+08,,1.10E+19,34.08,,,0.00,1.03E+08,4.93E+18,0.00E+00,1.03E+08,WikiText-103,17.10,,,0.0,1,0,0,Recurrent,SRU,https://github.com/asappresearch/sru,,,,1,,260000
SRU++ Large only 2 attention layers (k=5),Tao Lei,2021/02/24,2021,When Attention Meets Fast Recurrence: Training Language Models with Reduced Compute,35,,https://arxiv.org/abs/2102.12459,2.25E+08,,8.00E+18,34.08,,,0.00,1.03E+08,4.74E+18,0.00E+00,1.03E+08,WikiText-103,17.30,,,0.0,1,0,0,Recurrent,SRU,https://github.com/asappresearch/sru,,,,1,,260000
SRU++ Base,Tao Lei,2021/02/24,2021,When Attention Meets Fast Recurrence: Training Language Models with Reduced Compute,35,,https://arxiv.org/abs/2102.12459,1.08E+08,,5.80E+18,25.56,,,0.00,1.03E+08,1.71E+18,0.00E+00,1.03E+08,WikiText-103,18.30,,,0.0,1,0,0,Recurrent,SRU,https://github.com/asappresearch/sru,,,,1,,260000
GPT-Neo-125M(finetuned),"Michael Santacroce, Zixin Wen, Yelong Shen, Yuanzhi Li",2021/03/21,2021,What Matters In The Structured Pruning of Generative Language Models?,1,1,https://arxiv.org/pdf/2302.03773.pdf,1.25E+08,,,40,,,1.00,3.00E+11,9.00E+21,,3.00E+11,,16.14,,,0.0,0,0,0,Transformer,GPT-Neo,https://github.com/santacml/nn_pruning_uniqueness,,,,1,,
"ALiBi (L=3072, Lvalid = 3072)","Ofir Press, Noah A. Smith, Mike Lewis",2021/08/27,2021,"Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation",124,,https://arxiv.org/abs/2108.12409,1.30E+09,,1.80E+20,205,,,0.00,1.03E+08,1.65E+20,0.00E+00,1.03E+08,WikiText-103,18.30,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/ofirpress/attention_with_linear_biases,,,,1,BERT,29000
Neural Architecture Search with base 8 and shared embeddings,"Barret Zoph, Quoc V. Le",2016/11/05,2016,Neural Architecture Search with Reinforcement Learning,5473,,https://arxiv.org/abs/1611.01578,5.40E+07,,,35,,,0.00,9.29E+05,1.05E+16,,9.29E+05,Penn TreeBank,,,62.40,0.0,0,0,0,Recurrent,RNN,https://github.com/tensorflow/models,Baidu,Industry,,1,,
Subformer (83M),"Machel Reid, Edison Marrese-Taylor, Yutaka Matsuo",2021/01/01,2021,Subformer: Exploring Weight Sharing for Parameter Efficiency in Generative Transformers,12,,https://arxiv.org/abs/2101.00234,8.30E+07,,,70.29,,,0.00,1.03E+08,3.61E+18,0.00E+00,1.03E+08,WikiText-103,20.88,,,0.0,1,0,0,Transformer,Subformer,https://github.com/machelreid/subformer,,,,1,,260000
Subformer (122M),"Machel Reid, Edison Marrese-Taylor, Yutaka Matsuo",2021/01/01,2021,Subformer: Exploring Weight Sharing for Parameter Efficiency in Generative Transformers,12,,https://arxiv.org/abs/2101.00234,1.22E+08,,,70.29,,,0.00,1.03E+08,5.30E+18,0.00E+00,1.03E+08,WikiText-103,19.90,,,0.0,1,0,0,Transformer,Subformer,https://github.com/machelreid/subformer,,,,1,,260000
Subformer (96M),"Machel Reid, Edison Marrese-Taylor, Yutaka Matsuo",2021/01/01,2021,Subformer: Exploring Weight Sharing for Parameter Efficiency in Generative Transformers,12,,https://arxiv.org/abs/2101.00234,9.60E+07,,,70.29,,,0.00,1.03E+08,4.17E+18,0.00E+00,1.03E+08,WikiText-103,20.39,,,0.0,1,0,0,Transformer,Subformer,https://github.com/machelreid/subformer,,,,1,,267000
Pythia-70m,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,7.00E+07,,,1,,,0.00,3.00E+11,1.26E+20,,3.00E+11,,,57.04,,0.5,0,0,1,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,,
TransfoRNN(d=1024)(2-layer) (PTB),"Tze Yuang Chong, Xuyang Wang, Lin Yang, Junjie Wang",2021/04/04,2021,TransfoRNN: Capturing the Sequential Information in Self-Attention Representations for Language Modeling,0,0,https://arxiv.org/pdf/2104.01572,4.99E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,82.10,0.0,0,0,0,Transformer/RNN,TransfoRNN,,,,,1,,
BERT-Large-CAS (PTB+WT2+WT103),"Chenguang Wang, Mu Li, Alexander J. Smola",2019/04/20,2019,Language Models with Transformers,110,,https://arxiv.org/abs/1904.09408,3.95E+08,,,50,,,0.00,4.40E+09,5.21E+20,9.29E+05,4.40E+09,Penn TreeBank; WikiText-2; WikiText-103,,,31.34,0.0,0,0,0,Transformer,BERT,https://github.com/cgraywang/gluon-nlp-1/tree/lmtransformer/scripts/language_model,,,,1,,
Compressive Transformers for Long-Range Sequence Modelling,"Jack W. Rae, Anna Potapenko, Siddhant M. Jayakumar, Timothy P. Lillicrap",2019/11/13,2019,Compressive Transformers for Long-Range Sequence Modelling,330,,https://arxiv.org/abs/1911.05507,UNK,,1.60E+20,328.32,,,0.00,1.03E+08,#VALUE!,,1.03E+08,WikiText-103,17.10,,,0.0,1,0,0,Transformer,Transformer-XL,,Google Research,Industry,,0,,
DEQ-Transformer (Post-LN) + Jacobian Regularisation,"Shaojie Bai, Vladlen Koltun, J. Zico Kolter",2021/06/28,2021,Stabilizing Equilibrium Models by Jacobian Regularization,45,,https://arxiv.org/abs/2106.14342,9.80E+07,,2.90E+19,23,,,0.00,1.03E+08,1.39E+18,0.00E+00,1.03E+08,WikiText-103,24.90,,,0.0,1,0,0,Transformer,DEQ,,,,,1,,267000
Selfish-RNN (AWD-LSTM-MoS),"Shiwei Liu, Decebal Constantin Mocanu, Yulong Pei, Mykola Pechenizkiy",2021/01/22,2021,Selfish Sparse RNN Training,31,,https://arxiv.org/pdf/2101.09048,1.56E+07,,,1000,,,0.00,2.08E+06,1.95E+17,,2.08E+06,,,63.05,,0.0,1,0,0,Recurrent,LSTM,https://github.com/Shiweiliuiiiiiii/Selfish-RNN,,,,1,?,?
TRIMELMext (247M),"Zexuan Zhong, Tao Lei, Danqi Chen",2022/05/25,2022,Training Language Models with Memory Augmentation,35,,https://arxiv.org/abs/2205.12674,2.47E+08,,,204.72,,,0.00,1.03E+08,3.12E+19,1.03E+08,2.06E+08,WikiText-103,15.37,,,0.0,0,0,0,Transformer,Transformer-XL,https://github.com/princeton-nlp/TRIME,Michigan State University; TAL Education Group,Industry - Academia Collaboration,,1,,
Selfish-RNN (ON-LSTM),"Shiwei Liu, Decebal Constantin Mocanu, Yulong Pei, Mykola Pechenizkiy",2021/01/22,2021,Selfish Sparse RNN Training,31,,https://arxiv.org/pdf/2101.09048,1.13E+07,,,1000,,,0.00,9.29E+05,6.30E+16,,9.29E+05,,,,55.82,0.0,1,0,0,Recurrent,LSTM,https://github.com/Shiweiliuiiiiiii/Selfish-RNN,,,,1,?,?
Selfish-RNN (SNT-ASGD) Stacked LSTMs,"Shiwei Liu, Decebal Constantin Mocanu, Yulong Pei, Mykola Pechenizkiy",2021/01/22,2021,Selfish Sparse RNN Training,31,,https://arxiv.org/pdf/2101.09048,2.52E+07,,,100,,,0.00,9.29E+05,1.40E+16,,9.29E+05,,,,71.42,0.0,1,0,0,Recurrent,LSTM,https://github.com/Shiweiliuiiiiiii/Selfish-RNN,,,,1,?,?
Fairseq + UID: variance,"Jason Wei, Clara Meister, Ryan Cotterell",2021/05/15,2021,A Cognitive Regularizer for Language Modeling,11,,https://web.archive.org/web/20221010230611/https://arxiv.org/pdf/2105.07144.pdf,UNK,1.03E+08,,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,,29.58,,,0.0,1,0,0,Transformer,Transformer,,University of Edinburgh,Academia,,0,,
Quantized ADMM,"Junhao Xu, Xie Chen, Shoukang Hu, Jianwei Yu, Xunying Liu, Helen Meng",2021/11/29,2021,Low-bit Quantization of Recurrent Neural Network Language Models Using Alternating Direction Methods of Multipliers,9,,https://arxiv.org/pdf/2111.14836,,,,50,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,115.90,0.0,0,0,0,Recurrent,RNN,,,,,0,,
EN^2AS with performance reward,"Miao Zhang, Huiqi Li, Shirui Pan, Taoping Liu, Steven Su",2019/07/22,2019,Efficient Novelty-Driven Neural Architecture Search,1,1,https://arxiv.org/pdf/1907.09109,2.30E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,57.36,0.0,0,0,0,NAS,NAS,,Lenovo Research,Industry,,1,,
Selfish-RNN (SNT-ASGD)RHNs,"Shiwei Liu, Decebal Constantin Mocanu, Yulong Pei, Mykola Pechenizkiy",2021/01/22,2021,Selfish Sparse RNN Training,31,,https://arxiv.org/pdf/2101.09048,7.60E+06,,,500,,,0.00,9.29E+05,2.12E+16,,9.29E+05,,,,64.03,0.0,1,0,0,Recurrent,RHN,https://github.com/Shiweiliuiiiiiii/Selfish-RNN,,,,1,?,?
BERT-Large-CAS (WT103),"Chenguang Wang, Mu Li, Alexander J. Smola",2019/04/20,2019,Language Models with Transformers,110,,https://arxiv.org/abs/1904.09408,3.95E+08,,,50,,,0.00,4.40E+09,5.21E+20,1.03E+08,4.50E+09,WikiText-103,20.42,,,0.0,0,0,0,Transformer,BERT,https://github.com/cgraywang/gluon-nlp-1/tree/lmtransformer/scripts/language_model,,,,1,,
SparseOPT-30B,"Elias Frantar, Dan Alistarh",2023/01/02,2023,SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot,8,,https://arxiv.org/abs/2301.00774,1.50E+10,,,1.666666667,,,1.00,1.80E+11,2.70E+22,0.00E+00,1.80E+11,,9.79,,,1.0,0,0,0,Transformer,OPT,https://github.com/IST-DASLab/sparsegpt,,,,1,,
LSTM (WT2),"Edouard Grave, Armand Joulin, Nicolas Usunier",2016/12/13,2016,Improving Neural Language Models with a Continuous Cache,302,,https://arxiv.org/abs/1612.04426,3.28E+07,,,UNK,,,0.00,2.08E+06,#VALUE!,,2.08E+06,WikiText-2,,99.30,,0.0,0,0,0,Recurrent,LSTM,,,,,1,,
Gopher (7.1B),"Jack W. Rae, Sebastian Borgeaud, Trevor Cai, Katie Millican, Jordan Hoffmann, Francis Song, John Aslanides, Sarah Henderson, Roman Ring, Susannah Young, Eliza Rutherford, Tom Hennigan, Jacob Menick, Albin Cassirer, Richard Powell, George van den Driessche, Lisa Anne Hendricks, Maribeth Rauh, Po-Sen Huang, Amelia Glaese, Johannes Welbl, Sumanth Dathathri, Saffron Huang, Jonathan Uesato, John Mellor, Irina Higgins, Antonia Creswell, Nat McAleese, Amy Wu, Erich Elsen, Siddhant Jayakumar, Elena Buchatskaya, David Budden, Esme Sutherland, Karen Simonyan, Michela Paganini, Laurent Sifre, Lena Martens, Xiang Lorraine Li, Adhiguna Kuncoro, Aida Nematzadeh, Elena Gribovskaya, Domenic Donato, Angeliki Lazaridou, Arthur Mensch, Jean-Baptiste Lespiau, Maria Tsimpoukelli, Nikolai Grigorev, Doug Fritz, Thibault Sottiaux, Mantas Pajarskas, Toby Pohlen, Zhitao Gong, Daniel Toyama, Cyprien de Masson d'Autume, Yujia Li, Tayfun Terzi, Vladimir Mikulik, Igor Babuschkin, Aidan Clark, Diego de Las Casas, Aurelia Guy, Chris Jones, James Bradbury, Matthew Johnson, Blake Hechtman, Laura Weidinger, Iason Gabriel, William Isaac, Ed Lockhart, Simon Osindero, Laura Rimell, Chris Dyer, Oriol Vinyals, Kareem Ayoub, Jeff Stanway, Lorrayne Bennett, Demis Hassabis, Koray Kavukcuoglu, Geoffrey Irving",2021/12/08,2021,"Scaling Language Models: Methods, Analysis & Insights from Training Gopher",441,,https://arxiv.org/abs/2112.11446,7.10E+09,,6.31E+23,1.00,,,0.00,3.00E+11,1.28E+22,,3.00E+11,WikiText-103,10.81,,,1.0,1,0,0,Transformer,GPT,,Carnegie Mellon University; Intel Labs,Industry - Academia Collaboration,,1,Own,32000
DARTS,"Hanxiao Liu, Karen Simonyan, Yiming Yang",2018/06/24,2018,DARTS: Differentiable Architecture Search,3990,,https://arxiv.org/abs/1806.09055,3.30E+07,,1.10E+16,300,,,0.00,2.08E+06,1.24E+17,,2.08E+06,WikiText-2,,69.60,,0.0,0,0,0,NAS,DARTS,https://github.com/quark0/darts,,,,1,,
TF-LM-discourse LSTM (WT2),"Lyan Verwimp, Hugo Van hamme, Patrick Wambacq",2018/05/01,2018,TF-LM: TensorFlow-based Language Modeling Toolkit,7,,https://aclanthology.org/L18-1470.pdf,UNK,,,39,,,0.00,2.08E+06,#VALUE!,,2.08E+06,,,98.20,,0.0,1,0,0,Recurrent,LSTM,https://github.com/lverwimp/tf-lm,,,,0,,
Gopher (280B),"Jack W. Rae, Sebastian Borgeaud, Trevor Cai, Katie Millican, Jordan Hoffmann, Francis Song, John Aslanides, Sarah Henderson, Roman Ring, Susannah Young, Eliza Rutherford, Tom Hennigan, Jacob Menick, Albin Cassirer, Richard Powell, George van den Driessche, Lisa Anne Hendricks, Maribeth Rauh, Po-Sen Huang, Amelia Glaese, Johannes Welbl, Sumanth Dathathri, Saffron Huang, Jonathan Uesato, John Mellor, Irina Higgins, Antonia Creswell, Nat McAleese, Amy Wu, Erich Elsen, Siddhant Jayakumar, Elena Buchatskaya, David Budden, Esme Sutherland, Karen Simonyan, Michela Paganini, Laurent Sifre, Lena Martens, Xiang Lorraine Li, Adhiguna Kuncoro, Aida Nematzadeh, Elena Gribovskaya, Domenic Donato, Angeliki Lazaridou, Arthur Mensch, Jean-Baptiste Lespiau, Maria Tsimpoukelli, Nikolai Grigorev, Doug Fritz, Thibault Sottiaux, Mantas Pajarskas, Toby Pohlen, Zhitao Gong, Daniel Toyama, Cyprien de Masson d'Autume, Yujia Li, Tayfun Terzi, Vladimir Mikulik, Igor Babuschkin, Aidan Clark, Diego de Las Casas, Aurelia Guy, Chris Jones, James Bradbury, Matthew Johnson, Blake Hechtman, Laura Weidinger, Iason Gabriel, William Isaac, Ed Lockhart, Simon Osindero, Laura Rimell, Chris Dyer, Oriol Vinyals, Kareem Ayoub, Jeff Stanway, Lorrayne Bennett, Demis Hassabis, Koray Kavukcuoglu, Geoffrey Irving",2021/12/08,2021,"Scaling Language Models: Methods, Analysis & Insights from Training Gopher",441,,https://arxiv.org/abs/2112.11446,2.80E+11,,1.28E+22,1.00,,,0.00,3.00E+11,5.04E+23,,3.00E+11,WikiText-103,8.12,,,1.0,1,0,0,Transformer,GPT,,Facebook AI Research,Industry,,1,Own,32000
Transformer-C,"Simeng Sun, Mohit Iyyer",2021/04/08,2021,Revisiting Simple Neural Probabilistic Language Models,10,,https://arxiv.org/abs/2104.03474,1.48E+08,,,19.88,,,0.00,1.03E+08,1.82E+18,0.00E+00,1.03E+08,WikiText-103,25.10,,,0.0,1,0,0,Transformer,Transformer,https://github.com/SimengSun/revisit-nplm,,,,1,Word-level,267735
GRU + p-tHSM (pretrain via Brown) (WT103),"Nan Jiang, Wenge Rong, Min Gao, Yikang Shen, Zhang Xiong",2017/08/19,2017,Exploration of Tree-based Hierarchical Softmax for Recurrent Language Models,5,,https://www.researchgate.net/profile/Yikang-Shen-2/publication/318830618_Exploration_of_Tree-based_Hierarchical_Softmax_for_Recurrent_Language_Models/links/5b2c050aa6fdcc8506bc6f4a/Exploration-of-Tree-based-Hierarchical-Softmax-for-Recurrent-Language-Models.pdf,2.06E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,WikiText-103,161.55,,,0.0,0,0,0,Recurrent,GRU,,DeepMind,Industry,,1,,
HSO,"Davis Yoshida, Kevin Gimpel",2021/12/16,2021,Reconsidering the Past: Optimizing Hidden States in Language Models,1,1,https://web.archive.org/web/20230220145200/https://arxiv.org/pdf/2112.08653.pdf,3.45E+08,4.10E+09,3.45E+20,UNK,,,0.00,4.00E+09,#VALUE!,1.03E+08,4.10E+09,,20.30,,,0.0,1,0,0,Transformer,GPT,,,,,1,GPT2Tokenizer,50257
RFA-GATE-Gaussian-Stateful Big,"Hao Peng, Nikolaos Pappas, Dani Yogatama, Roy Schwartz, Noah A. Smith, Lingpeng Kong",2021/03/03,2021,Random Feature Attention,200,,https://arxiv.org/abs/2103.02143,2.42E+08,,,47.72,,,0.00,1.03E+08,7.14E+18,0.00E+00,1.03E+08,WikiText-103,23.50,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,,268000
(ensemble): AWD-LSTM-DOC (fin) × 5 (PTB),"Sho Takase, Jun Suzuki, Masaaki Nagata",2018/08/30,2018,Direct Output Connection for a High-Rank Language Model,36,,https://arxiv.org/abs/1808.10143,1.14E+08,,,300,,,0.00,9.29E+05,1.91E+17,,9.29E+05,Penn TreeBank,,,47.17,0.0,0,0,0,Recurrent,LSTM,https://github.com/nttcslab-nlp/doc_lm,"Ruhr University Bochum, Germany; University of London; Technische Universität Dresden, Dresden, Germany",Academia,,1,,
Adaptive Input Transformer + RD,"Xiaobo Liang, Lijun Wu, Juntao Li, Yue Wang, Qi Meng, Tao Qin, Wei Chen, Min Zhang, Tie-Yan Liu",2021/06/28,2021,R-Drop: Regularized Dropout for Neural Networks,245,,https://web.archive.org/web/20220518153557/https://arxiv.org/pdf/2106.14448.pdf,2.47E+08,1.03E+08,8.20E+19,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,,18.07,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/dropreg/R-Drop,,,,1,?,?
GPT-2-Medium+Pixelfly,"Tri Dao, Beidi Chen, Kaizhao Liang, Jiaming Yang, Zhao Song, Atri Rudra, Christopher Ré",2021/11/30,2021,Pixelated Butterfly: Simple and Efficient Sparse training for Neural Network Models,29,,https://arxiv.org/pdf/2112.00029,2.03E+08,,,100,,,0.00,1.03E+08,1.25E+19,0.00E+00,1.03E+08,,21.00,,,0.0,1,0,0,Transformer,GPT,https://github.com/HazyResearch/pixelfly,,,,1,GPT2Tokenizer,50257
GPT-2-Small+Pixelfly,"Tri Dao, Beidi Chen, Kaizhao Liang, Jiaming Yang, Zhao Song, Atri Rudra, Christopher Ré",2021/11/30,2021,Pixelated Butterfly: Simple and Efficient Sparse training for Neural Network Models,29,,https://arxiv.org/pdf/2112.00029,6.80E+07,,,100,,,0.00,1.03E+08,4.20E+18,0.00E+00,1.03E+08,,22.50,,,0.0,1,0,0,Transformer,GPT,https://github.com/HazyResearch/pixelfly,,,,1,GPT2Tokenizer,50257
PAR Transformer Large,"Swetha Mandava, Szymon Migacz, Alex Fit Florea",2020/09/09,2020,Pay Attention when Required,11,,https://arxiv.org/abs/2009.04534,,,,,,,0.00,1.03E+08,0.00E+00,0.00E+00,1.03E+08,WikiText-103,18.40,,,0.0,0,0,0,Transformer,Transformer-XL,,Carnegie Mellon University; University of California San Diego,Academia,,0,,
PermuteFormer,Peng Chen,2021/09/06,2021,PermuteFormer: Efficient Relative Position Encoding for Long Sequences,12,,https://arxiv.org/pdf/2109.02377,3.30E+07,,3.10E+18,30,,,0.00,1.03E+08,6.12E+17,0.00E+00,1.03E+08,,32.49,,,0.0,1,0,0,Transformer,Performer,https://github.com/cpcp1998/PermuteFormer,,,,1,Word-level,268000
VD-LSTM+REAL Medium,"Hakan Inan, Khashayar Khosravi, Richard Socher",2016/11/04,2016,Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling,397,,https://arxiv.org/abs/1611.01462,2.04E+07,,,,,,1.00,2.08E+06,0.00E+00,,2.08E+06,WikiText-2,,87.00,,0.0,0,0,0,Recurrent,LSTM,,,,,1,,
"MicroNet (Adaptive, Cache)","Zhongxia Yan, Hanrui Wang, Demi Guo, Song Han",2020/01/01,2020,MicroNet for Efficient Language Modeling,7,,http://proceedings.mlr.press/v123/yan20a.html?ref=https://githubhelp.com,8.30E+06,,,,,,0.00,1.03E+08,0.00E+00,0.00E+00,1.03E+08,WikiText-103,35.00,,,0.0,0,1,0,Transformer,Transformer-XL,,Google Brain; Carnegie Mellon University,Industry - Academia Collaboration,,1,,
Linear Transformer (large),"Imanol Schlag, Kazuki Irie, Jürgen Schmidhuber",2021/02/22,2021,Linear Transformers Are Secretly Fast Weight Programmers,78,,https://arxiv.org/pdf/2102.11174.pdf,9.00E+07,,,70,,,0.00,1.03E+08,3.89E+18,,1.03E+08,,31.50,,,0.0,1,0,0,Transformer,Linear Transformer,https://github.com/ischlag/fast-weight-transformers,,,,1,,268000
Linear Transformer (small),"Imanol Schlag, Kazuki Irie, Jürgen Schmidhuber",2021/02/22,2021,Linear Transformers Are Secretly Fast Weight Programmers,78,,https://arxiv.org/pdf/2102.11174.pdf,4.00E+07,,,120,,,0.00,1.03E+08,2.97E+18,,1.03E+08,,35.50,,,0.0,1,0,0,Transformer,Linear Transformer,https://github.com/ischlag/fast-weight-transformers,,,,1,,268000
RNS-RNN,"Brian DuSell, David Chiang",2021/09/05,2021,Learning Hierarchical Structures with Differentiable Nondeterministic Stacks,5,,https://arxiv.org/pdf/2109.01982,5.66E+06,,,100,,,0.00,9.29E+05,3.15E+15,,9.29E+05,,,,117.56,0.0,1,0,1,Recurrent,RNN,https://github.com/bdusell/nondeterministic-stack-rnn,,,,1,?,?
GPT-2 (fine-tuned with HYDRA),"Kabir Nagrecha, Arun Kumar",2021/10/16,2021,Hydra: A System for Large Multi-Model Deep Learning,4,0,https://arxiv.org/abs/2110.08633,1.54E+09,,,1,,,0.00,2.08E+06,1.92E+16,,2.08E+06,WikiText-2,,15.17,,0.0,1,0,0,Transformer,GPT,,,,,1,GPT2Tokenizer,50257
GPT-Neo-2.7B (finetuned on PTB),"Sid Black, Leo Gao, Phil Wang, Connor Leahy, Stella Biderman",2021/03/21,2021,GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow,NA,,https://github.com/EleutherAI/gpt-neo,2.70E+09,,,1,,,0.00,4.00E+11,6.48E+21,9.29E+05,4.00E+11,,,,14.70,0.0,1,0,0,Transformer,GPT-Neo,https://github.com/EleutherAI/gpt-neo,,,,1,,50257
GPT-Neo-2.7B (finetuned),"Sid Black, Leo Gao, Phil Wang, Connor Leahy, Stella Biderman",2021/03/21,2021,GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow,NA,,https://github.com/EleutherAI/gpt-neo,2.70E+09,,,1,,,0.00,4.00E+11,6.48E+21,2.08E+06,4.00E+11,,,10.78,,0.0,1,0,0,Transformer,GPT-Neo,https://github.com/EleutherAI/gpt-neo,,,,1,,50257
GPT-Neo-125M(finetuned),"Sid Black, Leo Gao, Phil Wang, Connor Leahy, Stella Biderman",2021/03/21,2021,GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow,NA,,https://github.com/EleutherAI/gpt-neo,1.25E+08,,,1,,,0.00,4.00E+11,3.00E+20,2.08E+06,4.00E+11,,,21.96,,0.0,1,0,0,Transformer,GPT-Neo,https://github.com/EleutherAI/gpt-neo,,,,1,,50257
GPT-Neo-125M,"Sid Black, Leo Gao, Phil Wang, Connor Leahy, Stella Biderman",2021/03/21,2021,GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow,NA,,https://github.com/EleutherAI/gpt-neo,1.25E+08,,,1,,,0.00,4.00E+11,3.00E+20,,4.00E+11,,,32.29,,1.0,1,0,0,Transformer,GPT-Neo,https://github.com/EleutherAI/gpt-neo,,,,1,,50257
GPT-Neo-2.7B,"Sid Black, Leo Gao, Phil Wang, Connor Leahy, Stella Biderman",2021/03/21,2021,GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow,NA,,https://github.com/EleutherAI/gpt-neo,2.70E+09,,,1,,,1.00,4.00E+11,6.48E+21,,4.00E+11,,,11.39,,1.0,1,0,0,Transformer,GPT-Neo,https://github.com/EleutherAI/gpt-neo,,,,1,,50257
GCNN-14,"Yann N. Dauphin, Angela Fan, Michael Auli, David Grangier",2016/12/23,2016,Language Modeling with Gated Convolutional Networks,2176,,https://arxiv.org/abs/1612.08083,UNK,,,35.00,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,37.20,,,0.0,1,0,0,Recurrent,GCNN,,Stanford University; Salesforce Research,Industry - Academia Collaboration,,0,,
GPT-Neo-1.3B (finetuned),"Sid Black, Leo Gao, Phil Wang, Connor Leahy, Stella Biderman",2021/03/21,2021,GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow,NA,,https://github.com/EleutherAI/gpt-neo,2.70E+09,,,1,,,0.00,4.00E+11,6.48E+21,2.08E+06,4.00E+11,,,12.09,,0.0,1,0,0,Transformer,GPT-Neo,https://github.com/EleutherAI/gpt-neo,,,,1,,50257
GPT-J-6B,"Ben Wang, Aran Komatsuzaki",2021/06/09,2021,GPT-J-6B: 6B JAX-Based Transformer,NA,,https://huggingface.co/EleutherAI/gpt-j-6b,6.05E+09,,,1,,,1.00,4.02E+11,1.46E+22,,4.02E+11,,,10.88,,1.0,1,0,0,Transformer,GPT,https://github.com/kingoflolz/mesh-transformer-jax/,,,,1,,?
Delta RNN (+ full context),"Kazuki Irie, Imanol Schlag, Róbert Csordás, Jürgen Schmidhuber",2021/06/11,2021,Going Beyond Linear Transformers with Recurrent Fast Weight Programmers,42,,https://proceedings.neurips.cc/paper/2021/file/3f9e3767ef3b10a0de4c256d7ef9805d-Paper.pdf,4.46E+07,,,40,,,0.00,1.03E+08,1.10E+18,0.00E+00,1.03E+08,WikiText-103,32.80,,,0.0,1,0,0,Recurrent,RNN,https://github.com/IDSIA/recurrent-fwp,,,,1,,?
base LM+GNN+kNN,"Yuxian Meng, Shi Zong, Xiaoya Li, Xiaofei Sun, Tianwei Zhang, Fei Wu, Jiwei Li",2021/10/17,2021,GNN-LM: Language Modeling based on Global Contexts via GNN,22,,https://arxiv.org/abs/2110.08743,2.74E+08,,7.30E+18,,,,0.00,1.03E+08,0.00E+00,1.03E+08,2.06E+08,WikiText-103,16.80,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/ShannonAI/GNN-LM,,,,1,,260000
TransformerXL + PowerSGD + L-Greco,"Mohammadreza Alimohammadi, Ilia Markov, Elias Frantar, Dan Alistarh",2022/10/31,2022,L-GreCo: An Efficient and General Framework for Layerwise-Adaptive Gradient Compression,3,0,https://web.archive.org/web/20221101102609/https://arxiv.org/pdf/2210.17357.pdf,UNK,1.03E+08,4.14E+17,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,,24.08,,,0.0,0,0,0,Transformer,Transformer-XL,https://github.com/LGrCo/L-GreCo,DeepMind; University of Oxford,Industry - Academia Collaboration,,0,,
GLM-10B-bidirectional,"Zhengxiao Du, Yujie Qian, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, Jie Tang",2021/03/18,2021,GLM: General Language Model Pretraining with Autoregressive Blank Infilling,131,,https://arxiv.org/abs/2103.10360,1.00E+10,,,1,,,0.00,6.32E+11,3.79E+22,,6.32E+11,,11.33,,,1.0,1,0,0,Transformer,GLM,https://github.com/THUDM/GLM,,,,1,BERT,30000
GLM-10B-unidirectional,"Zhengxiao Du, Yujie Qian, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, Jie Tang",2021/03/18,2021,GLM: General Language Model Pretraining with Autoregressive Blank Infilling,131,,https://arxiv.org/abs/2103.10360,1.00E+10,,,1,,,0.00,6.32E+11,3.79E+22,,6.32E+11,,12.22,,,1.0,1,0,0,Transformer,GLM,https://github.com/THUDM/GLM,,,,1,BERT,30000
FNetAR Medium,"Tim Lou, Michael Park, Mohammad Ramezanali, Vincent Tang",2021/07/22,2021,FNetAR: Mixing Tokens with Autoregressive Fourier Transforms,2,0,https://arxiv.org/abs/2107.10932,3.43E+07,,,,,,0.00,1.03E+08,0.00E+00,0.00E+00,1.03E+08,WikiText-103,25.81,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,Word-level,268000
FMMformer (2-kernel fast weight + Band20),"Tan M. Nguyen, Vai Suliafu, Stanley J. Osher, Long Chen, Bao Wang",2021/08/05,2021,FMMformer: Efficient and Flexible Transformer via Decomposed Near-field and Far-field Attention,15,,https://web.archive.org/web/20220803154831/https://proceedings.neurips.cc/paper/2021/file/f621585df244e9596dc70a39b579efb1-Paper.pdf,4.00E+07,1.03E+08,4.30E+17,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,,34.71,,,0.0,1,0,1,Transformer,FMMformer,,,,,1,?,?
S4,"Albert Gu, Karan Goel, Christopher Ré",2021/10/31,2021,Efficiently Modeling Long Sequences with Structured State Spaces,171,,https://arxiv.org/abs/2111.00396,2.49E+08,"8 A100 GPUs, 800k steps of training",,509.02,,,0.00,1.03E+08,7.83E+19,0.00E+00,1.03E+08,WikiText-103,20.95,,,0.0,1,0,0,State Space Model,S4,https://github.com/HazyResearch/state-spaces,,,,1,Word-level,268000
DiffQ Transformer (16L),"Alexandre Défossez, Yossi Adi, Gabriel Synnaeve",2021/04/20,2021,Differentiable Model Compression via Pseudo Quantization Noise,20,,https://arxiv.org/abs/2104.09987,2.57E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,WikiText-103,18.10,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/facebookresearch/diffq,,,,1,,?
T2R + Random Init,"Jungo Kasai, Hao Peng, Yizhe Zhang, Dani Yogatama, Gabriel Ilharco, Nikolaos Pappas, Yi Mao, Weizhu Chen, Noah A. Smith",2021/03/24,2021,Finetuning Pretrained Transformers into RNNs,30,,https://arxiv.org/abs/2103.13076,4.50E+08,,6.10E+19,205.48,,,1.00,1.03E+08,5.71E+19,,1.03E+08,,20.80,,,0.0,0,0,0,Transformer,ELU,https://github.com/jungokasai/T2R/,University of Washington; Allen Institute for AI,Industry - Academia Collaboration,,1,,
top-down frozen classifier,"Shucong Zhang, Cong-Thanh Do, Rama Doddipatla, Erfan Loweimi, Peter Bell, Steve Renals",2021/02/09,2021,Train your classifier first: Cascade Neural Networks Training from upper layers to lower layers,2,0,https://arxiv.org/pdf/2102.04697,UNK,,,UNK,,,0.00,,#VALUE!,,0.00E+00,,,65.20,,0.0,0,0,0,,,,,,,0,,
"GPT-2 (117M, SLW 110K)","Conglong Li, Minjia Zhang, Yuxiong He",2021/08/13,2021,Curriculum Learning: A Regularization Method for Efficient and Stable Billion-Scale GPT Model Pre-Training,20,,https://arxiv.org/abs/2108.06084,1.17E+08,,1.30E+20,1.10,,,0.00,1.57E+11,1.21E+20,,1.57E+11,Wikipedia; CC-Stories; RealNews; OpenWebtext,26.03,,,1.0,1,0,1,Transformer,GPT,,,,,1,?,?
"GPT-2 (1.5B, Curriculum Learning 45K)","Conglong Li, Minjia Zhang, Yuxiong He",2021/08/13,2021,Curriculum Learning: A Regularization Method for Efficient and Stable Billion-Scale GPT Model Pre-Training,20,,https://arxiv.org/abs/2108.06084,1.50E+09,,6.00E+20,2.20,,,0.00,1.57E+11,3.11E+21,,1.57E+11,Wikipedia; CC-Stories; RealNews; OpenWebtext,13.72,,,1.0,1,0,0,Transformer,GPT,,,,,1,?,?
Base LM + kNN LM + Continuous Cache,"Urvashi Khandelwal, Omer Levy, Dan Jurafsky, Luke Zettlemoyer, Mike Lewis",2019/11/01,2019,Generalization through Memorization: Nearest Neighbor Language Models,410,,https://arxiv.org/abs/1911.00172,2.47E+08,,7.30E+18,200.00,,,0.00,1.03E+08,3.05E+19,,1.03E+08,WikiText-103,15.79,,,0.0,0,1,0,Transformer,Transformer-XL,https://github.com/urvashik/knnlm,DeepMind; University of Oxford,,,1,,
GPT2+CoreLM+Fine-Tuning,"Nikolaos Stylianou, Ioannis Vlahavas",2021/11/04,2021,CoreLM: Coreference-aware Language Model Fine-Tuning,2,1,https://arxiv.org/pdf/2111.02687,1.32E+08,,,10,,,0.00,4.00E+06,3.17E+16,,4.00E+06,,29.51,31.80,,1.0,1,0,0,Transformer,GPT,,,,,1,GPT2 tokenizer,50257
CODA,"Lin Zheng, Zhiyong Wu, Lingpeng Kong",2021/05/31,2021,Cascaded Head-colliding Attention,2,1,https://arxiv.org/pdf/2105.14850,2.47E+08,,,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,,18.48,,,0.0,1,0,0,Transformer,CODA,"https://github.com/LZhengisme/CODA, ",,,,1,Word-level,267744
Transformer-XL + AutoDropout (PTB),"Hieu Pham, Quoc V. Le",2021/01/05,2021,AutoDropout: Learning Dropout Patterns to Regularize Deep Networks,45,,https://arxiv.org/abs/2101.01761,2.40E+07,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,Penn TreeBank,,,54.90,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/google-research/googleresearch/tree/master/auto_dropout,,,,1,Word-level,10000
Transformer-XL + AutoDropout (WT2),"Hieu Pham, Quoc V. Le",2021/01/05,2021,AutoDropout: Learning Dropout Patterns to Regularize Deep Networks,45,,https://arxiv.org/abs/2101.01761,3.50E+07,,,UNK,,,0.00,2.08E+06,#VALUE!,,2.08E+06,WikiText-2,,59.90,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/google-research/googleresearch/tree/master/auto_dropout,,,,1,?,?
True-Regularization+Finetune,"Yangyang Shi, Mei-Yuh Hwang, Xin Lei, Haoyu Sheng",2019/04/08,2019,Knowledge Distillation For Recurrent Neural Network Language Modeling With Trust Regularization,24,,https://arxiv.org/pdf/1904.04163,7.00E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,54.00,0.0,0,0,0,Recurrent,RNN,,,,,1,,
WeNet (PTB),"Zhiheng Huang, Bing Xiang",2019/04/08,2019,WeNet: Weighted Networks for Recurrent Network Architecture Search,5,,https://arxiv.org/pdf/1904.03819,2.30E+07,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,54.80,0.0,0,0,0,NAS,WeNet,,"Stanford University; SambaNova Systems; Peking University; Adobe; University at Buffalo, SUNY",Industry - Academia Collaboration,,1,,
GPT2-Large+LHOPT,"Diogo Almeida, Clemens Winter, Jie Tang, Wojciech Zaremba",2021/06/02,2021,A Generalizable Approach to Learning Optimizers,13,,https://web.archive.org/web/20221027150413/https://arxiv.org/pdf/2106.00958.pdf,7.60E+08,1.03E+08,1.60E+21,1,,,0.00,1.03E+08,4.70E+17,0.00E+00,1.03E+08,,32.50,,,0.0,1,0,1,Transformer,GPT,https://github.com/openai/LHOPT,,,,1,?,?
$\infty$-former (SM),"Pedro Henrique Martins, Zita Marinho, André F. T. Martins",2021/09/01,2021,$\infty$-former: Infinite Memory Transformer,31,,https://arxiv.org/abs/2109.00301,1.17E+08,,1.20E+20,1,,,0.00,4.00E+09,2.81E+18,2.55E+08,4.26E+09,WikiText-103,16.61,,,1.0,1,0,0,Transformer,GPT,https://github.com/deep-spin/infinite-former,,,,1,Word-level,268000
base LM+GNN+kNN,"Yuxian Meng, Shi Zong, Xiaoya Li, Xiaofei Sun, Tianwei Zhang, Fei Wu, Jiwei Li",2021/10/17,2021,GNN-LM: Language Modeling based on Global Contexts via GNN,22,,https://arxiv.org/abs/2110.08743,2.74E+08,,7.30E+18,,,,0.00,1.03E+08,0.00E+00,1.03E+08,2.06E+08,WikiText-103,14.80,,,0.0,0,0,0,Transformer,Transformer-XL,https://github.com/ShannonAI/GNN-LM,IBM Research,Industry,,1,,
NoPos,"Adi Haviv, Ori Ram, Ofir Press, Peter Izsak, Omer Levy",2022/03/30,2022,Transformer Language Models without Positional Encodings Still Learn Positional Information,30,,https://arxiv.org/abs/2203.16634,1.30E+09,,,199.92,,,0.00,1.03E+08,1.61E+20,0.00E+00,1.03E+08,The Pile (Subset),20.97,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/adihaviv/NoPos,,,,1,word-level,267000
Chinchilla,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, Tom Hennigan, Eric Noland, Katie Millican, George van den Driessche, Bogdan Damoc, Aurelia Guy, Simon Osindero, Karen Simonyan, Erich Elsen, Jack W. Rae, Oriol Vinyals, Laurent Sifre",2022/03/29,2022,Training Compute-Optimal Large Language Models,136,,https://arxiv.org/abs/2203.15556,7.00E+10,,5.76E+23,1,,,0.00,1.40E+12,5.88E+23,0.00E+00,1.40E+12,WikiText-103,7.16,,,1.0,1,0,0,Transformer,GPT,,,,,1,SentencePiece,32000
VRNS-RNN-3-3-5,"Brian DuSell, David Chiang",2022/10/04,2022,The Surprising Computational Power of Nondeterministic Stack RNNs,1,1,https://arxiv.org/pdf/2210.01343,1.50E+06,,,,,,1.00,9.29E+05,0.00E+00,,9.29E+05,,,,120.12,0.0,1,0,1,Recurrent,RNN,,,,,1,?,?
RETRO-7B,"Sebastian Borgeaud†, Arthur Mensch†, Jordan Hoffmann†, Trevor Cai, Eliza Rutherford, Katie Millican, George van den Driessche, Jean-Baptiste Lespiau, Bogdan Damoc, Aidan Clark, Diego de Las Casas, Aurelia Guy, Jacob Menick, Roman Ring, Tom Hennigan, Saffron Huang, Loren Maggiore, Chris Jones, Albin Cassirer, Andy Brock, Michela Paganini, Geoffrey Irving, Oriol Vinyals, Simon Osindero,Karen Simonyan, Jack W. Rae‡, Erich Elsen‡ and Laurent Sifre",2021/12/08,2021,Improving language models by retrieving from trillions of tokens,315,,https://arxiv.org/abs/2112.04426,7.50E+09,,7.50E+21,,,,0.00,2.00E+12,0.00E+00,,2.00E+12,MassiveText,21.53,,,1.0,0,0,0,Transformer,GPT,,OpenAI,Industry,,1,,
GPT3-6.7B + muP,"Greg Yang, Edward J. Hu, Igor Babuschkin, Szymon Sidor, Xiaodong Liu, David Farhi, Nick Ryder, Jakub Pachocki, Weizhu Chen, Jianfeng Gao",2022/03/07,2022,Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer,37,,https://web.archive.org/web/20221014063419/https://arxiv.org/pdf/2203.03466.pdf,6.70E+09,4.99E+11,1.28E+22,1,,,0.00,4.99E+11,2.01E+22,1.03E+08,4.99E+11,,8.56,,,1.0,1,0,0,Transformer,GPT,https://github.com/microsoft/mup,,,,1,custom,?
Transformer-XL + RMT,"Aydar Bulatov, Yuri Kuratov, Mikhail S. Burtsev",2022/07/14,2022,Recurrent Memory Transformer,19,,https://web.archive.org/web/20220715153256/https://arxiv.org/pdf/2207.06881.pdf,2.47E+08,2.06E+08,,UNK,,,0.00,1.03E+08,#VALUE!,1.03E+08,2.06E+08,,23.99,,,0.0,1,0,0,Recurrent/Transformer,Transformer-XL,"https://github.com/booydar/transformer-xl, https://github.com/kimiyoung/transformer-xl, https://github.com/GokuMohandas/fast-weights/blob/539fb10e3c384d5f782af2560bf28631cd0eaa61/, https://github.com/kimiyoung/transformer-xl, ",,,,1,Word-level,267735
E-SPA,"Bobby He, James Martens, Guodong Zhang, Aleksandar Botev, Andrew Brock, Samuel L Smith, Yee Whye Teh",2023/02/20,2023,Deep Transformers without Shortcuts: Modifying Self-attention for Faithful Signal Propagation,3,1,https://arxiv.org/pdf/2302.10322.pdf,2.43E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,19.70,,,0.0,0,0,0,Transformer,Transformer,,,,,1,,
OPT-125M (finetuned on PTB),"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.25E+08,,,1.666666667,,,0.00,1.80E+11,2.25E+20,9.29E+05,1.80E+11,,,,16.50,0.0,1,0,0,Transformer,OPT,,,,,1,GPT2Tokenizer,50257
Neural cache model (size=2000),"Edouard Grave, Armand Joulin, Nicolas Usunier",2016/12/13,2016,Improving Neural Language Models with a Continuous Cache,302,,https://arxiv.org/abs/1612.04426,UNK,,,,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-103,,68.90,,0.0,1,1,0,Recurrent,LSTM,,,,,0,,
OPT-66B,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,6.60E+10,,,1.666666667,,,1.00,1.80E+11,1.19E+23,0.00E+00,1.80E+11,,,9.34,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT2Tokenizer,50257
OPT-2.7B (finetuned on WT2),"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,2.70E+09,,,1.666666667,,,0.00,1.80E+11,4.86E+21,2.08E+06,1.80E+11,,,10.27,,0.0,1,0,0,Transformer,OPT,,,,,1,GPT2Tokenizer,50257
OPT-6.7B,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,6.70E+09,,,1.666666667,,,0.00,1.80E+11,1.21E+22,0.00E+00,1.80E+11,,,10.86,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT2Tokenizer,50257
OPT-2.7B (finetuned on PTB),"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,2.70E+09,,,1.666666667,,,0.00,1.80E+11,4.86E+21,9.29E+05,1.80E+11,,,,10.80,0.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 byte level BPE tokenizer,52000
OPT-1.3B (finetuned),"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.30E+09,,,1.666666667,,,0.00,1.80E+11,2.34E+21,2.08E+06,1.80E+11,,,12.22,,0.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
OPT-2.7B,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,2.70E+09,,,1.666666667,,,1.00,1.80E+11,4.86E+21,0.00E+00,1.80E+11,,,12.47,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
OPT-175B,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.75E+11,,4.16E+23,1.666666667,,,0.00,1.80E+11,3.15E+23,0.00E+00,1.80E+11,,,8.35,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
OPT-13B,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.30E+10,,,1.666666667,,,1.00,1.80E+11,2.34E+22,0.00E+00,1.80E+11,,,10.13,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
1-layer-LSTM,"H. T. Kung, Bradley McDanel, Sai Qian Zhang",2020/07/13,2020,Term Revealing: Furthering Quantization at Run Time on Quantized DNNs,9,,https://arxiv.org/pdf/2007.06389,8.65E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,86.85,,0.0,0,0,0,Recurrent,LSTM,,,,,1,,
OPT-125M (finetuned),"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.25E+08,,,1.666666667,,,0.00,1.80E+11,2.25E+20,2.08E+06,1.80E+11,,,19.85,,0.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
WeNet (WT2),"Zhiheng Huang, Bing Xiang",2019/04/08,2019,WeNet: Weighted Networks for Recurrent Network Architecture Search,5,,https://arxiv.org/pdf/1904.03819,3.30E+07,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,66.60,,0.0,0,0,0,NAS,WeNet,,,,,1,,
AFP+FPI (WT2),"Zhengxiong Wang, Anton Ragni",2021/06/04,2021,Approximate Fixed-Points in Recurrent Neural Networks,1,0,https://arxiv.org/pdf/2106.02417,8.02E+04,,,40,,,0.00,2.08E+06,4.00E+13,,2.08E+06,WikiText-2,,149.35,,0.0,0,0,0,Recurrent,AFP,,,,,1,,
OPT-1.3B,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.30E+09,,,1.666666667,,,0.00,1.80E+11,2.34E+21,,1.80E+11,,,16.41,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
OPT-30B,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,3.00E+10,,,1.666666667,,,0.00,1.80E+11,5.40E+22,0.00E+00,1.80E+11,,,10.67,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
OPT-350M,"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.25E+08,,,1.666666667,,,0.00,1.80E+11,2.25E+20,,1.80E+11,,,25.42,,1.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 tokenizer,52000
OPT-1.3B (finetuned on PTB),"Susan Zhang∗ , Stephen Roller∗ , Naman Goyal∗ , Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott† , Sam Shleifer† , Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, Luke Zettlemoyer",2022/06/21,2022,OPT: Open Pre-trained Transformer Language Models,656,,https://arxiv.org/pdf/2205.01068.pdf?fbclid=IwAR2zobF7YoESj0HLKULhilGRV-jvNXKr2bkW_b3MqPfFZ6rEyagDP654QFo,1.30E+09,,,1.666666667,,,0.00,1.80E+11,2.34E+21,9.29E+05,1.80E+11,,,,12.02,0.0,1,0,0,Transformer,OPT,,,,,1,GPT-2 byte level BPE tokenizer,52000
B2T connection (16L),"Sho Takase, Shun Kiyono, Sosuke Kobayashi, Jun Suzuki",2022/06/01,2022,On Layer Normalizations and Residual Connections in Transformers,4,1,https://web.archive.org/web/20220602013934/https://arxiv.org/pdf/2206.00330.pdf,2.47E+08,1.03E+08,2.80E+19,150,,,0.00,1.03E+08,2.29E+19,0.00E+00,1.03E+08,,19.20,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,,300000
ADP-FAIRSEQ+NGRAMRES,"Huayang Li, Deng Cai, Jin Xu, Taro Watanabe",2022/10/26,2022,N-gram Is Back: Residual Learning of Neural Text Generation with n-gram Language Model,0,1,https://web.archive.org/web/20221027013457/https://arxiv.org/pdf/2210.14431.pdf,2.47E+08,2.02E+08,,UNK,,,0.00,1.03E+08,#VALUE!,1.01E+08,2.04E+08,,18.20,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/ghrua/NgramRes,,,,1,Word-level,260000
ADP-FAIRSEQ,"Huayang Li, Deng Cai, Jin Xu, Taro Watanabe",2022/10/26,2022,N-gram Is Back: Residual Learning of Neural Text Generation with n-gram Language Model,0,1,https://web.archive.org/web/20221027013457/https://arxiv.org/pdf/2210.14431.pdf,2.47E+08,1.01E+08,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,18.90,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/ghrua/NgramRes,,,,1,Word-level,260000
LSTM-MemoryAug (PTB),"Ke Li, Daniel Povey, Sanjeev Khudanpur",2020/09/29,2020,Neural Language Modeling With Implicit Cache Pointers,4,1,https://arxiv.org/pdf/2009.13774,1.33E+07,,,,,,1.00,9.29E+05,0.00E+00,,9.29E+05,,,,67.80,0.0,0,0,0,Recurrent,LSTM,,,,,1,,
ISS,"Wei Wen, Yuxiong He, Samyam Rajbhandari, Minjia Zhang, Wenhan Wang, Fang Liu, Bin Hu, Yiran Chen, Hai Li",2017/09/15,2017,Learning Intrinsic Sparse Structures within Long Short-Term Memory,146,,https://arxiv.org/pdf/1709.05027,1.11E+07,,,55,,,0.00,9.29E+05,3.40E+15,,9.29E+05,,,,65.40,0.0,0,0,0,Recurrent,LSTM,https://github.com/wenwei202/iss-rnns,National Tsing Hua University; Google,Industry - Academia Collaboration,,1,,
Alleviated TOI 10 (WT2),"Noémien Kocher, Christian Scuito, Lorenzo Tarantino, Alexandros Lazaridis, Andreas Fischer, Claudiu Musat",2019/09/18,2019,Alleviating Sequence Information Loss with Data Overlapping and Prime Batch Sizes,0,1,https://arxiv.org/abs/1909.08700,UNK,,,1000,,,1.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,WikiText-2,,64.73,,0.0,1,0,0,Recurrent,LSTM,https://github.com/nkcr/overlap-ml,,,,0,,
Monarch-GPT-2-Medium,"Tri Dao, Beidi Chen, Nimit Sohoni, Arjun Desai, Michael Poli, Jessica Grogan, Alexander Liu, Aniruddh Rao, Atri Rudra, Christopher Ré",2022/04/01,2022,Monarch: Expressive Structured Matrices for Efficient and Accurate Training,28,,https://arxiv.org/pdf/2204.00595,1.65E+08,,,110,,,0.00,4.00E+09,4.36E+20,,4.00E+09,,20.30,,,0.0,1,0,0,Transformer,GPT,https://github.com/HazyResearch/monarch,University of Texas at Austin,Academia,,1,?,?
Monarch-GPT-2-Small,"Tri Dao, Beidi Chen, Nimit Sohoni, Arjun Desai, Michael Poli, Jessica Grogan, Alexander Liu, Aniruddh Rao, Atri Rudra, Christopher Ré",2022/04/01,2022,Monarch: Expressive Structured Matrices for Efficient and Accurate Training,28,,https://arxiv.org/pdf/2204.00595,7.20E+07,,,110,,,0.00,4.00E+09,1.90E+20,,4.00E+09,,20.70,,,0.0,1,0,0,Transformer,GPT,https://github.com/HazyResearch/monarch,,,,1,?,?
TransformerXL + FWL,"Kevin Clark, Kelvin Guu, Ming-Wei Chang, Panupong Pasupat, Geoffrey Hinton, Mohammad Norouzi",2022/12/05,2022,Meta-Learning Fast Weight Language Models,2,1,https://web.archive.org/web/20221207113900/https://arxiv.org/pdf/2212.02475.pdf,2.57E+08,2.06E+08,,UNK,,,0.00,1.03E+08,#VALUE!,1.03E+08,2.06E+08,,16.60,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,Word-level,260000
MemSizer,"Yizhe Zhang, Deng Cai",2022/03/23,2022,Linearizing Transformer with Key-Value Memory,0,0,https://web.archive.org/web/20220327055642/https://arxiv.org/pdf/2203.12644.pdf,3.57E+08,1.03E+08,7.30E+18,UNK,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,,20.80,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,Word-level,268000
Stacked-LSTM+Pruning,"Liangjian Wen, Xuanyang Zhang, Haoli Bai, Zenglin Xu",2019/06/17,2019,Structured Pruning of Recurrent Neural Networks through Neuron Selection,34,,https://arxiv.org/pdf/1906.06847,6.16E+06,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,78.08,0.0,0,0,0,Recurrent,LSTM,,UC Berkeley,Academia,,1,,
DITTO,"Jin Xu, Xiaojiang Liu, Jianhao Yan, Deng Cai, Huayang Li, Jian Li",2022/06/06,2022,Learning to Break the Loop: Analyzing and Mitigating Repetitions for Neural Text Generation,14,,https://web.archive.org/web/20221011104229/https://arxiv.org/pdf/2206.02369.pdf,7.50E+08,1.03E+08,1.10E+19,7.16,,,0.00,1.03E+08,3.32E+18,0.00E+00,1.03E+08,,24.33,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/Jxu-Thu/DITTO,,,,1,?,?
LaMemo,"Haozhe Ji, Rongsheng Zhang, Zhenyu Yang, Zhipeng Hu, Minlie Huang",2022/04/15,2022,LaMemo: Language Modeling with Look-Ahead Memory,2,1,https://web.archive.org/web/20220418055451/https://arxiv.org/pdf/2204.07341.pdf,1.51E+08,1.03E+08,,79.53,,,0.00,1.03E+08,7.42E+18,0.00E+00,1.03E+08,,23.77,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/thu-coai/LaMemo,,,,1,,260000
Hybrid H3-355M,"Daniel Y. Fu, Tri Dao, Khaled K. Saab, Armin W. Thomas, Atri Rudra, Christopher Ré",2022/12/28,2022,Hungry Hungry Hippos: Towards Language Modeling with State Space Models,27,,https://arxiv.org/pdf/2212.14052,3.55E+08,,,509.02,,,0.00,1.03E+08,1.12E+20,,1.03E+08,,16.90,,,0.0,1,0,0,State Space Model,H3,https://github.com/HazyResearch/H3,,,,1,GPT 2 tokenizer,50257
Hybrid H3-125M,"Daniel Y. Fu, Tri Dao, Khaled K. Saab, Armin W. Thomas, Atri Rudra, Christopher Ré",2022/12/28,2022,Hungry Hungry Hippos: Towards Language Modeling with State Space Models,27,,https://arxiv.org/pdf/2212.14052,1.25E+08,,,509.02,,,0.00,1.03E+08,3.93E+19,,1.03E+08,,23.70,,,0.0,1,0,0,State Space Model,H3,https://github.com/HazyResearch/H3,,,,1,GPT 2 tokenizer,50257
mini-GPT-2+Active-AdamW,"Davood Wadi, Marc Fredette, Sylvain Senecal",2023/01/24,2023,Read the Signs Towards Invariance to Gradient Descent’s Hyperparameter Initialization,0,0,https://arxiv.org/pdf/2301.10133.pdf,2.98E+06,,,200,,,0.00,1.03E+08,3.68E+17,,1.03E+08,,9.50,,,0.0,0,0,0,Transformer,GPT,,Facebook AI Research,Industry,,1,,
Hybrid H3-2.7B,"Daniel Y. Fu, Tri Dao, Khaled K. Saab, Armin W. Thomas, Atri Rudra, Christopher Ré",2022/12/28,2022,Hungry Hungry Hippos: Towards Language Modeling with State Space Models,27,,https://arxiv.org/pdf/2212.14052,2.70E+09,,,509.02,,,0.00,1.03E+08,8.49E+20,,1.03E+08,,10.60,,,0.0,1,0,0,State Space Model,H3,https://github.com/HazyResearch/H3,,,,1,GPT 2 tokenizer,50257
CD-GraB (WT103),"A. Feder Cooper, Wentao Guo, Khiem Pham, Tiancheng Yuan, Charlie F. Ruan, Yucheng Lu, Christopher De Sa",2023/02/02,2023,CD-GraB: Coordinating Distributed Example Orders for Provably Accelerated Training,0,0,https://arxiv.org/pdf/2302.00845.pdf,UNK,,,30,,,0.00,1.03E+08,#VALUE!,,1.03E+08,WikiText-103,66.11,,,0.0,0,0,0,Transformer,GPT,,Facebook AI Research,Industry,,0,,
Hybrid H3-1.3B,"Daniel Y. Fu, Tri Dao, Khaled K. Saab, Armin W. Thomas, Atri Rudra, Christopher Ré",2022/12/28,2022,Hungry Hungry Hippos: Towards Language Modeling with State Space Models,27,,https://arxiv.org/pdf/2212.14052,1.30E+09,,,509.02,,,0.00,1.03E+08,4.09E+20,,1.03E+08,,12.50,,,0.0,1,0,0,State Space Model,H3,https://github.com/HazyResearch/H3,,,,1,GPT 2 tokenizer,50257
Compress-LSTM (66M),"Artem M. Grachev, Dmitry I. Ignatov, Andrey V. Savchenko",2019/02/06,2019,Compression of Recurrent Neural Networks for Efficient Language Modeling,37,,"https://arxiv.org/abs/1902.02380#:~:text=Compression%20of%20Recurrent%20Neural%20Networks%20for%20Efficient%20Language%20Modeling,-Artem%20M.&text=Recurrent%20neural%20networks%20have%20proved,real%2Dtime%20offline%20mobile%20applications.",6.60E+07,,,90,,,0.00,9.29E+05,3.31E+16,,9.29E+05,,,,78.29,0.0,0,0,0,Recurrent,LSTM,,UC Berkeley,Academia,,1,,
T2R + Pretrain,"Jungo Kasai, Hao Peng, Yizhe Zhang, Dani Yogatama, Gabriel Ilharco, Nikolaos Pappas, Yi Mao, Weizhu Chen, Noah A. Smith",2021/03/24,2021,Finetuning Pretrained Transformers into RNNs,30,,https://arxiv.org/abs/2103.13076,4.50E+08,,2.03E+19,34.47,,,1.00,1.03E+08,9.59E+18,1.03E+08,2.06E+08,WikiText-103,19.60,,,0.0,0,0,0,Transformer,ELU,https://github.com/jungokasai/T2R/,University of Washington; Facebook AI Research; Allen Institute for AI,Industry - Academia Collaboration,,1,,
GPT-NeoX-20B,"Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach",2022/04/14,2022,GPT-NeoX-20B: An Open-Source Autoregressive Language Model,235,,https://arxiv.org/abs/2204.06745,2.00E+10,,,1,,,1.00,4.73E+11,5.67E+22,,4.73E+11,,,9.20,,1.0,1,0,0,Transformer,GPT,https://github.com/EleutherAI/gpt-neox,,,,1,Pretrained GPT2 tokenizer,50257
GRU + p-tHSM (pretrain via Brown) (WT2),"Nan Jiang, Wenge Rong, Min Gao, Yikang Shen, Zhang Xiong",2017/08/19,2017,Exploration of Tree-based Hierarchical Softmax for Recurrent Language Models,5,,https://www.researchgate.net/profile/Yikang-Shen-2/publication/318830618_Exploration_of_Tree-based_Hierarchical_Softmax_for_Recurrent_Language_Models/links/5b2c050aa6fdcc8506bc6f4a/Exploration-of-Tree-based-Hierarchical-Softmax-for-Recurrent-Language-Models.pdf,2.06E+08,,,,,,0.00,2.00E+06,0.00E+00,,2.00E+06,1.50E+06,,189.58,,0.0,0,0,0,Recurrent,GRU,,,,,1,,
Decaying Fast Weights Transformer,Huanru Henry Mao,2022/10/09,2022,Fine-Tuning Pre-trained Transformers into Decaying Fast Weights,0,1,https://arxiv.org/pdf/2210.04243.pdf,2.42E+08,,1.30E+19,192.12,,,0.00,1.03E+08,2.87E+19,1.03E+08,2.06E+08,,20.50,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/jenni-ai/T2FW,,,,1,?,?
Transformer + GFM,"Hao Yu, Jianxin Wu",2022/12/01,2022,"Compressing Transformers: Features Are Low-Rank, but Weights Are Not",0,1,https://cs.nju.edu.cn/wujx/paper/AAAI2023_AFM.pdf,1.85E+08,1.03E+08,8.04E+18,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,20.05,,,0.0,1,0,0,Transformer,Transformer-XL,,,,,1,Word-level,260000
Mogrifier RLSTM (PTB),Gábor Melis,2022/11/03,2022,Circling Back to Recurrent Models of Language,0,1,https://arxiv.org/pdf/2211.01848,2.40E+07,,,400,,,0.00,9.29E+05,5.35E+16,,9.29E+05,,,,42.90,0.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,10000
TransformerXL-LayerFusion-CA,"James O' Neill, Greg Ver Steeg, Aram Galstyan",2020/07/29,2020,Compressing Deep Neural Networks via Layer Fusion,5,,https://arxiv.org/pdf/2007.14917,UNK,,,,,,0.00,1.03E+08,#VALUE!,2.08E+06,1.05E+08,,,11.13,,0.0,0,0,0,Transformer,Transformer-XL,,,,,0,,
Mogrifier RLSTM (WT2),Gábor Melis,2022/11/03,2022,Circling Back to Recurrent Models of Language,0,1,https://arxiv.org/pdf/2211.01848,3.50E+07,,,250,,,0.00,2.08E+06,1.09E+17,,2.08E+06,,,38.00,,1.0,1,0,0,Recurrent,LSTM,,,,,1,Word-level,33000
Scatterbrain,"Beidi Chen, Tri Dao, Eric Winsor, Zhao Song, Atri Rudra, Christopher Ré",2021/10/28,2021,Scatterbrain: Unifying Sparse and Low-rank Attention Approximation,8,,https://web.archive.org/web/20220808053741/https://arxiv.org/pdf/2110.15343.pdf,UNK,1.03E+08,,30,,,0.00,1.03E+08,#VALUE!,0.00E+00,1.03E+08,,26.72,,,0.0,1,0,0,Transformer,Transformer,https://github.com/HazyResearch/scatterbrain,Salesforce Resarch,Industry,,0,,
BERT-Large-CAS (WT2),"Chenguang Wang, Mu Li, Alexander J. Smola",2019/04/20,2019,Language Models with Transformers,110,,https://arxiv.org/abs/1904.09408,3.95E+08,,,50,,,0.00,4.40E+09,5.21E+20,2.08E+06,4.40E+09,WikiText-2,,34.11,,0.0,0,0,0,Transformer,BERT,https://github.com/cgraywang/gluon-nlp-1/tree/lmtransformer/scripts/language_model,,,,1,,
CryptoGRU,"Bo Feng, Qian Lou, Lei Jiang, Geoffrey C. Fox",2020/10/22,2020,CryptoGRU: Low Latency Privacy-Preserving Text Analysis With GRU,12,,https://arxiv.org/pdf/2010.11796,,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,,0.0,0,0,0,Recurrent,GRU,,,,,0,,
BLOOM-1.7B,"Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff",2022/07/05,2022,BigScience Language Open-science Open-access Multilingual (BLOOM) Language Model,404,,"https://huggingface.co/bigscience/bloom-3b#:~:text=Started%2011th%20March%2C%202022%2011,Ended%205th%20July%2C%202022",1.72E+09,,,1,,,0.00,3.50E+11,3.62E+21,,3.50E+11,,,20.17,,0.5,1,0,0,Transformer,Megatron-LM GPT2,,,,,1,BLOOM tokenizer,250684
BLOOM-1B,"Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff",2022/07/05,2022,BigScience Language Open-science Open-access Multilingual (BLOOM) Language Model,404,,"https://huggingface.co/bigscience/bloom-3b#:~:text=Started%2011th%20March%2C%202022%2011,Ended%205th%20July%2C%202022",1.07E+09,,,1,,,0.00,3.50E+11,2.24E+21,,3.50E+11,,,23.70,,0.5,1,0,0,Transformer,Megatron-LM GPT2,,,,,1,BLOOM tokenizer,250684
BLOOM-560M,"Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff",2022/07/05,2022,BigScience Language Open-science Open-access Multilingual (BLOOM) Language Model,404,,"https://huggingface.co/bigscience/bloom-3b#:~:text=Started%2011th%20March%2C%202022%2011,Ended%205th%20July%2C%202022",5.60E+08,,,1,,,0.00,3.50E+11,1.18E+21,,3.50E+11,,,30.05,,0.5,1,0,0,Transformer,Megatron-LM GPT2,,,,,1,BLOOM tokenizer,250684
BLOOM-3B,"Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff",2022/07/05,2022,BigScience Language Open-science Open-access Multilingual (BLOOM) Language Model,404,,"https://huggingface.co/bigscience/bloom-3b#:~:text=Started%2011th%20March%2C%202022%2011,Ended%205th%20July%2C%202022",3.00E+09,,,1,,,0.00,3.50E+11,6.30E+21,,3.50E+11,,,17.57,,0.5,1,0,0,Transformer,Megatron-LM GPT2,,,,,1,BLOOM tokenizer,250684
Engin-XL(NE),"Zhongping Zhang, Yiwen Gu, Bryan A. Plummer",2021/12/11,2021,Show and Write: Entity-aware Article Generation with Image Information,0,0.5,https://arxiv.org/pdf/2112.05917,1.50E+09,,,3,,,0.00,,0.00E+00,,0.00E+00,,,16.30,,0.0,1,0,0,,,,,,,0,,
H-LSTM+wg+rcp+rcg+wp,"Hongxu Yin, Guoyang Chen, Yingmin Li, Shuai Che, Weifeng Zhang, Niraj K. Jha",2019/01/30,2019,"Hardware-Guided Symbiotic Training for Compact, Accurate, yet Execution-Efficient LSTM",10,,https://arxiv.org/pdf/1901.10997,8.00E+05,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,72.10,0.0,0,0,0,Recurrent,LSTM,,University of Liverpool; USC Information Sciences Institute,Academia,,1,,
BLOOM-7.1B,"Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff",2022/07/05,2022,BigScience Language Open-science Open-access Multilingual (BLOOM) Language Model,404,,"https://huggingface.co/bigscience/bloom-3b#:~:text=Started%2011th%20March%2C%202022%2011,Ended%205th%20July%2C%202022",7.07E+09,,,1,,,0.00,3.50E+11,1.48E+22,,3.50E+11,,,14.72,,0.5,1,0,0,Transformer,Megatron-LM GPT2,,,,,1,BLOOM tokenizer,250684
"Segatron-XL large, M=384 + HCP","He Bai, Tong Wang, Alessandro Sordoni, Peng Shi",2022/03/21,2022,Better Language Model with Hypernym Class Prediction,3,1,https://arxiv.org/abs/2203.10692,2.57E+08,,,167.02,,,0.00,1.03E+08,2.65E+19,,1.03E+08,WikiText-103,17.00,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/richardbaihe/robustLM,,,,1,,48935
N-gram,"Tomas Mikolov, Armand Joulin, Sumit Chopra, Michael Mathieu, Marc'Aurelio Ranzato",2014/12/24,2014,Learning Longer Memory in Recurrent Neural Networks,306,,https://arxiv.org/abs/1412.7753,UNK,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,141.20,0.0,1,0,0,N-gram,N-gram,"http://github.com/facebook/SCRNNs, ",,,,0,,
Transformer Large + HCP,"He Bai, Tong Wang, Alessandro Sordoni, Peng Shi",2022/03/21,2022,Better Language Model with Hypernym Class Prediction,3,1,https://arxiv.org/abs/2203.10692,2.57E+08,,,38.18,,,0.00,1.03E+08,6.06E+18,,1.03E+08,WikiText-103,25.30,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/richardbaihe/robustLM,,,,1,,48935
"Segatron -XL base, M=150 + HCP","He Bai, Tong Wang, Alessandro Sordoni, Peng Shi",2022/03/21,2022,Better Language Model with Hypernym Class Prediction,3,1,https://arxiv.org/abs/2203.10692,1.51E+08,,,18.64,,,0.00,1.03E+08,1.74E+18,,1.03E+08,WikiText-103,22.10,,,0.0,1,0,0,Transformer,Transformer-XL,https://github.com/richardbaihe/robustLM,,,,1,,48935
Transformer LM + MinSen,"Junhao Xu, Shoukang Hu, Jianwei Yu, Xunying Liu, Helen Meng",2021/11/29,2021,Mixed Precision of Quantization of Transformer Language Models for Speech Recognition,9,,https://arxiv.org/pdf/2112.11540,,,,,,,0.00,9.29E+05,0.00E+00,,9.29E+05,,,,56.82,0.0,0,0,0,,,,The University of Hong Kong; Sun Yat-sen University,Academia,,0,,
N-gram+Cache,"Tomas Mikolov, Armand Joulin, Sumit Chopra, Michael Mathieu, Marc'Aurelio Ranzato",2014/12/24,2014,Learning Longer Memory in Recurrent Neural Networks,306,,https://arxiv.org/abs/1412.7753,UNK,,,UNK,,,0.00,9.29E+05,#VALUE!,,9.29E+05,,,,125.00,0.0,1,1,0,N-gram,N-gram,"http://github.com/facebook/SCRNNs, ",,,,0,,
NMST+GPT-2,"Eugene Choi, Cheolhyoung Lee, Kyunghyun Cho",2022/10/03,2022,A Non-monotonic Self-terminating Language Model,0,1,https://web.archive.org/web/20230220171748/https://arxiv.org/pdf/2210.00660.pdf,1.24E+08,4.10E+09,1.20E+20,2.98,,,0.00,4.00E+09,8.87E+18,1.03E+08,4.10E+09,,20.69,,,0.0,1,0,0,Transformer,GPT,https://github.com/nyu-dl/non-monotonic-self-terminating-lm,,,,1,Word-level,260000
Sparse Wide GPT-3 Small,"Shreyas Saxena, Vithursan Thangarasa, Abhay Gupta, Sean Lie",2023/03/21,2023,Sparse Iso-FLOP Transformations for Maximizing Training Efficiency,0,0,https://arxiv.org/pdf/2303.11525.pdf,1.30E+09,,,110,,,0.00,1.03E+08,8.84E+19,,1.03E+08,,20.40,,,0.0,1,0,0,Transformer,GPT,https://github.com/CerebrasResearch/Sparse-IFT,,,,1,?,?
SparseOPT-175B,"Elias Frantar, Dan Alistarh",2023/01/02,2023,SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot,8,,https://arxiv.org/abs/2301.00774,8.75E+10,,,1.666666667,,,1.00,1.80E+11,1.58E+23,0.00E+00,1.80E+11,,8.21,,,1.0,0,0,0,Transformer,OPT,https://github.com/IST-DASLab/sparsegpt,,,,1,,
GPT-2+Active-SGD,"Davood Wadi, Marc Fredette, Sylvain Senecal",2023/01/24,2023,Read the Signs Towards Invariance to Gradient Descent’s Hyperparameter Initialization,0,0,https://arxiv.org/pdf/2301.10133.pdf,1.24E+08,,,200,,,0.00,2.08E+06,3.10E+17,,2.08E+06,,,20.59,,0.0,1,0,0,Transformer,GPT,,,,,1,,50257
Memformer (4 encoder + 16 decoder),"Qingyang Wu, Zhenzhong Lan, Kun Qian, Jing Gu, Alborz Geramifard, Zhou Yu",2020/10/14,2020,Memformer: A Memory-Augmented Transformer for Sequence Modeling,5,,https://arxiv.org/abs/2010.06891,7.62E+07,,1.20E+19,11.93,,,0.00,1.03E+08,5.62E+17,0.00E+00,1.03E+08,WikiText-103,22.74,,,0.0,0,0,0,Transformer,Memformer,,Microsoft,Industry,,1,,
Adaptive Inputs + LayerDrop,"Angela Fan, Edouard Grave, Armand Joulin",2019/09/25,2019,Reducing Transformer Depth on Demand with Structured Dropout,435,,https://arxiv.org/abs/1909.11556,4.23E+08,,,,,,0.00,1.03E+08,0.00E+00,1.03E+08,2.06E+08,WikiText-103,17.70,,,0.0,0,0,0,Transformer,Transformer-XL,,,,,1,,
Pythia-12b,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,1.20E+10,,,1,,,0.00,3.00E+11,2.16E+22,,3.00E+11,,,10.54,,0.5,1,0,0,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,https://arxiv.org/abs/2204.06745,50257
SPN-4+KN5,"W. Cheng, Stanley Kok, Hoai Vu Pham, Hai Leong Chieu, K. M. A. Chai",2014/01/01,2014,Language modeling with sum-product networks,102,,https://spn.cs.washington.edu/papers/is14.pdf,5.00E+06,,4.40E+16,UNK,,,0.00,1.01E+06,#VALUE!,,1.01E+06,Penn TreeBank,,,80.60,0.0,0,0,0,,,https://github.com/stakok/lmspn,,,,1,,
Pythia-6.9b,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,6.90E+09,,,1,,,0.00,3.00E+11,1.24E+22,,3.00E+11,,,11.41,,0.5,1,0,0,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,https://arxiv.org/abs/2204.06745,50257
Pythia-160m,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,1.60E+08,,,1,,,0.00,3.00E+11,2.88E+20,,3.00E+11,,,33.43,,0.5,1,0,0,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,https://arxiv.org/abs/2204.06745,50257
Pythia-1b,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,1.00E+09,,,1,,,0.00,3.00E+11,1.80E+21,,3.00E+11,,,16.45,,0.5,1,0,0,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,https://arxiv.org/abs/2204.06746,50257
FAIRSEQ Adaptive Inputs,"Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli",2019/04/01,2019,"fairseq: A Fast, Extensible Toolkit for Sequence Modeling",2428,,https://arxiv.org/abs/1904.01038,2.47E+08,,7.30E+18,,,,0.00,1.03E+08,0.00E+00,0.00E+00,1.03E+08,WikiText-103,18.70,,,0.0,0,0,0,Transformer,Transformer-XL,https://github.com/facebookresearch/fairseq,Amazon Web Services,Industry,,1,,
Pythia-1.4b,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,1.40E+09,,,1,,,0.00,3.00E+11,2.52E+21,,3.00E+11,,,14.72,,0.5,1,0,0,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,https://arxiv.org/abs/2204.06747,50257
Pythia-410m,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,4.10E+08,,,1,,,0.00,3.00E+11,7.38E+20,,3.00E+11,,,20.11,,0.5,1,0,0,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,https://arxiv.org/abs/2204.06748,50257
TPM-LVD,"Anji Liu, Honghua Zhang, Guy Van den Broeck",2022/10/10,2022,Scaling up Probabilistic Circuits by Latent Variable Distillation,7,,https://arxiv.org/pdf/2210.04398.pdf,1.12E+09,,,,,,0.00,2.08E+06,0.00E+00,,2.08E+06,,,197.50,,0.0,0,0,0,,,,,,,1,,
Pythia-2.8b,"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, Oskar van der Wal",2023/04/03,2023,Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling,73,,https://arxiv.org/abs/2304.01373,2.80E+09,,,1,,,0.00,3.00E+11,5.04E+21,,3.00E+11,,,12.69,,0.5,1,0,0,Transformer,Pythia,https://github.com/EleutherAI/pythia,,,,1,https://arxiv.org/abs/2204.06749,50257
WD+LR+M,"Ross M. Clarke, Elre T. Oldewage, José Miguel Hernández-Lobato",2021/10/20,2021,Scalable One-Pass Optimisation of High-Dimensional Weight-Update Hyperparameters by Implicit Differentiation,5,,https://arxiv.org/pdf/2110.10461,UNK,,,72,,,1.00,9.29E+05,#VALUE!,,9.29E+05,,,,100.00,0.0,1,0,1,Recurrent,LSTM,https://github.com/rmclarke/OptimisingWeightUpdateHyperparameters,University of Edinburgh; Toyota Technological Institute at chicago,Industry - Academia Collaboration,,0,,
LLaMA-33B (LoRA finetuned),"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/05/23,2023,LLaMA: Open and Efficient Foundation Language Models,0,,https://arxiv.org/pdf/2305.14152.pdf,3.30E+10,,,1.09,,,0.00,1.40E+12,3.02E+23,9.29E+05,1.40E+12,,,,7.68,0.0,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,,32000
LLaMA-13B (LoRA finetuned),"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/05/23,2023,LLaMA: Open and Efficient Foundation Language Models,0,,https://arxiv.org/pdf/2305.14152.pdf,1.30E+10,,,1.09,,,0.00,1.00E+12,8.50E+22,9.29E+05,1.00E+12,,,5.54,8.64,0.0,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,,32000
SparseOPT-13B,"Elias Frantar, Dan Alistarh",2023/01/02,2023,SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot,8,,https://arxiv.org/abs/2301.00774,6.50E+09,,,1.666666667,,,1.00,1.80E+11,1.17E+22,0.00E+00,1.80E+11,,11.17,,,1.0,0,0,0,Transformer,OPT,https://github.com/IST-DASLab/sparsegpt,,,,1,,
LLaMA-33B,"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/02/27,2023,LLaMA: Open and Efficient Foundation Language Models,963,,https://arxiv.org/abs/2302.13971,3.30E+10,,,1.09,,,0.00,1.40E+12,3.02E+23,0.00E+00,1.40E+12,,,6.90,,0.5,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,bytepair encoding (BPE) algorithm,32000
LLaMA-13B,"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/02/27,2023,LLaMA: Open and Efficient Foundation Language Models,963,,https://arxiv.org/abs/2302.13971,1.30E+10,,,1.09,,,1.00,1.00E+12,8.50E+22,0.00E+00,1.00E+12,,,13.99,,0.5,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,,32000
LLaMA-7B,"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/02/27,2023,LLaMA: Open and Efficient Foundation Language Models,963,,https://arxiv.org/abs/2302.13971,7.00E+09,,,1.09,,,1.00,1.00E+12,4.58E+22,0.00E+00,1.00E+12,,,9.49,,0.5,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,,32000
LLaMA-65B (LoRA finetuned),"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/05/23,2023,LLaMA: Open and Efficient Foundation Language Models,0,,https://arxiv.org/pdf/2305.14152.pdf,6.52E+10,,,1.09,,,0.00,1.40E+12,5.97E+23,2.08E+06,1.40E+12,,,4.27,,0.0,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,Probably Llama default tokenizer,32000
LLaMA-65B,"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/02/27,2023,LLaMA: Open and Efficient Foundation Language Models,963,,https://arxiv.org/abs/2302.13971,6.52E+10,,,1.09,,,0.00,1.40E+12,5.97E+23,0.00E+00,1.40E+12,,,4.96,,0.5,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,,32000
LLaMA-7B (LoRA finetuned),"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample",2023/05/23,2023,LLaMA: Open and Efficient Foundation Language Models,0,,https://arxiv.org/pdf/2305.14152.pdf,7.00E+09,,,1.09,,,0.00,1.00E+12,4.58E+22,9.29E+05,1.00E+12,,,6.19,9.69,0.0,1,0,0,Transformer,LLaMa,https://github.com/facebookresearch/llama,,,,1,Probably Llama default tokenizer,32000
MPT-7B,MosaicML NLP Team,2023/05/05,2023,"Introducing MPT-7B: A New Standard for Open-Source, Commercially Usable LLMs",NA,,https://www.mosaicml.com/blog/mpt-7b,7.00E+09,,,1,,,0.00,1.00E+12,4.20E+22,,1.00E+12,,,9.96,,0.5,1,0,0,Transformer,MPT,,,,,1,https://arxiv.org/abs/2204.06745,50432
Hyena-3-slim,"Michael Poli, Stefano Massaroli, Eric Nguyen, Daniel Y. Fu, Tri Dao, Stephen Baccus, Yoshua Bengio, Stefano Ermon, Christopher Ré",2023/02/21,2023,Hyena Hierarchy: Towards Larger Convolutional Language Models,21,,https://arxiv.org/pdf/2302.10866,1.25E+08,,,UNK,,,0.00,1.03E+08,#VALUE!,,1.03E+08,,18.50,,,0.0,1,0,0,Transformer,Hyena,,,,,1,GPT2Tokenizer,50257
TransformerXL+RelationLM,"Qi Liu, Dani Yogatama, Phil Blunsom",2022/01/24,2022,Relational Memory-Augmented Language Models,21,,https://arxiv.org/pdf/2201.09680,1.24E+08,,3.20E+21,,,,0.00,1.03E+08,0.00E+00,,1.03E+08,WikiText-103,18.60,,,0.0,0,0,0,Transformer,Transformer-XL,,Tianjin University; Microsoft Research; Beijing Institute of Technology,Industry - Academia Collaboration,,1,,