
# Compare whitened attention variants

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

#########################################################################################################
# Attention + RoPE no Whitening 100K 
#########################################################################################################
'''
In [1]: %run standard_self_attention.py
Using: cuda
Hyper-parameters
================
n_embd = 256
n_head = 2
n_layer = 2
block_size = 256
dropout_percentage = 0.5
static = True
learning_rate = 0.0003
batch_size = 256
max_iters = 100000
eval_interval = 1000
estimate_loss_iters = 1000
ffn_fanout = 4
n_ffn = 1
random_seed = 42

Total length: 13.83 M characters
Data Source + Info
==================
Source data: DATA/dickens.txt
Total length: 13.827989 M characters
Vocab size: 93
Entropy: 4.5

Model Size
==========
1.621089 M parameters

Total iterations: 100000
iter 1000 | train_loss: 1.7506, val_loss: 1.7413 | elapsed = 00:00:36, next = 21:38:08, remain = 01:00:17
iter 2000 | train_loss: 1.6208, val_loss: 1.6246 | elapsed = 00:01:10, next = 21:38:41, remain = 00:57:34
iter 3000 | train_loss: 1.5659, val_loss: 1.5765 | elapsed = 00:01:44, next = 21:39:14, remain = 00:56:18
iter 4000 | train_loss: 1.5272, val_loss: 1.5442 | elapsed = 00:02:18, next = 21:39:48, remain = 00:55:23
iter 5000 | train_loss: 1.5031, val_loss: 1.5230 | elapsed = 00:02:52, next = 21:40:22, remain = 00:54:36
iter 6000 | train_loss: 1.4876, val_loss: 1.5104 | elapsed = 00:03:26, next = 21:40:56, remain = 00:53:54
iter 7000 | train_loss: 1.4757, val_loss: 1.5010 | elapsed = 00:04:00, next = 21:41:30, remain = 00:53:15
iter 8000 | train_loss: 1.4644, val_loss: 1.4907 | elapsed = 00:04:34, next = 21:42:04, remain = 00:52:37
iter 9000 | train_loss: 1.4549, val_loss: 1.4835 | elapsed = 00:05:08, next = 21:42:38, remain = 00:52:00
iter 10000 | train_loss: 1.4479, val_loss: 1.4788 | elapsed = 00:05:42, next = 21:43:12, remain = 00:51:23
iter 11000 | train_loss: 1.4412, val_loss: 1.4730 | elapsed = 00:06:16, next = 21:43:46, remain = 00:50:47
iter 12000 | train_loss: 1.4349, val_loss: 1.4674 | elapsed = 00:06:51, next = 21:44:20, remain = 00:50:14
iter 13000 | train_loss: 1.4281, val_loss: 1.4626 | elapsed = 00:07:25, next = 21:44:54, remain = 00:49:38
iter 14000 | train_loss: 1.4254, val_loss: 1.4612 | elapsed = 00:07:59, next = 21:45:28, remain = 00:49:02
iter 15000 | train_loss: 1.4217, val_loss: 1.4583 | elapsed = 00:08:33, next = 21:46:02, remain = 00:48:28
iter 16000 | train_loss: 1.4177, val_loss: 1.4544 | elapsed = 00:09:07, next = 21:46:36, remain = 00:47:52
iter 17000 | train_loss: 1.4126, val_loss: 1.4471 | elapsed = 00:09:41, next = 21:47:10, remain = 00:47:18
iter 18000 | train_loss: 1.4108, val_loss: 1.4473 | elapsed = 00:10:15, next = 21:47:44, remain = 00:46:43
iter 19000 | train_loss: 1.4072, val_loss: 1.4424 | elapsed = 00:10:49, next = 21:48:18, remain = 00:46:08
iter 20000 | train_loss: 1.4060, val_loss: 1.4421 | elapsed = 00:11:23, next = 21:48:53, remain = 00:45:33
iter 21000 | train_loss: 1.4026, val_loss: 1.4380 | elapsed = 00:11:57, next = 21:49:27, remain = 00:44:59
iter 22000 | train_loss: 1.4001, val_loss: 1.4391 | elapsed = 00:12:31, next = 21:50:01, remain = 00:44:24
iter 23000 | train_loss: 1.3988, val_loss: 1.4382 | elapsed = 00:13:05, next = 21:50:35, remain = 00:43:50
iter 24000 | train_loss: 1.3966, val_loss: 1.4346 | elapsed = 00:13:39, next = 21:51:09, remain = 00:43:15
iter 25000 | train_loss: 1.3934, val_loss: 1.4304 | elapsed = 00:14:13, next = 21:51:43, remain = 00:42:41
iter 26000 | train_loss: 1.3902, val_loss: 1.4289 | elapsed = 00:14:47, next = 21:52:17, remain = 00:42:07
iter 27000 | train_loss: 1.3889, val_loss: 1.4291 | elapsed = 00:15:22, next = 21:52:51, remain = 00:41:32
iter 28000 | train_loss: 1.3882, val_loss: 1.4284 | elapsed = 00:15:56, next = 21:53:25, remain = 00:40:59
iter 29000 | train_loss: 1.3875, val_loss: 1.4263 | elapsed = 00:16:30, next = 21:53:59, remain = 00:40:24
iter 30000 | train_loss: 1.3839, val_loss: 1.4242 | elapsed = 00:17:04, next = 21:54:33, remain = 00:39:50
iter 31000 | train_loss: 1.3834, val_loss: 1.4250 | elapsed = 00:17:38, next = 21:55:08, remain = 00:39:15
iter 32000 | train_loss: 1.3815, val_loss: 1.4219 | elapsed = 00:18:12, next = 21:55:42, remain = 00:38:41
iter 33000 | train_loss: 1.3810, val_loss: 1.4222 | elapsed = 00:18:46, next = 21:56:16, remain = 00:38:07
iter 34000 | train_loss: 1.3795, val_loss: 1.4194 | elapsed = 00:19:20, next = 21:56:50, remain = 00:37:33
iter 35000 | train_loss: 1.3755, val_loss: 1.4159 | elapsed = 00:19:54, next = 21:57:24, remain = 00:36:58
iter 36000 | train_loss: 1.3764, val_loss: 1.4180 | elapsed = 00:20:28, next = 21:57:58, remain = 00:36:24
iter 37000 | train_loss: 1.3767, val_loss: 1.4190 | elapsed = 00:21:02, next = 21:58:32, remain = 00:35:50
iter 38000 | train_loss: 1.3737, val_loss: 1.4139 | elapsed = 00:21:36, next = 21:59:06, remain = 00:35:15
iter 39000 | train_loss: 1.3717, val_loss: 1.4121 | elapsed = 00:22:10, next = 21:59:40, remain = 00:34:41
iter 40000 | train_loss: 1.3730, val_loss: 1.4138 | elapsed = 00:22:45, next = 22:00:14, remain = 00:34:07
iter 41000 | train_loss: 1.3704, val_loss: 1.4111 | elapsed = 00:23:19, next = 22:00:48, remain = 00:33:33
iter 42000 | train_loss: 1.3693, val_loss: 1.4104 | elapsed = 00:23:53, next = 22:01:22, remain = 00:32:59
iter 43000 | train_loss: 1.3698, val_loss: 1.4127 | elapsed = 00:24:27, next = 22:01:56, remain = 00:32:24
iter 44000 | train_loss: 1.3697, val_loss: 1.4106 | elapsed = 00:25:01, next = 22:02:30, remain = 00:31:50
iter 45000 | train_loss: 1.3696, val_loss: 1.4108 | elapsed = 00:25:35, next = 22:03:04, remain = 00:31:16
iter 46000 | train_loss: 1.3672, val_loss: 1.4076 | elapsed = 00:26:09, next = 22:03:38, remain = 00:30:42
iter 47000 | train_loss: 1.3651, val_loss: 1.4090 | elapsed = 00:26:43, next = 22:04:13, remain = 00:30:08
iter 48000 | train_loss: 1.3653, val_loss: 1.4079 | elapsed = 00:27:17, next = 22:04:47, remain = 00:29:34
iter 49000 | train_loss: 1.3671, val_loss: 1.4099 | elapsed = 00:27:51, next = 22:05:21, remain = 00:29:00
iter 50000 | train_loss: 1.3649, val_loss: 1.4073 | elapsed = 00:28:26, next = 22:05:55, remain = 00:28:26
iter 51000 | train_loss: 1.3623, val_loss: 1.4054 | elapsed = 00:29:00, next = 22:06:29, remain = 00:27:52
iter 52000 | train_loss: 1.3637, val_loss: 1.4061 | elapsed = 00:29:34, next = 22:07:03, remain = 00:27:17
iter 53000 | train_loss: 1.3630, val_loss: 1.4072 | elapsed = 00:30:08, next = 22:07:38, remain = 00:26:43
iter 54000 | train_loss: 1.3600, val_loss: 1.4029 | elapsed = 00:30:42, next = 22:08:12, remain = 00:26:09
iter 55000 | train_loss: 1.3614, val_loss: 1.4051 | elapsed = 00:31:16, next = 22:08:46, remain = 00:25:35
iter 56000 | train_loss: 1.3592, val_loss: 1.4022 | elapsed = 00:31:50, next = 22:09:20, remain = 00:25:01
iter 57000 | train_loss: 1.3589, val_loss: 1.4033 | elapsed = 00:32:24, next = 22:09:54, remain = 00:24:27
iter 58000 | train_loss: 1.3597, val_loss: 1.4031 | elapsed = 00:32:59, next = 22:10:28, remain = 00:23:53
iter 59000 | train_loss: 1.3577, val_loss: 1.4020 | elapsed = 00:33:33, next = 22:11:02, remain = 00:23:18
iter 60000 | train_loss: 1.3594, val_loss: 1.4012 | elapsed = 00:34:07, next = 22:11:36, remain = 00:22:44
iter 61000 | train_loss: 1.3557, val_loss: 1.3984 | elapsed = 00:34:41, next = 22:12:10, remain = 00:22:10
iter 62000 | train_loss: 1.3591, val_loss: 1.4035 | elapsed = 00:35:15, next = 22:12:44, remain = 00:21:36
iter 63000 | train_loss: 1.3587, val_loss: 1.4039 | elapsed = 00:35:49, next = 22:13:18, remain = 00:21:02
iter 64000 | train_loss: 1.3564, val_loss: 1.4001 | elapsed = 00:36:23, next = 22:13:53, remain = 00:20:28
iter 65000 | train_loss: 1.3569, val_loss: 1.4012 | elapsed = 00:36:57, next = 22:14:27, remain = 00:19:54
iter 66000 | train_loss: 1.3556, val_loss: 1.4008 | elapsed = 00:37:31, next = 22:15:01, remain = 00:19:19
iter 67000 | train_loss: 1.3549, val_loss: 1.3990 | elapsed = 00:38:05, next = 22:15:35, remain = 00:18:45
iter 68000 | train_loss: 1.3540, val_loss: 1.3984 | elapsed = 00:38:39, next = 22:16:09, remain = 00:18:11
iter 69000 | train_loss: 1.3551, val_loss: 1.3995 | elapsed = 00:39:14, next = 22:16:43, remain = 00:17:37
iter 70000 | train_loss: 1.3524, val_loss: 1.3973 | elapsed = 00:39:48, next = 22:17:17, remain = 00:17:03
iter 71000 | train_loss: 1.3532, val_loss: 1.3982 | elapsed = 00:40:22, next = 22:17:51, remain = 00:16:29
iter 72000 | train_loss: 1.3511, val_loss: 1.3964 | elapsed = 00:40:56, next = 22:18:25, remain = 00:15:55
iter 73000 | train_loss: 1.3542, val_loss: 1.3984 | elapsed = 00:41:30, next = 22:19:00, remain = 00:15:21
iter 74000 | train_loss: 1.3494, val_loss: 1.3949 | elapsed = 00:42:04, next = 22:19:34, remain = 00:14:47
iter 75000 | train_loss: 1.3514, val_loss: 1.3958 | elapsed = 00:42:38, next = 22:20:08, remain = 00:14:12
iter 76000 | train_loss: 1.3483, val_loss: 1.3934 | elapsed = 00:43:12, next = 22:20:42, remain = 00:13:38
iter 77000 | train_loss: 1.3492, val_loss: 1.3949 | elapsed = 00:43:46, next = 22:21:16, remain = 00:13:04
iter 78000 | train_loss: 1.3501, val_loss: 1.3947 | elapsed = 00:44:21, next = 22:21:50, remain = 00:12:30
iter 79000 | train_loss: 1.3516, val_loss: 1.3946 | elapsed = 00:44:55, next = 22:22:24, remain = 00:11:56
iter 80000 | train_loss: 1.3493, val_loss: 1.3926 | elapsed = 00:45:29, next = 22:22:58, remain = 00:11:22
iter 81000 | train_loss: 1.3482, val_loss: 1.3929 | elapsed = 00:46:03, next = 22:23:32, remain = 00:10:48
iter 82000 | train_loss: 1.3481, val_loss: 1.3936 | elapsed = 00:46:37, next = 22:24:06, remain = 00:10:14
iter 83000 | train_loss: 1.3484, val_loss: 1.3935 | elapsed = 00:47:11, next = 22:24:40, remain = 00:09:39
iter 84000 | train_loss: 1.3501, val_loss: 1.3945 | elapsed = 00:47:45, next = 22:25:15, remain = 00:09:05
iter 85000 | train_loss: 1.3477, val_loss: 1.3932 | elapsed = 00:48:19, next = 22:25:49, remain = 00:08:31
iter 86000 | train_loss: 1.3466, val_loss: 1.3918 | elapsed = 00:48:53, next = 22:26:23, remain = 00:07:57
iter 87000 | train_loss: 1.3480, val_loss: 1.3922 | elapsed = 00:49:27, next = 22:26:57, remain = 00:07:23
iter 88000 | train_loss: 1.3459, val_loss: 1.3926 | elapsed = 00:50:01, next = 22:27:31, remain = 00:06:49
iter 89000 | train_loss: 1.3486, val_loss: 1.3939 | elapsed = 00:50:36, next = 22:28:05, remain = 00:06:15
iter 90000 | train_loss: 1.3461, val_loss: 1.3923 | elapsed = 00:51:10, next = 22:28:39, remain = 00:05:41
iter 91000 | train_loss: 1.3460, val_loss: 1.3914 | elapsed = 00:51:44, next = 22:29:13, remain = 00:05:07
iter 92000 | train_loss: 1.3455, val_loss: 1.3907 | elapsed = 00:52:18, next = 22:29:47, remain = 00:04:32
iter 93000 | train_loss: 1.3441, val_loss: 1.3902 | elapsed = 00:52:52, next = 22:30:22, remain = 00:03:58
iter 94000 | train_loss: 1.3468, val_loss: 1.3921 | elapsed = 00:53:26, next = 22:30:56, remain = 00:03:24
iter 95000 | train_loss: 1.3441, val_loss: 1.3900 | elapsed = 00:54:00, next = 22:31:30, remain = 00:02:50
iter 96000 | train_loss: 1.3435, val_loss: 1.3898 | elapsed = 00:54:34, next = 22:32:04, remain = 00:02:16
iter 97000 | train_loss: 1.3440, val_loss: 1.3891 | elapsed = 00:55:09, next = 22:32:38, remain = 00:01:42
iter 98000 | train_loss: 1.3426, val_loss: 1.3878 | elapsed = 00:55:43, next = 22:33:12, remain = 00:01:08
iter 99000 | train_loss: 1.3444, val_loss: 1.3912 | elapsed = 00:56:17, next = 22:33:46, remain = 00:00:34
iter 100000 | train_loss: 1.3423, val_loss: 1.3895 | elapsed = 00:56:51, next = 22:34:20, remain = 00:00:00

--------------------------------------------------------------------------------
PROMPT:


OUTPUT:
 
wantacity. If you know, I can disch I was bows a sound that passed,
but I am been the very drawn, I am not it should mistaken that I canted
you? I shouldn’t have no word known her been before the door in the latter’s
cheers so do!”

The streflected to have put been for word requiring over to our parts of a raise
slanless with a hundrink one with steps in such a showing or-benefully
being into a
privation which the standays the lady fetch was a the close of magistrance as word the
yestering of the further that Gabellanter from hands mind of oction who too much brought her
arms. The woman’s larging off pointion for the best of dat to to be in the
dispose: that the first-bye; or Miss Peech; who had got all, directly
much
looked at all genius? That is, he don’t it. Every gentleman rence uncommonstranage pretty,
not the stage buriedly creased a very variously—but the great. I
with the perception with the stranger that he made at the young relapped
at the pocket. Old numbered of their secret
'''
rope_no_whitening_100k = np.array([
    [1000, 1.7413],
    [2000, 1.6246],
    [3000, 1.5765],
    [4000, 1.5442],
    [5000, 1.5230],
    [6000, 1.5104],
    [7000, 1.5010],
    [8000, 1.4907],
    [9000, 1.4835],
    [10000, 1.4788],
    [11000, 1.4730],
    [12000, 1.4674],
    [13000, 1.4626],
    [14000, 1.4612],
    [15000, 1.4583],
    [16000, 1.4544],
    [17000, 1.4471],
    [18000, 1.4473],
    [19000, 1.4424],
    [20000, 1.4421],
    [21000, 1.4380],
    [22000, 1.4391],
    [23000, 1.4382],
    [24000, 1.4346],
    [25000, 1.4304],
    [26000, 1.4289],
    [27000, 1.4291],
    [28000, 1.4284],
    [29000, 1.4263],
    [30000, 1.4242],
    [31000, 1.4250],
    [32000, 1.4219],
    [33000, 1.4222],
    [34000, 1.4194],
    [35000, 1.4159],
    [36000, 1.4180],
    [37000, 1.4190],
    [38000, 1.4139],
    [39000, 1.4121],
    [40000, 1.4138],
    [41000, 1.4111],
    [42000, 1.4104],
    [43000, 1.4127],
    [44000, 1.4106],
    [45000, 1.4108],
    [46000, 1.4076],
    [47000, 1.4090],
    [48000, 1.4079],
    [49000, 1.4099],
    [50000, 1.4073],
    [51000, 1.4054],
    [52000, 1.4061],
    [53000, 1.4072],
    [54000, 1.4029],
    [55000, 1.4051],
    [56000, 1.4022],
    [57000, 1.4033],
    [58000, 1.4031],
    [59000, 1.4020],
    [60000, 1.4012],
    [61000, 1.3984],
    [62000, 1.4035],
    [63000, 1.4039],
    [64000, 1.4001],
    [65000, 1.4012],
    [66000, 1.4008],
    [67000, 1.3990],
    [68000, 1.3984],
    [69000, 1.3995],
    [70000, 1.3973],
    [71000, 1.3982],
    [72000, 1.3964],
    [73000, 1.3984],
    [74000, 1.3949],
    [75000, 1.3958],
    [76000, 1.3934],
    [77000, 1.3949],
    [78000, 1.3947],
    [79000, 1.3946],
    [80000, 1.3926],
    [81000, 1.3929],
    [82000, 1.3936],
    [83000, 1.3935],
    [84000, 1.3945],
    [85000, 1.3932],
    [86000, 1.3918],
    [87000, 1.3922],
    [88000, 1.3926],
    [89000, 1.3939],
    [90000, 1.3923],
    [91000, 1.3914],
    [92000, 1.3907],
    [93000, 1.3902],
    [94000, 1.3921],
    [95000, 1.3900],
    [96000, 1.3898],
    [97000, 1.3891],
    [98000, 1.3878],
    [99000, 1.3912],
    [100000, 1.3895]
])


#########################################################################################################
# Attention + RoPE no Whitening 100K -- Equivalent Capacity
#########################################################################################################
'''
In [1]: %run standard_self_attention.py
Using: cuda
Hyper-parameters
================
n_embd = 276
n_head = 2
n_layer = 2
block_size = 256
dropout_percentage = 0.5
static = True
learning_rate = 0.0003
batch_size = 256
max_iters = 100000
eval_interval = 1000
estimate_loss_iters = 1000
ffn_fanout = 4
n_ffn = 1
random_seed = 42

Total length: 13.83 M characters
Data Source + Info
==================
Source data: DATA/dickens.txt
Total length: 13.827989 M characters
Vocab size: 93
Entropy: 4.5

Model Size
==========
1.880209 M parameters

Total iterations: 100000
iter 1000 | train_loss: 1.7236, val_loss: 1.7158 | elapsed = 00:01:02, next = 11:39:04, remain = 01:42:41
iter 2000 | train_loss: 1.6002, val_loss: 1.6045 | elapsed = 00:01:44, next = 11:39:37, remain = 01:25:39
iter 3000 | train_loss: 1.5450, val_loss: 1.5556 | elapsed = 00:02:27, next = 11:40:15, remain = 01:19:14
iter 4000 | train_loss: 1.5071, val_loss: 1.5265 | elapsed = 00:03:09, next = 11:40:56, remain = 01:15:38
iter 5000 | train_loss: 1.4832, val_loss: 1.5068 | elapsed = 00:03:51, next = 11:41:37, remain = 01:13:12
iter 6000 | train_loss: 1.4650, val_loss: 1.4884 | elapsed = 00:04:33, next = 11:42:18, remain = 01:11:21
iter 7000 | train_loss: 1.4509, val_loss: 1.4801 | elapsed = 00:05:15, next = 11:43:00, remain = 01:09:49
iter 8000 | train_loss: 1.4388, val_loss: 1.4684 | elapsed = 00:05:57, next = 11:43:41, remain = 01:08:30
iter 9000 | train_loss: 1.4294, val_loss: 1.4617 | elapsed = 00:06:39, next = 11:44:23, remain = 01:07:19
iter 10000 | train_loss: 1.4212, val_loss: 1.4540 | elapsed = 00:07:21, next = 11:45:05, remain = 01:06:14
iter 11000 | train_loss: 1.4164, val_loss: 1.4496 | elapsed = 00:08:03, next = 11:45:47, remain = 01:05:13
iter 12000 | train_loss: 1.4074, val_loss: 1.4429 | elapsed = 00:08:45, next = 11:46:29, remain = 01:04:16
iter 13000 | train_loss: 1.4020, val_loss: 1.4364 | elapsed = 00:09:27, next = 11:47:11, remain = 01:03:21
iter 14000 | train_loss: 1.3991, val_loss: 1.4346 | elapsed = 00:10:10, next = 11:47:53, remain = 01:02:27
iter 15000 | train_loss: 1.3974, val_loss: 1.4358 | elapsed = 00:10:52, next = 11:48:35, remain = 01:01:35
iter 16000 | train_loss: 1.3875, val_loss: 1.4276 | elapsed = 00:11:34, next = 11:49:17, remain = 01:00:44
iter 17000 | train_loss: 1.3870, val_loss: 1.4275 | elapsed = 00:12:16, next = 11:49:59, remain = 00:59:55
iter 18000 | train_loss: 1.3849, val_loss: 1.4244 | elapsed = 00:12:58, next = 11:50:41, remain = 00:59:06
iter 19000 | train_loss: 1.3826, val_loss: 1.4232 | elapsed = 00:13:40, next = 11:51:23, remain = 00:58:17
iter 20000 | train_loss: 1.3789, val_loss: 1.4186 | elapsed = 00:14:22, next = 11:52:05, remain = 00:57:30
iter 21000 | train_loss: 1.3807, val_loss: 1.4229 | elapsed = 00:15:04, next = 11:52:47, remain = 00:56:43
iter 22000 | train_loss: 1.3737, val_loss: 1.4165 | elapsed = 00:15:46, next = 11:53:29, remain = 00:55:56
iter 23000 | train_loss: 1.3709, val_loss: 1.4135 | elapsed = 00:16:28, next = 11:54:11, remain = 00:55:10
iter 24000 | train_loss: 1.3682, val_loss: 1.4103 | elapsed = 00:17:11, next = 11:54:53, remain = 00:54:25
iter 25000 | train_loss: 1.3685, val_loss: 1.4116 | elapsed = 00:17:53, next = 11:55:35, remain = 00:53:39
iter 26000 | train_loss: 1.3690, val_loss: 1.4104 | elapsed = 00:18:35, next = 11:56:18, remain = 00:52:54
iter 27000 | train_loss: 1.3658, val_loss: 1.4113 | elapsed = 00:19:17, next = 11:57:00, remain = 00:52:09
iter 28000 | train_loss: 1.3628, val_loss: 1.4057 | elapsed = 00:19:59, next = 11:57:42, remain = 00:51:24
iter 29000 | train_loss: 1.3611, val_loss: 1.4050 | elapsed = 00:20:41, next = 11:58:24, remain = 00:50:39
iter 30000 | train_loss: 1.3613, val_loss: 1.4054 | elapsed = 00:21:23, next = 11:59:06, remain = 00:49:55
iter 31000 | train_loss: 1.3598, val_loss: 1.4055 | elapsed = 00:22:05, next = 11:59:48, remain = 00:49:11
iter 32000 | train_loss: 1.3591, val_loss: 1.4056 | elapsed = 00:22:47, next = 12:00:30, remain = 00:48:26
iter 33000 | train_loss: 1.3560, val_loss: 1.3995 | elapsed = 00:23:30, next = 12:01:12, remain = 00:47:42
iter 34000 | train_loss: 1.3563, val_loss: 1.4020 | elapsed = 00:24:12, next = 12:01:54, remain = 00:46:59
iter 35000 | train_loss: 1.3548, val_loss: 1.4011 | elapsed = 00:24:54, next = 12:02:36, remain = 00:46:15
iter 36000 | train_loss: 1.3519, val_loss: 1.3992 | elapsed = 00:25:36, next = 12:03:18, remain = 00:45:31
iter 37000 | train_loss: 1.3524, val_loss: 1.3987 | elapsed = 00:26:18, next = 12:04:01, remain = 00:44:48
iter 38000 | train_loss: 1.3522, val_loss: 1.3972 | elapsed = 00:27:00, next = 12:04:43, remain = 00:44:04
iter 39000 | train_loss: 1.3502, val_loss: 1.3978 | elapsed = 00:27:43, next = 12:05:25, remain = 00:43:21
iter 40000 | train_loss: 1.3497, val_loss: 1.3973 | elapsed = 00:28:25, next = 12:06:07, remain = 00:42:37
iter 41000 | train_loss: 1.3498, val_loss: 1.3965 | elapsed = 00:29:07, next = 12:06:49, remain = 00:41:54
iter 42000 | train_loss: 1.3508, val_loss: 1.3977 | elapsed = 00:29:49, next = 12:07:31, remain = 00:41:10
iter 43000 | train_loss: 1.3485, val_loss: 1.3954 | elapsed = 00:30:31, next = 12:08:13, remain = 00:40:27
iter 44000 | train_loss: 1.3471, val_loss: 1.3934 | elapsed = 00:31:13, next = 12:08:55, remain = 00:39:44
iter 45000 | train_loss: 1.3471, val_loss: 1.3969 | elapsed = 00:31:55, next = 12:09:38, remain = 00:39:01
iter 46000 | train_loss: 1.3447, val_loss: 1.3902 | elapsed = 00:32:37, next = 12:10:20, remain = 00:38:18
iter 47000 | train_loss: 1.3467, val_loss: 1.3931 | elapsed = 00:33:19, next = 12:11:02, remain = 00:37:35
iter 48000 | train_loss: 1.3437, val_loss: 1.3906 | elapsed = 00:34:02, next = 12:11:44, remain = 00:36:52
iter 49000 | train_loss: 1.3423, val_loss: 1.3900 | elapsed = 00:34:44, next = 12:12:26, remain = 00:36:09
iter 50000 | train_loss: 1.3404, val_loss: 1.3888 | elapsed = 00:35:26, next = 12:13:08, remain = 00:35:26
iter 51000 | train_loss: 1.3416, val_loss: 1.3887 | elapsed = 00:36:08, next = 12:13:50, remain = 00:34:43
iter 52000 | train_loss: 1.3408, val_loss: 1.3891 | elapsed = 00:36:50, next = 12:14:32, remain = 00:34:00
iter 53000 | train_loss: 1.3400, val_loss: 1.3886 | elapsed = 00:37:32, next = 12:15:15, remain = 00:33:17
iter 54000 | train_loss: 1.3382, val_loss: 1.3868 | elapsed = 00:38:14, next = 12:15:57, remain = 00:32:34
iter 55000 | train_loss: 1.3388, val_loss: 1.3895 | elapsed = 00:38:56, next = 12:16:39, remain = 00:31:52
iter 56000 | train_loss: 1.3388, val_loss: 1.3879 | elapsed = 00:39:39, next = 12:17:21, remain = 00:31:09
iter 57000 | train_loss: 1.3360, val_loss: 1.3846 | elapsed = 00:40:21, next = 12:18:03, remain = 00:30:26
iter 58000 | train_loss: 1.3382, val_loss: 1.3882 | elapsed = 00:41:03, next = 12:18:45, remain = 00:29:43
iter 59000 | train_loss: 1.3375, val_loss: 1.3859 | elapsed = 00:41:45, next = 12:19:27, remain = 00:29:01
iter 60000 | train_loss: 1.3371, val_loss: 1.3854 | elapsed = 00:42:27, next = 12:20:09, remain = 00:28:18
iter 61000 | train_loss: 1.3356, val_loss: 1.3845 | elapsed = 00:43:09, next = 12:20:51, remain = 00:27:35
iter 62000 | train_loss: 1.3336, val_loss: 1.3813 | elapsed = 00:43:51, next = 12:21:33, remain = 00:26:52
iter 63000 | train_loss: 1.3338, val_loss: 1.3830 | elapsed = 00:44:33, next = 12:22:15, remain = 00:26:10
iter 64000 | train_loss: 1.3335, val_loss: 1.3819 | elapsed = 00:45:15, next = 12:22:57, remain = 00:25:27
iter 65000 | train_loss: 1.3328, val_loss: 1.3816 | elapsed = 00:45:57, next = 12:23:40, remain = 00:24:45
iter 66000 | train_loss: 1.3320, val_loss: 1.3814 | elapsed = 00:46:39, next = 12:24:22, remain = 00:24:02
iter 67000 | train_loss: 1.3325, val_loss: 1.3843 | elapsed = 00:47:22, next = 12:25:04, remain = 00:23:19
iter 68000 | train_loss: 1.3327, val_loss: 1.3838 | elapsed = 00:48:04, next = 12:25:46, remain = 00:22:37
iter 69000 | train_loss: 1.3314, val_loss: 1.3809 | elapsed = 00:48:46, next = 12:26:28, remain = 00:21:54
iter 70000 | train_loss: 1.3298, val_loss: 1.3811 | elapsed = 00:49:28, next = 12:27:10, remain = 00:21:12
iter 71000 | train_loss: 1.3309, val_loss: 1.3815 | elapsed = 00:50:10, next = 12:27:52, remain = 00:20:29
iter 72000 | train_loss: 1.3327, val_loss: 1.3822 | elapsed = 00:50:52, next = 12:28:34, remain = 00:19:47
iter 73000 | train_loss: 1.3306, val_loss: 1.3802 | elapsed = 00:51:34, next = 12:29:16, remain = 00:19:04
iter 74000 | train_loss: 1.3290, val_loss: 1.3797 | elapsed = 00:52:16, next = 12:29:58, remain = 00:18:22
iter 75000 | train_loss: 1.3279, val_loss: 1.3789 | elapsed = 00:52:58, next = 12:30:40, remain = 00:17:39
iter 76000 | train_loss: 1.3309, val_loss: 1.3816 | elapsed = 00:53:40, next = 12:31:23, remain = 00:16:57
iter 77000 | train_loss: 1.3288, val_loss: 1.3798 | elapsed = 00:54:23, next = 12:32:05, remain = 00:16:14
iter 78000 | train_loss: 1.3285, val_loss: 1.3794 | elapsed = 00:55:05, next = 12:32:47, remain = 00:15:32
iter 79000 | train_loss: 1.3277, val_loss: 1.3788 | elapsed = 00:55:47, next = 12:33:29, remain = 00:14:49
iter 80000 | train_loss: 1.3294, val_loss: 1.3807 | elapsed = 00:56:29, next = 12:34:11, remain = 00:14:07
iter 81000 | train_loss: 1.3270, val_loss: 1.3782 | elapsed = 00:57:11, next = 12:34:54, remain = 00:13:25
iter 82000 | train_loss: 1.3257, val_loss: 1.3772 | elapsed = 00:57:53, next = 12:35:36, remain = 00:12:42
iter 83000 | train_loss: 1.3265, val_loss: 1.3762 | elapsed = 00:58:36, next = 12:36:18, remain = 00:12:00
iter 84000 | train_loss: 1.3257, val_loss: 1.3762 | elapsed = 00:59:18, next = 12:37:00, remain = 00:11:17
iter 85000 | train_loss: 1.3254, val_loss: 1.3752 | elapsed = 01:00:00, next = 12:37:42, remain = 00:10:35
iter 86000 | train_loss: 1.3297, val_loss: 1.3811 | elapsed = 01:00:42, next = 12:38:24, remain = 00:09:52
iter 87000 | train_loss: 1.3254, val_loss: 1.3767 | elapsed = 01:01:24, next = 12:39:06, remain = 00:09:10
iter 88000 | train_loss: 1.3253, val_loss: 1.3752 | elapsed = 01:02:06, next = 12:39:48, remain = 00:08:28
iter 89000 | train_loss: 1.3239, val_loss: 1.3749 | elapsed = 01:02:48, next = 12:40:30, remain = 00:07:45
iter 90000 | train_loss: 1.3247, val_loss: 1.3757 | elapsed = 01:03:30, next = 12:41:12, remain = 00:07:03
iter 91000 | train_loss: 1.3248, val_loss: 1.3770 | elapsed = 01:04:12, next = 12:41:54, remain = 00:06:21
iter 92000 | train_loss: 1.3247, val_loss: 1.3757 | elapsed = 01:04:54, next = 12:42:36, remain = 00:05:38
iter 93000 | train_loss: 1.3233, val_loss: 1.3732 | elapsed = 01:05:36, next = 12:43:19, remain = 00:04:56
iter 94000 | train_loss: 1.3241, val_loss: 1.3744 | elapsed = 01:06:19, next = 12:44:01, remain = 00:04:13
iter 95000 | train_loss: 1.3243, val_loss: 1.3769 | elapsed = 01:07:01, next = 12:44:43, remain = 00:03:31
iter 96000 | train_loss: 1.3248, val_loss: 1.3772 | elapsed = 01:07:43, next = 12:45:25, remain = 00:02:49
iter 97000 | train_loss: 1.3202, val_loss: 1.3728 | elapsed = 01:08:25, next = 12:46:07, remain = 00:02:06
iter 98000 | train_loss: 1.3246, val_loss: 1.3780 | elapsed = 01:09:07, next = 12:46:49, remain = 00:01:24
iter 99000 | train_loss: 1.3234, val_loss: 1.3734 | elapsed = 01:09:49, next = 12:47:31, remain = 00:00:42
iter 100000 | train_loss: 1.3215, val_loss: 1.3751 | elapsed = 01:10:31, next = 12:48:13, remain = 00:00:00

--------------------------------------------------------------------------------
PROMPT:


OUTPUT:
 
want to to be slipp those can disch with his
small tear the passenspondence, blowly they courred, poor stopped by a public could
presenttainly aunt for me, and went our oking of cabief to the
road back. One did the escents deted.

One gazement, it’s the heap of the Borows. But for the state would
begin interposed rendered. The deparation of him from his brutation of
the Dora’s changed, ‘No, now, sir,’ he said, turned, and warmly a teeth
in which he blonge to get before a word the yes, 'and good me from my mother for
one dotable open the floor; you have some disposition to crong a gremended that he was girling
for the best of dat the bexerious shadow could all the child.

“Not him with a curios together penty walking related his arms (night,
impressed on the incording of his characters with a grasage probable which
she glance or more seen a very visitors;—but the great chair-not
before we all in the venthuman that he made at the young lanchance
entrooperition of if my familiar idea to-d

'''
equiv_capacity_rope_no_whitening_100k = np.array([
    [1000, 1.7158],
    [2000, 1.6045],
    [3000, 1.5556],
    [4000, 1.5265],
    [5000, 1.5068],
    [6000, 1.4884],
    [7000, 1.4801],
    [8000, 1.4684],
    [9000, 1.4617],
    [10000, 1.4540],
    [11000, 1.4496],
    [12000, 1.4429],
    [13000, 1.4364],
    [14000, 1.4346],
    [15000, 1.4358],
    [16000, 1.4276],
    [17000, 1.4275],
    [18000, 1.4244],
    [19000, 1.4232],
    [20000, 1.4186],
    [21000, 1.4229],
    [22000, 1.4165],
    [23000, 1.4135],
    [24000, 1.4103],
    [25000, 1.4116],
    [26000, 1.4104],
    [27000, 1.4113],
    [28000, 1.4057],
    [29000, 1.4050],
    [30000, 1.4054],
    [31000, 1.4055],
    [32000, 1.4056],
    [33000, 1.3995],
    [34000, 1.4020],
    [35000, 1.4011],
    [36000, 1.3992],
    [37000, 1.3987],
    [38000, 1.3972],
    [39000, 1.3978],
    [40000, 1.3973],
    [41000, 1.3965],
    [42000, 1.3977],
    [43000, 1.3954],
    [44000, 1.3934],
    [45000, 1.3969],
    [46000, 1.3902],
    [47000, 1.3931],
    [48000, 1.3906],
    [49000, 1.3900],
    [50000, 1.3888],
    [51000, 1.3887],
    [52000, 1.3891],
    [53000, 1.3886],
    [54000, 1.3868],
    [55000, 1.3895],
    [56000, 1.3879],
    [57000, 1.3846],
    [58000, 1.3882],
    [59000, 1.3859],
    [60000, 1.3854],
    [61000, 1.3845],
    [62000, 1.3813],
    [63000, 1.3830],
    [64000, 1.3819],
    [65000, 1.3816],
    [66000, 1.3814],
    [67000, 1.3843],
    [68000, 1.3838],
    [69000, 1.3809],
    [70000, 1.3811],
    [71000, 1.3815],
    [72000, 1.3822],
    [73000, 1.3802],
    [74000, 1.3797],
    [75000, 1.3789],
    [76000, 1.3816],
    [77000, 1.3798],
    [78000, 1.3794],
    [79000, 1.3788],
    [80000, 1.3807],
    [81000, 1.3782],
    [82000, 1.3772],
    [83000, 1.3762],
    [84000, 1.3762],
    [85000, 1.3752],
    [86000, 1.3811],
    [87000, 1.3767],
    [88000, 1.3752],
    [89000, 1.3749],
    [90000, 1.3757],
    [91000, 1.3770],
    [92000, 1.3757],
    [93000, 1.3732],
    [94000, 1.3744],
    [95000, 1.3769],
    [96000, 1.3772],
    [97000, 1.3728],
    [98000, 1.3780],
    [99000, 1.3734],
    [100000, 1.3751]
])

#########################################################################################################
# Whitened Attention + RoPE 100K -- one whitening filter per decoder block + whitened to residual stream
#########################################################################################################
'''

In [1]: %run whitened_self_attention.py
Using: cuda
Hyper-parameters
================
n_embd = 256
n_head = 2
n_layer = 2
block_size = 256
dropout_percentage = 0.5
learning_rate = 0.0003
batch_size = 256
max_iters = 100000
eval_interval = 1000
estimate_loss_iters = 1000
ffn_fanout = 4
n_ffn = 1
random_seed = 42

Total length: 13.83 M characters
Data Source + Info
==================
Source data: DATA/dickens.txt
Total length: 13.827989 M characters
Vocab size: 93
Entropy: 4.5

Model Size
==========
1.883233 M parameters

Total iterations: 100000
iter 1000 | train_loss: 1.5255, val_loss: 1.5307 | elapsed = 00:09:18, next = 23:40:24, remain = 15:21:33
iter 2000 | train_loss: 1.3704, val_loss: 1.3952 | elapsed = 00:14:10, next = 23:43:03, remain = 11:34:38
iter 3000 | train_loss: 1.3053, val_loss: 1.3396 | elapsed = 00:19:02, next = 23:47:10, remain = 10:15:44
iter 4000 | train_loss: 1.2653, val_loss: 1.3061 | elapsed = 00:23:54, next = 23:51:40, remain = 09:33:51
iter 5000 | train_loss: 1.2414, val_loss: 1.2840 | elapsed = 00:28:45, next = 23:56:18, remain = 09:06:28
iter 6000 | train_loss: 1.2223, val_loss: 1.2685 | elapsed = 00:33:36, next = 00:01:00, remain = 08:46:37
iter 7000 | train_loss: 1.2070, val_loss: 1.2549 | elapsed = 00:38:28, next = 00:05:46, remain = 08:31:15
iter 8000 | train_loss: 1.1999, val_loss: 1.2526 | elapsed = 00:43:19, next = 00:10:32, remain = 08:18:19
iter 9000 | train_loss: 1.1877, val_loss: 1.2406 | elapsed = 00:48:12, next = 00:15:20, remain = 08:07:22
iter 10000 | train_loss: 1.1784, val_loss: 1.2358 | elapsed = 00:53:04, next = 00:20:10, remain = 07:57:38
iter 11000 | train_loss: 1.1728, val_loss: 1.2321 | elapsed = 00:57:56, next = 00:24:59, remain = 07:48:47
iter 12000 | train_loss: 1.1654, val_loss: 1.2260 | elapsed = 01:02:47, next = 00:29:48, remain = 07:40:28
iter 13000 | train_loss: 1.1597, val_loss: 1.2187 | elapsed = 01:07:38, next = 00:34:38, remain = 07:32:41
iter 14000 | train_loss: 1.1573, val_loss: 1.2197 | elapsed = 01:12:30, next = 00:39:28, remain = 07:25:25
iter 15000 | train_loss: 1.1508, val_loss: 1.2122 | elapsed = 01:17:21, next = 00:44:18, remain = 07:18:23
iter 16000 | train_loss: 1.1435, val_loss: 1.2077 | elapsed = 01:22:13, next = 00:49:09, remain = 07:11:43
iter 17000 | train_loss: 1.1407, val_loss: 1.2058 | elapsed = 01:27:06, next = 00:54:00, remain = 07:05:15
iter 18000 | train_loss: 1.1359, val_loss: 1.2017 | elapsed = 01:31:58, next = 00:58:52, remain = 06:58:58
iter 19000 | train_loss: 1.1325, val_loss: 1.1990 | elapsed = 01:36:49, next = 01:03:42, remain = 06:52:45
iter 20000 | train_loss: 1.1305, val_loss: 1.1981 | elapsed = 01:41:40, next = 01:08:32, remain = 06:46:41
iter 21000 | train_loss: 1.1263, val_loss: 1.1965 | elapsed = 01:46:31, next = 01:13:23, remain = 06:40:43
iter 22000 | train_loss: 1.1236, val_loss: 1.1924 | elapsed = 01:51:23, next = 01:18:14, remain = 06:34:56
iter 23000 | train_loss: 1.1208, val_loss: 1.1940 | elapsed = 01:56:15, next = 01:23:06, remain = 06:29:13
iter 24000 | train_loss: 1.1183, val_loss: 1.1906 | elapsed = 02:01:07, next = 01:27:57, remain = 06:23:34
iter 25000 | train_loss: 1.1155, val_loss: 1.1882 | elapsed = 02:05:59, next = 01:32:49, remain = 06:17:59
iter 26000 | train_loss: 1.1135, val_loss: 1.1853 | elapsed = 02:10:50, next = 01:37:40, remain = 06:12:24
iter 27000 | train_loss: 1.1118, val_loss: 1.1859 | elapsed = 02:15:41, next = 01:42:30, remain = 06:06:53
iter 28000 | train_loss: 1.1095, val_loss: 1.1830 | elapsed = 02:20:32, next = 01:47:21, remain = 06:01:24
iter 29000 | train_loss: 1.1067, val_loss: 1.1815 | elapsed = 02:25:25, next = 01:52:13, remain = 05:56:01
iter 30000 | train_loss: 1.1068, val_loss: 1.1799 | elapsed = 02:30:17, next = 01:57:05, remain = 05:50:40
iter 31000 | train_loss: 1.1041, val_loss: 1.1811 | elapsed = 02:35:09, next = 02:01:57, remain = 05:45:21
iter 32000 | train_loss: 1.1026, val_loss: 1.1787 | elapsed = 02:40:00, next = 02:06:47, remain = 05:40:01
iter 33000 | train_loss: 1.1035, val_loss: 1.1784 | elapsed = 02:44:52, next = 02:11:39, remain = 05:34:44
iter 34000 | train_loss: 1.0998, val_loss: 1.1761 | elapsed = 02:49:43, next = 02:16:30, remain = 05:29:28
iter 35000 | train_loss: 1.0980, val_loss: 1.1764 | elapsed = 02:54:34, next = 02:21:21, remain = 05:24:13
iter 36000 | train_loss: 1.0956, val_loss: 1.1745 | elapsed = 02:59:26, next = 02:26:13, remain = 05:19:01
iter 37000 | train_loss: 1.0961, val_loss: 1.1734 | elapsed = 03:04:17, next = 02:31:04, remain = 05:13:48
iter 38000 | train_loss: 1.0952, val_loss: 1.1732 | elapsed = 03:09:10, next = 02:35:56, remain = 05:08:38
iter 39000 | train_loss: 1.0930, val_loss: 1.1731 | elapsed = 03:14:02, next = 02:40:48, remain = 05:03:29
iter 40000 | train_loss: 1.0914, val_loss: 1.1722 | elapsed = 03:18:54, next = 02:45:40, remain = 04:58:21
iter 41000 | train_loss: 1.0894, val_loss: 1.1702 | elapsed = 03:23:45, next = 02:50:30, remain = 04:53:12
iter 42000 | train_loss: 1.0885, val_loss: 1.1701 | elapsed = 03:28:36, next = 02:55:21, remain = 04:48:04
iter 43000 | train_loss: 1.0878, val_loss: 1.1664 | elapsed = 03:33:28, next = 03:00:13, remain = 04:42:58
iter 44000 | train_loss: 1.0863, val_loss: 1.1706 | elapsed = 03:38:19, next = 03:05:04, remain = 04:37:52
iter 45000 | train_loss: 1.0860, val_loss: 1.1704 | elapsed = 03:43:11, next = 03:09:56, remain = 04:32:47
iter 46000 | train_loss: 1.0846, val_loss: 1.1684 | elapsed = 03:48:03, next = 03:14:48, remain = 04:27:43
iter 47000 | train_loss: 1.0841, val_loss: 1.1671 | elapsed = 03:52:55, next = 03:19:40, remain = 04:22:40
iter 48000 | train_loss: 1.0840, val_loss: 1.1671 | elapsed = 03:57:46, next = 03:24:31, remain = 04:17:35
iter 49000 | train_loss: 1.0826, val_loss: 1.1674 | elapsed = 04:02:38, next = 03:29:22, remain = 04:12:32
iter 50000 | train_loss: 1.0828, val_loss: 1.1667 | elapsed = 04:07:30, next = 03:34:14, remain = 04:07:30
iter 51000 | train_loss: 1.0802, val_loss: 1.1645 | elapsed = 04:12:21, next = 03:39:05, remain = 04:02:27
iter 52000 | train_loss: 1.0798, val_loss: 1.1645 | elapsed = 04:17:13, next = 03:43:57, remain = 03:57:26
iter 53000 | train_loss: 1.0790, val_loss: 1.1651 | elapsed = 04:22:05, next = 03:48:49, remain = 03:52:25
iter 54000 | train_loss: 1.0794, val_loss: 1.1680 | elapsed = 04:26:57, next = 03:53:41, remain = 03:47:24
iter 55000 | train_loss: 1.0770, val_loss: 1.1634 | elapsed = 04:31:48, next = 03:58:32, remain = 03:42:23
iter 56000 | train_loss: 1.0751, val_loss: 1.1618 | elapsed = 04:36:39, next = 04:03:23, remain = 03:37:22
iter 57000 | train_loss: 1.0755, val_loss: 1.1625 | elapsed = 04:41:31, next = 04:08:15, remain = 03:32:22
iter 58000 | train_loss: 1.0742, val_loss: 1.1642 | elapsed = 04:46:22, next = 04:13:06, remain = 03:27:22
iter 59000 | train_loss: 1.0756, val_loss: 1.1623 | elapsed = 04:51:14, next = 04:17:58, remain = 03:22:23
iter 60000 | train_loss: 1.0736, val_loss: 1.1606 | elapsed = 04:56:06, next = 04:22:50, remain = 03:17:24
iter 61000 | train_loss: 1.0730, val_loss: 1.1618 | elapsed = 05:00:59, next = 04:27:42, remain = 03:12:25
iter 62000 | train_loss: 1.0714, val_loss: 1.1612 | elapsed = 05:05:50, next = 04:32:33, remain = 03:07:26
iter 63000 | train_loss: 1.0713, val_loss: 1.1621 | elapsed = 05:10:41, next = 04:37:24, remain = 03:02:27
iter 64000 | train_loss: 1.0729, val_loss: 1.1638 | elapsed = 05:15:32, next = 04:42:15, remain = 02:57:29
iter 65000 | train_loss: 1.0710, val_loss: 1.1610 | elapsed = 05:20:24, next = 04:47:07, remain = 02:52:31
iter 66000 | train_loss: 1.0690, val_loss: 1.1598 | elapsed = 05:25:16, next = 04:51:59, remain = 02:47:33
iter 67000 | train_loss: 1.0694, val_loss: 1.1611 | elapsed = 05:30:08, next = 04:56:51, remain = 02:42:36
iter 68000 | train_loss: 1.0674, val_loss: 1.1624 | elapsed = 05:34:59, next = 05:01:42, remain = 02:37:38
iter 69000 | train_loss: 1.0685, val_loss: 1.1579 | elapsed = 05:39:51, next = 05:06:34, remain = 02:32:41
iter 70000 | train_loss: 1.0665, val_loss: 1.1589 | elapsed = 05:44:42, next = 05:11:25, remain = 02:27:43
iter 71000 | train_loss: 1.0659, val_loss: 1.1591 | elapsed = 05:49:33, next = 05:16:16, remain = 02:22:46
iter 72000 | train_loss: 1.0649, val_loss: 1.1584 | elapsed = 05:54:25, next = 05:21:08, remain = 02:17:49
iter 73000 | train_loss: 1.0651, val_loss: 1.1595 | elapsed = 05:59:16, next = 05:25:59, remain = 02:12:53
iter 74000 | train_loss: 1.0634, val_loss: 1.1594 | elapsed = 06:04:08, next = 05:30:51, remain = 02:07:56
iter 75000 | train_loss: 1.0651, val_loss: 1.1603 | elapsed = 06:09:01, next = 05:35:43, remain = 02:03:00
iter 76000 | train_loss: 1.0639, val_loss: 1.1576 | elapsed = 06:13:53, next = 05:40:35, remain = 01:58:04
iter 77000 | train_loss: 1.0636, val_loss: 1.1595 | elapsed = 06:18:44, next = 05:45:26, remain = 01:53:07
iter 78000 | train_loss: 1.0630, val_loss: 1.1579 | elapsed = 06:23:35, next = 05:50:17, remain = 01:48:11
iter 79000 | train_loss: 1.0615, val_loss: 1.1576 | elapsed = 06:28:27, next = 05:55:09, remain = 01:43:15
iter 80000 | train_loss: 1.0619, val_loss: 1.1598 | elapsed = 06:33:18, next = 06:00:00, remain = 01:38:19
iter 81000 | train_loss: 1.0628, val_loss: 1.1593 | elapsed = 06:38:10, next = 06:04:53, remain = 01:33:24
iter 82000 | train_loss: 1.0602, val_loss: 1.1574 | elapsed = 06:43:02, next = 06:09:45, remain = 01:28:28
iter 83000 | train_loss: 1.0610, val_loss: 1.1561 | elapsed = 06:47:55, next = 06:14:37, remain = 01:23:32
iter 84000 | train_loss: 1.0614, val_loss: 1.1583 | elapsed = 06:52:46, next = 06:19:28, remain = 01:18:37
iter 85000 | train_loss: 1.0610, val_loss: 1.1582 | elapsed = 06:57:37, next = 06:24:19, remain = 01:13:41
iter 86000 | train_loss: 1.0598, val_loss: 1.1589 | elapsed = 07:02:29, next = 06:29:11, remain = 01:08:46
iter 87000 | train_loss: 1.0596, val_loss: 1.1581 | elapsed = 07:07:20, next = 06:34:02, remain = 01:03:51
iter 88000 | train_loss: 1.0584, val_loss: 1.1577 | elapsed = 07:12:12, next = 06:38:54, remain = 00:58:56
iter 89000 | train_loss: 1.0568, val_loss: 1.1555 | elapsed = 07:17:05, next = 06:43:47, remain = 00:54:01
iter 90000 | train_loss: 1.0584, val_loss: 1.1586 | elapsed = 07:21:57, next = 06:48:39, remain = 00:49:06
iter 91000 | train_loss: 1.0578, val_loss: 1.1586 | elapsed = 07:26:48, next = 06:53:30, remain = 00:44:11
iter 92000 | train_loss: 1.0568, val_loss: 1.1531 | elapsed = 07:31:39, next = 06:58:21, remain = 00:39:16
iter 93000 | train_loss: 1.0556, val_loss: 1.1529 | elapsed = 07:36:31, next = 07:03:13, remain = 00:34:21
iter 94000 | train_loss: 1.0562, val_loss: 1.1534 | elapsed = 07:41:22, next = 07:08:04, remain = 00:29:26
iter 95000 | train_loss: 1.0557, val_loss: 1.1557 | elapsed = 07:46:14, next = 07:12:56, remain = 00:24:32
iter 96000 | train_loss: 1.0563, val_loss: 1.1546 | elapsed = 07:51:06, next = 07:17:48, remain = 00:19:37
iter 97000 | train_loss: 1.0539, val_loss: 1.1553 | elapsed = 07:55:58, next = 07:22:40, remain = 00:14:43
iter 98000 | train_loss: 1.0549, val_loss: 1.1548 | elapsed = 08:00:49, next = 07:27:31, remain = 00:09:48
iter 99000 | train_loss: 1.0547, val_loss: 1.1562 | elapsed = 08:05:40, next = 07:32:22, remain = 00:04:54
iter 100000 | train_loss: 1.0564, val_loss: 1.1554 | elapsed = 08:10:32, next = 07:37:13, remain = 00:00:00

--------------------------------------------------------------------------------
PROMPT:


OUTPUT:
 
wantanity. If you hate, he said, when Mr. Stryver looked at me, and
Mortimer, and the very decent post, they were always more than him at
it, in embrace as possible, and keep my cabin to Lowtl’s Head, and writ
that escape she would not much, well. Mr. Bumble was crying with her
compassion, and was pushing out of slantern with a handkerchief with
steady and tationance and steaming as if he could not throw herself
about. Now, for my remarkable woman.

The rysterics into a coolness of some point of rougher in the
company, who felt the mat word at last, without attracted his
arms. The whiskers were full from being delicious, no doubt,
forced it into it, and that the first subgence into the waist-swellers,
and the chefstone-seat was almost in a miniately possible to be thorough
acquainted by the houseal rose in the morning, and ropried itself alse-
brain-stand, we went and choose the patient, and it was very various
taste. It was a small look passing to open again. [If my famous
knowledge, 

'''
white_rope_100k = np.array([
    [1000, 1.5307],
    [2000, 1.3952],
    [3000, 1.3396],
    [4000, 1.3061],
    [5000, 1.2840],
    [6000, 1.2685],
    [7000, 1.2549],
    [8000, 1.2526],
    [9000, 1.2406],
    [10000, 1.2358],
    [11000, 1.2321],
    [12000, 1.2260],
    [13000, 1.2187],
    [14000, 1.2197],
    [15000, 1.2122],
    [16000, 1.2077],
    [17000, 1.2058],
    [18000, 1.2017],
    [19000, 1.1990],
    [20000, 1.1981],
    [21000, 1.1965],
    [22000, 1.1924],
    [23000, 1.1940],
    [24000, 1.1906],
    [25000, 1.1882],
    [26000, 1.1853],
    [27000, 1.1859],
    [28000, 1.1830],
    [29000, 1.1815],
    [30000, 1.1799],
    [31000, 1.1811],
    [32000, 1.1787],
    [33000, 1.1784],
    [34000, 1.1761],
    [35000, 1.1764],
    [36000, 1.1745],
    [37000, 1.1734],
    [38000, 1.1732],
    [39000, 1.1731],
    [40000, 1.1722],
    [41000, 1.1702],
    [42000, 1.1701],
    [43000, 1.1664],
    [44000, 1.1706],
    [45000, 1.1704],
    [46000, 1.1684],
    [47000, 1.1671],
    [48000, 1.1671],
    [49000, 1.1674],
    [50000, 1.1667],
    [51000, 1.1645],
    [52000, 1.1645],
    [53000, 1.1651],
    [54000, 1.1680],
    [55000, 1.1634],
    [56000, 1.1618],
    [57000, 1.1625],
    [58000, 1.1642],
    [59000, 1.1623],
    [60000, 1.1606],
    [61000, 1.1618],
    [62000, 1.1612],
    [63000, 1.1621],
    [64000, 1.1638],
    [65000, 1.1610],
    [66000, 1.1598],
    [67000, 1.1611],
    [68000, 1.1624],
    [69000, 1.1579],
    [70000, 1.1589],
    [71000, 1.1591],
    [72000, 1.1584],
    [73000, 1.1595],
    [74000, 1.1594],
    [75000, 1.1603],
    [76000, 1.1576],
    [77000, 1.1595],
    [78000, 1.1579],
    [79000, 1.1576],
    [80000, 1.1598],
    [81000, 1.1593],
    [82000, 1.1574],
    [83000, 1.1561],
    [84000, 1.1583],
    [85000, 1.1582],
    [86000, 1.1589],
    [87000, 1.1581],
    [88000, 1.1577],
    [89000, 1.1555],
    [90000, 1.1586],
    [91000, 1.1586],
    [92000, 1.1531],
    [93000, 1.1529],
    [94000, 1.1534],
    [95000, 1.1557],
    [96000, 1.1546],
    [97000, 1.1553],
    [98000, 1.1548],
    [99000, 1.1562],
    [100000, 1.1554]
])

#########################################################################################################
# Half Batch Whitened Attention + RoPE 100K -- same as above but with batch size 128 instead of 256
#########################################################################################################
'''
In [1]: %run whitened_self_attention.py
Using: cuda
Hyper-parameters
================
n_embd = 256
n_head = 2
n_layer = 2
block_size = 256
dropout_percentage = 0.5
learning_rate = 0.0003
batch_size = 128
max_iters = 100000
eval_interval = 1000
estimate_loss_iters = 1000
ffn_fanout = 4
n_ffn = 1
random_seed = 42

Total length: 13.83 M characters
Data Source + Info
==================
Source data: DATA/dickens.txt
Total length: 13.827989 M characters
Vocab size: 93
Entropy: 4.5

Model Size
==========
1.883233 M parameters

Total iterations: 100000
iter 1000 | train_loss: 1.5573, val_loss: 1.5578 | elapsed = 00:03:00, next = 07:48:31, remain = 04:58:28
iter 2000 | train_loss: 1.3993, val_loss: 1.4194 | elapsed = 00:05:19, next = 07:50:28, remain = 04:20:48
iter 3000 | train_loss: 1.3284, val_loss: 1.3580 | elapsed = 00:07:37, next = 07:52:39, remain = 04:06:29
iter 4000 | train_loss: 1.2876, val_loss: 1.3234 | elapsed = 00:09:55, next = 07:54:54, remain = 03:58:15
iter 5000 | train_loss: 1.2613, val_loss: 1.3022 | elapsed = 00:12:13, next = 07:57:10, remain = 03:52:20
iter 6000 | train_loss: 1.2430, val_loss: 1.2872 | elapsed = 00:14:32, next = 07:59:27, remain = 03:47:41
iter 7000 | train_loss: 1.2266, val_loss: 1.2752 | elapsed = 00:16:50, next = 08:01:44, remain = 03:43:43
iter 8000 | train_loss: 1.2158, val_loss: 1.2637 | elapsed = 00:19:08, next = 08:04:01, remain = 03:40:07
iter 9000 | train_loss: 1.2036, val_loss: 1.2571 | elapsed = 00:21:26, next = 08:06:19, remain = 03:36:50
iter 10000 | train_loss: 1.1974, val_loss: 1.2536 | elapsed = 00:23:44, next = 08:08:37, remain = 03:33:43
iter 11000 | train_loss: 1.1885, val_loss: 1.2458 | elapsed = 00:26:03, next = 08:10:54, remain = 03:30:47
iter 12000 | train_loss: 1.1825, val_loss: 1.2390 | elapsed = 00:28:21, next = 08:13:12, remain = 03:27:57
iter 13000 | train_loss: 1.1780, val_loss: 1.2334 | elapsed = 00:30:39, next = 08:15:30, remain = 03:25:10
iter 14000 | train_loss: 1.1720, val_loss: 1.2318 | elapsed = 00:32:57, next = 08:17:48, remain = 03:22:29
iter 15000 | train_loss: 1.1671, val_loss: 1.2279 | elapsed = 00:35:16, next = 08:20:06, remain = 03:19:50
iter 16000 | train_loss: 1.1623, val_loss: 1.2224 | elapsed = 00:37:34, next = 08:22:24, remain = 03:17:15
iter 17000 | train_loss: 1.1580, val_loss: 1.2210 | elapsed = 00:39:52, next = 08:24:43, remain = 03:14:41
iter 18000 | train_loss: 1.1558, val_loss: 1.2182 | elapsed = 00:42:10, next = 08:27:00, remain = 03:12:08
iter 19000 | train_loss: 1.1532, val_loss: 1.2151 | elapsed = 00:44:28, next = 08:29:19, remain = 03:09:38
iter 20000 | train_loss: 1.1491, val_loss: 1.2123 | elapsed = 00:46:46, next = 08:31:37, remain = 03:07:07
iter 21000 | train_loss: 1.1447, val_loss: 1.2100 | elapsed = 00:49:05, next = 08:33:54, remain = 03:04:39
iter 22000 | train_loss: 1.1419, val_loss: 1.2078 | elapsed = 00:51:23, next = 08:36:13, remain = 03:02:12
iter 23000 | train_loss: 1.1389, val_loss: 1.2058 | elapsed = 00:53:41, next = 08:38:31, remain = 02:59:45
iter 24000 | train_loss: 1.1341, val_loss: 1.2009 | elapsed = 00:56:00, next = 08:40:49, remain = 02:57:20
iter 25000 | train_loss: 1.1329, val_loss: 1.1995 | elapsed = 00:58:18, next = 08:43:07, remain = 02:54:54
iter 26000 | train_loss: 1.1304, val_loss: 1.1961 | elapsed = 01:00:36, next = 08:45:25, remain = 02:52:29
iter 27000 | train_loss: 1.1266, val_loss: 1.1954 | elapsed = 01:02:54, next = 08:47:44, remain = 02:50:05
iter 28000 | train_loss: 1.1272, val_loss: 1.1979 | elapsed = 01:05:12, next = 08:50:01, remain = 02:47:40
iter 29000 | train_loss: 1.1238, val_loss: 1.1943 | elapsed = 01:07:30, next = 08:52:20, remain = 02:45:17
iter 30000 | train_loss: 1.1219, val_loss: 1.1925 | elapsed = 01:09:49, next = 08:54:38, remain = 02:42:54
iter 31000 | train_loss: 1.1215, val_loss: 1.1931 | elapsed = 01:12:07, next = 08:56:56, remain = 02:40:31
iter 32000 | train_loss: 1.1183, val_loss: 1.1903 | elapsed = 01:14:25, next = 08:59:14, remain = 02:38:09
iter 33000 | train_loss: 1.1160, val_loss: 1.1896 | elapsed = 01:16:43, next = 09:01:32, remain = 02:35:46
iter 34000 | train_loss: 1.1139, val_loss: 1.1882 | elapsed = 01:19:01, next = 09:03:50, remain = 02:33:24
iter 35000 | train_loss: 1.1131, val_loss: 1.1865 | elapsed = 01:21:20, next = 09:06:09, remain = 02:31:03
iter 36000 | train_loss: 1.1107, val_loss: 1.1871 | elapsed = 01:23:38, next = 09:08:27, remain = 02:28:41
iter 37000 | train_loss: 1.1100, val_loss: 1.1865 | elapsed = 01:25:56, next = 09:10:45, remain = 02:26:19
iter 38000 | train_loss: 1.1103, val_loss: 1.1839 | elapsed = 01:28:14, next = 09:13:03, remain = 02:23:58
iter 39000 | train_loss: 1.1088, val_loss: 1.1846 | elapsed = 01:30:32, next = 09:15:21, remain = 02:21:37
iter 40000 | train_loss: 1.1068, val_loss: 1.1847 | elapsed = 01:32:51, next = 09:17:40, remain = 02:19:16
iter 41000 | train_loss: 1.1067, val_loss: 1.1822 | elapsed = 01:35:09, next = 09:19:58, remain = 02:16:55
iter 42000 | train_loss: 1.1040, val_loss: 1.1824 | elapsed = 01:37:27, next = 09:22:16, remain = 02:14:35
iter 43000 | train_loss: 1.1021, val_loss: 1.1799 | elapsed = 01:39:45, next = 09:24:34, remain = 02:12:14
iter 44000 | train_loss: 1.1022, val_loss: 1.1793 | elapsed = 01:42:03, next = 09:26:52, remain = 02:09:54
iter 45000 | train_loss: 1.1013, val_loss: 1.1790 | elapsed = 01:44:22, next = 09:29:10, remain = 02:07:33
iter 46000 | train_loss: 1.0991, val_loss: 1.1789 | elapsed = 01:46:40, next = 09:31:28, remain = 02:05:13
iter 47000 | train_loss: 1.0978, val_loss: 1.1763 | elapsed = 01:48:58, next = 09:33:47, remain = 02:02:53
iter 48000 | train_loss: 1.0966, val_loss: 1.1781 | elapsed = 01:51:16, next = 09:36:05, remain = 02:00:32
iter 49000 | train_loss: 1.0966, val_loss: 1.1772 | elapsed = 01:53:34, next = 09:38:23, remain = 01:58:12
iter 50000 | train_loss: 1.0953, val_loss: 1.1763 | elapsed = 01:55:53, next = 09:40:41, remain = 01:55:53
iter 51000 | train_loss: 1.0938, val_loss: 1.1748 | elapsed = 01:58:11, next = 09:42:59, remain = 01:53:32
iter 52000 | train_loss: 1.0933, val_loss: 1.1759 | elapsed = 02:00:29, next = 09:45:18, remain = 01:51:13
iter 53000 | train_loss: 1.0921, val_loss: 1.1718 | elapsed = 02:02:47, next = 09:47:36, remain = 01:48:53
iter 54000 | train_loss: 1.0913, val_loss: 1.1743 | elapsed = 02:05:05, next = 09:49:54, remain = 01:46:33
iter 55000 | train_loss: 1.0905, val_loss: 1.1731 | elapsed = 02:07:24, next = 09:52:12, remain = 01:44:14
iter 56000 | train_loss: 1.0899, val_loss: 1.1706 | elapsed = 02:09:41, next = 09:54:30, remain = 01:41:54
iter 57000 | train_loss: 1.0889, val_loss: 1.1707 | elapsed = 02:12:00, next = 09:56:48, remain = 01:39:34
iter 58000 | train_loss: 1.0900, val_loss: 1.1703 | elapsed = 02:14:18, next = 09:59:07, remain = 01:37:15
iter 59000 | train_loss: 1.0877, val_loss: 1.1693 | elapsed = 02:16:36, next = 10:01:25, remain = 01:34:55
iter 60000 | train_loss: 1.0866, val_loss: 1.1711 | elapsed = 02:18:54, next = 10:03:43, remain = 01:32:36
iter 61000 | train_loss: 1.0852, val_loss: 1.1679 | elapsed = 02:21:12, next = 10:06:01, remain = 01:30:17
iter 62000 | train_loss: 1.0849, val_loss: 1.1688 | elapsed = 02:23:31, next = 10:08:19, remain = 01:27:57
iter 63000 | train_loss: 1.0848, val_loss: 1.1702 | elapsed = 02:25:49, next = 10:10:37, remain = 01:25:38
iter 64000 | train_loss: 1.0847, val_loss: 1.1680 | elapsed = 02:28:07, next = 10:12:55, remain = 01:23:19
iter 65000 | train_loss: 1.0822, val_loss: 1.1679 | elapsed = 02:30:25, next = 10:15:14, remain = 01:20:59
iter 66000 | train_loss: 1.0846, val_loss: 1.1689 | elapsed = 02:32:43, next = 10:17:32, remain = 01:18:40
iter 67000 | train_loss: 1.0828, val_loss: 1.1669 | elapsed = 02:35:01, next = 10:19:50, remain = 01:16:21
iter 68000 | train_loss: 1.0837, val_loss: 1.1667 | elapsed = 02:37:20, next = 10:22:08, remain = 01:14:02
iter 69000 | train_loss: 1.0839, val_loss: 1.1699 | elapsed = 02:39:38, next = 10:24:26, remain = 01:11:43
iter 70000 | train_loss: 1.0819, val_loss: 1.1676 | elapsed = 02:41:56, next = 10:26:44, remain = 01:09:24
iter 71000 | train_loss: 1.0800, val_loss: 1.1648 | elapsed = 02:44:14, next = 10:29:02, remain = 01:07:05
iter 72000 | train_loss: 1.0799, val_loss: 1.1638 | elapsed = 02:46:32, next = 10:31:21, remain = 01:04:46
iter 73000 | train_loss: 1.0791, val_loss: 1.1653 | elapsed = 02:48:51, next = 10:33:39, remain = 01:02:27
iter 74000 | train_loss: 1.0790, val_loss: 1.1671 | elapsed = 02:51:09, next = 10:35:57, remain = 01:00:08
iter 75000 | train_loss: 1.0791, val_loss: 1.1642 | elapsed = 02:53:27, next = 10:38:16, remain = 00:57:49
iter 76000 | train_loss: 1.0774, val_loss: 1.1671 | elapsed = 02:55:45, next = 10:40:34, remain = 00:55:30
iter 77000 | train_loss: 1.0776, val_loss: 1.1652 | elapsed = 02:58:03, next = 10:42:52, remain = 00:53:11
iter 78000 | train_loss: 1.0757, val_loss: 1.1646 | elapsed = 03:00:22, next = 10:45:10, remain = 00:50:52
iter 79000 | train_loss: 1.0769, val_loss: 1.1661 | elapsed = 03:02:40, next = 10:47:28, remain = 00:48:33
iter 80000 | train_loss: 1.0750, val_loss: 1.1640 | elapsed = 03:04:58, next = 10:49:46, remain = 00:46:14
iter 81000 | train_loss: 1.0743, val_loss: 1.1646 | elapsed = 03:07:16, next = 10:52:05, remain = 00:43:55
iter 82000 | train_loss: 1.0747, val_loss: 1.1642 | elapsed = 03:09:34, next = 10:54:23, remain = 00:41:36
iter 83000 | train_loss: 1.0747, val_loss: 1.1622 | elapsed = 03:11:53, next = 10:56:41, remain = 00:39:18
iter 84000 | train_loss: 1.0726, val_loss: 1.1630 | elapsed = 03:14:11, next = 10:58:59, remain = 00:36:59
iter 85000 | train_loss: 1.0727, val_loss: 1.1612 | elapsed = 03:16:29, next = 11:01:17, remain = 00:34:40
iter 86000 | train_loss: 1.0731, val_loss: 1.1643 | elapsed = 03:18:47, next = 11:03:36, remain = 00:32:21
iter 87000 | train_loss: 1.0722, val_loss: 1.1616 | elapsed = 03:21:05, next = 11:05:54, remain = 00:30:02
iter 88000 | train_loss: 1.0728, val_loss: 1.1639 | elapsed = 03:23:24, next = 11:08:12, remain = 00:27:44
iter 89000 | train_loss: 1.0724, val_loss: 1.1628 | elapsed = 03:25:42, next = 11:10:30, remain = 00:25:25
iter 90000 | train_loss: 1.0714, val_loss: 1.1613 | elapsed = 03:28:00, next = 11:12:49, remain = 00:23:06
iter 91000 | train_loss: 1.0714, val_loss: 1.1614 | elapsed = 03:30:19, next = 11:15:07, remain = 00:20:48
iter 92000 | train_loss: 1.0702, val_loss: 1.1607 | elapsed = 03:32:37, next = 11:17:25, remain = 00:18:29
iter 93000 | train_loss: 1.0702, val_loss: 1.1603 | elapsed = 03:34:56, next = 11:19:44, remain = 00:16:10
iter 94000 | train_loss: 1.0705, val_loss: 1.1617 | elapsed = 03:37:14, next = 11:22:03, remain = 00:13:52
iter 95000 | train_loss: 1.0689, val_loss: 1.1619 | elapsed = 03:39:33, next = 11:24:21, remain = 00:11:33
iter 96000 | train_loss: 1.0690, val_loss: 1.1588 | elapsed = 03:41:51, next = 11:26:39, remain = 00:09:14
iter 97000 | train_loss: 1.0677, val_loss: 1.1596 | elapsed = 03:44:09, next = 11:28:58, remain = 00:06:55
iter 98000 | train_loss: 1.0685, val_loss: 1.1621 | elapsed = 03:46:28, next = 11:31:16, remain = 00:04:37
iter 99000 | train_loss: 1.0684, val_loss: 1.1612 | elapsed = 03:48:46, next = 11:33:34, remain = 00:02:18
iter 100000 | train_loss: 1.0676, val_loss: 1.1607 | elapsed = 03:51:04, next = 11:35:52, remain = 00:00:00

--------------------------------------------------------------------------------
PROMPT:


OUTPUT:
 
wantant to be slowly at her a dress with her small perstone of hers,
or follows with her company, portentous was almost to conjure himself,
‘I am now afraid of you? And oken of my being betwixt and solitary, and
that it on me, do! One of the wellers or Cheapson was Borrioboola and
private, one of these inquests of bellows. The departing of sudden
defiances though I had brought her. It very well, and we are an oound
to. Now, for my teeth, who was succeeded to cause from a word that Nicholas
would refer me to death to undertake him; but it would be shown in
the brother, than I not known him than I could give it for more money
down to Society that I would take it that he had come into the
waist-coach to leave the head-crown table in my mouth, and
began placed the great company coarse! When I resolved, I should not
think how but it may be the liking from the black form of my
overduards, and pale in the very variety of the darkness of his relapsing
entreaties of flowing them out?--if you ha

'''
white_rope_half_batch_100k = np.array([
    [1000, 1.5578],
    [2000, 1.4194],
    [3000, 1.3580],
    [4000, 1.3234],
    [5000, 1.3022],
    [6000, 1.2872],
    [7000, 1.2752],
    [8000, 1.2637],
    [9000, 1.2571],
    [10000, 1.2536],
    [11000, 1.2458],
    [12000, 1.2390],
    [13000, 1.2334],
    [14000, 1.2318],
    [15000, 1.2279],
    [16000, 1.2224],
    [17000, 1.2210],
    [18000, 1.2182],
    [19000, 1.2151],
    [20000, 1.2123],
    [21000, 1.2100],
    [22000, 1.2078],
    [23000, 1.2058],
    [24000, 1.2009],
    [25000, 1.1995],
    [26000, 1.1961],
    [27000, 1.1954],
    [28000, 1.1979],
    [29000, 1.1943],
    [30000, 1.1925],
    [31000, 1.1931],
    [32000, 1.1903],
    [33000, 1.1896],
    [34000, 1.1882],
    [35000, 1.1865],
    [36000, 1.1871],
    [37000, 1.1865],
    [38000, 1.1839],
    [39000, 1.1846],
    [40000, 1.1847],
    [41000, 1.1822],
    [42000, 1.1824],
    [43000, 1.1799],
    [44000, 1.1793],
    [45000, 1.1790],
    [46000, 1.1789],
    [47000, 1.1763],
    [48000, 1.1781],
    [49000, 1.1772],
    [50000, 1.1763],
    [51000, 1.1748],
    [52000, 1.1759],
    [53000, 1.1718],
    [54000, 1.1743],
    [55000, 1.1731],
    [56000, 1.1706],
    [57000, 1.1707],
    [58000, 1.1703],
    [59000, 1.1693],
    [60000, 1.1711],
    [61000, 1.1679],
    [62000, 1.1688],
    [63000, 1.1702],
    [64000, 1.1680],
    [65000, 1.1679],
    [66000, 1.1689],
    [67000, 1.1669],
    [68000, 1.1667],
    [69000, 1.1699],
    [70000, 1.1676],
    [71000, 1.1648],
    [72000, 1.1638],
    [73000, 1.1653],
    [74000, 1.1671],
    [75000, 1.1642],
    [76000, 1.1671],
    [77000, 1.1652],
    [78000, 1.1646],
    [79000, 1.1661],
    [80000, 1.1640],
    [81000, 1.1646],
    [82000, 1.1642],
    [83000, 1.1622],
    [84000, 1.1630],
    [85000, 1.1612],
    [86000, 1.1643],
    [87000, 1.1616],
    [88000, 1.1639],
    [89000, 1.1628],
    [90000, 1.1613],
    [91000, 1.1614],
    [92000, 1.1607],
    [93000, 1.1603],
    [94000, 1.1617],
    [95000, 1.1619],
    [96000, 1.1588],
    [97000, 1.1596],
    [98000, 1.1621],
    [99000, 1.1612],
    [100000, 1.1607]
])

############################################################
# Plot the results
############################################################

plt.figure(figsize=(12, 8))
for i, data in enumerate([(rope_no_whitening_100k, 'Standard Attention + RoPE','#1f77b4'),
                          (equiv_capacity_rope_no_whitening_100k, 'Equivalent Capacity Attention + RoPE','#ff7f0e'),
                          (white_rope_half_batch_100k, 'Whitened + RoPE @ Half Batch (128)','#d62728'),
                          (white_rope_100k, 'Whitened + RoPE @ Full Batch (256)','#2ca02c')
                          ]):    
    x = data[0][0:, 0]
    y = data[0][0:, 1]
    plt.plot(x, y, marker='o', linestyle='-', label=f'{data[1]}', color = data[2])

# Add labels and title
plt.title('Whitened vs. Standard Attention', fontsize = 24)
plt.xlabel('Training Iterations', fontsize = 19)
plt.ylabel('Mean Cross-Entropy Loss', fontsize = 19)

plt.xticks(np.linspace(0,100000,11))
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(fontsize=18)

from matplotlib.ticker import FuncFormatter
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, pos: '{:,.0f}K'.format(x/1000)))

plt.grid(True)
plt.ion()
plt.show()





