{
    "name": "rw_original-d=96_l=8_h=4-8.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 1691089920,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "338217984",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.06333327293396,
            "data_time": 0.12513233721256256,
            "batch_time": 1.30666384100914,
            "samples_per_second": 383111.2452772229,
            "samples_per_second_per_gpu": 47888.90565965286,
            "loss_sequences_lower_95": 4.945051116943359,
            "loss_sequences_upper_95": 5.184049746195475,
            "loss_tokens_lower_95": 5.047727750142416,
            "loss_tokens_upper_95": 5.078654085795085,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.610637699767132,
            "data_time": 0.018631419240812047,
            "batch_time": 0.0639631137697292,
            "samples_per_second": 4673469.291262419,
            "samples_per_second_per_gpu": 584183.6614078024,
            "loss_sequences_lower_95": 4.608406018011976,
            "loss_sequences_upper_95": 4.612875601845316,
            "loss_tokens_lower_95": 4.599029958333333,
            "loss_tokens_upper_95": 4.62215378125,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.094798022386979,
            "data_time": 0.08783990144729614,
            "batch_time": 0.1323184370994568,
            "samples_per_second": 4152290.11099594,
            "samples_per_second_per_gpu": 519036.2638744925,
            "loss_sequences_lower_95": 4.034808355837452,
            "loss_sequences_upper_95": 4.1695582362583705,
            "loss_tokens_lower_95": 4.0802116250000005,
            "loss_tokens_upper_95": 4.1092508854166665,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7879720457804575,
            "data_time": 0.013212584351238451,
            "batch_time": 0.05740693368409809,
            "samples_per_second": 5323604.332001544,
            "samples_per_second_per_gpu": 665450.541500193,
            "loss_sequences_lower_95": 4.746231153350515,
            "loss_sequences_upper_95": 4.830440701514175,
            "loss_tokens_lower_95": 4.7747378750000005,
            "loss_tokens_upper_95": 4.801074041666666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.670819779763153,
            "data_time": 0.09720669686794281,
            "batch_time": 0.1419704258441925,
            "samples_per_second": 4100217.3395262067,
            "samples_per_second_per_gpu": 512527.16744077584,
            "loss_sequences_lower_95": 4.611113917706212,
            "loss_sequences_upper_95": 4.745596891508083,
            "loss_tokens_lower_95": 4.65875134375,
            "loss_tokens_upper_95": 4.682925197916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.030854915292145,
            "data_time": 0.03297693530718485,
            "batch_time": 0.07623253017663956,
            "samples_per_second": 4950528.611352661,
            "samples_per_second_per_gpu": 618816.0764190826,
            "loss_sequences_lower_95": 4.9810920305595685,
            "loss_sequences_upper_95": 5.086656561358132,
            "loss_tokens_lower_95": 5.017498677083333,
            "loss_tokens_upper_95": 5.04424065625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3936818723289335,
            "data_time": 0.013135024905204773,
            "batch_time": 0.055629965662956235,
            "samples_per_second": 5219031.744898682,
            "samples_per_second_per_gpu": 652378.9681123353,
            "loss_sequences_lower_95": 5.357423120615433,
            "loss_sequences_upper_95": 5.429437260841836,
            "loss_tokens_lower_95": 5.375844270833333,
            "loss_tokens_upper_95": 5.4118964375,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.9437385106710865,
            "data_time": 0.012962611882310165,
            "batch_time": 0.056323040472833735,
            "samples_per_second": 5364885.654741597,
            "samples_per_second_per_gpu": 670610.7068426997,
            "loss_sequences_lower_95": 4.917706080251963,
            "loss_sequences_upper_95": 4.9720535115346856,
            "loss_tokens_lower_95": 4.931452395833333,
            "loss_tokens_upper_95": 4.9560236875,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.790127374292389,
            "data_time": 0.09523296356201172,
            "batch_time": 0.13968616724014282,
            "samples_per_second": 4117318.0978618334,
            "samples_per_second_per_gpu": 514664.7622327292,
            "loss_sequences_lower_95": 4.700808430493363,
            "loss_sequences_upper_95": 4.900048803313961,
            "loss_tokens_lower_95": 4.77753415625,
            "loss_tokens_upper_95": 4.802625906249999,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.959321904088197,
            "data_time": 0.09543170779943466,
            "batch_time": 0.14157944917678833,
            "samples_per_second": 4019134.0715628406,
            "samples_per_second_per_gpu": 502391.7589453551,
            "loss_sequences_lower_95": 5.840249766474185,
            "loss_sequences_upper_95": 6.102148956178206,
            "loss_tokens_lower_95": 5.945801947916667,
            "loss_tokens_upper_95": 5.97255875,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.9985160779279925,
            "data_time": 0.010547641022451993,
            "batch_time": 0.053686531453297055,
            "samples_per_second": 5371593.97987764,
            "samples_per_second_per_gpu": 671449.247484705,
            "loss_sequences_lower_95": 4.987467010736434,
            "loss_sequences_upper_95": 5.010171426399548,
            "loss_tokens_lower_95": 4.985990343749999,
            "loss_tokens_upper_95": 5.01119115625,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.843744571881213,
            "data_time": 0.021851341426372527,
            "batch_time": 0.06390133202075958,
            "samples_per_second": 5095494.943589303,
            "samples_per_second_per_gpu": 636936.8679486628,
            "loss_sequences_lower_95": 4.8187965097550505,
            "loss_sequences_upper_95": 4.8697545964734745,
            "loss_tokens_lower_95": 4.830974697916667,
            "loss_tokens_upper_95": 4.856365958333333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.0974173236328495,
            "data_time": 0.09301674365997314,
            "batch_time": 0.14405512809753418,
            "samples_per_second": 4194822.882786854,
            "samples_per_second_per_gpu": 524352.8603483568,
            "loss_sequences_lower_95": 5.001657878264452,
            "loss_sequences_upper_95": 5.214556414312088,
            "loss_tokens_lower_95": 5.083996645833333,
            "loss_tokens_upper_95": 5.110818197916667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.667437903992756,
            "data_time": 0.0935932993888855,
            "batch_time": 0.1394834741950035,
            "samples_per_second": 4085008.760921964,
            "samples_per_second_per_gpu": 510626.0951152455,
            "loss_sequences_lower_95": 4.583492165621818,
            "loss_sequences_upper_95": 4.7690976269133465,
            "loss_tokens_lower_95": 4.654571979166667,
            "loss_tokens_upper_95": 4.6806447604166665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.754093311049721,
            "data_time": 0.14103274047374725,
            "batch_time": 0.16141743957996368,
            "samples_per_second": 946733.8201244238,
            "samples_per_second_per_gpu": 118341.72751555298,
            "loss_sequences_lower_95": 5.672369991649281,
            "loss_sequences_upper_95": 5.841401048140092,
            "loss_tokens_lower_95": 5.727395274422385,
            "loss_tokens_upper_95": 5.780678168210116,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1652630689192796,
            "data_time": 0.09456060081720352,
            "batch_time": 0.12927141040563583,
            "samples_per_second": 3306602.0872676424,
            "samples_per_second_per_gpu": 413325.2609084553,
            "loss_sequences_lower_95": 5.088966000392903,
            "loss_sequences_upper_95": 5.243417224939641,
            "loss_tokens_lower_95": 5.151210354166667,
            "loss_tokens_upper_95": 5.179448833333333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.340780672108592,
            "data_time": 0.09124821424484253,
            "batch_time": 0.12727603316307068,
            "samples_per_second": 3774479.522890689,
            "samples_per_second_per_gpu": 471809.9403613361,
            "loss_sequences_lower_95": 6.248283784810975,
            "loss_sequences_upper_95": 6.4630733600707035,
            "loss_tokens_lower_95": 6.329176291666666,
            "loss_tokens_upper_95": 6.352586875,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5417471596452055,
            "data_time": 0.15148071944713593,
            "batch_time": 0.18056243658065796,
            "samples_per_second": 2267078.3289473597,
            "samples_per_second_per_gpu": 283384.79111841996,
            "loss_sequences_lower_95": 5.381708651683369,
            "loss_sequences_upper_95": 5.82519244835025,
            "loss_tokens_lower_95": 5.526888725405834,
            "loss_tokens_upper_95": 5.55642508835089,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.213560610546921,
            "data_time": 0.026086169210347263,
            "batch_time": 0.07048832747069272,
            "samples_per_second": 4516120.873198954,
            "samples_per_second_per_gpu": 564515.1091498693,
            "loss_sequences_lower_95": 5.198038098316657,
            "loss_sequences_upper_95": 5.228736321726784,
            "loss_tokens_lower_95": 5.197946603826912,
            "loss_tokens_upper_95": 5.228718170265988,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.300448142777067,
            "data_time": 0.028160186484456062,
            "batch_time": 0.07182868085801601,
            "samples_per_second": 4481535.3225683365,
            "samples_per_second_per_gpu": 560191.9153210421,
            "loss_sequences_lower_95": 4.307498253429347,
            "loss_sequences_upper_95": 4.333065898616436,
            "loss_tokens_lower_95": 4.2886570559827994,
            "loss_tokens_upper_95": 4.309828041975936,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.7143082643489995,
            "data_time": 0.046873274776670665,
            "batch_time": 0.0887672934267256,
            "samples_per_second": 4380928.534139425,
            "samples_per_second_per_gpu": 547616.0667674281,
            "loss_sequences_lower_95": 7.15642199401423,
            "loss_sequences_upper_95": 7.41232564184651,
            "loss_tokens_lower_95": 6.586018600991035,
            "loss_tokens_upper_95": 6.777748905933554,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.4277406865755715,
            "data_time": 0.041872020810842514,
            "batch_time": 0.08612631137172382,
            "samples_per_second": 4542259.971645995,
            "samples_per_second_per_gpu": 567782.4964557494,
            "loss_sequences_lower_95": 6.818187304687499,
            "loss_sequences_upper_95": 6.998170149739583,
            "loss_tokens_lower_95": 6.333056505503144,
            "loss_tokens_upper_95": 6.460797292649372,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.474834510826359,
            "data_time": 0.07420348127683003,
            "batch_time": 0.11463285982608795,
            "samples_per_second": 3889318.197308993,
            "samples_per_second_per_gpu": 486164.7746636241,
            "loss_sequences_lower_95": 4.5801844437073544,
            "loss_sequences_upper_95": 4.648902662854198,
            "loss_tokens_lower_95": 4.449433390201013,
            "loss_tokens_upper_95": 4.484283787367309,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.481307222626426,
            "data_time": 0.3598563075065613,
            "batch_time": 0.4024905264377594,
            "samples_per_second": 2512557.2237548004,
            "samples_per_second_per_gpu": 314069.65296935005,
            "loss_sequences_lower_95": 4.484883408979936,
            "loss_sequences_upper_95": 4.617851278131658,
            "loss_tokens_lower_95": 4.441048991295475,
            "loss_tokens_upper_95": 4.504908805615742,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.599509499997509,
            "data_time": 0.336921364068985,
            "batch_time": 0.3839019238948822,
            "samples_per_second": 2706731.241004246,
            "samples_per_second_per_gpu": 338341.4051255307,
            "loss_sequences_lower_95": 4.648015684789541,
            "loss_sequences_upper_95": 4.853885934012276,
            "loss_tokens_lower_95": 4.550896559749492,
            "loss_tokens_upper_95": 4.656627996466655,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.737292736371359,
            "data_time": 0.18853545188903809,
            "batch_time": 0.22032135725021362,
            "samples_per_second": 2713042.2228357457,
            "samples_per_second_per_gpu": 339130.2778544682,
            "loss_sequences_lower_95": 4.733786580403646,
            "loss_sequences_upper_95": 4.840682779947917,
            "loss_tokens_lower_95": 4.627906178779743,
            "loss_tokens_upper_95": 4.843350276310453,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.663700055749883,
            "data_time": 0.0256094541400671,
            "batch_time": 0.06991050001233816,
            "samples_per_second": 4500443.954548499,
            "samples_per_second_per_gpu": 562555.4943185623,
            "loss_sequences_lower_95": 8.742525051055559,
            "loss_sequences_upper_95": 8.817349682133015,
            "loss_tokens_lower_95": 8.607919521059296,
            "loss_tokens_upper_95": 8.686292756163695,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.526689911530877,
            "data_time": 0.047333750128746035,
            "batch_time": 0.08974845856428146,
            "samples_per_second": 4425736.992047848,
            "samples_per_second_per_gpu": 553217.124005981,
            "loss_sequences_lower_95": 6.759555598300715,
            "loss_sequences_upper_95": 7.057968170474274,
            "loss_tokens_lower_95": 5.380234942665758,
            "loss_tokens_upper_95": 5.5258645482977125,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.203956415628817,
            "data_time": 0.08956789672374725,
            "batch_time": 0.1317744106054306,
            "samples_per_second": 4213467.039718958,
            "samples_per_second_per_gpu": 526683.3799648697,
            "loss_sequences_lower_95": 5.977706232168568,
            "loss_sequences_upper_95": 6.305626212370681,
            "loss_tokens_lower_95": 5.100824098688589,
            "loss_tokens_upper_95": 5.268298328026682,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.540911572164596,
            "data_time": 0.3552882969379425,
            "batch_time": 0.3979909420013428,
            "samples_per_second": 2390714.7486394765,
            "samples_per_second_per_gpu": 298839.34357993456,
            "loss_sequences_lower_95": 5.467242891494542,
            "loss_sequences_upper_95": 5.61668918557363,
            "loss_tokens_lower_95": 5.464172739525364,
            "loss_tokens_upper_95": 5.617218965155893,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.664859795570374,
            "data_time": 0.31222429871559143,
            "batch_time": 0.33740128576755524,
            "samples_per_second": 1593510.1426128158,
            "samples_per_second_per_gpu": 199188.76782660198,
            "loss_sequences_lower_95": 4.600688316345215,
            "loss_sequences_upper_95": 5.0097571487426755,
            "loss_tokens_lower_95": 4.4169130710881594,
            "loss_tokens_upper_95": 4.890405000471684,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.381789448317388,
            "data_time": 0.053381336852908134,
            "batch_time": 0.09708563052117825,
            "samples_per_second": 4540602.715068906,
            "samples_per_second_per_gpu": 567575.3393836132,
            "loss_sequences_lower_95": 5.339369087648734,
            "loss_sequences_upper_95": 5.424411476158681,
            "loss_tokens_lower_95": 5.3389949052824335,
            "loss_tokens_upper_95": 5.42437290593814,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.72281888896183,
            "data_time": 0.0779606580734253,
            "batch_time": 0.1216256707906723,
            "samples_per_second": 4386183.179179328,
            "samples_per_second_per_gpu": 548272.897397416,
            "loss_sequences_lower_95": 5.674361085384034,
            "loss_sequences_upper_95": 5.769892152228578,
            "loss_tokens_lower_95": 5.673536036036037,
            "loss_tokens_upper_95": 5.7706737321483415,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.774195661482536,
            "data_time": 0.05377841368317604,
            "batch_time": 0.09524586237967014,
            "samples_per_second": 4210557.074951021,
            "samples_per_second_per_gpu": 526319.6343688776,
            "loss_sequences_lower_95": 4.983977016245579,
            "loss_sequences_upper_95": 5.100956027058963,
            "loss_tokens_lower_95": 4.74492400662407,
            "loss_tokens_upper_95": 4.80810422937683,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.383690349578857,
            "data_time": 0.21135875582695007,
            "batch_time": 0.25669655203819275,
            "samples_per_second": 3391872.4020114774,
            "samples_per_second_per_gpu": 423984.0502514347,
            "loss_sequences_lower_95": 6.984820654296875,
            "loss_sequences_upper_95": 7.485214892578125,
            "loss_tokens_lower_95": 6.14932403261499,
            "loss_tokens_upper_95": 6.494983887688203,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.18323341012001,
            "data_time": 0.14944593608379364,
            "batch_time": 0.16635537147521973,
            "samples_per_second": 770382.1155927645,
            "samples_per_second_per_gpu": 96297.76444909556,
            "loss_sequences_lower_95": 4.871235907077789,
            "loss_sequences_upper_95": 5.62543978691101,
            "loss_tokens_lower_95": 4.658731999890557,
            "loss_tokens_upper_95": 5.50296473009833,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.884316427954312,
            "data_time": 0.331530898809433,
            "batch_time": 0.3669385761022568,
            "samples_per_second": 2403912.5147581846,
            "samples_per_second_per_gpu": 300489.06434477307,
            "loss_sequences_lower_95": 5.563850481756803,
            "loss_sequences_upper_95": 6.0857366276883536,
            "loss_tokens_lower_95": 4.6391384395698285,
            "loss_tokens_upper_95": 5.0398934313785935,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.663319333769339,
            "data_time": 0.04802305830849542,
            "batch_time": 0.09259167148007287,
            "samples_per_second": 4537237.637672357,
            "samples_per_second_per_gpu": 567154.7047090447,
            "loss_sequences_lower_95": 5.6447751346453146,
            "loss_sequences_upper_95": 5.682303804370085,
            "loss_tokens_lower_95": 5.645150710172974,
            "loss_tokens_upper_95": 5.682094114344419,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.844569927184484,
            "data_time": 0.037244791785875954,
            "batch_time": 0.08039792804490953,
            "samples_per_second": 4369160.652032125,
            "samples_per_second_per_gpu": 546145.0815040156,
            "loss_sequences_lower_95": 5.972020053548904,
            "loss_sequences_upper_95": 6.1997262289564325,
            "loss_tokens_lower_95": 5.7018872079213,
            "loss_tokens_upper_95": 5.929007537652661,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.207420914164393,
            "data_time": 0.17091941088438034,
            "batch_time": 0.20053132623434067,
            "samples_per_second": 1903499.8598076666,
            "samples_per_second_per_gpu": 237937.48247595833,
            "loss_sequences_lower_95": 4.120529700198889,
            "loss_sequences_upper_95": 4.481922269828154,
            "loss_tokens_lower_95": 4.001615348201897,
            "loss_tokens_upper_95": 4.334432286095773,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.513092274729813,
            "data_time": 0.08338010013103485,
            "batch_time": 0.12834837436676025,
            "samples_per_second": 4268983.857327277,
            "samples_per_second_per_gpu": 533622.9821659096,
            "loss_sequences_lower_95": 4.5720076298280885,
            "loss_sequences_upper_95": 4.705877796344712,
            "loss_tokens_lower_95": 4.43305471179505,
            "loss_tokens_upper_95": 4.585266439540784,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.375805566950542,
            "data_time": 0.32141102850437164,
            "batch_time": 0.35634882748126984,
            "samples_per_second": 2070778.5996117848,
            "samples_per_second_per_gpu": 258847.3249514731,
            "loss_sequences_lower_95": 4.213597544228159,
            "loss_sequences_upper_95": 4.7014369220268435,
            "loss_tokens_lower_95": 4.185179090888815,
            "loss_tokens_upper_95": 4.5780092223822395,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.204227435204906,
            "data_time": 0.028191371023841786,
            "batch_time": 0.07232299755359518,
            "samples_per_second": 4431410.685708646,
            "samples_per_second_per_gpu": 553926.3357135807,
            "loss_sequences_lower_95": 5.197495202165434,
            "loss_sequences_upper_95": 5.21076015007689,
            "loss_tokens_lower_95": 5.197457450474469,
            "loss_tokens_upper_95": 5.21098521462105,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.1182626080744473,
            "data_time": 0.29841703176498413,
            "batch_time": 0.3255063593387604,
            "samples_per_second": 1514657.199503715,
            "samples_per_second_per_gpu": 189332.14993796436,
            "loss_sequences_lower_95": 2.996171784632414,
            "loss_sequences_upper_95": 3.3309246396555485,
            "loss_tokens_lower_95": 2.883173788142912,
            "loss_tokens_upper_95": 3.2672240461273345,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.977803175829242,
            "data_time": 0.02409351021051407,
            "batch_time": 0.06810206055641174,
            "samples_per_second": 4461070.549499379,
            "samples_per_second_per_gpu": 557633.8186874223,
            "loss_sequences_lower_95": 6.894379893049004,
            "loss_sequences_upper_95": 6.94339336019392,
            "loss_tokens_lower_95": 5.870599903288201,
            "loss_tokens_upper_95": 5.920462729690522,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.124026101112365,
            "data_time": 0.10412833467125893,
            "batch_time": 0.1492183618247509,
            "samples_per_second": 4244800.780841875,
            "samples_per_second_per_gpu": 530600.0976052344,
            "loss_sequences_lower_95": 6.1636361328125,
            "loss_sequences_upper_95": 6.404560815429687,
            "loss_tokens_lower_95": 5.999865486634764,
            "loss_tokens_upper_95": 6.221830646842372,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.249188470840454,
            "data_time": 0.3247164934873581,
            "batch_time": 0.3675343692302704,
            "samples_per_second": 2402371.7097814926,
            "samples_per_second_per_gpu": 300296.4637226866,
            "loss_sequences_lower_95": 5.110111508576766,
            "loss_sequences_upper_95": 5.384341762376868,
            "loss_tokens_lower_95": 5.110775624150815,
            "loss_tokens_upper_95": 5.3837710836659305,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.051693018277486,
            "data_time": 0.06766083836555481,
            "batch_time": 0.10824097941319148,
            "samples_per_second": 3966801.5901059024,
            "samples_per_second_per_gpu": 495850.1987632378,
            "loss_sequences_lower_95": 7.927199762517756,
            "loss_sequences_upper_95": 8.176610551313921,
            "loss_tokens_lower_95": 7.926792842980587,
            "loss_tokens_upper_95": 8.176773663145124,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.164906356493632,
            "data_time": 0.06651796648899715,
            "batch_time": 0.11071739097436269,
            "samples_per_second": 4492793.19956234,
            "samples_per_second_per_gpu": 561599.1499452925,
            "loss_sequences_lower_95": 2.328848836263021,
            "loss_sequences_upper_95": 2.441089599609375,
            "loss_tokens_lower_95": 2.111411115227341,
            "loss_tokens_upper_95": 2.1961963144632857,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.3142008849552695,
            "data_time": 0.3375861346721649,
            "batch_time": 0.3782302886247635,
            "samples_per_second": 2326954.6005483214,
            "samples_per_second_per_gpu": 290869.3250685402,
            "loss_sequences_lower_95": 6.02387923467727,
            "loss_sequences_upper_95": 6.608935139973958,
            "loss_tokens_lower_95": 6.018224661690849,
            "loss_tokens_upper_95": 6.608031805129278,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.0169942304492,
            "data_time": 0.1478671133518219,
            "batch_time": 0.16466516256332397,
            "samples_per_second": 839665.0641231969,
            "samples_per_second_per_gpu": 104958.13301539961,
            "loss_sequences_lower_95": 3.685827147960663,
            "loss_sequences_upper_95": 4.974603760242462,
            "loss_tokens_lower_95": 3.4354108546935405,
            "loss_tokens_upper_95": 4.024440930553318,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.625982395648956,
            "data_time": 0.09070733189582825,
            "batch_time": 0.13449524715542793,
            "samples_per_second": 4453734.686685955,
            "samples_per_second_per_gpu": 556716.8358357444,
            "loss_sequences_lower_95": 7.723084423828125,
            "loss_sequences_upper_95": 8.061915917968749,
            "loss_tokens_lower_95": 7.463638207553406,
            "loss_tokens_upper_95": 7.758611244679304,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.778900572776794,
            "data_time": 0.09555618464946747,
            "batch_time": 0.13965773582458496,
            "samples_per_second": 4311079.694548827,
            "samples_per_second_per_gpu": 538884.9618186033,
            "loss_sequences_lower_95": 8.081986853027344,
            "loss_sequences_upper_95": 8.345902294921876,
            "loss_tokens_lower_95": 7.644784853060841,
            "loss_tokens_upper_95": 7.8741504835378064,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.413012979976131,
            "data_time": 0.04038334637880325,
            "batch_time": 0.08406590049465497,
            "samples_per_second": 4571588.008946043,
            "samples_per_second_per_gpu": 571448.5011182554,
            "loss_sequences_lower_95": 5.401752402612199,
            "loss_sequences_upper_95": 5.424605651098411,
            "loss_tokens_lower_95": 5.4014355681544375,
            "loss_tokens_upper_95": 5.424458446496292,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.172659741385558,
            "data_time": 0.12347996731599171,
            "batch_time": 0.16431079804897308,
            "samples_per_second": 3847169.979658484,
            "samples_per_second_per_gpu": 480896.2474573105,
            "loss_sequences_lower_95": 5.096713158392137,
            "loss_sequences_upper_95": 5.2477460963751685,
            "loss_tokens_lower_95": 5.095955628660234,
            "loss_tokens_upper_95": 5.2471469111523135,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.9490792207717895,
            "data_time": 0.09077180176973343,
            "batch_time": 0.13558753207325935,
            "samples_per_second": 4328907.908413285,
            "samples_per_second_per_gpu": 541113.4885516607,
            "loss_sequences_lower_95": 7.893615832519531,
            "loss_sequences_upper_95": 8.0044216796875,
            "loss_tokens_lower_95": 7.892482373046875,
            "loss_tokens_upper_95": 8.002944189453125,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.664386831326877,
            "data_time": 0.030343901898179735,
            "batch_time": 0.07426854435886655,
            "samples_per_second": 4426031.930575826,
            "samples_per_second_per_gpu": 553253.9913219783,
            "loss_sequences_lower_95": 5.521856180153146,
            "loss_sequences_upper_95": 5.6131798611932355,
            "loss_tokens_lower_95": 4.571951515377725,
            "loss_tokens_upper_95": 4.6346720636224035,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.369673789437138,
            "data_time": 0.20618111746651785,
            "batch_time": 0.23885766948972428,
            "samples_per_second": 1845727.759545997,
            "samples_per_second_per_gpu": 230715.96994324963,
            "loss_sequences_lower_95": 5.234334074561276,
            "loss_sequences_upper_95": 5.502655598654676,
            "loss_tokens_lower_95": 5.231155794058273,
            "loss_tokens_upper_95": 5.503348199645085,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2358391331691365,
            "data_time": 0.17202825844287872,
            "batch_time": 0.21776919066905975,
            "samples_per_second": 3863723.6131840944,
            "samples_per_second_per_gpu": 482965.4516480118,
            "loss_sequences_lower_95": 5.144557662664675,
            "loss_sequences_upper_95": 5.326337064855239,
            "loss_tokens_lower_95": 5.145764387542126,
            "loss_tokens_upper_95": 5.327501101026348,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.278364399867325,
            "data_time": 0.028265022672712803,
            "batch_time": 0.07209415128454566,
            "samples_per_second": 4491316.675711282,
            "samples_per_second_per_gpu": 561414.5844639102,
            "loss_sequences_lower_95": 5.926345583975323,
            "loss_sequences_upper_95": 6.014974716780972,
            "loss_tokens_lower_95": 5.183387661193496,
            "loss_tokens_upper_95": 5.2624345402032855,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.11414452961513,
            "data_time": 0.32680246233940125,
            "batch_time": 0.36447788774967194,
            "samples_per_second": 2245768.0701039503,
            "samples_per_second_per_gpu": 280721.0087629938,
            "loss_sequences_lower_95": 5.070045576146041,
            "loss_sequences_upper_95": 5.159600030808222,
            "loss_tokens_lower_95": 5.069421822684151,
            "loss_tokens_upper_95": 5.158449477745743,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.790367817076704,
            "data_time": 0.04340552595945505,
            "batch_time": 0.08758779901724595,
            "samples_per_second": 4475605.317875595,
            "samples_per_second_per_gpu": 559450.6647344494,
            "loss_sequences_lower_95": 6.768655524297592,
            "loss_sequences_upper_95": 6.812204373327599,
            "loss_tokens_lower_95": 6.7690219890816135,
            "loss_tokens_upper_95": 6.8116568711773695,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.517147869739718,
            "data_time": 0.2964777648448944,
            "batch_time": 0.33749136328697205,
            "samples_per_second": 2582230.23020251,
            "samples_per_second_per_gpu": 322778.77877531375,
            "loss_sequences_lower_95": 5.3637013259443265,
            "loss_sequences_upper_95": 5.670492287052488,
            "loss_tokens_lower_95": 5.361880967223529,
            "loss_tokens_upper_95": 5.67157509813031,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.45191216468811,
            "data_time": 0.2709283381700516,
            "batch_time": 0.2906177043914795,
            "samples_per_second": 1450527.1599573458,
            "samples_per_second_per_gpu": 181315.89499466823,
            "loss_sequences_lower_95": 5.208745676676432,
            "loss_sequences_upper_95": 6.037349573771158,
            "loss_tokens_lower_95": 4.678600162929959,
            "loss_tokens_upper_95": 6.045821401807997,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.895801067352295,
            "data_time": 0.2819478511810303,
            "batch_time": 0.3028194308280945,
            "samples_per_second": 1193535.4982406916,
            "samples_per_second_per_gpu": 149191.93728008645,
            "loss_sequences_lower_95": 4.787777315775553,
            "loss_sequences_upper_95": 5.825122884114584,
            "loss_tokens_lower_95": 4.0300160097272215,
            "loss_tokens_upper_95": 5.5106495375043885,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.801842282762935,
            "data_time": 0.0427328741976193,
            "batch_time": 0.08585758932999202,
            "samples_per_second": 4314619.5801588185,
            "samples_per_second_per_gpu": 539327.4475198523,
            "loss_sequences_lower_95": 8.781142707566273,
            "loss_sequences_upper_95": 8.822624119799338,
            "loss_tokens_lower_95": 8.781208823292527,
            "loss_tokens_upper_95": 8.822562304399852,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.4692349101453135,
            "data_time": 0.023315984674416253,
            "batch_time": 0.06769315403882073,
            "samples_per_second": 4462711.128300887,
            "samples_per_second_per_gpu": 557838.8910376108,
            "loss_sequences_lower_95": 3.072957306127334,
            "loss_sequences_upper_95": 3.1124422597321,
            "loss_tokens_lower_95": 2.412634959507727,
            "loss_tokens_upper_95": 2.4373389094935813,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.401768154985323,
            "data_time": 0.297004371881485,
            "batch_time": 0.32614997029304504,
            "samples_per_second": 2056864.4653596778,
            "samples_per_second_per_gpu": 257108.05816995972,
            "loss_sequences_lower_95": 6.65386923241803,
            "loss_sequences_upper_95": 7.102553372120294,
            "loss_tokens_lower_95": 6.201032826988915,
            "loss_tokens_upper_95": 6.504557207707719,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.57881076915844,
            "data_time": 0.21428202092647552,
            "batch_time": 0.23142611980438232,
            "samples_per_second": 1038255.5269891358,
            "samples_per_second_per_gpu": 129781.94087364197,
            "loss_sequences_lower_95": 10.069981239937448,
            "loss_sequences_upper_95": 11.264377532134185,
            "loss_tokens_lower_95": 9.380768801842207,
            "loss_tokens_upper_95": 11.440942345136477,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.225642791608485,
            "data_time": 0.34256093204021454,
            "batch_time": 0.3768317550420761,
            "samples_per_second": 2377012.187252476,
            "samples_per_second_per_gpu": 297126.5234065595,
            "loss_sequences_lower_95": 6.452543119104897,
            "loss_sequences_upper_95": 6.852791688500381,
            "loss_tokens_lower_95": 6.034169437289739,
            "loss_tokens_upper_95": 6.2860168241431875,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.394203921643699,
            "data_time": 0.3416355550289154,
            "batch_time": 0.3771461099386215,
            "samples_per_second": 1594844.5805668463,
            "samples_per_second_per_gpu": 199355.5725708558,
            "loss_sequences_lower_95": 6.626730960752906,
            "loss_sequences_upper_95": 6.99640689012481,
            "loss_tokens_lower_95": 6.221910434318928,
            "loss_tokens_upper_95": 6.432380416617803,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.3877860714749595,
            "data_time": 0.3317364901304245,
            "batch_time": 0.3779798448085785,
            "samples_per_second": 2384742.8777160584,
            "samples_per_second_per_gpu": 298092.8597145073,
            "loss_sequences_lower_95": 6.6123542971727325,
            "loss_sequences_upper_95": 7.082995810159822,
            "loss_tokens_lower_95": 6.16909466085933,
            "loss_tokens_upper_95": 6.497794058187176,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.508650142972062,
            "data_time": 0.312848225235939,
            "batch_time": 0.34782421588897705,
            "samples_per_second": 1860422.159626177,
            "samples_per_second_per_gpu": 232552.76995327213,
            "loss_sequences_lower_95": 6.668776163240759,
            "loss_sequences_upper_95": 6.996967204024152,
            "loss_tokens_lower_95": 6.348214807242991,
            "loss_tokens_upper_95": 6.5366308669806275,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.156323880142307,
            "data_time": 0.3470633178949356,
            "batch_time": 0.3815254718065262,
            "samples_per_second": 1776654.8939445778,
            "samples_per_second_per_gpu": 222081.86174307222,
            "loss_sequences_lower_95": 6.222065251510336,
            "loss_sequences_upper_95": 6.454417637416294,
            "loss_tokens_lower_95": 6.026543122989178,
            "loss_tokens_upper_95": 6.174103253418397,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.606809607366237,
            "data_time": 0.29701152443885803,
            "batch_time": 0.3317244201898575,
            "samples_per_second": 2371692.063862375,
            "samples_per_second_per_gpu": 296461.5079827969,
            "loss_sequences_lower_95": 5.793837337959103,
            "loss_sequences_upper_95": 6.05635052192502,
            "loss_tokens_lower_95": 5.479482969315099,
            "loss_tokens_upper_95": 5.620913589229204,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-8.0/params.txt",
    "uuid": "c1333eeb-a483-4233-b2b5-411cd06b9d31",
    "creation_date": "2023_12_14-05_01_01"
}