{
    "name": "rw_original-d=96_l=8_h=4-32.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 6764359680,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 32.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1352871936",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-32.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.810756309827169,
            "data_time": 0.12565410137176514,
            "batch_time": 1.2940608710050583,
            "samples_per_second": 371777.8699259998,
            "samples_per_second_per_gpu": 46472.23374074997,
            "loss_sequences_lower_95": 4.709044481913248,
            "loss_sequences_upper_95": 4.912632179260253,
            "loss_tokens_lower_95": 4.796067021687826,
            "loss_tokens_upper_95": 4.825214831034343,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.447244461612797,
            "data_time": 0.018941197993276173,
            "batch_time": 0.06397892496912612,
            "samples_per_second": 4673703.229943244,
            "samples_per_second_per_gpu": 584212.9037429055,
            "loss_sequences_lower_95": 4.4449717239320465,
            "loss_sequences_upper_95": 4.449499405875363,
            "loss_tokens_lower_95": 4.43576284375,
            "loss_tokens_upper_95": 4.4585080625,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.868409921198475,
            "data_time": 0.09441585093736649,
            "batch_time": 0.1394461691379547,
            "samples_per_second": 4084917.8823047928,
            "samples_per_second_per_gpu": 510614.7352880991,
            "loss_sequences_lower_95": 3.8236110391422193,
            "loss_sequences_upper_95": 3.921233246472417,
            "loss_tokens_lower_95": 3.85438203125,
            "loss_tokens_upper_95": 3.8822051666666666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.561080433855352,
            "data_time": 0.013968732011945624,
            "batch_time": 0.05784641206264496,
            "samples_per_second": 5327273.542978736,
            "samples_per_second_per_gpu": 665909.192872342,
            "loss_sequences_lower_95": 4.529993264739046,
            "loss_sequences_upper_95": 4.59247962306701,
            "loss_tokens_lower_95": 4.548714125,
            "loss_tokens_upper_95": 4.57342334375,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.489631662543582,
            "data_time": 0.09351732581853867,
            "batch_time": 0.13826905190944672,
            "samples_per_second": 4099817.3516224492,
            "samples_per_second_per_gpu": 512477.16895280615,
            "loss_sequences_lower_95": 4.4417440690236765,
            "loss_sequences_upper_95": 4.546896194489085,
            "loss_tokens_lower_95": 4.477765260416667,
            "loss_tokens_upper_95": 4.5014409791666665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7738928477185505,
            "data_time": 0.03575227657953898,
            "batch_time": 0.0787848283847173,
            "samples_per_second": 4971121.056754331,
            "samples_per_second_per_gpu": 621390.1320942914,
            "loss_sequences_lower_95": 4.732414497671121,
            "loss_sequences_upper_95": 4.819046719652872,
            "loss_tokens_lower_95": 4.760825572916667,
            "loss_tokens_upper_95": 4.7869119375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.829157780043933,
            "data_time": 0.013057399541139603,
            "batch_time": 0.05549030676484108,
            "samples_per_second": 5236165.338739746,
            "samples_per_second_per_gpu": 654520.6673424683,
            "loss_sequences_lower_95": 4.797036939971301,
            "loss_sequences_upper_95": 4.860707718829719,
            "loss_tokens_lower_95": 4.81297728125,
            "loss_tokens_upper_95": 4.84555440625,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7892844051341115,
            "data_time": 0.015473648121482447,
            "batch_time": 0.058763391877475535,
            "samples_per_second": 5225175.682343955,
            "samples_per_second_per_gpu": 653146.9602929944,
            "loss_sequences_lower_95": 4.769285381217277,
            "loss_sequences_upper_95": 4.811169523069371,
            "loss_tokens_lower_95": 4.777400114583333,
            "loss_tokens_upper_95": 4.801244645833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.599875239821953,
            "data_time": 0.0962090790271759,
            "batch_time": 0.14079006761312485,
            "samples_per_second": 4101464.555879176,
            "samples_per_second_per_gpu": 512683.069484897,
            "loss_sequences_lower_95": 4.529031384475832,
            "loss_sequences_upper_95": 4.687585213513878,
            "loss_tokens_lower_95": 4.587613510416666,
            "loss_tokens_upper_95": 4.611976427083333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.720666991863326,
            "data_time": 0.09520187973976135,
            "batch_time": 0.14000889658927917,
            "samples_per_second": 4239171.687672017,
            "samples_per_second_per_gpu": 529896.4609590021,
            "loss_sequences_lower_95": 5.6310332166347585,
            "loss_sequences_upper_95": 5.829063928174407,
            "loss_tokens_lower_95": 5.70757025,
            "loss_tokens_upper_95": 5.733583041666666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.770436673388508,
            "data_time": 0.010898351669311523,
            "batch_time": 0.05418990757958642,
            "samples_per_second": 5366575.64650499,
            "samples_per_second_per_gpu": 670821.9558131237,
            "loss_sequences_lower_95": 4.762385949491229,
            "loss_sequences_upper_95": 4.778504128944258,
            "loss_tokens_lower_95": 4.757982385416667,
            "loss_tokens_upper_95": 4.782937958333333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.613121577671596,
            "data_time": 0.02276058942079544,
            "batch_time": 0.08420580625534058,
            "samples_per_second": 5030747.468936943,
            "samples_per_second_per_gpu": 628843.4336171179,
            "loss_sequences_lower_95": 4.597596358776292,
            "loss_sequences_upper_95": 4.629019850127551,
            "loss_tokens_lower_95": 4.6005828541666665,
            "loss_tokens_upper_95": 4.625341260416667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.888949082782737,
            "data_time": 0.09648467600345612,
            "batch_time": 0.14109450578689575,
            "samples_per_second": 4117485.581358166,
            "samples_per_second_per_gpu": 514685.69766977074,
            "loss_sequences_lower_95": 4.815391840750983,
            "loss_sequences_upper_95": 4.979315383632575,
            "loss_tokens_lower_95": 4.876218635416667,
            "loss_tokens_upper_95": 4.90179834375,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.394147570895566,
            "data_time": 0.0975152850151062,
            "batch_time": 0.14161322265863419,
            "samples_per_second": 4199820.837607646,
            "samples_per_second_per_gpu": 524977.6047009558,
            "loss_sequences_lower_95": 4.321804243969577,
            "loss_sequences_upper_95": 4.475306704767853,
            "loss_tokens_lower_95": 4.381771645833333,
            "loss_tokens_upper_95": 4.40679653125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.633532372387973,
            "data_time": 0.1569281816482544,
            "batch_time": 0.1792983114719391,
            "samples_per_second": 1003821.4244850366,
            "samples_per_second_per_gpu": 125477.67806062958,
            "loss_sequences_lower_95": 5.563687940077348,
            "loss_sequences_upper_95": 5.7025061954151495,
            "loss_tokens_lower_95": 5.606937763907693,
            "loss_tokens_upper_95": 5.660625449093906,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.83431437828798,
            "data_time": 0.09827633947134018,
            "batch_time": 0.13265929371118546,
            "samples_per_second": 3384398.6400484857,
            "samples_per_second_per_gpu": 423049.8300060607,
            "loss_sequences_lower_95": 4.766931677281683,
            "loss_sequences_upper_95": 4.902558227183172,
            "loss_tokens_lower_95": 4.821018322916666,
            "loss_tokens_upper_95": 4.847887510416666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.204101770094013,
            "data_time": 0.09668169170618057,
            "batch_time": 0.13326528668403625,
            "samples_per_second": 3621731.6441197335,
            "samples_per_second_per_gpu": 452716.4555149667,
            "loss_sequences_lower_95": 6.12687622714483,
            "loss_sequences_upper_95": 6.30176658932326,
            "loss_tokens_lower_95": 6.192445395833333,
            "loss_tokens_upper_95": 6.2159924375,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3203055663187,
            "data_time": 0.16161425411701202,
            "batch_time": 0.19152429699897766,
            "samples_per_second": 2210253.3743433645,
            "samples_per_second_per_gpu": 276281.67179292056,
            "loss_sequences_lower_95": 5.185720199835105,
            "loss_sequences_upper_95": 5.552401858470478,
            "loss_tokens_lower_95": 5.305593934606333,
            "loss_tokens_upper_95": 5.334907081478932,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.232906830241208,
            "data_time": 0.027231974493373525,
            "batch_time": 0.0715928462418643,
            "samples_per_second": 4522003.07905828,
            "samples_per_second_per_gpu": 565250.384882285,
            "loss_sequences_lower_95": 5.215380615964606,
            "loss_sequences_upper_95": 5.250309409383457,
            "loss_tokens_lower_95": 5.214724103359564,
            "loss_tokens_upper_95": 5.250693775259044,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.12725002154058,
            "data_time": 0.03403317257761955,
            "batch_time": 0.07791428826749325,
            "samples_per_second": 4338347.020528078,
            "samples_per_second_per_gpu": 542293.3775660098,
            "loss_sequences_lower_95": 4.127147960985734,
            "loss_sequences_upper_95": 4.152961662579666,
            "loss_tokens_lower_95": 4.115941307192686,
            "loss_tokens_upper_95": 4.137110919290521,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.27295772253369,
            "data_time": 0.05486149423652225,
            "batch_time": 0.09700734913349152,
            "samples_per_second": 4262458.581291795,
            "samples_per_second_per_gpu": 532807.3226614743,
            "loss_sequences_lower_95": 6.673048016706277,
            "loss_sequences_upper_95": 6.929868120239431,
            "loss_tokens_lower_95": 6.145473631649094,
            "loss_tokens_upper_95": 6.3433110355052715,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.224773097515106,
            "data_time": 0.04109057908256849,
            "batch_time": 0.08490800112485886,
            "samples_per_second": 4521943.704823772,
            "samples_per_second_per_gpu": 565242.9631029716,
            "loss_sequences_lower_95": 6.620089078776041,
            "loss_sequences_upper_95": 6.808731477864583,
            "loss_tokens_lower_95": 6.125848663522013,
            "loss_tokens_upper_95": 6.258671838148585,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.40105486205349,
            "data_time": 0.07881440967321396,
            "batch_time": 0.1187820037206014,
            "samples_per_second": 3815545.5700769364,
            "samples_per_second_per_gpu": 476943.19625961705,
            "loss_sequences_lower_95": 4.526122213946408,
            "loss_sequences_upper_95": 4.597825093880307,
            "loss_tokens_lower_95": 4.373975928032136,
            "loss_tokens_upper_95": 4.409191112710935,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.028903953595595,
            "data_time": 0.3491189181804657,
            "batch_time": 0.39146481454372406,
            "samples_per_second": 2146190.608544102,
            "samples_per_second_per_gpu": 268273.82606801274,
            "loss_sequences_lower_95": 4.029511885209517,
            "loss_sequences_upper_95": 4.157260353781961,
            "loss_tokens_lower_95": 3.990345157714401,
            "loss_tokens_upper_95": 4.051577033785145,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.453768601709482,
            "data_time": 0.3614335358142853,
            "batch_time": 0.4077148586511612,
            "samples_per_second": 2541467.454041376,
            "samples_per_second_per_gpu": 317683.431755172,
            "loss_sequences_lower_95": 4.5137878417968755,
            "loss_sequences_upper_95": 4.721791543765943,
            "loss_tokens_lower_95": 4.40262928708954,
            "loss_tokens_upper_95": 4.508301614336493,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.587337042490641,
            "data_time": 0.1689414530992508,
            "batch_time": 0.20052992552518845,
            "samples_per_second": 2616792.2009236044,
            "samples_per_second_per_gpu": 327099.02511545055,
            "loss_sequences_lower_95": 4.552930836995443,
            "loss_sequences_upper_95": 4.648706461588541,
            "loss_tokens_lower_95": 4.468872251380948,
            "loss_tokens_upper_95": 4.707171683671006,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.603645780613195,
            "data_time": 0.02458294704556465,
            "batch_time": 0.06883935667574406,
            "samples_per_second": 4512254.58032158,
            "samples_per_second_per_gpu": 564031.8225401975,
            "loss_sequences_lower_95": 7.682624078847006,
            "loss_sequences_upper_95": 7.747636700488411,
            "loss_tokens_lower_95": 7.55298098631086,
            "loss_tokens_upper_95": 7.622067559516699,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.292634348576318,
            "data_time": 0.04943305402994156,
            "batch_time": 0.09189653545618057,
            "samples_per_second": 4441501.085362478,
            "samples_per_second_per_gpu": 555187.6356703098,
            "loss_sequences_lower_95": 6.476464658794981,
            "loss_sequences_upper_95": 6.762948690600668,
            "loss_tokens_lower_95": 5.14912320032215,
            "loss_tokens_upper_95": 5.290831277762404,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.006823510440136,
            "data_time": 0.07675204873085022,
            "batch_time": 0.11932598352432251,
            "samples_per_second": 4286843.511390434,
            "samples_per_second_per_gpu": 535855.4389238042,
            "loss_sequences_lower_95": 5.760989848582818,
            "loss_sequences_upper_95": 6.078971805507412,
            "loss_tokens_lower_95": 4.9028673091370205,
            "loss_tokens_upper_95": 5.066931888940989,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.4712827216544655,
            "data_time": 0.372269868850708,
            "batch_time": 0.41348135471343994,
            "samples_per_second": 2530625.18728358,
            "samples_per_second_per_gpu": 316328.1484104475,
            "loss_sequences_lower_95": 6.353095039820562,
            "loss_sequences_upper_95": 6.590582637699772,
            "loss_tokens_lower_95": 6.35286603256992,
            "loss_tokens_upper_95": 6.589408965089006,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.553444457054138,
            "data_time": 0.31780460476875305,
            "batch_time": 0.3430122137069702,
            "samples_per_second": 1316418.8983768253,
            "samples_per_second_per_gpu": 164552.36229710316,
            "loss_sequences_lower_95": 4.482492858886719,
            "loss_sequences_upper_95": 4.919396667480468,
            "loss_tokens_lower_95": 4.290572033713244,
            "loss_tokens_upper_95": 4.7887092494794,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.8344275312023495,
            "data_time": 0.06161569058895111,
            "batch_time": 0.10510281473398209,
            "samples_per_second": 4239785.373231954,
            "samples_per_second_per_gpu": 529973.1716539942,
            "loss_sequences_lower_95": 5.7792462976586485,
            "loss_sequences_upper_95": 5.890323360134659,
            "loss_tokens_lower_95": 5.778209274744914,
            "loss_tokens_upper_95": 5.890263187092183,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.074681902986193,
            "data_time": 0.08336237370967865,
            "batch_time": 0.1268195927143097,
            "samples_per_second": 4302481.8065125095,
            "samples_per_second_per_gpu": 537810.2258140637,
            "loss_sequences_lower_95": 6.019152592089604,
            "loss_sequences_upper_95": 6.127775385026362,
            "loss_tokens_lower_95": 6.019346914830313,
            "loss_tokens_upper_95": 6.130420541724252,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.588737316048573,
            "data_time": 0.051049813628196716,
            "batch_time": 0.09224198944866657,
            "samples_per_second": 4214084.507574469,
            "samples_per_second_per_gpu": 526760.5634468086,
            "loss_sequences_lower_95": 4.840122173919512,
            "loss_sequences_upper_95": 4.960704888975789,
            "loss_tokens_lower_95": 4.553004221006784,
            "loss_tokens_upper_95": 4.6140186354789865,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.160965495109558,
            "data_time": 0.18737846612930298,
            "batch_time": 0.2334325909614563,
            "samples_per_second": 3638528.2944335695,
            "samples_per_second_per_gpu": 454816.0368041962,
            "loss_sequences_lower_95": 6.7486974731445315,
            "loss_sequences_upper_95": 7.260653186035157,
            "loss_tokens_lower_95": 5.933377351893406,
            "loss_tokens_upper_95": 6.278634916213393,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.833494275808334,
            "data_time": 0.1546497493982315,
            "batch_time": 0.17186614871025085,
            "samples_per_second": 981797.9846369791,
            "samples_per_second_per_gpu": 122724.74807962238,
            "loss_sequences_lower_95": 4.539564669132233,
            "loss_sequences_upper_95": 5.228948318958283,
            "loss_tokens_lower_95": 4.320385321255388,
            "loss_tokens_upper_95": 5.17079383587015,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.708654453014505,
            "data_time": 0.32983458042144775,
            "batch_time": 0.36564384400844574,
            "samples_per_second": 2509034.8217894416,
            "samples_per_second_per_gpu": 313629.3527236802,
            "loss_sequences_lower_95": 5.438726490941541,
            "loss_sequences_upper_95": 5.976827511842223,
            "loss_tokens_lower_95": 4.456622630982684,
            "loss_tokens_upper_95": 4.853900235938381,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.324925514119533,
            "data_time": 0.055238771769735545,
            "batch_time": 0.10015934705734253,
            "samples_per_second": 4540445.848678879,
            "samples_per_second_per_gpu": 567555.7310848599,
            "loss_sequences_lower_95": 5.310026169656509,
            "loss_sequences_upper_95": 5.339944446008765,
            "loss_tokens_lower_95": 5.31020886449118,
            "loss_tokens_upper_95": 5.339919957285523,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.467225402078882,
            "data_time": 0.03303883969783783,
            "batch_time": 0.07624409383251554,
            "samples_per_second": 4407376.550380287,
            "samples_per_second_per_gpu": 550922.0687975358,
            "loss_sequences_lower_95": 5.583370787496362,
            "loss_sequences_upper_95": 5.8062794503808455,
            "loss_tokens_lower_95": 5.326644473574781,
            "loss_tokens_upper_95": 5.547021074952151,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.059572418967446,
            "data_time": 0.18556265532970428,
            "batch_time": 0.2153196781873703,
            "samples_per_second": 1840859.1381902392,
            "samples_per_second_per_gpu": 230107.3922737799,
            "loss_sequences_lower_95": 3.9895245003612922,
            "loss_sequences_upper_95": 4.360998904050052,
            "loss_tokens_lower_95": 3.8622492194727105,
            "loss_tokens_upper_95": 4.194169759265148,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.386678710732765,
            "data_time": 0.08698645532131195,
            "batch_time": 0.1324714869260788,
            "samples_per_second": 4123553.821942933,
            "samples_per_second_per_gpu": 515444.2277428666,
            "loss_sequences_lower_95": 4.463828639487347,
            "loss_sequences_upper_95": 4.6013605299951905,
            "loss_tokens_lower_95": 4.302813744300513,
            "loss_tokens_upper_95": 4.456544462567239,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.940787360435579,
            "data_time": 0.3068147450685501,
            "batch_time": 0.34142130613327026,
            "samples_per_second": 2100539.160065723,
            "samples_per_second_per_gpu": 262567.3950082154,
            "loss_sequences_lower_95": 3.815077153647818,
            "loss_sequences_upper_95": 4.331153720762671,
            "loss_tokens_lower_95": 3.74563672313099,
            "loss_tokens_upper_95": 4.149653244640855,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.253879731596649,
            "data_time": 0.027714775273792423,
            "batch_time": 0.07171203275459313,
            "samples_per_second": 4454752.798086299,
            "samples_per_second_per_gpu": 556844.0997607873,
            "loss_sequences_lower_95": 5.244087137740047,
            "loss_sequences_upper_95": 5.263527021810612,
            "loss_tokens_lower_95": 5.244147169277606,
            "loss_tokens_upper_95": 5.263597348375924,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.362815074550295,
            "data_time": 0.31122367084026337,
            "batch_time": 0.33815906941890717,
            "samples_per_second": 1579681.7043632872,
            "samples_per_second_per_gpu": 197460.2130454109,
            "loss_sequences_lower_95": 2.289700750702793,
            "loss_sequences_upper_95": 2.5781269110522222,
            "loss_tokens_lower_95": 2.1505831949786494,
            "loss_tokens_upper_95": 2.498657282787311,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.251517305609065,
            "data_time": 0.022855591277281442,
            "batch_time": 0.06697397321462631,
            "samples_per_second": 4508267.049271546,
            "samples_per_second_per_gpu": 563533.3811589433,
            "loss_sequences_lower_95": 6.131691213803721,
            "loss_sequences_upper_95": 6.180216266624083,
            "loss_tokens_lower_95": 5.149502163926499,
            "loss_tokens_upper_95": 5.197558099613153,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4646238412857056,
            "data_time": 0.09550786018371582,
            "batch_time": 0.14005433395504951,
            "samples_per_second": 4225474.2342556575,
            "samples_per_second_per_gpu": 528184.2792819572,
            "loss_sequences_lower_95": 5.447383642578125,
            "loss_sequences_upper_95": 5.685309619140625,
            "loss_tokens_lower_95": 5.348896401295437,
            "loss_tokens_upper_95": 5.565693389585751,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.289202304508375,
            "data_time": 0.3914306312799454,
            "batch_time": 0.4353247284889221,
            "samples_per_second": 2319358.69265362,
            "samples_per_second_per_gpu": 289919.8365817025,
            "loss_sequences_lower_95": 5.153036897078804,
            "loss_sequences_upper_95": 5.421707193125849,
            "loss_tokens_lower_95": 5.1534905607804005,
            "loss_tokens_upper_95": 5.4235145104449725,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.715910587888775,
            "data_time": 0.06416759888331096,
            "batch_time": 0.10434551537036896,
            "samples_per_second": 4001387.188728024,
            "samples_per_second_per_gpu": 500173.398591003,
            "loss_sequences_lower_95": 6.63132431492661,
            "loss_sequences_upper_95": 6.8011707652698865,
            "loss_tokens_lower_95": 6.630011171283144,
            "loss_tokens_upper_95": 6.802258800159801,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.7903056086699167,
            "data_time": 0.07161315530538559,
            "batch_time": 0.11562098066012065,
            "samples_per_second": 4376404.3176016575,
            "samples_per_second_per_gpu": 547050.5397002072,
            "loss_sequences_lower_95": 1.9494066121419271,
            "loss_sequences_upper_95": 2.0541319946289063,
            "loss_tokens_lower_95": 1.7405390554659363,
            "loss_tokens_upper_95": 1.817946877969938,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.401378688358125,
            "data_time": 0.35991835594177246,
            "batch_time": 0.40071994066238403,
            "samples_per_second": 1993006.4568761718,
            "samples_per_second_per_gpu": 249125.80710952147,
            "loss_sequences_lower_95": 6.081961263020833,
            "loss_sequences_upper_95": 6.729144592285156,
            "loss_tokens_lower_95": 6.079639078776042,
            "loss_tokens_upper_95": 6.726317967006138,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.5187269002199173,
            "data_time": 0.146863654255867,
            "batch_time": 0.16456055641174316,
            "samples_per_second": 964310.5606568411,
            "samples_per_second_per_gpu": 120538.82008210514,
            "loss_sequences_lower_95": 3.2478528141975405,
            "loss_sequences_upper_95": 4.499534797668457,
            "loss_tokens_lower_95": 2.9082114811533506,
            "loss_tokens_upper_95": 3.515875835615335,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.543492898464203,
            "data_time": 0.09728770330548286,
            "batch_time": 0.14148874953389168,
            "samples_per_second": 4222314.178323751,
            "samples_per_second_per_gpu": 527789.2722904689,
            "loss_sequences_lower_95": 7.622193481445312,
            "loss_sequences_upper_95": 7.9427164306640625,
            "loss_tokens_lower_95": 7.390461289459073,
            "loss_tokens_upper_95": 7.672373835891233,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.057578132629395,
            "data_time": 0.0915045291185379,
            "batch_time": 0.13587898761034012,
            "samples_per_second": 4405094.773601631,
            "samples_per_second_per_gpu": 550636.8467002038,
            "loss_sequences_lower_95": 8.189906604003907,
            "loss_sequences_upper_95": 8.384929785156249,
            "loss_tokens_lower_95": 7.936987451391932,
            "loss_tokens_upper_95": 8.154582520753786,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.814299962443071,
            "data_time": 0.05052794888615608,
            "batch_time": 0.09410036851962407,
            "samples_per_second": 4265031.196773242,
            "samples_per_second_per_gpu": 533128.8995966553,
            "loss_sequences_lower_95": 5.797798298430691,
            "loss_sequences_upper_95": 5.830730192444687,
            "loss_tokens_lower_95": 5.797523269883507,
            "loss_tokens_upper_95": 5.830584706566376,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.371241026027228,
            "data_time": 0.1177508036295573,
            "batch_time": 0.1576240360736847,
            "samples_per_second": 4046738.781487483,
            "samples_per_second_per_gpu": 505842.3476859354,
            "loss_sequences_lower_95": 5.27676040954121,
            "loss_sequences_upper_95": 5.464791349771385,
            "loss_tokens_lower_95": 5.276578616656466,
            "loss_tokens_upper_95": 5.462671085949501,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.2985537214279175,
            "data_time": 0.10034210979938507,
            "batch_time": 0.1448063664138317,
            "samples_per_second": 4285545.334170805,
            "samples_per_second_per_gpu": 535693.1667713507,
            "loss_sequences_lower_95": 7.238315893554687,
            "loss_sequences_upper_95": 7.3607018798828125,
            "loss_tokens_lower_95": 7.237684240722657,
            "loss_tokens_upper_95": 7.358746875,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.173018006525067,
            "data_time": 0.028210929107098354,
            "batch_time": 0.07238027524380457,
            "samples_per_second": 4455053.067574984,
            "samples_per_second_per_gpu": 556881.633446873,
            "loss_sequences_lower_95": 5.092528336018804,
            "loss_sequences_upper_95": 5.182767256607734,
            "loss_tokens_lower_95": 4.067659164316107,
            "loss_tokens_upper_95": 4.131298871864736,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.392175323927581,
            "data_time": 0.19577510016305105,
            "batch_time": 0.22828240053994314,
            "samples_per_second": 1946670.9705260696,
            "samples_per_second_per_gpu": 243333.8713157587,
            "loss_sequences_lower_95": 5.2464537549374715,
            "loss_sequences_upper_95": 5.5360865578722604,
            "loss_tokens_lower_95": 5.244708217791657,
            "loss_tokens_upper_95": 5.5368656670869285,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.355119952968523,
            "data_time": 0.18918564170598984,
            "batch_time": 0.23551810532808304,
            "samples_per_second": 3614196.139039698,
            "samples_per_second_per_gpu": 451774.51737996226,
            "loss_sequences_lower_95": 5.25401580212163,
            "loss_sequences_upper_95": 5.455087124693628,
            "loss_tokens_lower_95": 5.256118738511029,
            "loss_tokens_upper_95": 5.453027882295496,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.835533238735531,
            "data_time": 0.029075986240059137,
            "batch_time": 0.07277910551056266,
            "samples_per_second": 4437933.150862253,
            "samples_per_second_per_gpu": 554741.6438577817,
            "loss_sequences_lower_95": 5.444320010099586,
            "loss_sequences_upper_95": 5.531498330436553,
            "loss_tokens_lower_95": 4.74566415844851,
            "loss_tokens_upper_95": 4.823396900891777,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.406575144914092,
            "data_time": 0.33780208230018616,
            "batch_time": 0.37647099792957306,
            "samples_per_second": 2432614.5928247,
            "samples_per_second_per_gpu": 304076.8241030875,
            "loss_sequences_lower_95": 5.33812339823082,
            "loss_sequences_upper_95": 5.475951066597429,
            "loss_tokens_lower_95": 5.337202808844349,
            "loss_tokens_upper_95": 5.474413594240865,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.189560717398967,
            "data_time": 0.04346082416864542,
            "batch_time": 0.08820808277680324,
            "samples_per_second": 4418167.405147707,
            "samples_per_second_per_gpu": 552270.9256434634,
            "loss_sequences_lower_95": 7.166876090046827,
            "loss_sequences_upper_95": 7.212893148533066,
            "loss_tokens_lower_95": 7.166649897864105,
            "loss_tokens_upper_95": 7.212949875764526,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.404131928693901,
            "data_time": 0.35065051913261414,
            "batch_time": 0.39042985439300537,
            "samples_per_second": 2573889.831381949,
            "samples_per_second_per_gpu": 321736.22892274364,
            "loss_sequences_lower_95": 6.176869734977056,
            "loss_sequences_upper_95": 6.630109620325773,
            "loss_tokens_lower_95": 6.176963880224135,
            "loss_tokens_upper_95": 6.6350385202944855,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.543799177805583,
            "data_time": 0.3341221958398819,
            "batch_time": 0.3544591814279556,
            "samples_per_second": 1118659.77399715,
            "samples_per_second_per_gpu": 139832.47174964374,
            "loss_sequences_lower_95": 5.324248657226563,
            "loss_sequences_upper_95": 6.229633687337239,
            "loss_tokens_lower_95": 4.7526334550645615,
            "loss_tokens_upper_95": 6.117982832590739,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.715241026878357,
            "data_time": 0.29090940952301025,
            "batch_time": 0.311021164059639,
            "samples_per_second": 1207083.9610392377,
            "samples_per_second_per_gpu": 150885.4951299047,
            "loss_sequences_lower_95": 4.644064470926921,
            "loss_sequences_upper_95": 5.66378002166748,
            "loss_tokens_lower_95": 3.865123903081658,
            "loss_tokens_upper_95": 5.354962072479591,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.177823777402448,
            "data_time": 0.041691434170518606,
            "batch_time": 0.0844922981091908,
            "samples_per_second": 4333294.336766807,
            "samples_per_second_per_gpu": 541661.7920958509,
            "loss_sequences_lower_95": 8.161934183426915,
            "loss_sequences_upper_95": 8.193723622744844,
            "loss_tokens_lower_95": 8.162501510148196,
            "loss_tokens_upper_95": 8.194125293400221,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.7399930081859558,
            "data_time": 0.022899246827483698,
            "batch_time": 0.0671095565026504,
            "samples_per_second": 4487795.789427694,
            "samples_per_second_per_gpu": 560974.4736784617,
            "loss_sequences_lower_95": 2.3595141950501777,
            "loss_sequences_upper_95": 2.3964250786966597,
            "loss_tokens_lower_95": 1.6861192814610815,
            "loss_tokens_upper_95": 1.7072677247630583,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.8971162968733175,
            "data_time": 0.33040285110473633,
            "batch_time": 0.3600056320428848,
            "samples_per_second": 1787619.411577093,
            "samples_per_second_per_gpu": 223452.42644713662,
            "loss_sequences_lower_95": 6.108200037197804,
            "loss_sequences_upper_95": 6.517359708050104,
            "loss_tokens_lower_95": 5.712421018618736,
            "loss_tokens_upper_95": 5.996916271819731,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.981260608982396,
            "data_time": 0.22548294067382812,
            "batch_time": 0.24314852058887482,
            "samples_per_second": 1085483.5003165668,
            "samples_per_second_per_gpu": 135685.43753957085,
            "loss_sequences_lower_95": 9.442093761547191,
            "loss_sequences_upper_95": 10.732410678348025,
            "loss_tokens_lower_95": 8.831672197506752,
            "loss_tokens_upper_95": 10.780351238486208,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.713388044659684,
            "data_time": 0.327389732003212,
            "batch_time": 0.3613665848970413,
            "samples_per_second": 2244920.82143237,
            "samples_per_second_per_gpu": 280615.1026790463,
            "loss_sequences_lower_95": 5.93225183254335,
            "loss_sequences_upper_95": 6.320296999303307,
            "loss_tokens_lower_95": 5.527881557506308,
            "loss_tokens_upper_95": 5.763661559083263,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.897724669154098,
            "data_time": 0.3259364366531372,
            "batch_time": 0.36079004406929016,
            "samples_per_second": 2155489.4544212646,
            "samples_per_second_per_gpu": 269436.1818026581,
            "loss_sequences_lower_95": 6.127208142164276,
            "loss_sequences_upper_95": 6.4780112661966465,
            "loss_tokens_lower_95": 5.731240345465141,
            "loss_tokens_upper_95": 5.9295136152991965,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.829412739451339,
            "data_time": 0.3590358644723892,
            "batch_time": 0.3947305232286453,
            "samples_per_second": 1528490.7181732235,
            "samples_per_second_per_gpu": 191061.33977165294,
            "loss_sequences_lower_95": 6.023774802975539,
            "loss_sequences_upper_95": 6.449126973966273,
            "loss_tokens_lower_95": 5.627587104433223,
            "loss_tokens_upper_95": 5.931784736760531,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.026408570568736,
            "data_time": 0.31741973757743835,
            "batch_time": 0.35166242718696594,
            "samples_per_second": 2332534.503122712,
            "samples_per_second_per_gpu": 291566.812890339,
            "loss_sequences_lower_95": 6.192603515997165,
            "loss_sequences_upper_95": 6.508177222275152,
            "loss_tokens_lower_95": 5.871709606135004,
            "loss_tokens_upper_95": 6.048592997042932,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.694128086848289,
            "data_time": 0.32567326724529266,
            "batch_time": 0.3609103858470917,
            "samples_per_second": 2206872.268716761,
            "samples_per_second_per_gpu": 275859.0335895951,
            "loss_sequences_lower_95": 5.768503717458025,
            "loss_sequences_upper_95": 5.9901980192764945,
            "loss_tokens_lower_95": 5.571044393463001,
            "loss_tokens_upper_95": 5.7067631251828015,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.25549504524324,
            "data_time": 0.3365255147218704,
            "batch_time": 0.3712879419326782,
            "samples_per_second": 2379495.4187582806,
            "samples_per_second_per_gpu": 297436.9273447851,
            "loss_sequences_lower_95": 5.509301329822075,
            "loss_sequences_upper_95": 5.789080540726824,
            "loss_tokens_lower_95": 5.13480552337628,
            "loss_tokens_upper_95": 5.26614200713909,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-32.0/params.txt",
    "uuid": "3db03c15-6ed6-4518-8206-cc510faf59e3",
    "creation_date": "2023_12_14-05_01_04"
}