{
    "name": "c4_original-d=1024_l=24_h=8-16.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 131717201920,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 16.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/26439",
        "--train-num-samples",
        "26343440384",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/original_c4/manifest.jsonl",
        "--data-key",
        "txt",
        "--name",
        "c4_original-d=1024_l=24_h=8-16.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/411m_16x_c4_original"
    ],
    "results": [
        {
            "loss": 3.55536189476649,
            "data_time": 0.035706013441085815,
            "batch_time": 0.5671105831861496,
            "samples_per_second": 225657.04431246978,
            "samples_per_second_per_gpu": 112828.52215623489,
            "loss_sequences_lower_95": 3.4072206433614096,
            "loss_sequences_upper_95": 3.7089704513549804,
            "loss_tokens_lower_95": 3.5387338829040527,
            "loss_tokens_upper_95": 3.572231171925863,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.688088583583547,
            "data_time": 0.00262506720622231,
            "batch_time": 0.11806387054682406,
            "samples_per_second": 277934.7725637502,
            "samples_per_second_per_gpu": 138967.3862818751,
            "loss_sequences_lower_95": 2.6851973901898845,
            "loss_sequences_upper_95": 2.6909811990435184,
            "loss_tokens_lower_95": 2.6784577031250003,
            "loss_tokens_upper_95": 2.6981764166666666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.548192780845019,
            "data_time": 0.01236904039978981,
            "batch_time": 0.12235806882381439,
            "samples_per_second": 270952.54827126756,
            "samples_per_second_per_gpu": 135476.27413563378,
            "loss_sequences_lower_95": 3.530830526546556,
            "loss_sequences_upper_95": 3.5661755993901467,
            "loss_tokens_lower_95": 3.52994346875,
            "loss_tokens_upper_95": 3.5665764687499997,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.648670682759629,
            "data_time": 0.003316167938081842,
            "batch_time": 0.11763935222437508,
            "samples_per_second": 279331.4639684045,
            "samples_per_second_per_gpu": 139665.73198420226,
            "loss_sequences_lower_95": 2.6391276628946523,
            "loss_sequences_upper_95": 2.6581314634342785,
            "loss_tokens_lower_95": 2.638854640625,
            "loss_tokens_upper_95": 2.658465640625,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6971738688086297,
            "data_time": 0.012533340603113174,
            "batch_time": 0.12302763387560844,
            "samples_per_second": 270247.26201168983,
            "samples_per_second_per_gpu": 135123.63100584492,
            "loss_sequences_lower_95": 2.6618302541449212,
            "loss_sequences_upper_95": 2.7307002174392982,
            "loss_tokens_lower_95": 2.6872088541666663,
            "loss_tokens_upper_95": 2.7072019895833335,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3178665318673843,
            "data_time": 0.006390971981960794,
            "batch_time": 0.12073346713314885,
            "samples_per_second": 278585.5491272242,
            "samples_per_second_per_gpu": 139292.7745636121,
            "loss_sequences_lower_95": 3.2722444888282847,
            "loss_sequences_upper_95": 3.363968154834615,
            "loss_tokens_lower_95": 3.3046928072916666,
            "loss_tokens_upper_95": 3.3307381874999997,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7905893616773643,
            "data_time": 0.0033082768514558865,
            "batch_time": 0.11728890607883403,
            "samples_per_second": 278980.1191028723,
            "samples_per_second_per_gpu": 139490.05955143616,
            "loss_sequences_lower_95": 3.7396067542251274,
            "loss_sequences_upper_95": 3.842118632413903,
            "loss_tokens_lower_95": 3.771084552083333,
            "loss_tokens_upper_95": 3.8092262395833334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.512781335121674,
            "data_time": 0.00339358925819397,
            "batch_time": 0.11734552303949992,
            "samples_per_second": 279225.29448215157,
            "samples_per_second_per_gpu": 139612.64724107579,
            "loss_sequences_lower_95": 3.505357452552356,
            "loss_sequences_upper_95": 3.520118026014398,
            "loss_tokens_lower_95": 3.5003287291666667,
            "loss_tokens_upper_95": 3.5255417395833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.143211293026684,
            "data_time": 0.013263560831546783,
            "batch_time": 0.13270825147628784,
            "samples_per_second": 270776.8973087446,
            "samples_per_second_per_gpu": 135388.4486543723,
            "loss_sequences_lower_95": 3.1015222068724593,
            "loss_sequences_upper_95": 3.1854061964081555,
            "loss_tokens_lower_95": 3.132102317708333,
            "loss_tokens_upper_95": 3.154343005208333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.222827525006924,
            "data_time": 0.011666689068078995,
            "batch_time": 0.1256190575659275,
            "samples_per_second": 272243.4393072813,
            "samples_per_second_per_gpu": 136121.71965364064,
            "loss_sequences_lower_95": 4.202362157045146,
            "loss_sequences_upper_95": 4.241976663340693,
            "loss_tokens_lower_95": 4.209057479166667,
            "loss_tokens_upper_95": 4.236793104166667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0647491740579227,
            "data_time": 0.0030667356175634714,
            "batch_time": 0.11763719661081738,
            "samples_per_second": 279333.06648207415,
            "samples_per_second_per_gpu": 139666.53324103708,
            "loss_sequences_lower_95": 3.057431457276963,
            "loss_sequences_upper_95": 3.0721905392948297,
            "loss_tokens_lower_95": 3.0539503385416666,
            "loss_tokens_upper_95": 3.0754607031249996,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.871395126376138,
            "data_time": 0.004653308874744909,
            "batch_time": 0.11925872457383875,
            "samples_per_second": 277486.6022046639,
            "samples_per_second_per_gpu": 138743.30110233196,
            "loss_sequences_lower_95": 2.86324333244937,
            "loss_sequences_upper_95": 2.879606102844518,
            "loss_tokens_lower_95": 2.8601673645833334,
            "loss_tokens_upper_95": 2.882446932291667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.735032558441162,
            "data_time": 0.014178924262523651,
            "batch_time": 0.12583426013588905,
            "samples_per_second": 271824.13716725307,
            "samples_per_second_per_gpu": 135912.06858362653,
            "loss_sequences_lower_95": 3.698066429772677,
            "loss_sequences_upper_95": 3.7728458365852404,
            "loss_tokens_lower_95": 3.720186239583333,
            "loss_tokens_upper_95": 3.749915677083333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.741969885020052,
            "data_time": 0.012281179428100586,
            "batch_time": 0.12294510751962662,
            "samples_per_second": 271879.29793329857,
            "samples_per_second_per_gpu": 135939.64896664929,
            "loss_sequences_lower_95": 2.68856971880569,
            "loss_sequences_upper_95": 2.796222085428335,
            "loss_tokens_lower_95": 2.7303778177083333,
            "loss_tokens_upper_95": 2.7532548489583335,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.326973470774564,
            "data_time": 0.07721972465515137,
            "batch_time": 0.1621418595314026,
            "samples_per_second": 206734.09537805797,
            "samples_per_second_per_gpu": 103367.04768902899,
            "loss_sequences_lower_95": 4.253583552620627,
            "loss_sequences_upper_95": 4.398532260547984,
            "loss_tokens_lower_95": 4.289201354980469,
            "loss_tokens_upper_95": 4.365053870461203,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.469391887459046,
            "data_time": 0.016314241019162266,
            "batch_time": 0.1282073584469882,
            "samples_per_second": 270699.4858843517,
            "samples_per_second_per_gpu": 135349.74294217586,
            "loss_sequences_lower_95": 3.3666922610980774,
            "loss_sequences_upper_95": 3.5767033346192827,
            "loss_tokens_lower_95": 3.454172203125,
            "loss_tokens_upper_95": 3.4843239635416667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.596398156047811,
            "data_time": 0.014736831188201904,
            "batch_time": 0.128277858098348,
            "samples_per_second": 270808.98309748137,
            "samples_per_second_per_gpu": 135404.49154874068,
            "loss_sequences_lower_95": 5.539844490796092,
            "loss_sequences_upper_95": 5.651909031528282,
            "loss_tokens_lower_95": 5.5832868125,
            "loss_tokens_upper_95": 5.609806291666667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2014993077418845,
            "data_time": 0.04209397733211517,
            "batch_time": 0.15302124619483948,
            "samples_per_second": 250638.39950557196,
            "samples_per_second_per_gpu": 125319.19975278598,
            "loss_sequences_lower_95": 3.161202533909532,
            "loss_sequences_upper_95": 3.2429320163414124,
            "loss_tokens_lower_95": 3.186197274630187,
            "loss_tokens_upper_95": 3.217141292134269,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.961308509056223,
            "data_time": 0.0033702103586566202,
            "batch_time": 0.1174281762238244,
            "samples_per_second": 280059.585236407,
            "samples_per_second_per_gpu": 140029.7926182035,
            "loss_sequences_lower_95": 2.94524278726321,
            "loss_sequences_upper_95": 2.9775863536622276,
            "loss_tokens_lower_95": 2.9450433437410983,
            "loss_tokens_upper_95": 2.977375122122472,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4410770936592527,
            "data_time": 0.00317800310766621,
            "batch_time": 0.11761040615428026,
            "samples_per_second": 279382.6126155508,
            "samples_per_second_per_gpu": 139691.3063077754,
            "loss_sequences_lower_95": 2.4563118836433855,
            "loss_sequences_upper_95": 2.4805571871576877,
            "loss_tokens_lower_95": 2.42947560885029,
            "loss_tokens_upper_95": 2.447067785418162,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7243494502091443,
            "data_time": 0.005747054939839377,
            "batch_time": 0.11914509356911503,
            "samples_per_second": 276701.3646134605,
            "samples_per_second_per_gpu": 138350.68230673025,
            "loss_sequences_lower_95": 3.285302849698866,
            "loss_sequences_upper_95": 3.5639637943028166,
            "loss_tokens_lower_95": 2.5284757376455653,
            "loss_tokens_upper_95": 2.724559534712294,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0229016269048055,
            "data_time": 0.004859562249893838,
            "batch_time": 0.11885185317790255,
            "samples_per_second": 278833.05527990026,
            "samples_per_second_per_gpu": 139416.52763995013,
            "loss_sequences_lower_95": 3.2619452148437498,
            "loss_sequences_upper_95": 3.468946687825521,
            "loss_tokens_lower_95": 2.951325502407626,
            "loss_tokens_upper_95": 3.0945920855444182,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3582228578519784,
            "data_time": 0.00723353454044887,
            "batch_time": 0.11982940492175874,
            "samples_per_second": 274283.5808635465,
            "samples_per_second_per_gpu": 137141.79043177326,
            "loss_sequences_lower_95": 2.438383408691777,
            "loss_sequences_upper_95": 2.4958682947397413,
            "loss_tokens_lower_95": 2.325072540321605,
            "loss_tokens_upper_95": 2.3577710970017427,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1272256731987,
            "data_time": 0.027482049805777415,
            "batch_time": 0.14105795110974992,
            "samples_per_second": 261514.19442465928,
            "samples_per_second_per_gpu": 130757.09721232964,
            "loss_sequences_lower_95": 3.0570218311656605,
            "loss_sequences_upper_95": 3.344807635220614,
            "loss_tokens_lower_95": 3.0670159603950418,
            "loss_tokens_upper_95": 3.1506947322508316,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.03478084486358,
            "data_time": 0.021656207740306854,
            "batch_time": 0.1315748691558838,
            "samples_per_second": 266319.94254395145,
            "samples_per_second_per_gpu": 133159.97127197572,
            "loss_sequences_lower_95": 3.040413774762835,
            "loss_sequences_upper_95": 3.2385114646444517,
            "loss_tokens_lower_95": 2.9830959080907244,
            "loss_tokens_upper_95": 3.081899880194228,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.959595878918966,
            "data_time": 0.01927924156188965,
            "batch_time": 0.12711706161499023,
            "samples_per_second": 265607.91789183207,
            "samples_per_second_per_gpu": 132803.95894591603,
            "loss_sequences_lower_95": 2.955258961995443,
            "loss_sequences_upper_95": 3.057080866495768,
            "loss_tokens_lower_95": 2.85397879809178,
            "loss_tokens_upper_95": 3.0346474929262595,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.788334383245798,
            "data_time": 0.0030522895394137226,
            "batch_time": 0.11698483660764368,
            "samples_per_second": 280912.88555719785,
            "samples_per_second_per_gpu": 140456.44277859892,
            "loss_sequences_lower_95": 3.832359699095148,
            "loss_sequences_upper_95": 3.914077993516559,
            "loss_tokens_lower_95": 3.7255657800629507,
            "loss_tokens_upper_95": 3.8050313522803285,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.948804334056899,
            "data_time": 0.0053087973594665525,
            "batch_time": 0.11818885564804077,
            "samples_per_second": 278598.2546312473,
            "samples_per_second_per_gpu": 139299.12731562366,
            "loss_sequences_lower_95": 3.977889593362005,
            "loss_sequences_upper_95": 4.280541488698837,
            "loss_tokens_lower_95": 2.7967630578646543,
            "loss_tokens_upper_95": 2.926472776657229,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.060923720992873,
            "data_time": 0.00828551923906481,
            "batch_time": 0.12113922190021824,
            "samples_per_second": 276713.883063947,
            "samples_per_second_per_gpu": 138356.9415319735,
            "loss_sequences_lower_95": 3.6266171191739547,
            "loss_sequences_upper_95": 3.983561128479629,
            "loss_tokens_lower_95": 2.926250655453372,
            "loss_tokens_upper_95": 3.091648342333212,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.392962684370067,
            "data_time": 0.030657529830932617,
            "batch_time": 0.14318648406437465,
            "samples_per_second": 263331.45554433775,
            "samples_per_second_per_gpu": 131665.72777216887,
            "loss_sequences_lower_95": 5.28996300544913,
            "loss_sequences_upper_95": 5.496834797619685,
            "loss_tokens_lower_95": 5.287334508329766,
            "loss_tokens_upper_95": 5.498686113749464,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6218276405334473,
            "data_time": 0.04553195834159851,
            "batch_time": 0.13856099545955658,
            "samples_per_second": 218544.14376371002,
            "samples_per_second_per_gpu": 109272.07188185501,
            "loss_sequences_lower_95": 2.541304237365723,
            "loss_sequences_upper_95": 2.8849526748657226,
            "loss_tokens_lower_95": 2.4176275481904765,
            "loss_tokens_upper_95": 2.810032847614322,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4166606837919815,
            "data_time": 0.005381332180364345,
            "batch_time": 0.11856543048610532,
            "samples_per_second": 278713.3589263806,
            "samples_per_second_per_gpu": 139356.6794631903,
            "loss_sequences_lower_95": 3.372619666389457,
            "loss_sequences_upper_95": 3.461893115634196,
            "loss_tokens_lower_95": 3.372261976634468,
            "loss_tokens_upper_95": 3.4610928241647745,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9345445052997485,
            "data_time": 0.007097117411784637,
            "batch_time": 0.11924176032726581,
            "samples_per_second": 274004.20974339236,
            "samples_per_second_per_gpu": 137002.10487169618,
            "loss_sequences_lower_95": 2.898513936410665,
            "loss_sequences_upper_95": 2.9699486484886877,
            "loss_tokens_lower_95": 2.898497270463823,
            "loss_tokens_upper_95": 2.970571971896435,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6301904711811535,
            "data_time": 0.006343094439342104,
            "batch_time": 0.11951017379760742,
            "samples_per_second": 277188.84489427024,
            "samples_per_second_per_gpu": 138594.42244713512,
            "loss_sequences_lower_95": 2.8976474367625307,
            "loss_sequences_upper_95": 3.026312196086099,
            "loss_tokens_lower_95": 2.570790917599627,
            "loss_tokens_upper_95": 2.621628880533618,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.416820350170135,
            "data_time": 0.014606419950723648,
            "batch_time": 0.12624994292855263,
            "samples_per_second": 273388.151641784,
            "samples_per_second_per_gpu": 136694.075820892,
            "loss_sequences_lower_95": 4.884979272460938,
            "loss_sequences_upper_95": 5.442789428710937,
            "loss_tokens_lower_95": 4.147142964936817,
            "loss_tokens_upper_95": 4.501864411955555,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.909853219985962,
            "data_time": 0.18045753240585327,
            "batch_time": 0.2952141761779785,
            "samples_per_second": 175299.45704010734,
            "samples_per_second_per_gpu": 87649.72852005367,
            "loss_sequences_lower_95": 2.703265881538391,
            "loss_sequences_upper_95": 3.1646054923534392,
            "loss_tokens_lower_95": 2.520176714316182,
            "loss_tokens_upper_95": 3.2639394825902475,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1883048545355086,
            "data_time": 0.030196090539296467,
            "batch_time": 0.13509257634480795,
            "samples_per_second": 256469.45615319602,
            "samples_per_second_per_gpu": 128234.72807659801,
            "loss_sequences_lower_95": 4.655800251577092,
            "loss_sequences_upper_95": 5.495043655921672,
            "loss_tokens_lower_95": 2.781904865251991,
            "loss_tokens_upper_95": 3.2093967442044815,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.991064887319709,
            "data_time": 0.005646431611643897,
            "batch_time": 0.11914007448487812,
            "samples_per_second": 277692.7411817101,
            "samples_per_second_per_gpu": 138846.37059085505,
            "loss_sequences_lower_95": 1.967795366426387,
            "loss_sequences_upper_95": 2.0148750595152767,
            "loss_tokens_lower_95": 1.9674723573937172,
            "loss_tokens_upper_95": 2.0145138348487333,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8662390969134741,
            "data_time": 0.003959822100262309,
            "batch_time": 0.11797104547190111,
            "samples_per_second": 279594.5508764658,
            "samples_per_second_per_gpu": 139797.2754382329,
            "loss_sequences_lower_95": 1.8704235832737,
            "loss_sequences_upper_95": 1.9880914997180041,
            "loss_tokens_lower_95": 1.7941283156130376,
            "loss_tokens_upper_95": 1.9082419934691714,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6615799241886906,
            "data_time": 0.023127158482869465,
            "batch_time": 0.13243264622158474,
            "samples_per_second": 264159.67178435833,
            "samples_per_second_per_gpu": 132079.83589217917,
            "loss_sequences_lower_95": 2.6358553987719637,
            "loss_sequences_upper_95": 3.0339234572190503,
            "loss_tokens_lower_95": 2.51218485871914,
            "loss_tokens_upper_95": 2.7852938981104733,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1566709303535996,
            "data_time": 0.008476535975933074,
            "batch_time": 0.12366102039813995,
            "samples_per_second": 272938.20195371006,
            "samples_per_second_per_gpu": 136469.10097685503,
            "loss_sequences_lower_95": 3.2705274574382153,
            "loss_sequences_upper_95": 3.4278655885426943,
            "loss_tokens_lower_95": 3.0806168348934695,
            "loss_tokens_upper_95": 3.220224898289103,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.167617974484839,
            "data_time": 0.032241374254226685,
            "batch_time": 0.13230228424072266,
            "samples_per_second": 239796.9414339273,
            "samples_per_second_per_gpu": 119898.47071696365,
            "loss_sequences_lower_95": 2.197201389219703,
            "loss_sequences_upper_95": 2.6105604032190834,
            "loss_tokens_lower_95": 2.021398656947772,
            "loss_tokens_upper_95": 2.3348951031683316,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6697300561405846,
            "data_time": 0.003284606880273301,
            "batch_time": 0.11767703447098168,
            "samples_per_second": 279084.86495604826,
            "samples_per_second_per_gpu": 139542.43247802413,
            "loss_sequences_lower_95": 3.654458528424435,
            "loss_sequences_upper_95": 3.6848055646285505,
            "loss_tokens_lower_95": 3.654280436165358,
            "loss_tokens_upper_95": 3.68467020552548,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.706172291514943,
            "data_time": 0.04224756360054016,
            "batch_time": 0.13783282041549683,
            "samples_per_second": 229265.11580832046,
            "samples_per_second_per_gpu": 114632.55790416023,
            "loss_sequences_lower_95": 0.6935570244650239,
            "loss_sequences_upper_95": 0.8064829150449883,
            "loss_tokens_lower_95": 0.6167831811348516,
            "loss_tokens_upper_95": 0.7847691905901155,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2179850408616057,
            "data_time": 0.0030068467368823786,
            "batch_time": 0.11785444140134493,
            "samples_per_second": 278301.10491012566,
            "samples_per_second_per_gpu": 139150.55245506283,
            "loss_sequences_lower_95": 3.773852129602332,
            "loss_sequences_upper_95": 3.810548072670991,
            "loss_tokens_lower_95": 3.0969239422147004,
            "loss_tokens_upper_95": 3.132345079787234,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.050256510734558,
            "data_time": 0.008396349847316742,
            "batch_time": 0.11979823373258114,
            "samples_per_second": 275460.46716613544,
            "samples_per_second_per_gpu": 137730.23358306772,
            "loss_sequences_lower_95": 5.119343188476562,
            "loss_sequences_upper_95": 5.309834143066406,
            "loss_tokens_lower_95": 4.929892734843267,
            "loss_tokens_upper_95": 5.107844570599502,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.245465536221214,
            "data_time": 0.02437111735343933,
            "batch_time": 0.1286647617816925,
            "samples_per_second": 253333.95018523379,
            "samples_per_second_per_gpu": 126666.97509261689,
            "loss_sequences_lower_95": 2.188518772954526,
            "loss_sequences_upper_95": 2.302043881623641,
            "loss_tokens_lower_95": 2.1870396821395213,
            "loss_tokens_upper_95": 2.3045364247197693,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.158740476405982,
            "data_time": 0.007182070187159947,
            "batch_time": 0.11911217087791079,
            "samples_per_second": 277113.0083800771,
            "samples_per_second_per_gpu": 138556.50419003854,
            "loss_sequences_lower_95": 6.100438297156131,
            "loss_sequences_upper_95": 6.21741562352036,
            "loss_tokens_lower_95": 6.099817375414299,
            "loss_tokens_upper_95": 6.215925662878788,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.586125121931235,
            "data_time": 0.007098180182436679,
            "batch_time": 0.121004674028843,
            "samples_per_second": 277372.2216104425,
            "samples_per_second_per_gpu": 138686.11080522125,
            "loss_sequences_lower_95": 0.6144771301269532,
            "loss_sequences_upper_95": 0.638024100748698,
            "loss_tokens_lower_95": 0.5656366501678797,
            "loss_tokens_upper_95": 0.5969583839395134,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.675692506063552,
            "data_time": 0.026903935841151645,
            "batch_time": 0.1347293428012303,
            "samples_per_second": 262383.0283526952,
            "samples_per_second_per_gpu": 131191.5141763476,
            "loss_sequences_lower_95": 5.330826997302828,
            "loss_sequences_upper_95": 6.024994826543899,
            "loss_tokens_lower_95": 5.326342395600818,
            "loss_tokens_upper_95": 6.031485624767485,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5822360813617706,
            "data_time": 0.17557364702224731,
            "batch_time": 0.2906060814857483,
            "samples_per_second": 175092.49815265078,
            "samples_per_second_per_gpu": 87546.24907632539,
            "loss_sequences_lower_95": 1.57463236451149,
            "loss_sequences_upper_95": 2.2575565636157986,
            "loss_tokens_lower_95": 1.3441126432123873,
            "loss_tokens_upper_95": 1.7741555093981554,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.069118962287903,
            "data_time": 0.00828871876001358,
            "batch_time": 0.1199260801076889,
            "samples_per_second": 274732.258089072,
            "samples_per_second_per_gpu": 137366.129044536,
            "loss_sequences_lower_95": 7.052022338867188,
            "loss_sequences_upper_95": 7.409125805664063,
            "loss_tokens_lower_95": 6.88387058729246,
            "loss_tokens_upper_95": 7.200509844939721,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.421777940750122,
            "data_time": 0.009127380326390266,
            "batch_time": 0.12058605812489986,
            "samples_per_second": 274640.23576783924,
            "samples_per_second_per_gpu": 137320.11788391962,
            "loss_sequences_lower_95": 6.613291198730469,
            "loss_sequences_upper_95": 6.833192651367188,
            "loss_tokens_lower_95": 6.293295375191693,
            "loss_tokens_upper_95": 6.493624983373498,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.689516180519925,
            "data_time": 0.004600176151762617,
            "batch_time": 0.1176705576003866,
            "samples_per_second": 279035.8001060516,
            "samples_per_second_per_gpu": 139517.9000530258,
            "loss_sequences_lower_95": 3.6565794056947705,
            "loss_sequences_upper_95": 3.7217179659345665,
            "loss_tokens_lower_95": 3.6575900562170003,
            "loss_tokens_upper_95": 3.7213809241954405,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3454174977110642,
            "data_time": 0.015086395399911063,
            "batch_time": 0.12601777769270398,
            "samples_per_second": 270174.78780835285,
            "samples_per_second_per_gpu": 135087.39390417642,
            "loss_sequences_lower_95": 2.2937627508160525,
            "loss_sequences_upper_95": 2.3992958361835157,
            "loss_tokens_lower_95": 2.293827911236319,
            "loss_tokens_upper_95": 2.3992834463280652,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.011821448326111,
            "data_time": 0.008693935349583626,
            "batch_time": 0.12033634074032307,
            "samples_per_second": 274490.04758427653,
            "samples_per_second_per_gpu": 137245.02379213826,
            "loss_sequences_lower_95": 4.919129846191407,
            "loss_sequences_upper_95": 5.106043115234375,
            "loss_tokens_lower_95": 4.918987463378906,
            "loss_tokens_upper_95": 5.104851159667969,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0435946766903856,
            "data_time": 0.0032602408141167863,
            "batch_time": 0.1171047487647872,
            "samples_per_second": 280207.88552488305,
            "samples_per_second_per_gpu": 140103.94276244153,
            "loss_sequences_lower_95": 2.884836562056528,
            "loss_sequences_upper_95": 2.9718479158659528,
            "loss_tokens_lower_95": 1.8692310787735145,
            "loss_tokens_upper_95": 1.9253438651396473,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0703778791783463,
            "data_time": 0.021072851286994085,
            "batch_time": 0.12782159778806898,
            "samples_per_second": 265711.4575350841,
            "samples_per_second_per_gpu": 132855.72876754205,
            "loss_sequences_lower_95": 2.0075245814536937,
            "loss_sequences_upper_95": 2.135102149621764,
            "loss_tokens_lower_95": 2.0061181196525912,
            "loss_tokens_upper_95": 2.1350115135534486,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9835302572624356,
            "data_time": 0.013441070914268494,
            "batch_time": 0.12705089151859283,
            "samples_per_second": 274937.0236948066,
            "samples_per_second_per_gpu": 137468.5118474033,
            "loss_sequences_lower_95": 1.9498602055568321,
            "loss_sequences_upper_95": 2.0180630792356005,
            "loss_tokens_lower_95": 1.949797114951938,
            "loss_tokens_upper_95": 2.0170670812270224,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.083339080167835,
            "data_time": 0.0033662109375,
            "batch_time": 0.11719812488555909,
            "samples_per_second": 280215.8611672696,
            "samples_per_second_per_gpu": 140107.9305836348,
            "loss_sequences_lower_95": 2.7338043721325787,
            "loss_sequences_upper_95": 2.8196532255065447,
            "loss_tokens_lower_95": 1.9464469228596895,
            "loss_tokens_upper_95": 2.0062931696690436,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3133801846277144,
            "data_time": 0.03857756654421488,
            "batch_time": 0.15197191635767618,
            "samples_per_second": 259961.48353648128,
            "samples_per_second_per_gpu": 129980.74176824064,
            "loss_sequences_lower_95": 3.17051675786417,
            "loss_sequences_upper_95": 3.4505447710632646,
            "loss_tokens_lower_95": 3.1734002330315807,
            "loss_tokens_upper_95": 3.453672265754175,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.263517484679499,
            "data_time": 0.004759911773274246,
            "batch_time": 0.11779476427337499,
            "samples_per_second": 279053.41796698835,
            "samples_per_second_per_gpu": 139526.70898349417,
            "loss_sequences_lower_95": 3.2404462472524846,
            "loss_sequences_upper_95": 3.286572512005447,
            "loss_tokens_lower_95": 3.2404008833858944,
            "loss_tokens_upper_95": 3.2865991136276755,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3098907783193496,
            "data_time": 0.02615703855242048,
            "batch_time": 0.13255711112703597,
            "samples_per_second": 259146.91400872258,
            "samples_per_second_per_gpu": 129573.45700436129,
            "loss_sequences_lower_95": 2.232413312300895,
            "loss_sequences_upper_95": 2.388736573006343,
            "loss_tokens_lower_95": 2.232102403362978,
            "loss_tokens_upper_95": 2.3878651443037016,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.212484825650851,
            "data_time": 0.07635548710823059,
            "batch_time": 0.18780550360679626,
            "samples_per_second": 220127.31349853543,
            "samples_per_second_per_gpu": 110063.65674926771,
            "loss_sequences_lower_95": 1.1227003733317058,
            "loss_sequences_upper_95": 1.4371815077463785,
            "loss_tokens_lower_95": 1.007011114226447,
            "loss_tokens_upper_95": 1.3707882828182645,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1458444943030675,
            "data_time": 0.09327954053878784,
            "batch_time": 0.20499667525291443,
            "samples_per_second": 216836.73214212595,
            "samples_per_second_per_gpu": 108418.36607106298,
            "loss_sequences_lower_95": 1.1283435757954914,
            "loss_sequences_upper_95": 1.4535437266031899,
            "loss_tokens_lower_95": 0.9320608117607202,
            "loss_tokens_upper_95": 1.3431298352359387,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.158473497918849,
            "data_time": 0.004762752832238512,
            "batch_time": 0.11833051328636723,
            "samples_per_second": 277578.85518999776,
            "samples_per_second_per_gpu": 138789.42759499888,
            "loss_sequences_lower_95": 4.1181833132939065,
            "loss_sequences_upper_95": 4.198967324707751,
            "loss_tokens_lower_95": 4.117865743510677,
            "loss_tokens_upper_95": 4.198075229830174,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.4055733261843297,
            "data_time": 0.0028829574911025733,
            "batch_time": 0.11682181934167833,
            "samples_per_second": 280714.52086431114,
            "samples_per_second_per_gpu": 140357.26043215557,
            "loss_sequences_lower_95": 0.5323967355111062,
            "loss_sequences_upper_95": 0.5450765010132261,
            "loss_tokens_lower_95": 0.38652798514472897,
            "loss_tokens_upper_95": 0.39348635748024013,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.2732621628468435,
            "data_time": 0.04778769612312317,
            "batch_time": 0.16162879765033722,
            "samples_per_second": 253537.89585392957,
            "samples_per_second_per_gpu": 126768.94792696479,
            "loss_sequences_lower_95": 4.547289954958939,
            "loss_sequences_upper_95": 4.9852147260050135,
            "loss_tokens_lower_95": 4.087308141810981,
            "loss_tokens_upper_95": 4.359583182207229,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.822209847939981,
            "data_time": 0.09375491738319397,
            "batch_time": 0.1687476933002472,
            "samples_per_second": 164093.38492727515,
            "samples_per_second_per_gpu": 82046.69246363758,
            "loss_sequences_lower_95": 6.335678636705554,
            "loss_sequences_upper_95": 7.759223587448532,
            "loss_tokens_lower_95": 5.918607848367573,
            "loss_tokens_upper_95": 7.517167418680073,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.0685435010165705,
            "data_time": 0.030456125736236572,
            "batch_time": 0.1303502321243286,
            "samples_per_second": 240488.8511949077,
            "samples_per_second_per_gpu": 120244.42559745385,
            "loss_sequences_lower_95": 4.242005604069408,
            "loss_sequences_upper_95": 4.601438624684404,
            "loss_tokens_lower_95": 3.857432199130046,
            "loss_tokens_upper_95": 4.071728803090833,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.141886844867614,
            "data_time": 0.030213286479314167,
            "batch_time": 0.13034798701604208,
            "samples_per_second": 239109.44259999608,
            "samples_per_second_per_gpu": 119554.72129999804,
            "loss_sequences_lower_95": 4.278331644942121,
            "loss_sequences_upper_95": 4.586080048723919,
            "loss_tokens_lower_95": 3.9667434028490387,
            "loss_tokens_upper_95": 4.148570132465258,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.401852367127814,
            "data_time": 0.02844775716463725,
            "batch_time": 0.12887991468111673,
            "samples_per_second": 238688.1140862479,
            "samples_per_second_per_gpu": 119344.05704312395,
            "loss_sequences_lower_95": 4.694032613242545,
            "loss_sequences_upper_95": 5.198850278156559,
            "loss_tokens_lower_95": 4.1393280924291975,
            "loss_tokens_upper_95": 4.4412277914862175,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.202077461452019,
            "data_time": 0.03387801845868429,
            "batch_time": 0.1338054140408834,
            "samples_per_second": 239855.5872986317,
            "samples_per_second_per_gpu": 119927.79364931585,
            "loss_sequences_lower_95": 4.283739527260385,
            "loss_sequences_upper_95": 4.579595575100038,
            "loss_tokens_lower_95": 4.0340643576743815,
            "loss_tokens_upper_95": 4.201964476398219,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.632092392962912,
            "data_time": 0.04019955226353237,
            "batch_time": 0.15116011528741746,
            "samples_per_second": 252140.6974057502,
            "samples_per_second_per_gpu": 126070.3487028751,
            "loss_sequences_lower_95": 4.7528682353333656,
            "loss_sequences_upper_95": 5.070933665399966,
            "loss_tokens_lower_95": 4.509594476637906,
            "loss_tokens_upper_95": 4.655221578860778,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.592698108859178,
            "data_time": 0.03232112526893616,
            "batch_time": 0.13309426108996072,
            "samples_per_second": 236733.60914159066,
            "samples_per_second_per_gpu": 118366.80457079533,
            "loss_sequences_lower_95": 4.8122886471632045,
            "loss_sequences_upper_95": 5.143483417790111,
            "loss_tokens_lower_95": 4.44055043041742,
            "loss_tokens_upper_95": 4.616946579264617,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/params.txt",
    "uuid": "be50c9d6-a240-4d68-ae4e-a3cb932f7550",
    "creation_date": "2024_01_26-08_19_57"
}