{
    "name": "c4_original-d=512_l=8_h=4-4.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 6313123840,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1262624768",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.250802439451218,
            "data_time": 0.031726524233818054,
            "batch_time": 0.31660597771406174,
            "samples_per_second": 1735378.5841561505,
            "samples_per_second_per_gpu": 216922.3230195188,
            "loss_sequences_lower_95": 4.125627899169922,
            "loss_sequences_upper_95": 4.376924069722493,
            "loss_tokens_lower_95": 4.234986502329508,
            "loss_tokens_upper_95": 4.266403306325277,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.517017598344945,
            "data_time": 0.0014370234941585263,
            "batch_time": 0.015193209469639531,
            "samples_per_second": 2261874.174259702,
            "samples_per_second_per_gpu": 282734.2717824628,
            "loss_sequences_lower_95": 3.514317337556913,
            "loss_sequences_upper_95": 3.519650405813598,
            "loss_tokens_lower_95": 3.506169510416667,
            "loss_tokens_upper_95": 3.5277358229166667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7921770217467325,
            "data_time": 0.010591150283813477,
            "batch_time": 0.024501177787780763,
            "samples_per_second": 2174851.284969693,
            "samples_per_second_per_gpu": 271856.41062121163,
            "loss_sequences_lower_95": 3.7715107695910395,
            "loss_sequences_upper_95": 3.814047041912468,
            "loss_tokens_lower_95": 3.7779553854166665,
            "loss_tokens_upper_95": 3.8069898020833333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5199375798530186,
            "data_time": 0.0016940945857449581,
            "batch_time": 0.015213613643458015,
            "samples_per_second": 2312207.737485101,
            "samples_per_second_per_gpu": 289025.9671856376,
            "loss_sequences_lower_95": 3.5084047499194586,
            "loss_sequences_upper_95": 3.531788035599227,
            "loss_tokens_lower_95": 3.5087092604166665,
            "loss_tokens_upper_95": 3.530865125,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.533301052882084,
            "data_time": 0.010776622361870877,
            "batch_time": 0.02479192175238256,
            "samples_per_second": 2136735.770389319,
            "samples_per_second_per_gpu": 267091.97129866487,
            "loss_sequences_lower_95": 3.499826360574322,
            "loss_sequences_upper_95": 3.56703508614037,
            "loss_tokens_lower_95": 3.522253072916667,
            "loss_tokens_upper_95": 3.5439517395833335,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9802008516362215,
            "data_time": 0.00410333567339441,
            "batch_time": 0.017987815582233925,
            "samples_per_second": 2249654.8385260855,
            "samples_per_second_per_gpu": 281206.8548157607,
            "loss_sequences_lower_95": 3.940455504001848,
            "loss_sequences_upper_95": 4.021568255810248,
            "loss_tokens_lower_95": 3.967599385416667,
            "loss_tokens_upper_95": 3.992595291666667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7176224789327503,
            "data_time": 0.0016779422176603782,
            "batch_time": 0.01501659234624119,
            "samples_per_second": 2342756.3538114172,
            "samples_per_second_per_gpu": 292844.54422642716,
            "loss_sequences_lower_95": 3.683518385283801,
            "loss_sequences_upper_95": 3.7510842135682396,
            "loss_tokens_lower_95": 3.702209052083333,
            "loss_tokens_upper_95": 3.7333761354166666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.156319940027767,
            "data_time": 0.0016837489001674166,
            "batch_time": 0.015107909289926029,
            "samples_per_second": 2328787.1683828165,
            "samples_per_second_per_gpu": 291098.39604785206,
            "loss_sequences_lower_95": 4.144464557428011,
            "loss_sequences_upper_95": 4.169343995418848,
            "loss_tokens_lower_95": 4.144493656250001,
            "loss_tokens_upper_95": 4.168173458333333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8987747503489985,
            "data_time": 0.010522706168038505,
            "batch_time": 0.024169466798267668,
            "samples_per_second": 2199924.75156662,
            "samples_per_second_per_gpu": 274990.5939458275,
            "loss_sequences_lower_95": 3.8537437687075236,
            "loss_sequences_upper_95": 3.948651085830316,
            "loss_tokens_lower_95": 3.8872633125,
            "loss_tokens_upper_95": 3.91034053125,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.022491797156956,
            "data_time": 0.0114521449431777,
            "batch_time": 0.02577097062021494,
            "samples_per_second": 2140513.1891767243,
            "samples_per_second_per_gpu": 267564.14864709054,
            "loss_sequences_lower_95": 4.984070282585536,
            "loss_sequences_upper_95": 5.070107682420331,
            "loss_tokens_lower_95": 5.008955302083334,
            "loss_tokens_upper_95": 5.0361980937499995,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.879194556547711,
            "data_time": 0.0015362174780801997,
            "batch_time": 0.015889916690482477,
            "samples_per_second": 2319947.749308159,
            "samples_per_second_per_gpu": 289993.4686635199,
            "loss_sequences_lower_95": 3.87189185597463,
            "loss_sequences_upper_95": 3.8865139489045153,
            "loss_tokens_lower_95": 3.8675883541666667,
            "loss_tokens_upper_95": 3.8907338541666667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.695164578996664,
            "data_time": 0.0028480758873449573,
            "batch_time": 0.01713742046530896,
            "samples_per_second": 2332347.641364018,
            "samples_per_second_per_gpu": 291543.45517050225,
            "loss_sequences_lower_95": 3.686519288837984,
            "loss_sequences_upper_95": 3.7037310300395667,
            "loss_tokens_lower_95": 3.6836050104166667,
            "loss_tokens_upper_95": 3.7066605312500003,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.298690658553861,
            "data_time": 0.010595404583474865,
            "batch_time": 0.0242941002600749,
            "samples_per_second": 2187079.8039733297,
            "samples_per_second_per_gpu": 273384.9754966662,
            "loss_sequences_lower_95": 4.254009093619263,
            "loss_sequences_upper_95": 4.34913155515102,
            "loss_tokens_lower_95": 4.2853532083333326,
            "loss_tokens_upper_95": 4.311619010416667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.521507124308413,
            "data_time": 0.01090102081754768,
            "batch_time": 0.02461764347030822,
            "samples_per_second": 2189645.8496301915,
            "samples_per_second_per_gpu": 273705.73120377393,
            "loss_sequences_lower_95": 3.4624806079747965,
            "loss_sequences_upper_95": 3.5802255106069407,
            "loss_tokens_lower_95": 3.509286552083333,
            "loss_tokens_upper_95": 3.5334996979166666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0213304649699815,
            "data_time": 0.09039613178798131,
            "batch_time": 0.10695066622325353,
            "samples_per_second": 1112091.4418113863,
            "samples_per_second_per_gpu": 139011.43022642328,
            "loss_sequences_lower_95": 4.94987740950151,
            "loss_sequences_upper_95": 5.093561233173717,
            "loss_tokens_lower_95": 4.9922645568847654,
            "loss_tokens_upper_95": 5.050671724839644,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.120078288431418,
            "data_time": 0.01535021039572629,
            "batch_time": 0.029377310113473373,
            "samples_per_second": 2124013.2196966964,
            "samples_per_second_per_gpu": 265501.65246208705,
            "loss_sequences_lower_95": 4.030049422461507,
            "loss_sequences_upper_95": 4.209799247719456,
            "loss_tokens_lower_95": 4.106144364583333,
            "loss_tokens_upper_95": 4.13398371875,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.043200744487994,
            "data_time": 0.01404988020658493,
            "batch_time": 0.028519981851180393,
            "samples_per_second": 2097717.670624471,
            "samples_per_second_per_gpu": 262214.70882805885,
            "loss_sequences_lower_95": 5.9874333927688,
            "loss_sequences_upper_95": 6.099225062871042,
            "loss_tokens_lower_95": 6.031168729166667,
            "loss_tokens_upper_95": 6.055479239583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.297161493145052,
            "data_time": 0.0402391143143177,
            "batch_time": 0.054707374423742294,
            "samples_per_second": 1845728.5309181623,
            "samples_per_second_per_gpu": 230716.0663647703,
            "loss_sequences_lower_95": 4.196553583614162,
            "loss_sequences_upper_95": 4.459930282342629,
            "loss_tokens_lower_95": 4.282377474425269,
            "loss_tokens_upper_95": 4.3120540431288426,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.951008572247001,
            "data_time": 0.0018739177249287146,
            "batch_time": 0.015683268020799328,
            "samples_per_second": 2249841.468680224,
            "samples_per_second_per_gpu": 281230.183585028,
            "loss_sequences_lower_95": 4.931235117193063,
            "loss_sequences_upper_95": 4.971218807305761,
            "loss_tokens_lower_95": 4.93112943535465,
            "loss_tokens_upper_95": 4.9707592206639015,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2206352545526062,
            "data_time": 0.0022568398979818747,
            "batch_time": 0.015845950099692984,
            "samples_per_second": 2278441.124309881,
            "samples_per_second_per_gpu": 284805.14053873514,
            "loss_sequences_lower_95": 3.232001701253174,
            "loss_sequences_upper_95": 3.2579479175743127,
            "loss_tokens_lower_95": 3.194232264085003,
            "loss_tokens_upper_95": 3.213564967430094,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.076152910924938,
            "data_time": 0.0033236549891580624,
            "batch_time": 0.017008151118200303,
            "samples_per_second": 2259610.5315881856,
            "samples_per_second_per_gpu": 282451.3164485232,
            "loss_sequences_lower_95": 5.318774390997727,
            "loss_sequences_upper_95": 5.6254445619722775,
            "loss_tokens_lower_95": 4.562335215257923,
            "loss_tokens_upper_95": 4.779412773446807,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.260248119274775,
            "data_time": 0.004190286264774647,
            "batch_time": 0.01798778296785152,
            "samples_per_second": 2227607.2054393725,
            "samples_per_second_per_gpu": 278450.90067992156,
            "loss_sequences_lower_95": 5.408791764322917,
            "loss_sequences_upper_95": 5.6178296875000004,
            "loss_tokens_lower_95": 4.903214929736635,
            "loss_tokens_upper_95": 5.047538448309748,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.626623554967225,
            "data_time": 0.0047871868714667734,
            "batch_time": 0.018490347207761278,
            "samples_per_second": 2246103.5858875206,
            "samples_per_second_per_gpu": 280762.9482359401,
            "loss_sequences_lower_95": 3.6726831258292267,
            "loss_sequences_upper_95": 3.744225139487775,
            "loss_tokens_lower_95": 3.521856025201255,
            "loss_tokens_upper_95": 3.556647015051786,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.03518170335076,
            "data_time": 0.025245838931628635,
            "batch_time": 0.03972738555499485,
            "samples_per_second": 2014929.0324386458,
            "samples_per_second_per_gpu": 251866.12905483073,
            "loss_sequences_lower_95": 3.9382222886519,
            "loss_sequences_upper_95": 4.20272480357777,
            "loss_tokens_lower_95": 3.922268443578051,
            "loss_tokens_upper_95": 4.003903163735765,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9935286288358727,
            "data_time": 0.022341296076774597,
            "batch_time": 0.03653574176132679,
            "samples_per_second": 2011608.304921282,
            "samples_per_second_per_gpu": 251451.03811516025,
            "loss_sequences_lower_95": 3.982627501195791,
            "loss_sequences_upper_95": 4.206171488859216,
            "loss_tokens_lower_95": 3.861930102720887,
            "loss_tokens_upper_95": 3.9648480410566185,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.580776179631551,
            "data_time": 0.018070480762383878,
            "batch_time": 0.03291231852311354,
            "samples_per_second": 1982204.2743675727,
            "samples_per_second_per_gpu": 247775.5342959466,
            "loss_sequences_lower_95": 4.531817199707032,
            "loss_sequences_upper_95": 4.6519443359375,
            "loss_tokens_lower_95": 4.434783309351824,
            "loss_tokens_upper_95": 4.67916710927843,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.668501615197612,
            "data_time": 0.0016648435250075709,
            "batch_time": 0.015205142676578054,
            "samples_per_second": 2289471.311704343,
            "samples_per_second_per_gpu": 286183.9139630429,
            "loss_sequences_lower_95": 6.684952585225875,
            "loss_sequences_upper_95": 6.7617153475776295,
            "loss_tokens_lower_95": 6.518833242749295,
            "loss_tokens_upper_95": 6.598923779504515,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.870572017850699,
            "data_time": 0.003035153118555978,
            "batch_time": 0.016649360624735786,
            "samples_per_second": 2268115.591277038,
            "samples_per_second_per_gpu": 283514.4489096298,
            "loss_sequences_lower_95": 5.454332263060291,
            "loss_sequences_upper_95": 5.761277262369791,
            "loss_tokens_lower_95": 4.0994763172506525,
            "loss_tokens_upper_95": 4.239561391721304,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.469158022476951,
            "data_time": 0.005428745134456738,
            "batch_time": 0.019157484576508805,
            "samples_per_second": 2220961.4902012004,
            "samples_per_second_per_gpu": 277620.18627515004,
            "loss_sequences_lower_95": 4.936590065809646,
            "loss_sequences_upper_95": 5.292304664585778,
            "loss_tokens_lower_95": 4.033167043552356,
            "loss_tokens_upper_95": 4.198144090079461,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.079414861931649,
            "data_time": 0.025148083056722368,
            "batch_time": 0.04019003893647875,
            "samples_per_second": 1948103.9843310409,
            "samples_per_second_per_gpu": 243512.9980413801,
            "loss_sequences_lower_95": 5.9967901778547725,
            "loss_sequences_upper_95": 6.160294540614298,
            "loss_tokens_lower_95": 5.997858850487836,
            "loss_tokens_upper_95": 6.159482828776042,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.641266703605652,
            "data_time": 0.05413330518282377,
            "batch_time": 0.07003438014250535,
            "samples_per_second": 1639242.4905164351,
            "samples_per_second_per_gpu": 204905.3113145544,
            "loss_sequences_lower_95": 3.502021453857422,
            "loss_sequences_upper_95": 3.8802717285156247,
            "loss_tokens_lower_95": 3.331419795655607,
            "loss_tokens_upper_95": 3.792798812402169,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.07678464637686,
            "data_time": 0.0034904284955046172,
            "batch_time": 0.017090110447265376,
            "samples_per_second": 2272749.9523376985,
            "samples_per_second_per_gpu": 284093.7440422123,
            "loss_sequences_lower_95": 5.029017475171123,
            "loss_sequences_upper_95": 5.124289430834986,
            "loss_tokens_lower_95": 5.028296735562468,
            "loss_tokens_upper_95": 5.124786370705924,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.213495026447068,
            "data_time": 0.005228484046400857,
            "batch_time": 0.019148126900876715,
            "samples_per_second": 2211837.141418761,
            "samples_per_second_per_gpu": 276479.64267734514,
            "loss_sequences_lower_95": 5.162905141469595,
            "loss_sequences_upper_95": 5.263027751650799,
            "loss_tokens_lower_95": 5.161417443277795,
            "loss_tokens_upper_95": 5.265401334395475,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6179528585326035,
            "data_time": 0.0036985124934054347,
            "batch_time": 0.01727987983571788,
            "samples_per_second": 2262209.450463984,
            "samples_per_second_per_gpu": 282776.181307998,
            "loss_sequences_lower_95": 3.7746603350385097,
            "loss_sequences_upper_95": 3.9029856552107423,
            "loss_tokens_lower_95": 3.4277249586973393,
            "loss_tokens_upper_95": 3.4831751889909635,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.760137508392334,
            "data_time": 0.011438335292041302,
            "batch_time": 0.025392016395926476,
            "samples_per_second": 2132121.4203013666,
            "samples_per_second_per_gpu": 266515.1775376708,
            "loss_sequences_lower_95": 5.96629677734375,
            "loss_sequences_upper_95": 6.542959338378906,
            "loss_tokens_lower_95": 5.109259206896819,
            "loss_tokens_upper_95": 5.476122594463518,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.911548748612404,
            "data_time": 0.16699659824371338,
            "batch_time": 0.18347887694835663,
            "samples_per_second": 741651.7989943431,
            "samples_per_second_per_gpu": 92706.4748742929,
            "loss_sequences_lower_95": 3.683730846643448,
            "loss_sequences_upper_95": 4.1952800989151,
            "loss_tokens_lower_95": 3.4764670185659123,
            "loss_tokens_upper_95": 4.255140186178273,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.564478965326288,
            "data_time": 0.0301200455807625,
            "batch_time": 0.04385967457548101,
            "samples_per_second": 1911914.4909456759,
            "samples_per_second_per_gpu": 238989.31136820948,
            "loss_sequences_lower_95": 6.093429091881061,
            "loss_sequences_upper_95": 6.985039493955415,
            "loss_tokens_lower_95": 3.9707922900005284,
            "loss_tokens_upper_95": 4.4487060340442675,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.0834876147353945,
            "data_time": 0.0032044460790024865,
            "batch_time": 0.017143746010131307,
            "samples_per_second": 2213302.8626574976,
            "samples_per_second_per_gpu": 276662.8578321872,
            "loss_sequences_lower_95": 3.0611449768679693,
            "loss_sequences_upper_95": 3.106276248583579,
            "loss_tokens_lower_95": 3.0605164347490716,
            "loss_tokens_upper_95": 3.106092012537885,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6862039112285965,
            "data_time": 0.0029018287007365525,
            "batch_time": 0.01645894785190119,
            "samples_per_second": 2283423.617229754,
            "samples_per_second_per_gpu": 285427.95215371926,
            "loss_sequences_lower_95": 3.6563564970496554,
            "loss_sequences_upper_95": 3.837562463613429,
            "loss_tokens_lower_95": 3.4890115357557874,
            "loss_tokens_upper_95": 3.6630187031477854,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.430274655094077,
            "data_time": 0.020534675982263353,
            "batch_time": 0.03492021560668945,
            "samples_per_second": 1948889.6823132625,
            "samples_per_second_per_gpu": 243611.2102891578,
            "loss_sequences_lower_95": 3.260965552696815,
            "loss_sequences_upper_95": 3.650314504322988,
            "loss_tokens_lower_95": 3.1790596565860603,
            "loss_tokens_upper_95": 3.4805252343388644,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.768232895900814,
            "data_time": 0.004995493963360786,
            "batch_time": 0.018660560995340348,
            "samples_per_second": 2235163.540117861,
            "samples_per_second_per_gpu": 279395.4425147326,
            "loss_sequences_lower_95": 3.792591835027994,
            "loss_sequences_upper_95": 3.9395154993648878,
            "loss_tokens_lower_95": 3.6298956905178392,
            "loss_tokens_upper_95": 3.7757354833795596,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.0987165657485405,
            "data_time": 0.03252054963793073,
            "batch_time": 0.04717144511994861,
            "samples_per_second": 1921150.137518307,
            "samples_per_second_per_gpu": 240143.76718978837,
            "loss_sequences_lower_95": 2.932628966540825,
            "loss_sequences_upper_95": 3.4153719925298924,
            "loss_tokens_lower_95": 2.8274592687410784,
            "loss_tokens_upper_95": 3.204532542392052,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.540325900177213,
            "data_time": 0.0021501212800163487,
            "batch_time": 0.015723020898024177,
            "samples_per_second": 2278267.434682952,
            "samples_per_second_per_gpu": 284783.429335369,
            "loss_sequences_lower_95": 4.523932725998325,
            "loss_sequences_upper_95": 4.556322338686487,
            "loss_tokens_lower_95": 4.524274542408482,
            "loss_tokens_upper_95": 4.556395781109347,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.1360684844475348,
            "data_time": 0.052436425469138404,
            "batch_time": 0.06753154234452681,
            "samples_per_second": 1683864.2599798196,
            "samples_per_second_per_gpu": 210483.03249747746,
            "loss_sequences_lower_95": 1.085820988775457,
            "loss_sequences_upper_95": 1.2449760992550156,
            "loss_tokens_lower_95": 0.9716611458066786,
            "loss_tokens_upper_95": 1.1989803130697447,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.1551505478797965,
            "data_time": 0.0015292721617051962,
            "batch_time": 0.01516649822727133,
            "samples_per_second": 2270120.3808819344,
            "samples_per_second_per_gpu": 283765.0476102418,
            "loss_sequences_lower_95": 6.670701958038522,
            "loss_sequences_upper_95": 6.73007304769392,
            "loss_tokens_lower_95": 5.373384792069632,
            "loss_tokens_upper_95": 5.433300809961315,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.254759831905365,
            "data_time": 0.006148577209502932,
            "batch_time": 0.019829356481158543,
            "samples_per_second": 2230373.9321206054,
            "samples_per_second_per_gpu": 278796.7415150757,
            "loss_sequences_lower_95": 6.2114366455078125,
            "loss_sequences_upper_95": 6.489868798828125,
            "loss_tokens_lower_95": 6.003125745827895,
            "loss_tokens_upper_95": 6.259860090239511,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.3318688765816065,
            "data_time": 0.024770997338375803,
            "batch_time": 0.039762286816613146,
            "samples_per_second": 1920341.838458423,
            "samples_per_second_per_gpu": 240042.72980730288,
            "loss_sequences_lower_95": 5.169407401706861,
            "loss_sequences_upper_95": 5.497011798361074,
            "loss_tokens_lower_95": 5.170363159179687,
            "loss_tokens_upper_95": 5.49756919529127,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.469331684257045,
            "data_time": 0.004756505230823195,
            "batch_time": 0.018517439264849007,
            "samples_per_second": 2230805.9378992887,
            "samples_per_second_per_gpu": 278850.7422374111,
            "loss_sequences_lower_95": 6.392382978959517,
            "loss_sequences_upper_95": 6.546701345732718,
            "loss_tokens_lower_95": 6.392799257220644,
            "loss_tokens_upper_95": 6.545040912050189,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.1650635635058084,
            "data_time": 0.004273986245723481,
            "batch_time": 0.017926390500778846,
            "samples_per_second": 2258504.6063745846,
            "samples_per_second_per_gpu": 282313.0757968231,
            "loss_sequences_lower_95": 1.2061319498697916,
            "loss_sequences_upper_95": 1.2679177368164063,
            "loss_tokens_lower_95": 1.087143152182748,
            "loss_tokens_upper_95": 1.143890444849815,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.114709812118893,
            "data_time": 0.02435521568570818,
            "batch_time": 0.03851708344050816,
            "samples_per_second": 1923321.8210027858,
            "samples_per_second_per_gpu": 240415.22762534823,
            "loss_sequences_lower_95": 5.7803997076125375,
            "loss_sequences_upper_95": 6.444708934965588,
            "loss_tokens_lower_95": 5.7843941243489585,
            "loss_tokens_upper_95": 6.4468533906482515,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.720489513128996,
            "data_time": 0.1658032387495041,
            "batch_time": 0.1821700930595398,
            "samples_per_second": 922398.7442190108,
            "samples_per_second_per_gpu": 115299.84302737635,
            "loss_sequences_lower_95": 2.504594546556473,
            "loss_sequences_upper_95": 3.7333592057228087,
            "loss_tokens_lower_95": 2.126351607804446,
            "loss_tokens_upper_95": 2.692822057979623,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.755129225730896,
            "data_time": 0.006300427137859284,
            "batch_time": 0.020191198303585962,
            "samples_per_second": 2204478.8479372775,
            "samples_per_second_per_gpu": 275559.8559921597,
            "loss_sequences_lower_95": 7.6860230468749995,
            "loss_sequences_upper_95": 8.046616918945313,
            "loss_tokens_lower_95": 7.461534095415609,
            "loss_tokens_upper_95": 7.780298347279506,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.942211472988129,
            "data_time": 0.006241623844419207,
            "batch_time": 0.020077606042226154,
            "samples_per_second": 2217252.230811376,
            "samples_per_second_per_gpu": 277156.528851422,
            "loss_sequences_lower_95": 7.024721923828125,
            "loss_sequences_upper_95": 7.243000024414063,
            "loss_tokens_lower_95": 6.7159118438399785,
            "loss_tokens_upper_95": 6.898782255414371,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.417345708542689,
            "data_time": 0.004117240395434325,
            "batch_time": 0.01788893957999239,
            "samples_per_second": 2234360.8848035084,
            "samples_per_second_per_gpu": 279295.11060043855,
            "loss_sequences_lower_95": 4.3843006856562186,
            "loss_sequences_upper_95": 4.4507039074285535,
            "loss_tokens_lower_95": 4.384749632684169,
            "loss_tokens_upper_95": 4.450588859330477,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.08711471022915,
            "data_time": 0.008951808030511677,
            "batch_time": 0.022787746706037722,
            "samples_per_second": 2174097.683930302,
            "samples_per_second_per_gpu": 271762.21049128775,
            "loss_sequences_lower_95": 4.981576345886137,
            "loss_sequences_upper_95": 5.191976352111535,
            "loss_tokens_lower_95": 4.978689382995512,
            "loss_tokens_upper_95": 5.189331504716302,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.647716957092285,
            "data_time": 0.006505864953237866,
            "batch_time": 0.02044989381517683,
            "samples_per_second": 2188551.5167118073,
            "samples_per_second_per_gpu": 273568.9395889759,
            "loss_sequences_lower_95": 9.596015747070313,
            "loss_sequences_upper_95": 9.699560400390626,
            "loss_tokens_lower_95": 9.595274096679688,
            "loss_tokens_upper_95": 9.699641650390626,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.025619073746446,
            "data_time": 0.0020540056246665566,
            "batch_time": 0.015646880515807066,
            "samples_per_second": 2276271.9071558574,
            "samples_per_second_per_gpu": 284533.9883944822,
            "loss_sequences_lower_95": 4.6743536763836335,
            "loss_sequences_upper_95": 4.789336334777081,
            "loss_tokens_lower_95": 3.2421122343290105,
            "loss_tokens_upper_95": 3.314364795727936,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.670815764078453,
            "data_time": 0.01977463960647583,
            "batch_time": 0.03378878320966448,
            "samples_per_second": 2038923.0183130987,
            "samples_per_second_per_gpu": 254865.37728913734,
            "loss_sequences_lower_95": 5.460059026461929,
            "loss_sequences_upper_95": 5.876789594052442,
            "loss_tokens_lower_95": 5.464450995601825,
            "loss_tokens_upper_95": 5.876191734200093,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.449996030096914,
            "data_time": 0.011690969578921795,
            "batch_time": 0.025929346680641174,
            "samples_per_second": 2127317.643071359,
            "samples_per_second_per_gpu": 265914.7053839199,
            "loss_sequences_lower_95": 5.316501393037684,
            "loss_sequences_upper_95": 5.583215080710018,
            "loss_tokens_lower_95": 5.3174955480238975,
            "loss_tokens_upper_95": 5.57923807779948,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.683258285636549,
            "data_time": 0.0020235632920032683,
            "batch_time": 0.01571839181862345,
            "samples_per_second": 2261406.5864995755,
            "samples_per_second_per_gpu": 282675.82331244694,
            "loss_sequences_lower_95": 5.22262090878664,
            "loss_sequences_upper_95": 5.345193423720719,
            "loss_tokens_lower_95": 3.9056440493593505,
            "loss_tokens_upper_95": 3.992840042950563,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.985744574713329,
            "data_time": 0.02778114875157674,
            "batch_time": 0.042935902873675026,
            "samples_per_second": 1958136.2784456366,
            "samples_per_second_per_gpu": 244767.03480570458,
            "loss_sequences_lower_95": 4.8287986957206925,
            "loss_sequences_upper_95": 5.1366766874121605,
            "loss_tokens_lower_95": 4.830811378438637,
            "loss_tokens_upper_95": 5.137526803798776,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.6152600039035905,
            "data_time": 0.0038419097334473997,
            "batch_time": 0.017470747123271117,
            "samples_per_second": 2251359.9281952665,
            "samples_per_second_per_gpu": 281419.9910244083,
            "loss_sequences_lower_95": 5.576428961797592,
            "loss_sequences_upper_95": 5.654035995436736,
            "loss_tokens_lower_95": 5.576369994743883,
            "loss_tokens_upper_95": 5.654072310421444,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.69994686298,
            "data_time": 0.02509188001806086,
            "batch_time": 0.039490979368036444,
            "samples_per_second": 1903670.9326873154,
            "samples_per_second_per_gpu": 237958.86658591442,
            "loss_sequences_lower_95": 5.498506297880006,
            "loss_sequences_upper_95": 5.8996764692288,
            "loss_tokens_lower_95": 5.496763003451154,
            "loss_tokens_upper_95": 5.904632568359375,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.48030800819397,
            "data_time": 0.08570260554552078,
            "batch_time": 0.10131567716598511,
            "samples_per_second": 1394332.611866995,
            "samples_per_second_per_gpu": 174291.57648337437,
            "loss_sequences_lower_95": 4.147905991872151,
            "loss_sequences_upper_95": 4.979225540161133,
            "loss_tokens_lower_95": 3.70028322007921,
            "loss_tokens_upper_95": 4.810169050428603,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.855557330449422,
            "data_time": 0.08465375006198883,
            "batch_time": 0.10064930468797684,
            "samples_per_second": 1405093.363041895,
            "samples_per_second_per_gpu": 175636.67038023687,
            "loss_sequences_lower_95": 3.6202591133117674,
            "loss_sequences_upper_95": 4.521990242004395,
            "loss_tokens_lower_95": 2.946261562390274,
            "loss_tokens_upper_95": 4.1529236097014355,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3913476555617814,
            "data_time": 0.00346738973598469,
            "batch_time": 0.017094375527276932,
            "samples_per_second": 2262976.0661097183,
            "samples_per_second_per_gpu": 282872.0082637148,
            "loss_sequences_lower_95": 3.3785864509503867,
            "loss_sequences_upper_95": 3.4046773460511783,
            "loss_tokens_lower_95": 3.3783797332359167,
            "loss_tokens_upper_95": 3.4040851953700293,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.8746312606765058,
            "data_time": 0.0015476059894071366,
            "batch_time": 0.015252895351637897,
            "samples_per_second": 2263431.894633054,
            "samples_per_second_per_gpu": 282928.9868291317,
            "loss_sequences_lower_95": 1.0476623672895438,
            "loss_sequences_upper_95": 1.0787396206425663,
            "loss_tokens_lower_95": 0.6865517428585953,
            "loss_tokens_upper_95": 0.7011657304302084,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.571482327979381,
            "data_time": 0.04770185053348541,
            "batch_time": 0.06302956864237785,
            "samples_per_second": 1824605.8635795794,
            "samples_per_second_per_gpu": 228075.73294744742,
            "loss_sequences_lower_95": 4.565163896215243,
            "loss_sequences_upper_95": 4.953380014014056,
            "loss_tokens_lower_95": 4.233535530467023,
            "loss_tokens_upper_95": 4.439781356507922,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.962766479801488,
            "data_time": 0.13101226942879812,
            "batch_time": 0.1481699716477167,
            "samples_per_second": 930259.545883723,
            "samples_per_second_per_gpu": 116282.44323546537,
            "loss_sequences_lower_95": 6.584003799026077,
            "loss_sequences_upper_95": 7.555906450426256,
            "loss_tokens_lower_95": 6.313330671522353,
            "loss_tokens_upper_95": 7.385446336534288,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.451902719532571,
            "data_time": 0.03344295138404483,
            "batch_time": 0.04906189441680908,
            "samples_per_second": 1789633.1612216686,
            "samples_per_second_per_gpu": 223704.14515270857,
            "loss_sequences_lower_95": 4.406284146192597,
            "loss_sequences_upper_95": 4.769640294516959,
            "loss_tokens_lower_95": 4.096442043602818,
            "loss_tokens_upper_95": 4.2693117246635826,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.5928741199214285,
            "data_time": 0.031517130987984796,
            "batch_time": 0.046494293780553905,
            "samples_per_second": 1842232.7615912794,
            "samples_per_second_per_gpu": 230279.09519890993,
            "loss_sequences_lower_95": 4.5802314851342185,
            "loss_sequences_upper_95": 4.911202965713128,
            "loss_tokens_lower_95": 4.241756138131083,
            "loss_tokens_upper_95": 4.386059160882344,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.6113512276149375,
            "data_time": 0.03175572270438785,
            "batch_time": 0.046579321225484215,
            "samples_per_second": 1911493.9068527385,
            "samples_per_second_per_gpu": 238936.7383565923,
            "loss_sequences_lower_95": 4.506233420023104,
            "loss_sequences_upper_95": 4.910939314307235,
            "loss_tokens_lower_95": 4.260586514508607,
            "loss_tokens_upper_95": 4.487910574616338,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.675095171463199,
            "data_time": 0.031495247568402975,
            "batch_time": 0.045700955958593456,
            "samples_per_second": 1962782.8247441011,
            "samples_per_second_per_gpu": 245347.85309301264,
            "loss_sequences_lower_95": 4.6442713853789535,
            "loss_sequences_upper_95": 4.9391886641339555,
            "loss_tokens_lower_95": 4.357503740067051,
            "loss_tokens_upper_95": 4.492481562058873,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.928458053873192,
            "data_time": 0.03288991068616325,
            "batch_time": 0.04755735691682792,
            "samples_per_second": 1975679.9670033348,
            "samples_per_second_per_gpu": 246959.99587541685,
            "loss_sequences_lower_95": 4.8989701359908775,
            "loss_sequences_upper_95": 5.186083065056653,
            "loss_tokens_lower_95": 4.6813392187957,
            "loss_tokens_upper_95": 4.795799233145657,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.002496379177745,
            "data_time": 0.03521757750284104,
            "batch_time": 0.049978997026171,
            "samples_per_second": 1907219.3199006582,
            "samples_per_second_per_gpu": 238402.41498758228,
            "loss_sequences_lower_95": 5.034469511450791,
            "loss_sequences_upper_95": 5.34759492641542,
            "loss_tokens_lower_95": 4.654040849627035,
            "loss_tokens_upper_95": 4.785135588551085,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/params.txt",
    "uuid": "cfb61889-570f-4a32-93f9-64803af43400",
    "creation_date": "2023_12_14-04_59_29"
}