{
    "name": "c4_original-d=576_l=24_h=8-8.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 24588380160,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 8.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/787",
        "--train-num-samples",
        "4917676032",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/original_c4/manifest.jsonl",
        "--data-key",
        "txt",
        "--name",
        "c4_original-d=576_l=24_h=8-8.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/paloma_val/00000001.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/154m_8x_c4_original/"
    ],
    "results": [
        {
            "loss": 3.8291171828905743,
            "data_time": 0.12537448108196259,
            "batch_time": 1.370915725827217,
            "samples_per_second": 309445.8722560701,
            "samples_per_second_per_gpu": 38680.73403200876,
            "loss_sequences_lower_95": 3.6981305440266925,
            "loss_sequences_upper_95": 3.964744853973389,
            "loss_tokens_lower_95": 3.8135808753967284,
            "loss_tokens_upper_95": 3.8448506355285645,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.045812595364492,
            "data_time": 0.002499308459185307,
            "batch_time": 0.07130902101242041,
            "samples_per_second": 1857421.7703649134,
            "samples_per_second_per_gpu": 232177.72129561417,
            "loss_sequences_lower_95": 3.042996778822398,
            "loss_sequences_upper_95": 3.0487220759582465,
            "loss_tokens_lower_95": 3.035299010416667,
            "loss_tokens_upper_95": 3.0564594166666663,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.61814690463397,
            "data_time": 0.033717602491378784,
            "batch_time": 0.1220007948577404,
            "samples_per_second": 1533110.005836133,
            "samples_per_second_per_gpu": 191638.7507295166,
            "loss_sequences_lower_95": 3.5977095312001754,
            "loss_sequences_upper_95": 3.638616650639748,
            "loss_tokens_lower_95": 3.6018036354166667,
            "loss_tokens_upper_95": 3.6349296041666666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9940204391282856,
            "data_time": 0.005172850270020335,
            "batch_time": 0.0733733090915178,
            "samples_per_second": 1846281.2993710001,
            "samples_per_second_per_gpu": 230785.16242137502,
            "loss_sequences_lower_95": 2.9835912985260955,
            "loss_sequences_upper_95": 3.004083103455219,
            "loss_tokens_lower_95": 2.98394828125,
            "loss_tokens_upper_95": 3.00423959375,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.054557358902972,
            "data_time": 0.03370348736643791,
            "batch_time": 0.10055570676922798,
            "samples_per_second": 1657342.9808304224,
            "samples_per_second_per_gpu": 207167.8726038028,
            "loss_sequences_lower_95": 3.019232438780868,
            "loss_sequences_upper_95": 3.0886976937412487,
            "loss_tokens_lower_95": 3.044140302083333,
            "loss_tokens_upper_95": 3.06505,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5664414241150233,
            "data_time": 0.012213471035162607,
            "batch_time": 0.08051685243844986,
            "samples_per_second": 1765371.95849244,
            "samples_per_second_per_gpu": 220671.494811555,
            "loss_sequences_lower_95": 3.5294849263495283,
            "loss_sequences_upper_95": 3.6048009565134262,
            "loss_tokens_lower_95": 3.55366859375,
            "loss_tokens_upper_95": 3.57911590625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4804343433769382,
            "data_time": 0.00524060084269597,
            "batch_time": 0.0726092408100764,
            "samples_per_second": 1829422.4351106004,
            "samples_per_second_per_gpu": 228677.80438882505,
            "loss_sequences_lower_95": 3.446764568718112,
            "loss_sequences_upper_95": 3.513481784119898,
            "loss_tokens_lower_95": 3.4640578958333332,
            "loss_tokens_upper_95": 3.4970960416666665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.781166592043732,
            "data_time": 0.0052335823052807855,
            "batch_time": 0.07260861130137193,
            "samples_per_second": 1830530.6984924169,
            "samples_per_second_per_gpu": 228816.3373115521,
            "loss_sequences_lower_95": 3.7725204004417536,
            "loss_sequences_upper_95": 3.789998650196335,
            "loss_tokens_lower_95": 3.7691370729166667,
            "loss_tokens_upper_95": 3.7935883541666664,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.454491141850386,
            "data_time": 0.03444143012166023,
            "batch_time": 0.10128596052527428,
            "samples_per_second": 1660912.0979308235,
            "samples_per_second_per_gpu": 207614.01224135293,
            "loss_sequences_lower_95": 3.411789262973196,
            "loss_sequences_upper_95": 3.498658727630367,
            "loss_tokens_lower_95": 3.4428332604166667,
            "loss_tokens_upper_95": 3.4658359375,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.492144754281628,
            "data_time": 0.03547793626785278,
            "batch_time": 0.10397340729832649,
            "samples_per_second": 1670640.59597386,
            "samples_per_second_per_gpu": 208830.0744967325,
            "loss_sequences_lower_95": 4.465638413259635,
            "loss_sequences_upper_95": 4.516781652397789,
            "loss_tokens_lower_95": 4.47860653125,
            "loss_tokens_upper_95": 4.5056335625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4220319154966004,
            "data_time": 0.00416860105174003,
            "batch_time": 0.07347648219498198,
            "samples_per_second": 1850235.889241055,
            "samples_per_second_per_gpu": 231279.48615513186,
            "loss_sequences_lower_95": 3.413770200403419,
            "loss_sequences_upper_95": 3.4306331877612375,
            "loss_tokens_lower_95": 3.410703421875,
            "loss_tokens_upper_95": 3.432888895833333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2097876305284228,
            "data_time": 0.009772771283199913,
            "batch_time": 0.08013410944687693,
            "samples_per_second": 1812103.405047431,
            "samples_per_second_per_gpu": 226512.92563092886,
            "loss_sequences_lower_95": 3.200532925123972,
            "loss_sequences_upper_95": 3.218792350924745,
            "loss_tokens_lower_95": 3.198435421875,
            "loss_tokens_upper_95": 3.2212631927083333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.957554719994566,
            "data_time": 0.03441665694117546,
            "batch_time": 0.10162162780761719,
            "samples_per_second": 1655076.6168292086,
            "samples_per_second_per_gpu": 206884.57710365107,
            "loss_sequences_lower_95": 3.9197375438759825,
            "loss_sequences_upper_95": 3.995819159888834,
            "loss_tokens_lower_95": 3.9436386979166667,
            "loss_tokens_upper_95": 3.9713393229166667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.048355293856619,
            "data_time": 0.03439907357096672,
            "batch_time": 0.10107610002160072,
            "samples_per_second": 1653550.8967270693,
            "samples_per_second_per_gpu": 206693.86209088366,
            "loss_sequences_lower_95": 2.991671187328952,
            "loss_sequences_upper_95": 3.1035719551039813,
            "loss_tokens_lower_95": 3.03689703125,
            "loss_tokens_upper_95": 3.0604112083333335,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.569727344946428,
            "data_time": 0.11698190867900848,
            "batch_time": 0.15618804097175598,
            "samples_per_second": 674475.988959925,
            "samples_per_second_per_gpu": 84309.49861999063,
            "loss_sequences_lower_95": 4.494591548226096,
            "loss_sequences_upper_95": 4.64326303655451,
            "loss_tokens_lower_95": 4.5379302631724965,
            "loss_tokens_upper_95": 4.6017712939869275,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7184225498066006,
            "data_time": 0.04330650965372721,
            "batch_time": 0.10677364468574524,
            "samples_per_second": 1553854.6448869507,
            "samples_per_second_per_gpu": 194231.83061086884,
            "loss_sequences_lower_95": 3.6287189895140535,
            "loss_sequences_upper_95": 3.8101961185911306,
            "loss_tokens_lower_95": 3.7043230104166667,
            "loss_tokens_upper_95": 3.7328549791666665,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.805011503929198,
            "data_time": 0.04425795376300812,
            "batch_time": 0.1126635770003001,
            "samples_per_second": 1611719.980178727,
            "samples_per_second_per_gpu": 201464.99752234088,
            "loss_sequences_lower_95": 5.746146298209721,
            "loss_sequences_upper_95": 5.862225100232932,
            "loss_tokens_lower_95": 5.7920495625,
            "loss_tokens_upper_95": 5.817754625,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6195893483083754,
            "data_time": 0.1261603832244873,
            "batch_time": 0.1945391744375229,
            "samples_per_second": 1102213.257999843,
            "samples_per_second_per_gpu": 137776.65724998037,
            "loss_sequences_lower_95": 3.577881197069512,
            "loss_sequences_upper_95": 3.6614270069560066,
            "loss_tokens_lower_95": 3.6045884679575435,
            "loss_tokens_upper_95": 3.6345657411168832,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.943197520433574,
            "data_time": 0.004739308357238769,
            "batch_time": 0.07290126938711514,
            "samples_per_second": 1844124.856286334,
            "samples_per_second_per_gpu": 230515.60703579176,
            "loss_sequences_lower_95": 4.920931263130252,
            "loss_sequences_upper_95": 4.966068382464214,
            "loss_tokens_lower_95": 4.920559568504308,
            "loss_tokens_upper_95": 4.965474864469093,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.7598654392634083,
            "data_time": 0.005076812603805638,
            "batch_time": 0.07307775262035901,
            "samples_per_second": 1835652.1722684917,
            "samples_per_second_per_gpu": 229456.52153356146,
            "loss_sequences_lower_95": 2.776927505064666,
            "loss_sequences_upper_95": 2.80253853249944,
            "loss_tokens_lower_95": 2.7487393759252328,
            "loss_tokens_upper_95": 2.7675314997698726,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7085470491048014,
            "data_time": 0.010610995923771578,
            "batch_time": 0.07750034770544838,
            "samples_per_second": 1802715.2067416979,
            "samples_per_second_per_gpu": 225339.40084271223,
            "loss_sequences_lower_95": 4.319476252048152,
            "loss_sequences_upper_95": 4.620774522005639,
            "loss_tokens_lower_95": 3.5001812876477993,
            "loss_tokens_upper_95": 3.7207261197079013,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9048052674134572,
            "data_time": 0.013431745891769728,
            "batch_time": 0.08045150091250737,
            "samples_per_second": 1785422.551162245,
            "samples_per_second_per_gpu": 223177.81889528062,
            "loss_sequences_lower_95": 4.145943294270833,
            "loss_sequences_upper_95": 4.349687589518228,
            "loss_tokens_lower_95": 3.803305584217767,
            "loss_tokens_upper_95": 3.948488649764151,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8900291131607414,
            "data_time": 0.015816100619056007,
            "batch_time": 0.08093722977421501,
            "samples_per_second": 1736244.3322822212,
            "samples_per_second_per_gpu": 217030.54153527765,
            "loss_sequences_lower_95": 2.9935273396779047,
            "loss_sequences_upper_95": 3.0587915196393456,
            "loss_tokens_lower_95": 2.8562060321584712,
            "loss_tokens_upper_95": 2.8880321009986125,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4926415302536706,
            "data_time": 0.0746341273188591,
            "batch_time": 0.13693098723888397,
            "samples_per_second": 1393938.6703015757,
            "samples_per_second_per_gpu": 174242.33378769696,
            "loss_sequences_lower_95": 3.417751333063299,
            "loss_sequences_upper_95": 3.697790236039595,
            "loss_tokens_lower_95": 3.419376102237227,
            "loss_tokens_upper_95": 3.502679026944976,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4668139447971265,
            "data_time": 0.07869688421487808,
            "batch_time": 0.14562712609767914,
            "samples_per_second": 1460496.0073138746,
            "samples_per_second_per_gpu": 182562.00091423432,
            "loss_sequences_lower_95": 3.5026741121253187,
            "loss_sequences_upper_95": 3.7049832215601084,
            "loss_tokens_lower_95": 3.400884360982989,
            "loss_tokens_upper_95": 3.5031158624957683,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.005932370821635,
            "data_time": 0.051633551716804504,
            "batch_time": 0.10867903629938762,
            "samples_per_second": 1427341.1141261181,
            "samples_per_second_per_gpu": 178417.63926576477,
            "loss_sequences_lower_95": 4.014237935384115,
            "loss_sequences_upper_95": 4.129956288655599,
            "loss_tokens_lower_95": 3.8883118053007184,
            "loss_tokens_upper_95": 4.10217875137612,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.738803966045028,
            "data_time": 0.004036154949440146,
            "batch_time": 0.07227006282821391,
            "samples_per_second": 1846609.5396784206,
            "samples_per_second_per_gpu": 230826.19245980258,
            "loss_sequences_lower_95": 4.79655432649722,
            "loss_sequences_upper_95": 4.8802845482352,
            "loss_tokens_lower_95": 4.665988085834846,
            "loss_tokens_upper_95": 4.7495936876612594,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6542540767578164,
            "data_time": 0.009671660630326522,
            "batch_time": 0.07669909376847117,
            "samples_per_second": 1812669.389418453,
            "samples_per_second_per_gpu": 226583.67367730662,
            "loss_sequences_lower_95": 4.820957571729666,
            "loss_sequences_upper_95": 5.147111917103983,
            "loss_tokens_lower_95": 3.454270509586456,
            "loss_tokens_upper_95": 3.5939933146269727,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5521490173128276,
            "data_time": 0.016914303600788116,
            "batch_time": 0.08139166682958603,
            "samples_per_second": 1668982.4945915327,
            "samples_per_second_per_gpu": 208622.8118239416,
            "loss_sequences_lower_95": 4.2706031877433075,
            "loss_sequences_upper_95": 4.640428500126653,
            "loss_tokens_lower_95": 3.418589216972713,
            "loss_tokens_upper_95": 3.5829261644381254,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.0555795055546175,
            "data_time": 0.07688725739717484,
            "batch_time": 0.13871517777442932,
            "samples_per_second": 1385674.5378527394,
            "samples_per_second_per_gpu": 173209.31723159243,
            "loss_sequences_lower_95": 5.9744762263885915,
            "loss_sequences_upper_95": 6.1377472899275825,
            "loss_tokens_lower_95": 5.9742279192084045,
            "loss_tokens_upper_95": 6.137398511634025,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0739033102989195,
            "data_time": 0.1385830044746399,
            "batch_time": 0.19781991839408875,
            "samples_per_second": 992900.8702007389,
            "samples_per_second_per_gpu": 124112.60877509236,
            "loss_sequences_lower_95": 2.9985436630249023,
            "loss_sequences_upper_95": 3.345636749267578,
            "loss_tokens_lower_95": 2.845564780465605,
            "loss_tokens_upper_95": 3.267185130656725,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.9110456720955415,
            "data_time": 0.01137025747448206,
            "batch_time": 0.07740283384919167,
            "samples_per_second": 1768383.4052598535,
            "samples_per_second_per_gpu": 221047.92565748168,
            "loss_sequences_lower_95": 4.842435976182671,
            "loss_sequences_upper_95": 4.980920485122665,
            "loss_tokens_lower_95": 4.840986900368635,
            "loss_tokens_upper_95": 4.98143280560309,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.108002040544364,
            "data_time": 0.016723905503749848,
            "batch_time": 0.08252357095479965,
            "samples_per_second": 1762860.2990502107,
            "samples_per_second_per_gpu": 220357.53738127634,
            "loss_sequences_lower_95": 5.045353210099305,
            "loss_sequences_upper_95": 5.169461267019861,
            "loss_tokens_lower_95": 5.045827124843238,
            "loss_tokens_upper_95": 5.168328232573838,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.010885913820858,
            "data_time": 0.01225073238213857,
            "batch_time": 0.07830515702565512,
            "samples_per_second": 1777052.4680861006,
            "samples_per_second_per_gpu": 222131.55851076258,
            "loss_sequences_lower_95": 3.2888069426792876,
            "loss_sequences_upper_95": 3.4173129109744798,
            "loss_tokens_lower_95": 2.9448426773433,
            "loss_tokens_upper_95": 2.997872756514556,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.88338848733902,
            "data_time": 0.04047020524740219,
            "batch_time": 0.10796593129634857,
            "samples_per_second": 1654550.8664003215,
            "samples_per_second_per_gpu": 206818.85830004018,
            "loss_sequences_lower_95": 5.341282092285156,
            "loss_sequences_upper_95": 5.927958764648437,
            "loss_tokens_lower_95": 4.571886100238873,
            "loss_tokens_upper_95": 4.935147654181833,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.859313875436783,
            "data_time": 0.11581981182098389,
            "batch_time": 0.15479286015033722,
            "samples_per_second": 507926.9926627036,
            "samples_per_second_per_gpu": 63490.87408283795,
            "loss_sequences_lower_95": 3.6043762266635895,
            "loss_sequences_upper_95": 4.172616744041443,
            "loss_tokens_lower_95": 3.37097835321536,
            "loss_tokens_upper_95": 4.200420414716348,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8795989614793625,
            "data_time": 0.07491527497768402,
            "batch_time": 0.12686306238174438,
            "samples_per_second": 1222799.0661900854,
            "samples_per_second_per_gpu": 152849.88327376067,
            "loss_sequences_lower_95": 5.148533446213294,
            "loss_sequences_upper_95": 5.980291204342897,
            "loss_tokens_lower_95": 3.21122202566807,
            "loss_tokens_upper_95": 3.6452296518284952,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9391531654714549,
            "data_time": 0.010307240817281935,
            "batch_time": 0.07837071435319053,
            "samples_per_second": 1814855.6345320928,
            "samples_per_second_per_gpu": 226856.9543165116,
            "loss_sequences_lower_95": 1.9194114088439276,
            "loss_sequences_upper_95": 1.9590838977685396,
            "loss_tokens_lower_95": 1.919240963703787,
            "loss_tokens_upper_95": 1.9588662617068042,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5576654828538334,
            "data_time": 0.008104492978351871,
            "batch_time": 0.07557176017179722,
            "samples_per_second": 1804338.6162186752,
            "samples_per_second_per_gpu": 225542.3270273344,
            "loss_sequences_lower_95": 2.579311112193443,
            "loss_sequences_upper_95": 2.7252828013566126,
            "loss_tokens_lower_95": 2.4641487725146964,
            "loss_tokens_upper_95": 2.608192689167597,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0204123243307457,
            "data_time": 0.05486025412877401,
            "batch_time": 0.1104707419872284,
            "samples_per_second": 1185767.9043322017,
            "samples_per_second_per_gpu": 148220.9880415252,
            "loss_sequences_lower_95": 2.97360109322237,
            "loss_sequences_upper_95": 3.395233092814575,
            "loss_tokens_lower_95": 2.8330951927107426,
            "loss_tokens_upper_95": 3.1252202629491648,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4195340934849763,
            "data_time": 0.01719014495611191,
            "batch_time": 0.08512037694454193,
            "samples_per_second": 1780324.3486983678,
            "samples_per_second_per_gpu": 222540.54358729598,
            "loss_sequences_lower_95": 3.520336374525207,
            "loss_sequences_upper_95": 3.6741308246904594,
            "loss_tokens_lower_95": 3.3261025602385907,
            "loss_tokens_upper_95": 3.4694393884476384,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.591987199899627,
            "data_time": 0.07961127161979675,
            "batch_time": 0.13070034980773926,
            "samples_per_second": 1100947.2721799146,
            "samples_per_second_per_gpu": 137618.40902248933,
            "loss_sequences_lower_95": 2.520575514072325,
            "loss_sequences_upper_95": 2.946281819227265,
            "loss_tokens_lower_95": 2.4033187691967015,
            "loss_tokens_upper_95": 2.742944395678273,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3722937522255965,
            "data_time": 0.006137208779904808,
            "batch_time": 0.07408459874227194,
            "samples_per_second": 1817903.0928973313,
            "samples_per_second_per_gpu": 227237.88661216642,
            "loss_sequences_lower_95": 4.357399917483496,
            "loss_sequences_upper_95": 4.386815937406231,
            "loss_tokens_lower_95": 4.357286808924284,
            "loss_tokens_upper_95": 4.387158144519529,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.7859117990558587,
            "data_time": 0.13979679346084595,
            "batch_time": 0.20101632177829742,
            "samples_per_second": 987138.9460838984,
            "samples_per_second_per_gpu": 123392.3682604873,
            "loss_sequences_lower_95": 0.7682740683694488,
            "loss_sequences_upper_95": 0.8830250453023077,
            "loss_tokens_lower_95": 0.6832696967383612,
            "loss_tokens_upper_95": 0.8610526937057198,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.135388917867992,
            "data_time": 0.003656888735732905,
            "batch_time": 0.07186381782975085,
            "samples_per_second": 1843615.3266006894,
            "samples_per_second_per_gpu": 230451.91582508618,
            "loss_sequences_lower_95": 4.9139694297038785,
            "loss_sequences_upper_95": 4.961668990353118,
            "loss_tokens_lower_95": 3.9649261484526113,
            "loss_tokens_upper_95": 4.012143641199226,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.40714235496521,
            "data_time": 0.02074199914932251,
            "batch_time": 0.08819663152098656,
            "samples_per_second": 1750440.265720216,
            "samples_per_second_per_gpu": 218805.033215027,
            "loss_sequences_lower_95": 6.464721911621094,
            "loss_sequences_upper_95": 6.7639570190429685,
            "loss_tokens_lower_95": 6.217225480577509,
            "loss_tokens_upper_95": 6.487184297660479,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.838464811573858,
            "data_time": 0.07831205427646637,
            "batch_time": 0.14229589700698853,
            "samples_per_second": 1411734.8899061645,
            "samples_per_second_per_gpu": 176466.86123827056,
            "loss_sequences_lower_95": 4.670041490637738,
            "loss_sequences_upper_95": 5.007849943741508,
            "loss_tokens_lower_95": 4.673713830035665,
            "loss_tokens_upper_95": 5.0014904519786,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.810129754471056,
            "data_time": 0.014839669520204718,
            "batch_time": 0.08000983026894656,
            "samples_per_second": 1734179.6795450123,
            "samples_per_second_per_gpu": 216772.45994312654,
            "loss_sequences_lower_95": 6.7145993134469695,
            "loss_sequences_upper_95": 6.902050836736505,
            "loss_tokens_lower_95": 6.719819761334044,
            "loss_tokens_upper_95": 6.903619569720644,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8928615085283915,
            "data_time": 0.015159357339143753,
            "batch_time": 0.08249430855115254,
            "samples_per_second": 1779936.6645474194,
            "samples_per_second_per_gpu": 222492.08306842743,
            "loss_sequences_lower_95": 0.9411831542968749,
            "loss_sequences_upper_95": 0.9849209004720052,
            "loss_tokens_lower_95": 0.8599723053283814,
            "loss_tokens_upper_95": 0.9111395300307623,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.20783219110398,
            "data_time": 0.07729891687631607,
            "batch_time": 0.13684386759996414,
            "samples_per_second": 1383870.505516987,
            "samples_per_second_per_gpu": 172983.81318962338,
            "loss_sequences_lower_95": 5.8762362089611235,
            "loss_sequences_upper_95": 6.537742193312872,
            "loss_tokens_lower_95": 5.882821015857515,
            "loss_tokens_upper_95": 6.5329680815197175,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1123099736869335,
            "data_time": 0.12643294036388397,
            "batch_time": 0.16689659655094147,
            "samples_per_second": 511425.04653208185,
            "samples_per_second_per_gpu": 63928.13081651023,
            "loss_sequences_lower_95": 1.9253934800624848,
            "loss_sequences_upper_95": 2.7722416520118713,
            "loss_tokens_lower_95": 1.6498588703588113,
            "loss_tokens_upper_95": 2.123512050982603,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.019595039844513,
            "data_time": 0.020977530628442764,
            "batch_time": 0.08827422931790352,
            "samples_per_second": 1752659.1550061048,
            "samples_per_second_per_gpu": 219082.3943757631,
            "loss_sequences_lower_95": 7.053328393554688,
            "loss_sequences_upper_95": 7.386949157714843,
            "loss_tokens_lower_95": 6.838622458211057,
            "loss_tokens_upper_95": 7.13444138476893,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.627200457572937,
            "data_time": 0.022235799580812454,
            "batch_time": 0.08965650200843811,
            "samples_per_second": 1744888.2437720336,
            "samples_per_second_per_gpu": 218111.0304715042,
            "loss_sequences_lower_95": 6.791097546386719,
            "loss_sequences_upper_95": 6.985755029296875,
            "loss_tokens_lower_95": 6.513901711942915,
            "loss_tokens_upper_95": 6.694624187257449,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.91219400680189,
            "data_time": 0.012075417985518774,
            "batch_time": 0.07911078756054242,
            "samples_per_second": 1764651.9103444905,
            "samples_per_second_per_gpu": 220581.48879306132,
            "loss_sequences_lower_95": 4.871149027431487,
            "loss_sequences_upper_95": 4.952135915369385,
            "loss_tokens_lower_95": 4.871453863560174,
            "loss_tokens_upper_95": 4.952885328056906,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.01993349235728,
            "data_time": 0.02840256690979004,
            "batch_time": 0.09219329959743626,
            "samples_per_second": 1571378.4234806031,
            "samples_per_second_per_gpu": 196422.3029350754,
            "loss_sequences_lower_95": 3.9179256128642233,
            "loss_sequences_upper_95": 4.121452469833069,
            "loss_tokens_lower_95": 3.9170923939132107,
            "loss_tokens_upper_95": 4.123114266813076,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 8.202286833286285,
            "data_time": 0.021324805915355682,
            "batch_time": 0.08855347707867622,
            "samples_per_second": 1753613.8890897422,
            "samples_per_second_per_gpu": 219201.73613621778,
            "loss_sequences_lower_95": 8.112280200195313,
            "loss_sequences_upper_95": 8.293392041015625,
            "loss_tokens_lower_95": 8.108964562988282,
            "loss_tokens_upper_95": 8.293983105468751,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5484647104733376,
            "data_time": 0.00505597799657339,
            "batch_time": 0.07309095830802458,
            "samples_per_second": 1838688.596301129,
            "samples_per_second_per_gpu": 229836.07453764114,
            "loss_sequences_lower_95": 3.501755179384461,
            "loss_sequences_upper_95": 3.5960312943472093,
            "loss_tokens_lower_95": 2.35824985153484,
            "loss_tokens_upper_95": 2.4195819714307043,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.519920616897185,
            "data_time": 0.05442599274895408,
            "batch_time": 0.112319908358834,
            "samples_per_second": 1236796.3157751895,
            "samples_per_second_per_gpu": 154599.5394718987,
            "loss_sequences_lower_95": 4.366941491881414,
            "loss_sequences_upper_95": 4.6743488938061155,
            "loss_tokens_lower_95": 4.363291407343167,
            "loss_tokens_upper_95": 4.669649209549178,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.921816540699379,
            "data_time": 0.03784098103642464,
            "batch_time": 0.10639302060008049,
            "samples_per_second": 1673975.0168438095,
            "samples_per_second_per_gpu": 209246.8771054762,
            "loss_sequences_lower_95": 4.801619849111519,
            "loss_sequences_upper_95": 5.040792356004902,
            "loss_tokens_lower_95": 4.801347943474265,
            "loss_tokens_upper_95": 5.0400259698606,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.938533702560285,
            "data_time": 0.0061044033084596905,
            "batch_time": 0.07387894273750366,
            "samples_per_second": 1828468.3033828875,
            "samples_per_second_per_gpu": 228558.53792286094,
            "loss_sequences_lower_95": 3.7941465274352533,
            "loss_sequences_upper_95": 3.893080241802894,
            "loss_tokens_lower_95": 2.741185539786267,
            "loss_tokens_upper_95": 2.8141512380508305,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.924024375027449,
            "data_time": 0.07807138562202454,
            "batch_time": 0.13246780633926392,
            "samples_per_second": 1330935.9883190275,
            "samples_per_second_per_gpu": 166366.99853987843,
            "loss_sequences_lower_95": 4.732434986255787,
            "loss_sequences_upper_95": 5.11359290067481,
            "loss_tokens_lower_95": 4.730219870269614,
            "loss_tokens_upper_95": 5.116268832090671,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4657839636554777,
            "data_time": 0.01234923073878655,
            "batch_time": 0.07979539839121011,
            "samples_per_second": 1793016.2879789309,
            "samples_per_second_per_gpu": 224127.03599736636,
            "loss_sequences_lower_95": 3.436272263833142,
            "loss_sequences_upper_95": 3.495119546779434,
            "loss_tokens_lower_95": 3.43596510954224,
            "loss_tokens_upper_95": 3.4950579964640673,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.756140740172377,
            "data_time": 0.08364169299602509,
            "batch_time": 0.1417854055762291,
            "samples_per_second": 1374561.736485139,
            "samples_per_second_per_gpu": 171820.21706064238,
            "loss_sequences_lower_95": 4.5707715377066895,
            "loss_sequences_upper_95": 4.936611508860171,
            "loss_tokens_lower_95": 4.570293315405984,
            "loss_tokens_upper_95": 4.939552558972998,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7094119985898335,
            "data_time": 0.12695850431919098,
            "batch_time": 0.16774098575115204,
            "samples_per_second": 845963.8423510843,
            "samples_per_second_per_gpu": 105745.48029388554,
            "loss_sequences_lower_95": 1.609413242340088,
            "loss_sequences_upper_95": 2.015697816212972,
            "loss_tokens_lower_95": 1.4419528166453044,
            "loss_tokens_upper_95": 1.9481873406304253,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6813159922758738,
            "data_time": 0.12799960374832153,
            "batch_time": 0.1727292239665985,
            "samples_per_second": 785306.6665511117,
            "samples_per_second_per_gpu": 98163.33331888897,
            "loss_sequences_lower_95": 1.6502214749654134,
            "loss_sequences_upper_95": 2.182388916015625,
            "loss_tokens_lower_95": 1.3379797946201282,
            "loss_tokens_upper_95": 1.9415087024817306,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.859142949689997,
            "data_time": 0.01065531759350388,
            "batch_time": 0.07803870351226241,
            "samples_per_second": 1798930.7063574833,
            "samples_per_second_per_gpu": 224866.3382946854,
            "loss_sequences_lower_95": 5.834743130983063,
            "loss_sequences_upper_95": 5.883322944472569,
            "loss_tokens_lower_95": 5.834747718957106,
            "loss_tokens_upper_95": 5.883023187246871,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.5708875718160904,
            "data_time": 0.0029328295908036847,
            "batch_time": 0.07127564786821278,
            "samples_per_second": 1852961.0965646731,
            "samples_per_second_per_gpu": 231620.13707058414,
            "loss_sequences_lower_95": 0.7258351703010668,
            "loss_sequences_upper_95": 0.7405411276862648,
            "loss_tokens_lower_95": 0.5453152393619373,
            "loss_tokens_upper_95": 0.5540349364655904,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.353227243648739,
            "data_time": 0.15852969884872437,
            "batch_time": 0.22843855619430542,
            "samples_per_second": 1086594.2487189518,
            "samples_per_second_per_gpu": 135824.28108986898,
            "loss_sequences_lower_95": 4.606019892655019,
            "loss_sequences_upper_95": 4.9905205914354696,
            "loss_tokens_lower_95": 4.1770754531864105,
            "loss_tokens_upper_95": 4.3992054844863056,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.844281931181212,
            "data_time": 0.12097571790218353,
            "batch_time": 0.1585550159215927,
            "samples_per_second": 544267.5547335701,
            "samples_per_second_per_gpu": 68033.44434169626,
            "loss_sequences_lower_95": 6.455440758370065,
            "loss_sequences_upper_95": 7.47856496862463,
            "loss_tokens_lower_95": 6.12747501326196,
            "loss_tokens_upper_95": 7.316808742947049,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.201674439558169,
            "data_time": 0.07846002280712128,
            "batch_time": 0.12951112538576126,
            "samples_per_second": 1099571.892307524,
            "samples_per_second_per_gpu": 137446.4865384405,
            "loss_sequences_lower_95": 4.330952211705649,
            "loss_sequences_upper_95": 4.67396922227813,
            "loss_tokens_lower_95": 3.966122693702691,
            "loss_tokens_upper_95": 4.149241205188183,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.290369717086234,
            "data_time": 0.07991129159927368,
            "batch_time": 0.13057884573936462,
            "samples_per_second": 1110972.4682107707,
            "samples_per_second_per_gpu": 138871.55852634634,
            "loss_sequences_lower_95": 4.417557507026486,
            "loss_sequences_upper_95": 4.728968169049518,
            "loss_tokens_lower_95": 4.08878527856368,
            "loss_tokens_upper_95": 4.241529250835581,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.421178063241447,
            "data_time": 0.07724740356206894,
            "batch_time": 0.12852177768945694,
            "samples_per_second": 1097805.9671293767,
            "samples_per_second_per_gpu": 137225.7458911721,
            "loss_sequences_lower_95": 4.6042817092523345,
            "loss_sequences_upper_95": 5.025245889803259,
            "loss_tokens_lower_95": 4.18852938476001,
            "loss_tokens_upper_95": 4.435166119514628,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.370438439090077,
            "data_time": 0.0775023028254509,
            "batch_time": 0.12835906445980072,
            "samples_per_second": 1105450.575450283,
            "samples_per_second_per_gpu": 138181.32193128538,
            "loss_sequences_lower_95": 4.450951552972561,
            "loss_sequences_upper_95": 4.735855325838414,
            "loss_tokens_lower_95": 4.184753579588323,
            "loss_tokens_upper_95": 4.3264200275932145,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.766353604216013,
            "data_time": 0.08399270474910736,
            "batch_time": 0.13495232909917831,
            "samples_per_second": 1062214.2596980047,
            "samples_per_second_per_gpu": 132776.7824622506,
            "loss_sequences_lower_95": 4.862569489094041,
            "loss_sequences_upper_95": 5.175013249557211,
            "loss_tokens_lower_95": 4.5854234265318805,
            "loss_tokens_upper_95": 4.703606443313103,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.6913051140017625,
            "data_time": 0.07673320919275284,
            "batch_time": 0.12791166454553604,
            "samples_per_second": 1107686.753857146,
            "samples_per_second_per_gpu": 138460.84423214325,
            "loss_sequences_lower_95": 4.880050603354849,
            "loss_sequences_upper_95": 5.208391487307665,
            "loss_tokens_lower_95": 4.490641976529536,
            "loss_tokens_upper_95": 4.626617797336498,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/params.txt",
    "uuid": "dec1155d-e531-41d9-9b7f-4831780bec8e",
    "creation_date": "2024_01_25-08_33_30"
}