{
    "name": "c4_original-d=96_l=8_h=4-16.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 3382179840,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "676435968",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=96_l=8_h=4-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.4597341855367025,
            "data_time": 0.1284843236207962,
            "batch_time": 1.2185916006565094,
            "samples_per_second": 382230.16047288105,
            "samples_per_second_per_gpu": 47778.77005911013,
            "loss_sequences_lower_95": 5.311852480570475,
            "loss_sequences_upper_95": 5.611358248392741,
            "loss_tokens_lower_95": 5.443257395426433,
            "loss_tokens_upper_95": 5.476219825744629,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.506403529656214,
            "data_time": 0.018765379060679616,
            "batch_time": 0.06366843911888322,
            "samples_per_second": 4706879.403630953,
            "samples_per_second_per_gpu": 588359.9254538692,
            "loss_sequences_lower_95": 4.50406867838151,
            "loss_sequences_upper_95": 4.50877400947364,
            "loss_tokens_lower_95": 4.49476015625,
            "loss_tokens_upper_95": 4.51787759375,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.904681214507745,
            "data_time": 0.09441205859184265,
            "batch_time": 0.13982073217630386,
            "samples_per_second": 4009131.3960458217,
            "samples_per_second_per_gpu": 501141.4245057277,
            "loss_sequences_lower_95": 4.866490926937181,
            "loss_sequences_upper_95": 4.951683673469388,
            "loss_tokens_lower_95": 4.889010947916667,
            "loss_tokens_upper_95": 4.920568489583333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.658284392700982,
            "data_time": 0.012957588622444555,
            "batch_time": 0.05664872495751632,
            "samples_per_second": 5405506.740531367,
            "samples_per_second_per_gpu": 675688.3425664209,
            "loss_sequences_lower_95": 4.629036928157217,
            "loss_sequences_upper_95": 4.687892286163016,
            "loss_tokens_lower_95": 4.6461233125,
            "loss_tokens_upper_95": 4.67051128125,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.5447602718765046,
            "data_time": 0.09253650158643723,
            "batch_time": 0.13771482557058334,
            "samples_per_second": 4100521.945502955,
            "samples_per_second_per_gpu": 512565.2431878694,
            "loss_sequences_lower_95": 4.497595165120609,
            "loss_sequences_upper_95": 4.599630407888874,
            "loss_tokens_lower_95": 4.532905520833333,
            "loss_tokens_upper_95": 4.556467802083334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.178088220911396,
            "data_time": 0.03356420745452245,
            "batch_time": 0.07666710764169693,
            "samples_per_second": 5023222.909847171,
            "samples_per_second_per_gpu": 627902.8637308964,
            "loss_sequences_lower_95": 5.132101869874872,
            "loss_sequences_upper_95": 5.226066260529083,
            "loss_tokens_lower_95": 5.1646889895833334,
            "loss_tokens_upper_95": 5.19153003125,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.659461544095254,
            "data_time": 0.012087894976139069,
            "batch_time": 0.05446205586194992,
            "samples_per_second": 5270511.8197238725,
            "samples_per_second_per_gpu": 658813.9774654841,
            "loss_sequences_lower_95": 5.623996322943239,
            "loss_sequences_upper_95": 5.694697285554847,
            "loss_tokens_lower_95": 5.642083885416667,
            "loss_tokens_upper_95": 5.6769861666666666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.983831690743331,
            "data_time": 0.013011461025790164,
            "batch_time": 0.05635771076930197,
            "samples_per_second": 5359105.8001696775,
            "samples_per_second_per_gpu": 669888.2250212097,
            "loss_sequences_lower_95": 4.965710150114529,
            "loss_sequences_upper_95": 5.003477789594241,
            "loss_tokens_lower_95": 4.971394958333334,
            "loss_tokens_upper_95": 4.9962190625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.904607402599924,
            "data_time": 0.09290320426225662,
            "batch_time": 0.13825585693120956,
            "samples_per_second": 3971549.1266719117,
            "samples_per_second_per_gpu": 496443.64083398896,
            "loss_sequences_lower_95": 4.836864614099023,
            "loss_sequences_upper_95": 4.984940468392721,
            "loss_tokens_lower_95": 4.892220708333333,
            "loss_tokens_upper_95": 4.91703459375,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.968482519798128,
            "data_time": 0.09320733696222305,
            "batch_time": 0.13857460021972656,
            "samples_per_second": 4122497.6065171715,
            "samples_per_second_per_gpu": 515312.20081464644,
            "loss_sequences_lower_95": 5.888923506303267,
            "loss_sequences_upper_95": 6.064237161870059,
            "loss_tokens_lower_95": 5.955092333333334,
            "loss_tokens_upper_95": 5.98181565625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.041699747067535,
            "data_time": 0.009755654067828738,
            "batch_time": 0.05371590780800786,
            "samples_per_second": 5360023.357568189,
            "samples_per_second_per_gpu": 670002.9196960236,
            "loss_sequences_lower_95": 5.03302992132897,
            "loss_sequences_upper_95": 5.050601970415925,
            "loss_tokens_lower_95": 5.028863458333333,
            "loss_tokens_upper_95": 5.054391916666667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.817399153258591,
            "data_time": 0.0218635156750679,
            "batch_time": 0.06423543393611908,
            "samples_per_second": 5116679.728046402,
            "samples_per_second_per_gpu": 639584.9660058003,
            "loss_sequences_lower_95": 4.800694331463452,
            "loss_sequences_upper_95": 4.834409043432425,
            "loss_tokens_lower_95": 4.804665666666667,
            "loss_tokens_upper_95": 4.82972428125,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.270039186999957,
            "data_time": 0.09130723029375076,
            "batch_time": 0.14540693163871765,
            "samples_per_second": 4203727.3759457795,
            "samples_per_second_per_gpu": 525465.9219932224,
            "loss_sequences_lower_95": 5.197879963037209,
            "loss_sequences_upper_95": 5.354493982651655,
            "loss_tokens_lower_95": 5.2566155625,
            "loss_tokens_upper_95": 5.2837921041666664,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7833958118850495,
            "data_time": 0.09418753534555435,
            "batch_time": 0.13791077584028244,
            "samples_per_second": 4185027.6932932176,
            "samples_per_second_per_gpu": 523128.4616616522,
            "loss_sequences_lower_95": 4.711544370942591,
            "loss_sequences_upper_95": 4.8644105122676935,
            "loss_tokens_lower_95": 4.770383302083333,
            "loss_tokens_upper_95": 4.7969290625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.084366289052096,
            "data_time": 0.14031442999839783,
            "batch_time": 0.16349390149116516,
            "samples_per_second": 991830.7746690535,
            "samples_per_second_per_gpu": 123978.84683363169,
            "loss_sequences_lower_95": 6.01523959419944,
            "loss_sequences_upper_95": 6.152494326504794,
            "loss_tokens_lower_95": 6.053747471896084,
            "loss_tokens_upper_95": 6.1151965574784715,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.542251910829683,
            "data_time": 0.09183740615844727,
            "batch_time": 0.12689831107854843,
            "samples_per_second": 3372006.1472762786,
            "samples_per_second_per_gpu": 421500.7684095348,
            "loss_sequences_lower_95": 5.427585176962805,
            "loss_sequences_upper_95": 5.65613548345538,
            "loss_tokens_lower_95": 5.52728371875,
            "loss_tokens_upper_95": 5.557373770833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.744653138133026,
            "data_time": 0.09225772321224213,
            "batch_time": 0.12938381731510162,
            "samples_per_second": 3682910.722689879,
            "samples_per_second_per_gpu": 460363.8403362349,
            "loss_sequences_lower_95": 6.670299858153652,
            "loss_sequences_upper_95": 6.833374152271603,
            "loss_tokens_lower_95": 6.7320785625,
            "loss_tokens_upper_95": 6.757236510416667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.679859071481423,
            "data_time": 0.15834477543830872,
            "batch_time": 0.18846885859966278,
            "samples_per_second": 2128249.2485397854,
            "samples_per_second_per_gpu": 266031.1560674732,
            "loss_sequences_lower_95": 5.560028388851978,
            "loss_sequences_upper_95": 5.883044258492892,
            "loss_tokens_lower_95": 5.664886749767866,
            "loss_tokens_upper_95": 5.694620613973649,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.539003178564305,
            "data_time": 0.027501450885425913,
            "batch_time": 0.07168040763248097,
            "samples_per_second": 4537263.764527483,
            "samples_per_second_per_gpu": 567157.9705659354,
            "loss_sequences_lower_95": 5.51723775381445,
            "loss_sequences_upper_95": 5.560234934148447,
            "loss_tokens_lower_95": 5.517496965020118,
            "loss_tokens_upper_95": 5.560557738058146,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.204738486242873,
            "data_time": 0.027434442564845084,
            "batch_time": 0.07127091996371746,
            "samples_per_second": 4471327.83630914,
            "samples_per_second_per_gpu": 558915.9795386425,
            "loss_sequences_lower_95": 4.229963187814305,
            "loss_sequences_upper_95": 4.255977929804198,
            "loss_tokens_lower_95": 4.192819400603827,
            "loss_tokens_upper_95": 4.2137837656237656,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.705346383854754,
            "data_time": 0.04715105394522349,
            "batch_time": 0.08876793914371067,
            "samples_per_second": 4398076.067406645,
            "samples_per_second_per_gpu": 549759.5084258306,
            "loss_sequences_lower_95": 7.160773636318345,
            "loss_sequences_upper_95": 7.439760993593529,
            "loss_tokens_lower_95": 6.569431453084047,
            "loss_tokens_upper_95": 6.777419720373183,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.556182384967804,
            "data_time": 0.04722507670521736,
            "batch_time": 0.09135432913899422,
            "samples_per_second": 4438057.107385414,
            "samples_per_second_per_gpu": 554757.1384231767,
            "loss_sequences_lower_95": 6.979082942708334,
            "loss_sequences_upper_95": 7.1780388671874995,
            "loss_tokens_lower_95": 6.4552186394457545,
            "loss_tokens_upper_95": 6.589739018278301,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.677665566024317,
            "data_time": 0.06379447380701701,
            "batch_time": 0.10406408210595448,
            "samples_per_second": 4017135.7939435854,
            "samples_per_second_per_gpu": 502141.97424294817,
            "loss_sequences_lower_95": 4.7782383338170495,
            "loss_sequences_upper_95": 4.853415932702331,
            "loss_tokens_lower_95": 4.651127144637499,
            "loss_tokens_upper_95": 4.687989702311812,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.615166965397921,
            "data_time": 0.3699048012495041,
            "batch_time": 0.41188162565231323,
            "samples_per_second": 2531931.366077828,
            "samples_per_second_per_gpu": 316491.4207597285,
            "loss_sequences_lower_95": 5.546456506902521,
            "loss_sequences_upper_95": 5.846102253306996,
            "loss_tokens_lower_95": 5.559834932102187,
            "loss_tokens_upper_95": 5.655392475183916,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.866186198409723,
            "data_time": 0.3327002674341202,
            "batch_time": 0.3777769058942795,
            "samples_per_second": 2566980.3521870943,
            "samples_per_second_per_gpu": 320872.5440233868,
            "loss_sequences_lower_95": 4.883732200155452,
            "loss_sequences_upper_95": 5.114687039122289,
            "loss_tokens_lower_95": 4.819384983814319,
            "loss_tokens_upper_95": 4.931158029895904,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.143062364260356,
            "data_time": 0.17271307855844498,
            "batch_time": 0.2046569064259529,
            "samples_per_second": 2429949.2369981906,
            "samples_per_second_per_gpu": 303743.65462477383,
            "loss_sequences_lower_95": 5.134539235432943,
            "loss_sequences_upper_95": 5.246975891113281,
            "loss_tokens_lower_95": 5.034426272549058,
            "loss_tokens_upper_95": 5.244868640876081,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.956623159661713,
            "data_time": 0.02489014230668545,
            "batch_time": 0.06895898822695017,
            "samples_per_second": 4505825.9668533895,
            "samples_per_second_per_gpu": 563228.2458566737,
            "loss_sequences_lower_95": 8.031173051430788,
            "loss_sequences_upper_95": 8.10316165773707,
            "loss_tokens_lower_95": 7.903352797702948,
            "loss_tokens_upper_95": 7.977543980541354,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.388237532541808,
            "data_time": 0.04286163598299027,
            "batch_time": 0.08524787127971649,
            "samples_per_second": 4434575.352344395,
            "samples_per_second_per_gpu": 554321.9190430493,
            "loss_sequences_lower_95": 6.665335648148147,
            "loss_sequences_upper_95": 6.973014405118897,
            "loss_tokens_lower_95": 5.241094200141207,
            "loss_tokens_upper_95": 5.3899723617733875,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.148541750354571,
            "data_time": 0.07271155416965484,
            "batch_time": 0.11477379202842712,
            "samples_per_second": 4401299.952695097,
            "samples_per_second_per_gpu": 550162.4940868871,
            "loss_sequences_lower_95": 5.913585352979016,
            "loss_sequences_upper_95": 6.266822268124733,
            "loss_tokens_lower_95": 5.045970119834523,
            "loss_tokens_upper_95": 5.221550416654061,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.147888279396649,
            "data_time": 0.33575259149074554,
            "batch_time": 0.37690015137195587,
            "samples_per_second": 2363920.97365666,
            "samples_per_second_per_gpu": 295490.1217070825,
            "loss_sequences_lower_95": 6.024479974894763,
            "loss_sequences_upper_95": 6.269168954788277,
            "loss_tokens_lower_95": 6.0248288507330905,
            "loss_tokens_upper_95": 6.271090823656892,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.834408226013184,
            "data_time": 0.2932904362678528,
            "batch_time": 0.31913015246391296,
            "samples_per_second": 1663147.0584022382,
            "samples_per_second_per_gpu": 207893.38230027977,
            "loss_sequences_lower_95": 4.775031028747559,
            "loss_sequences_upper_95": 5.258454330444335,
            "loss_tokens_lower_95": 4.573327199973446,
            "loss_tokens_upper_95": 5.060906338222551,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.986111016698113,
            "data_time": 0.05514606833457947,
            "batch_time": 0.09836394339799881,
            "samples_per_second": 4450265.353630342,
            "samples_per_second_per_gpu": 556283.1692037927,
            "loss_sequences_lower_95": 4.9325991330883765,
            "loss_sequences_upper_95": 5.039544171711073,
            "loss_tokens_lower_95": 4.931837399345094,
            "loss_tokens_upper_95": 5.039009523733367,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.604302869089112,
            "data_time": 0.07643924057483673,
            "batch_time": 0.11964736580848694,
            "samples_per_second": 4341000.900317309,
            "samples_per_second_per_gpu": 542625.1125396637,
            "loss_sequences_lower_95": 5.549871221318976,
            "loss_sequences_upper_95": 5.657932650779331,
            "loss_tokens_lower_95": 5.548676951765331,
            "loss_tokens_upper_95": 5.657916074810606,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.575522835547828,
            "data_time": 0.05396673269569874,
            "batch_time": 0.09486174397170544,
            "samples_per_second": 4258689.921993252,
            "samples_per_second_per_gpu": 532336.2402491565,
            "loss_sequences_lower_95": 4.829410098867656,
            "loss_sequences_upper_95": 4.953791458744219,
            "loss_tokens_lower_95": 4.536657881376881,
            "loss_tokens_upper_95": 4.595455475262209,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.288805410385132,
            "data_time": 0.18174321204423904,
            "batch_time": 0.22634808719158173,
            "samples_per_second": 3833956.4950473127,
            "samples_per_second_per_gpu": 479244.5618809141,
            "loss_sequences_lower_95": 6.878932141113281,
            "loss_sequences_upper_95": 7.38514033203125,
            "loss_tokens_lower_95": 6.0618688536668595,
            "loss_tokens_upper_95": 6.407126563404823,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.822877570986748,
            "data_time": 0.14877715706825256,
            "batch_time": 0.16651369631290436,
            "samples_per_second": 915367.3108890827,
            "samples_per_second_per_gpu": 114420.91386113534,
            "loss_sequences_lower_95": 4.537006294727325,
            "loss_sequences_upper_95": 5.210825550556183,
            "loss_tokens_lower_95": 4.295467701177487,
            "loss_tokens_upper_95": 5.166669376417138,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.954096618740038,
            "data_time": 0.3268144428730011,
            "batch_time": 0.36265020072460175,
            "samples_per_second": 2525048.0996914506,
            "samples_per_second_per_gpu": 315631.0124614313,
            "loss_sequences_lower_95": 5.994919191557785,
            "loss_sequences_upper_95": 6.665175111266389,
            "loss_tokens_lower_95": 4.682124928987281,
            "loss_tokens_upper_95": 5.116532118667207,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.925210096576736,
            "data_time": 0.046245397792922124,
            "batch_time": 0.09090542627705468,
            "samples_per_second": 4580749.587296009,
            "samples_per_second_per_gpu": 572593.6984120011,
            "loss_sequences_lower_95": 4.900454449268239,
            "loss_sequences_upper_95": 4.9501734902318155,
            "loss_tokens_lower_95": 4.900815689933514,
            "loss_tokens_upper_95": 4.95002866972477,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.782261417328143,
            "data_time": 0.032110504451252166,
            "batch_time": 0.07513137871310824,
            "samples_per_second": 4438325.370470367,
            "samples_per_second_per_gpu": 554790.6713087959,
            "loss_sequences_lower_95": 5.903166768751213,
            "loss_sequences_upper_95": 6.13404819363417,
            "loss_tokens_lower_95": 5.63970990433034,
            "loss_tokens_upper_95": 5.867759009794819,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.232240337155241,
            "data_time": 0.1635267212986946,
            "batch_time": 0.19320335239171982,
            "samples_per_second": 2056953.2509901137,
            "samples_per_second_per_gpu": 257119.1563737642,
            "loss_sequences_lower_95": 4.158569771902902,
            "loss_sequences_upper_95": 4.544235285385188,
            "loss_tokens_lower_95": 4.031852706640986,
            "loss_tokens_upper_95": 4.359943298848,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.512829312773888,
            "data_time": 0.08206996321678162,
            "batch_time": 0.12735503017902375,
            "samples_per_second": 4270047.625048082,
            "samples_per_second_per_gpu": 533755.9531310103,
            "loss_sequences_lower_95": 4.592692709232562,
            "loss_sequences_upper_95": 4.73420557565546,
            "loss_tokens_lower_95": 4.429098279385191,
            "loss_tokens_upper_95": 4.581389433885947,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.462522777115426,
            "data_time": 0.30683137476444244,
            "batch_time": 0.34127750992774963,
            "samples_per_second": 2490307.7047695415,
            "samples_per_second_per_gpu": 311288.4630961927,
            "loss_sequences_lower_95": 4.296801832245618,
            "loss_sequences_upper_95": 4.813693488516458,
            "loss_tokens_lower_95": 4.258435426995183,
            "loss_tokens_upper_95": 4.670805042758462,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.243004951793734,
            "data_time": 0.02863532398188002,
            "batch_time": 0.0725469204325661,
            "samples_per_second": 4425069.394829152,
            "samples_per_second_per_gpu": 553133.674353644,
            "loss_sequences_lower_95": 5.229711430567364,
            "loss_sequences_upper_95": 5.256307413826515,
            "loss_tokens_lower_95": 5.229719342305962,
            "loss_tokens_upper_95": 5.256339256132476,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.6233301891863925,
            "data_time": 0.33344440162181854,
            "batch_time": 0.3601889908313751,
            "samples_per_second": 1600257.5050315997,
            "samples_per_second_per_gpu": 200032.18812894996,
            "loss_sequences_lower_95": 2.509861288718807,
            "loss_sequences_upper_95": 2.8489431584923013,
            "loss_tokens_lower_95": 2.408623455441962,
            "loss_tokens_upper_95": 2.757957822395567,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3360627602231325,
            "data_time": 0.024767024219036104,
            "batch_time": 0.06914151002963384,
            "samples_per_second": 4436785.627439261,
            "samples_per_second_per_gpu": 554598.2034299077,
            "loss_sequences_lower_95": 6.218980474891903,
            "loss_sequences_upper_95": 6.26590078165946,
            "loss_tokens_lower_95": 5.236611943907157,
            "loss_tokens_upper_95": 5.285839156189555,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.57263455581665,
            "data_time": 0.0979914590716362,
            "batch_time": 0.14273301512002945,
            "samples_per_second": 4383248.67026205,
            "samples_per_second_per_gpu": 547906.0837827562,
            "loss_sequences_lower_95": 8.43254443359375,
            "loss_sequences_upper_95": 8.964633935546875,
            "loss_tokens_lower_95": 8.283018763896946,
            "loss_tokens_upper_95": 8.79825721734822,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.366010456499846,
            "data_time": 0.32507045567035675,
            "batch_time": 0.3673303872346878,
            "samples_per_second": 2664445.3861471843,
            "samples_per_second_per_gpu": 333055.67326839804,
            "loss_sequences_lower_95": 5.190488493546195,
            "loss_sequences_upper_95": 5.540626777980639,
            "loss_tokens_lower_95": 5.190405684761379,
            "loss_tokens_upper_95": 5.542016455608866,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.339569307818557,
            "data_time": 0.06415109833081563,
            "batch_time": 0.10412078847487767,
            "samples_per_second": 4012536.4091057666,
            "samples_per_second_per_gpu": 501567.0511382208,
            "loss_sequences_lower_95": 9.227781205610796,
            "loss_sequences_upper_95": 9.45179522890033,
            "loss_tokens_lower_95": 9.226002197265625,
            "loss_tokens_upper_95": 9.451472371419271,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.5666267006397248,
            "data_time": 0.06577366590499878,
            "batch_time": 0.10990522553523381,
            "samples_per_second": 4468623.933538029,
            "samples_per_second_per_gpu": 558577.9916922536,
            "loss_sequences_lower_95": 1.6867464111328125,
            "loss_sequences_upper_95": 1.7736854573567706,
            "loss_tokens_lower_95": 1.5234322713460384,
            "loss_tokens_upper_95": 1.593647881027411,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.116884778794788,
            "data_time": 0.3316149264574051,
            "batch_time": 0.3718954622745514,
            "samples_per_second": 2386410.77194891,
            "samples_per_second_per_gpu": 298301.34649361373,
            "loss_sequences_lower_95": 5.792438267299107,
            "loss_sequences_upper_95": 6.443832717168899,
            "loss_tokens_lower_95": 5.797090497698103,
            "loss_tokens_upper_95": 6.436218392508371,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.7266193330287933,
            "data_time": 0.1438566893339157,
            "batch_time": 0.1626797318458557,
            "samples_per_second": 811460.3596867747,
            "samples_per_second_per_gpu": 101432.54496084683,
            "loss_sequences_lower_95": 3.4164841771125793,
            "loss_sequences_upper_95": 4.601454269886017,
            "loss_tokens_lower_95": 3.2009984281874195,
            "loss_tokens_upper_95": 3.780004184368959,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.3119618906974795,
            "data_time": 0.09319644421339035,
            "batch_time": 0.13772781193256378,
            "samples_per_second": 4379129.577597407,
            "samples_per_second_per_gpu": 547391.1971996758,
            "loss_sequences_lower_95": 7.350062145996094,
            "loss_sequences_upper_95": 7.68740673828125,
            "loss_tokens_lower_95": 7.1557933826737,
            "loss_tokens_upper_95": 7.450737564938928,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.213774075508118,
            "data_time": 0.10143210738897324,
            "batch_time": 0.14659149944782257,
            "samples_per_second": 4193162.6534263715,
            "samples_per_second_per_gpu": 524145.33167829644,
            "loss_sequences_lower_95": 7.458967419433593,
            "loss_sequences_upper_95": 7.687064050292968,
            "loss_tokens_lower_95": 7.110746038491331,
            "loss_tokens_upper_95": 7.2882863357536305,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.484455736859149,
            "data_time": 0.03977858026822408,
            "batch_time": 0.08374372124671936,
            "samples_per_second": 4544327.39963483,
            "samples_per_second_per_gpu": 568040.9249543537,
            "loss_sequences_lower_95": 5.453535496721003,
            "loss_sequences_upper_95": 5.516719401478168,
            "loss_tokens_lower_95": 5.452904659607568,
            "loss_tokens_upper_95": 5.516285153630992,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.595017537177067,
            "data_time": 0.11141105492909749,
            "batch_time": 0.15208238859971365,
            "samples_per_second": 3928353.7909987806,
            "samples_per_second_per_gpu": 491044.2238748476,
            "loss_sequences_lower_95": 5.474329382080453,
            "loss_sequences_upper_95": 5.7123611848658316,
            "loss_tokens_lower_95": 5.478003426594302,
            "loss_tokens_upper_95": 5.7099875454529085,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.031406893253326,
            "data_time": 0.09562284126877785,
            "batch_time": 0.13972989469766617,
            "samples_per_second": 4262152.473819952,
            "samples_per_second_per_gpu": 532769.059227494,
            "loss_sequences_lower_95": 7.960159326171874,
            "loss_sequences_upper_95": 8.10398740234375,
            "loss_tokens_lower_95": 7.9601373046875,
            "loss_tokens_upper_95": 8.101404968261718,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.519047367764512,
            "data_time": 0.02858462326583408,
            "batch_time": 0.07249008544853755,
            "samples_per_second": 4497589.310526155,
            "samples_per_second_per_gpu": 562198.6638157694,
            "loss_sequences_lower_95": 5.523988661896879,
            "loss_sequences_upper_95": 5.626170535344725,
            "loss_tokens_lower_95": 4.409867734371668,
            "loss_tokens_upper_95": 4.478853395736468,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.742761598594153,
            "data_time": 0.20411044359207153,
            "batch_time": 0.23655911002840316,
            "samples_per_second": 1867985.9324851285,
            "samples_per_second_per_gpu": 233498.24156064107,
            "loss_sequences_lower_95": 5.563032531738282,
            "loss_sequences_upper_95": 5.91957173133964,
            "loss_tokens_lower_95": 5.560922104565065,
            "loss_tokens_upper_95": 5.9180929838721426,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.693788314333149,
            "data_time": 0.18704332411289215,
            "batch_time": 0.2327028214931488,
            "samples_per_second": 3874097.2108529704,
            "samples_per_second_per_gpu": 484262.1513566213,
            "loss_sequences_lower_95": 5.563155912511489,
            "loss_sequences_upper_95": 5.822458627738205,
            "loss_tokens_lower_95": 5.566395491057751,
            "loss_tokens_upper_95": 5.823357208850338,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.403500714039842,
            "data_time": 0.028439773712307215,
            "batch_time": 0.07198215555399656,
            "samples_per_second": 4473324.247488907,
            "samples_per_second_per_gpu": 559165.5309361133,
            "loss_sequences_lower_95": 6.090847555449862,
            "loss_sequences_upper_95": 6.194311006592133,
            "loss_tokens_lower_95": 5.2970537936035855,
            "loss_tokens_upper_95": 5.38272253753162,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.757624381433719,
            "data_time": 0.31289108097553253,
            "batch_time": 0.35073111951351166,
            "samples_per_second": 2074025.9480400686,
            "samples_per_second_per_gpu": 259253.24350500858,
            "loss_sequences_lower_95": 5.652440049913195,
            "loss_sequences_upper_95": 5.869275935743221,
            "loss_tokens_lower_95": 5.648330462420428,
            "loss_tokens_upper_95": 5.867293536473834,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.0613535774592595,
            "data_time": 0.04587457844844231,
            "batch_time": 0.08982395667296189,
            "samples_per_second": 4478200.2293511275,
            "samples_per_second_per_gpu": 559775.0286688909,
            "loss_sequences_lower_95": 7.0319612480886855,
            "loss_sequences_upper_95": 7.091183133839832,
            "loss_tokens_lower_95": 7.031091838685015,
            "loss_tokens_upper_95": 7.090843905294342,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.297486932532301,
            "data_time": 0.3413132280111313,
            "batch_time": 0.3812626600265503,
            "samples_per_second": 2313820.170283114,
            "samples_per_second_per_gpu": 289227.52128538926,
            "loss_sequences_lower_95": 6.061383841801615,
            "loss_sequences_upper_95": 6.528699397114875,
            "loss_tokens_lower_95": 6.063849351012591,
            "loss_tokens_upper_95": 6.535344510865443,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.9113500118255615,
            "data_time": 0.30747365951538086,
            "batch_time": 0.3271985650062561,
            "samples_per_second": 1175988.9445491266,
            "samples_per_second_per_gpu": 146998.61806864082,
            "loss_sequences_lower_95": 5.6250033315022785,
            "loss_sequences_upper_95": 6.582567138671875,
            "loss_tokens_lower_95": 5.034734853108724,
            "loss_tokens_upper_95": 6.5572350607977965,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.784592580795288,
            "data_time": 0.2880585193634033,
            "batch_time": 0.3082168847322464,
            "samples_per_second": 1362294.8424693556,
            "samples_per_second_per_gpu": 170286.85530866944,
            "loss_sequences_lower_95": 4.62711924235026,
            "loss_sequences_upper_95": 5.713254470825195,
            "loss_tokens_lower_95": 3.8758836296167267,
            "loss_tokens_upper_95": 5.407592319102769,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.92888595345094,
            "data_time": 0.04849738308361599,
            "batch_time": 0.09107374187026705,
            "samples_per_second": 4174903.4731436223,
            "samples_per_second_per_gpu": 521862.9341429528,
            "loss_sequences_lower_95": 8.887012150220913,
            "loss_sequences_upper_95": 8.970791806655008,
            "loss_tokens_lower_95": 8.887992639106223,
            "loss_tokens_upper_95": 8.970830912302098,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.4012875738352126,
            "data_time": 0.022654460377047676,
            "batch_time": 0.06672884380713301,
            "samples_per_second": 4506497.165112087,
            "samples_per_second_per_gpu": 563312.1456390108,
            "loss_sequences_lower_95": 3.131099999519165,
            "loss_sequences_upper_95": 3.170645765393131,
            "loss_tokens_lower_95": 2.338740838402235,
            "loss_tokens_upper_95": 2.364009387046009,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.084663071970302,
            "data_time": 0.33353592455387115,
            "batch_time": 0.36294570565223694,
            "samples_per_second": 1864381.1696699623,
            "samples_per_second_per_gpu": 233047.64620874528,
            "loss_sequences_lower_95": 6.240994418887642,
            "loss_sequences_upper_95": 6.682554794671967,
            "loss_tokens_lower_95": 5.890159699513326,
            "loss_tokens_upper_95": 6.14199043635624,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.870453370584023,
            "data_time": 0.2194891721010208,
            "batch_time": 0.23625445365905762,
            "samples_per_second": 1100537.6372081463,
            "samples_per_second_per_gpu": 137567.20465101828,
            "loss_sequences_lower_95": 8.380796360325169,
            "loss_sequences_upper_95": 9.625385903023385,
            "loss_tokens_lower_95": 8.039326571240837,
            "loss_tokens_upper_95": 9.322884642047647,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.995609751561793,
            "data_time": 0.307154580950737,
            "batch_time": 0.3415740579366684,
            "samples_per_second": 2321915.627808358,
            "samples_per_second_per_gpu": 290239.45347604476,
            "loss_sequences_lower_95": 6.1305397033691404,
            "loss_sequences_upper_95": 6.501623088557546,
            "loss_tokens_lower_95": 5.813251320700168,
            "loss_tokens_upper_95": 6.031770116037636,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.077924891215999,
            "data_time": 0.2881374806165695,
            "batch_time": 0.3219750225543976,
            "samples_per_second": 2375273.568987512,
            "samples_per_second_per_gpu": 296909.196123439,
            "loss_sequences_lower_95": 6.220096662567883,
            "loss_sequences_upper_95": 6.574320797803925,
            "loss_tokens_lower_95": 5.923880005240706,
            "loss_tokens_upper_95": 6.102751324835816,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.164902751038714,
            "data_time": 0.34086233377456665,
            "batch_time": 0.37537989020347595,
            "samples_per_second": 1531299.9577067888,
            "samples_per_second_per_gpu": 191412.4947133486,
            "loss_sequences_lower_95": 6.2781879890255805,
            "loss_sequences_upper_95": 6.70626388177639,
            "loss_tokens_lower_95": 5.957892326543631,
            "loss_tokens_upper_95": 6.2424270919440055,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.095093258997289,
            "data_time": 0.30983754992485046,
            "batch_time": 0.3446328192949295,
            "samples_per_second": 2080660.9008699516,
            "samples_per_second_per_gpu": 260082.61260874395,
            "loss_sequences_lower_95": 6.200989811594893,
            "loss_sequences_upper_95": 6.531304633908156,
            "loss_tokens_lower_95": 5.949029522001558,
            "loss_tokens_upper_95": 6.117665494639555,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.410459832375094,
            "data_time": 0.32377779483795166,
            "batch_time": 0.35779932141304016,
            "samples_per_second": 2251791.941111067,
            "samples_per_second_per_gpu": 281473.99263888336,
            "loss_sequences_lower_95": 6.497296104668091,
            "loss_sequences_upper_95": 6.7648531895987,
            "loss_tokens_lower_95": 6.301522523490055,
            "loss_tokens_upper_95": 6.436312141251097,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.297765467225052,
            "data_time": 0.3410339802503586,
            "batch_time": 0.3766828924417496,
            "samples_per_second": 2001442.7234186134,
            "samples_per_second_per_gpu": 250180.34042732668,
            "loss_sequences_lower_95": 6.539862004722036,
            "loss_sequences_upper_95": 6.849985932140815,
            "loss_tokens_lower_95": 6.159161133401145,
            "loss_tokens_upper_95": 6.310604456280139,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-16.0/params.txt",
    "uuid": "04d70a3a-0dca-4429-a2d0-f785868132e8",
    "creation_date": "2023_12_14-04_59_14"
}