{
    "name": "rpj-d=96_l=8_h=4-0.5",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 105693120,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "21138624",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.929919397830963,
            "data_time": 0.1405930072069168,
            "batch_time": 1.2389139235019684,
            "samples_per_second": 360351.00951380585,
            "samples_per_second_per_gpu": 45043.87618922573,
            "loss_sequences_lower_95": 5.842115491231282,
            "loss_sequences_upper_95": 6.019068094889323,
            "loss_tokens_lower_95": 5.916148999532064,
            "loss_tokens_upper_95": 5.943634173075358,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.723234933817179,
            "data_time": 0.018367723406515393,
            "batch_time": 0.06349784630002507,
            "samples_per_second": 4673993.952039395,
            "samples_per_second_per_gpu": 584249.2440049244,
            "loss_sequences_lower_95": 5.720854326227367,
            "loss_sequences_upper_95": 5.725621091865786,
            "loss_tokens_lower_95": 5.711515395833334,
            "loss_tokens_upper_95": 5.734736791666667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.112812482094278,
            "data_time": 0.09665849059820175,
            "batch_time": 0.14140090346336365,
            "samples_per_second": 4091223.0106671844,
            "samples_per_second_per_gpu": 511402.87633339805,
            "loss_sequences_lower_95": 6.095127575932717,
            "loss_sequences_upper_95": 6.130867633430325,
            "loss_tokens_lower_95": 6.100724041666667,
            "loss_tokens_upper_95": 6.125062333333333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.692095179705276,
            "data_time": 0.012941678103647734,
            "batch_time": 0.056763757216302974,
            "samples_per_second": 5387393.323521402,
            "samples_per_second_per_gpu": 673424.1654401752,
            "loss_sequences_lower_95": 5.681340598824098,
            "loss_sequences_upper_95": 5.702984193782217,
            "loss_tokens_lower_95": 5.680789604166667,
            "loss_tokens_upper_95": 5.703674229166666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.70279351854033,
            "data_time": 0.09467767924070358,
            "batch_time": 0.13917271047830582,
            "samples_per_second": 4150231.340625634,
            "samples_per_second_per_gpu": 518778.91757820424,
            "loss_sequences_lower_95": 5.671244498134388,
            "loss_sequences_upper_95": 5.735202231475147,
            "loss_tokens_lower_95": 5.690965791666667,
            "loss_tokens_upper_95": 5.7144955625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.806968866279545,
            "data_time": 0.03270996113618215,
            "batch_time": 0.07555678238471349,
            "samples_per_second": 5034632.807807327,
            "samples_per_second_per_gpu": 629329.1009759159,
            "loss_sequences_lower_95": 5.781787872833107,
            "loss_sequences_upper_95": 5.831661088996643,
            "loss_tokens_lower_95": 5.795051604166667,
            "loss_tokens_upper_95": 5.8190994895833335,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.909981049518196,
            "data_time": 0.012586326152086259,
            "batch_time": 0.05497004389762879,
            "samples_per_second": 5248280.183355363,
            "samples_per_second_per_gpu": 656035.0229194204,
            "loss_sequences_lower_95": 5.884797472895409,
            "loss_sequences_upper_95": 5.935476891342475,
            "loss_tokens_lower_95": 5.897146572916667,
            "loss_tokens_upper_95": 5.922994312499999,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.50220742400404,
            "data_time": 0.01550683457600443,
            "batch_time": 0.05891103728821403,
            "samples_per_second": 5217643.858442874,
            "samples_per_second_per_gpu": 652205.4823053592,
            "loss_sequences_lower_95": 5.493044011780104,
            "loss_sequences_upper_95": 5.511512915166885,
            "loss_tokens_lower_95": 5.49081871875,
            "loss_tokens_upper_95": 5.51390340625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.85970981237365,
            "data_time": 0.0911426767706871,
            "batch_time": 0.13499848544597626,
            "samples_per_second": 4184209.895148925,
            "samples_per_second_per_gpu": 523026.2368936156,
            "loss_sequences_lower_95": 5.818080114349117,
            "loss_sequences_upper_95": 5.9009021790047,
            "loss_tokens_lower_95": 5.847793229166667,
            "loss_tokens_upper_95": 5.8715825937499995,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.092465847377249,
            "data_time": 0.09241044521331787,
            "batch_time": 0.13773828744888306,
            "samples_per_second": 4194913.078639795,
            "samples_per_second_per_gpu": 524364.1348299744,
            "loss_sequences_lower_95": 6.071186593398746,
            "loss_sequences_upper_95": 6.11639821651896,
            "loss_tokens_lower_95": 6.0802289895833335,
            "loss_tokens_upper_95": 6.104123447916666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.032625463150357,
            "data_time": 0.009985546613561696,
            "batch_time": 0.05321179124815711,
            "samples_per_second": 5424972.355011685,
            "samples_per_second_per_gpu": 678121.5443764606,
            "loss_sequences_lower_95": 6.025647285485474,
            "loss_sequences_upper_95": 6.039549228814067,
            "loss_tokens_lower_95": 6.020881916666667,
            "loss_tokens_upper_95": 6.0447350208333335,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.916538996380699,
            "data_time": 0.023822066187858582,
            "batch_time": 0.07301723957061768,
            "samples_per_second": 5122190.382247805,
            "samples_per_second_per_gpu": 640273.7977809756,
            "loss_sequences_lower_95": 5.907025159194737,
            "loss_sequences_upper_95": 5.9264447207055655,
            "loss_tokens_lower_95": 5.9045786145833326,
            "loss_tokens_upper_95": 5.928289583333333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.63714630270101,
            "data_time": 0.09704400599002838,
            "batch_time": 0.1450122520327568,
            "samples_per_second": 4119572.8774176603,
            "samples_per_second_per_gpu": 514946.60967720754,
            "loss_sequences_lower_95": 5.5925416153294565,
            "loss_sequences_upper_95": 5.683547484608741,
            "loss_tokens_lower_95": 5.625627489583334,
            "loss_tokens_upper_95": 5.648464822916667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.21742560081715,
            "data_time": 0.0948231965303421,
            "batch_time": 0.13856983184814453,
            "samples_per_second": 4242946.339861441,
            "samples_per_second_per_gpu": 530368.2924826802,
            "loss_sequences_lower_95": 6.165405062114148,
            "loss_sequences_upper_95": 6.271345722019794,
            "loss_tokens_lower_95": 6.2057946875,
            "loss_tokens_upper_95": 6.229678375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.312733119184321,
            "data_time": 0.1438269317150116,
            "batch_time": 0.18631121516227722,
            "samples_per_second": 764023.6856607499,
            "samples_per_second_per_gpu": 95502.96070759374,
            "loss_sequences_lower_95": 7.273652666265314,
            "loss_sequences_upper_95": 7.352445897189054,
            "loss_tokens_lower_95": 7.2896961558948865,
            "loss_tokens_upper_95": 7.335911646756259,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6960347275692245,
            "data_time": 0.09159748256206512,
            "batch_time": 0.1269758716225624,
            "samples_per_second": 3289356.717026998,
            "samples_per_second_per_gpu": 411169.5896283747,
            "loss_sequences_lower_95": 5.606003412630399,
            "loss_sequences_upper_95": 5.7906803942977865,
            "loss_tokens_lower_95": 5.68398996875,
            "loss_tokens_upper_95": 5.708011666666667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.984834644599451,
            "data_time": 0.09229157119989395,
            "batch_time": 0.12862840294837952,
            "samples_per_second": 3783856.0064458065,
            "samples_per_second_per_gpu": 472982.0008057258,
            "loss_sequences_lower_95": 6.937029803837195,
            "loss_sequences_upper_95": 7.032064295947395,
            "loss_tokens_lower_95": 6.9741528229166665,
            "loss_tokens_upper_95": 6.995380999999999,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.9164092462570945,
            "data_time": 0.15728037059307098,
            "batch_time": 0.18743760883808136,
            "samples_per_second": 2142367.228252981,
            "samples_per_second_per_gpu": 267795.9035316226,
            "loss_sequences_lower_95": 6.871740072281635,
            "loss_sequences_upper_95": 6.965351354880411,
            "loss_tokens_lower_95": 6.903556410992731,
            "loss_tokens_upper_95": 6.929033085557281,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.99820060119417,
            "data_time": 0.027243557301434605,
            "batch_time": 0.07176649597558109,
            "samples_per_second": 4526327.402324837,
            "samples_per_second_per_gpu": 565790.9252906046,
            "loss_sequences_lower_95": 4.982434351521329,
            "loss_sequences_upper_95": 5.013322782767768,
            "loss_tokens_lower_95": 4.982557684090585,
            "loss_tokens_upper_95": 5.013562409868609,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.498192874304067,
            "data_time": 0.027702018991112708,
            "batch_time": 0.07163595184683799,
            "samples_per_second": 4473448.285793247,
            "samples_per_second_per_gpu": 559181.0357241558,
            "loss_sequences_lower_95": 5.474840552492033,
            "loss_sequences_upper_95": 5.501983660812089,
            "loss_tokens_lower_95": 5.48562280820765,
            "loss_tokens_upper_95": 5.508513653513909,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.808538365487958,
            "data_time": 0.054056409332487315,
            "batch_time": 0.09580404063065846,
            "samples_per_second": 4290223.472178582,
            "samples_per_second_per_gpu": 536277.9340223228,
            "loss_sequences_lower_95": 8.292066640588097,
            "loss_sequences_upper_95": 8.521548587328766,
            "loss_tokens_lower_95": 7.683060817250417,
            "loss_tokens_upper_95": 7.858133832573863,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.376882871468862,
            "data_time": 0.04040980835755666,
            "batch_time": 0.08413356790939967,
            "samples_per_second": 4611734.6461570235,
            "samples_per_second_per_gpu": 576466.8307696279,
            "loss_sequences_lower_95": 7.750953662109374,
            "loss_sequences_upper_95": 7.900090462239583,
            "loss_tokens_lower_95": 7.284460335593553,
            "loss_tokens_upper_95": 7.39713444624607,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.526397939700805,
            "data_time": 0.07232335458199184,
            "batch_time": 0.11265759666760762,
            "samples_per_second": 3965853.8545997743,
            "samples_per_second_per_gpu": 495731.7318249718,
            "loss_sequences_lower_95": 6.581530308235406,
            "loss_sequences_upper_95": 6.650524023141347,
            "loss_tokens_lower_95": 6.505260134344852,
            "loss_tokens_upper_95": 6.541420958462395,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.878196402029558,
            "data_time": 0.40178264677524567,
            "batch_time": 0.44400830566883087,
            "samples_per_second": 2429097.117595683,
            "samples_per_second_per_gpu": 303637.13969946036,
            "loss_sequences_lower_95": 4.861913146972656,
            "loss_sequences_upper_95": 4.9782535067471585,
            "loss_tokens_lower_95": 4.852227361621989,
            "loss_tokens_upper_95": 4.903600497266452,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.882943898804334,
            "data_time": 0.4143722653388977,
            "batch_time": 0.4615001827478409,
            "samples_per_second": 2229676.6647349168,
            "samples_per_second_per_gpu": 278709.5830918646,
            "loss_sequences_lower_95": 5.852047268614477,
            "loss_sequences_upper_95": 6.047927694515306,
            "loss_tokens_lower_95": 5.836200570201422,
            "loss_tokens_upper_95": 5.935360157836832,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4300959094365435,
            "data_time": 0.1725155934691429,
            "batch_time": 0.2044602930545807,
            "samples_per_second": 2561144.6689246683,
            "samples_per_second_per_gpu": 320143.08361558354,
            "loss_sequences_lower_95": 5.3719632466634115,
            "loss_sequences_upper_95": 5.493990509033203,
            "loss_tokens_lower_95": 5.327387704365923,
            "loss_tokens_upper_95": 5.538674757006142,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.848601393434254,
            "data_time": 0.02403442785143852,
            "batch_time": 0.0682830061763525,
            "samples_per_second": 4542356.707707341,
            "samples_per_second_per_gpu": 567794.5884634176,
            "loss_sequences_lower_95": 9.916637191821268,
            "loss_sequences_upper_95": 9.986381295058068,
            "loss_tokens_lower_95": 9.798373145515791,
            "loss_tokens_upper_95": 9.8695086288762,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.895122741007243,
            "data_time": 0.049443022906780244,
            "batch_time": 0.09217684417963028,
            "samples_per_second": 4371726.856860433,
            "samples_per_second_per_gpu": 546465.8571075541,
            "loss_sequences_lower_95": 8.07502124927662,
            "loss_sequences_upper_95": 8.327478335602114,
            "loss_tokens_lower_95": 6.7452698585447886,
            "loss_tokens_upper_95": 6.894045496636581,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.473374539267895,
            "data_time": 0.08457975387573242,
            "batch_time": 0.12700130343437194,
            "samples_per_second": 4327414.8224708615,
            "samples_per_second_per_gpu": 540926.8528088577,
            "loss_sequences_lower_95": 7.212474398564153,
            "loss_sequences_upper_95": 7.509015371693686,
            "loss_tokens_lower_95": 6.363401626532752,
            "loss_tokens_upper_95": 6.536808606985116,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.041959307509471,
            "data_time": 0.37140925228595734,
            "batch_time": 0.41447581350803375,
            "samples_per_second": 2229415.0435003797,
            "samples_per_second_per_gpu": 278676.88043754746,
            "loss_sequences_lower_95": 6.01116889013003,
            "loss_sequences_upper_95": 6.073193080675656,
            "loss_tokens_lower_95": 6.011785791127105,
            "loss_tokens_upper_95": 6.073319164258704,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.669845514297485,
            "data_time": 0.31188875436782837,
            "batch_time": 0.33764098584651947,
            "samples_per_second": 1782231.2578190404,
            "samples_per_second_per_gpu": 222778.90722738006,
            "loss_sequences_lower_95": 5.591023132324219,
            "loss_sequences_upper_95": 6.065431335449219,
            "loss_tokens_lower_95": 5.3927479643301375,
            "loss_tokens_upper_95": 5.923739050795225,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.793584173020955,
            "data_time": 0.05289324373006821,
            "batch_time": 0.0966768879443407,
            "samples_per_second": 4465469.534109257,
            "samples_per_second_per_gpu": 558183.6917636571,
            "loss_sequences_lower_95": 4.751520806178831,
            "loss_sequences_upper_95": 4.836807122908937,
            "loss_tokens_lower_95": 4.750229247297211,
            "loss_tokens_upper_95": 4.836193897633859,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.127897928036401,
            "data_time": 0.0780326783657074,
            "batch_time": 0.12104220688343048,
            "samples_per_second": 4506343.838192408,
            "samples_per_second_per_gpu": 563292.979774051,
            "loss_sequences_lower_95": 5.081178497389435,
            "loss_sequences_upper_95": 5.173632532568079,
            "loss_tokens_lower_95": 5.080043433436988,
            "loss_tokens_upper_95": 5.174726996394477,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.148477541090226,
            "data_time": 0.05181045085191727,
            "batch_time": 0.09308167919516563,
            "samples_per_second": 4226403.994720062,
            "samples_per_second_per_gpu": 528300.4993400078,
            "loss_sequences_lower_95": 6.308703241358644,
            "loss_sequences_upper_95": 6.419576319050088,
            "loss_tokens_lower_95": 6.120164713916493,
            "loss_tokens_upper_95": 6.181181172009937,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.426458155632019,
            "data_time": 0.18773489445447922,
            "batch_time": 0.2329661101102829,
            "samples_per_second": 3729448.1493679727,
            "samples_per_second_per_gpu": 466181.0186709966,
            "loss_sequences_lower_95": 8.148602404785157,
            "loss_sequences_upper_95": 8.621316796875,
            "loss_tokens_lower_95": 7.183354101174719,
            "loss_tokens_upper_95": 7.519146070095343,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.821379840373993,
            "data_time": 0.16671982407569885,
            "batch_time": 0.18339861929416656,
            "samples_per_second": 738481.4720215825,
            "samples_per_second_per_gpu": 92310.18400269782,
            "loss_sequences_lower_95": 5.516910040378571,
            "loss_sequences_upper_95": 6.2772247910499575,
            "loss_tokens_lower_95": 5.230628704202586,
            "loss_tokens_upper_95": 6.204799538097162,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.405445233158682,
            "data_time": 0.3215042054653168,
            "batch_time": 0.35684557259082794,
            "samples_per_second": 2210929.1397348884,
            "samples_per_second_per_gpu": 276366.14246686106,
            "loss_sequences_lower_95": 7.4145635495240665,
            "loss_sequences_upper_95": 8.017275088956985,
            "loss_tokens_lower_95": 6.105340803247428,
            "loss_tokens_upper_95": 6.549127644535654,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.184339587060371,
            "data_time": 0.04851860801378886,
            "batch_time": 0.09292070733176337,
            "samples_per_second": 4595021.821207588,
            "samples_per_second_per_gpu": 574377.7276509485,
            "loss_sequences_lower_95": 5.168150580903233,
            "loss_sequences_upper_95": 5.200543039571047,
            "loss_tokens_lower_95": 5.1677828234218,
            "loss_tokens_upper_95": 5.2000263232443205,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.786379878182746,
            "data_time": 0.03811807930469513,
            "batch_time": 0.08122502125444866,
            "samples_per_second": 4287454.312416927,
            "samples_per_second_per_gpu": 535931.7890521159,
            "loss_sequences_lower_95": 8.830480139754755,
            "loss_sequences_upper_95": 8.973601637547302,
            "loss_tokens_lower_95": 8.703861658511439,
            "loss_tokens_upper_95": 8.843857108577515,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.916741754545834,
            "data_time": 0.1812499836087227,
            "batch_time": 0.2113271877169609,
            "samples_per_second": 1930325.7097076497,
            "samples_per_second_per_gpu": 241290.7137134562,
            "loss_sequences_lower_95": 4.800745634980253,
            "loss_sequences_upper_95": 5.159895240867531,
            "loss_tokens_lower_95": 4.692195878924317,
            "loss_tokens_upper_95": 5.040298989831464,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.38681633376473,
            "data_time": 0.08857414722442628,
            "batch_time": 0.1333507239818573,
            "samples_per_second": 4195296.201606587,
            "samples_per_second_per_gpu": 524412.0252008233,
            "loss_sequences_lower_95": 5.418526853156447,
            "loss_sequences_upper_95": 5.552074347468182,
            "loss_tokens_lower_95": 5.30057104857434,
            "loss_tokens_upper_95": 5.46339817813393,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.263838613905558,
            "data_time": 0.36573487520217896,
            "batch_time": 0.40182968974113464,
            "samples_per_second": 1747195.288054694,
            "samples_per_second_per_gpu": 218399.41100683674,
            "loss_sequences_lower_95": 7.0613074884182065,
            "loss_sequences_upper_95": 7.60457224031774,
            "loss_tokens_lower_95": 7.100236055707075,
            "loss_tokens_upper_95": 7.462842354455547,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.932545129812629,
            "data_time": 0.028732054285868582,
            "batch_time": 0.07268908031308165,
            "samples_per_second": 4437969.498015216,
            "samples_per_second_per_gpu": 554746.187251902,
            "loss_sequences_lower_95": 4.924458309630676,
            "loss_sequences_upper_95": 4.940603814122199,
            "loss_tokens_lower_95": 4.92461108432624,
            "loss_tokens_upper_95": 4.940557847897705,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.651712394455104,
            "data_time": 0.32387131452560425,
            "batch_time": 0.35118623077869415,
            "samples_per_second": 1331925.31594916,
            "samples_per_second_per_gpu": 166490.664493645,
            "loss_sequences_lower_95": 6.5296442568880835,
            "loss_sequences_upper_95": 6.883708250175402,
            "loss_tokens_lower_95": 6.3998098476882355,
            "loss_tokens_upper_95": 6.814696665893679,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.548682278688349,
            "data_time": 0.022444690863291424,
            "batch_time": 0.06685905347267787,
            "samples_per_second": 4514056.674905993,
            "samples_per_second_per_gpu": 564257.0843632491,
            "loss_sequences_lower_95": 6.915520075832023,
            "loss_sequences_upper_95": 6.961558794385482,
            "loss_tokens_lower_95": 6.487486992263056,
            "loss_tokens_upper_95": 6.5291726063829785,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.0094540762901305,
            "data_time": 0.10243593901395798,
            "batch_time": 0.1466020978987217,
            "samples_per_second": 4221802.937588233,
            "samples_per_second_per_gpu": 527725.3671985291,
            "loss_sequences_lower_95": 7.096418347167969,
            "loss_sequences_upper_95": 7.283684606933594,
            "loss_tokens_lower_95": 6.907023905011359,
            "loss_tokens_upper_95": 7.069572978598705,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.995291446602863,
            "data_time": 0.33071471750736237,
            "batch_time": 0.37462690472602844,
            "samples_per_second": 2244448.2487449707,
            "samples_per_second_per_gpu": 280556.03109312133,
            "loss_sequences_lower_95": 4.884047559655231,
            "loss_sequences_upper_95": 5.10575415569803,
            "loss_tokens_lower_95": 4.8859987209154205,
            "loss_tokens_upper_95": 5.103324239979619,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.814401868617896,
            "data_time": 0.07505647093057632,
            "batch_time": 0.11530528962612152,
            "samples_per_second": 4002278.0377869415,
            "samples_per_second_per_gpu": 500284.7547233677,
            "loss_sequences_lower_95": 8.739837073123816,
            "loss_sequences_upper_95": 8.887105435458096,
            "loss_tokens_lower_95": 8.741828021425189,
            "loss_tokens_upper_95": 8.88711055871212,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.919967932701111,
            "data_time": 0.06594426929950714,
            "batch_time": 0.11023221164941788,
            "samples_per_second": 4435447.145859315,
            "samples_per_second_per_gpu": 554430.8932324144,
            "loss_sequences_lower_95": 5.027735896809896,
            "loss_sequences_upper_95": 5.116299348958333,
            "loss_tokens_lower_95": 4.860594534688876,
            "loss_tokens_upper_95": 4.963146360106542,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.4893165247780935,
            "data_time": 0.35867540538311005,
            "batch_time": 0.4006222039461136,
            "samples_per_second": 1691209.7072221716,
            "samples_per_second_per_gpu": 211401.21340277145,
            "loss_sequences_lower_95": 6.132692551385788,
            "loss_sequences_upper_95": 6.848602832612538,
            "loss_tokens_lower_95": 6.12991206577846,
            "loss_tokens_upper_95": 6.851159595307849,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.34742334485054,
            "data_time": 0.1582740843296051,
            "batch_time": 0.17603574693202972,
            "samples_per_second": 870551.9593626433,
            "samples_per_second_per_gpu": 108818.99492033041,
            "loss_sequences_lower_95": 7.183124756813049,
            "loss_sequences_upper_95": 8.558902072906493,
            "loss_tokens_lower_95": 6.940771396283022,
            "loss_tokens_upper_95": 7.522530152625644,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.1737497334480285,
            "data_time": 0.09342624992132187,
            "batch_time": 0.13847175240516663,
            "samples_per_second": 4306270.507204592,
            "samples_per_second_per_gpu": 538283.813400574,
            "loss_sequences_lower_95": 7.294824963378907,
            "loss_sequences_upper_95": 7.612026586914062,
            "loss_tokens_lower_95": 7.015967499140757,
            "loss_tokens_upper_95": 7.301361073656937,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.389142551422119,
            "data_time": 0.1093028225004673,
            "batch_time": 0.15461577475070953,
            "samples_per_second": 3941102.146481212,
            "samples_per_second_per_gpu": 492637.7683101515,
            "loss_sequences_lower_95": 7.710385876464844,
            "loss_sequences_upper_95": 7.96356650390625,
            "loss_tokens_lower_95": 7.2763817112387335,
            "loss_tokens_upper_95": 7.46824152243991,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5049284021605835,
            "data_time": 0.03938850139578184,
            "batch_time": 0.08390360449751218,
            "samples_per_second": 4523844.827714424,
            "samples_per_second_per_gpu": 565480.603464303,
            "loss_sequences_lower_95": 5.48885072867342,
            "loss_sequences_upper_95": 5.521150009559379,
            "loss_tokens_lower_95": 5.48828621793025,
            "loss_tokens_upper_95": 5.521605143993044,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.011857130926692,
            "data_time": 0.12636839350064596,
            "batch_time": 0.16680294275283813,
            "samples_per_second": 3882817.983012562,
            "samples_per_second_per_gpu": 485352.24787657027,
            "loss_sequences_lower_95": 4.9439804803757434,
            "loss_sequences_upper_95": 5.078087769492248,
            "loss_tokens_lower_95": 4.943608278429819,
            "loss_tokens_upper_95": 5.077472908266129,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.06047984981537,
            "data_time": 0.10405688732862473,
            "batch_time": 0.14884230121970177,
            "samples_per_second": 4153905.5544467033,
            "samples_per_second_per_gpu": 519238.1943058379,
            "loss_sequences_lower_95": 8.975979516601562,
            "loss_sequences_upper_95": 9.145638305664063,
            "loss_tokens_lower_95": 8.974320849609375,
            "loss_tokens_upper_95": 9.143634643554687,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.405411719193183,
            "data_time": 0.027827115285964238,
            "batch_time": 0.07205452911910556,
            "samples_per_second": 4444604.30069753,
            "samples_per_second_per_gpu": 555575.5375871912,
            "loss_sequences_lower_95": 8.002916198557239,
            "loss_sequences_upper_95": 8.078480387446783,
            "loss_tokens_lower_95": 7.316325785548991,
            "loss_tokens_upper_95": 7.372961578349614,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.064781430052288,
            "data_time": 0.20914049659456527,
            "batch_time": 0.24173534767968313,
            "samples_per_second": 1822351.8366327693,
            "samples_per_second_per_gpu": 227793.97957909617,
            "loss_sequences_lower_95": 4.948526171783903,
            "loss_sequences_upper_95": 5.179671444110016,
            "loss_tokens_lower_95": 4.945705789594507,
            "loss_tokens_upper_95": 5.180671350279851,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.102319742651547,
            "data_time": 0.1732226088643074,
            "batch_time": 0.21941803395748138,
            "samples_per_second": 3589996.2470350233,
            "samples_per_second_per_gpu": 448749.5308793779,
            "loss_sequences_lower_95": 5.017859413296569,
            "loss_sequences_upper_95": 5.184978625727635,
            "loss_tokens_lower_95": 5.0179698629940255,
            "loss_tokens_upper_95": 5.185620165058211,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.195823909494791,
            "data_time": 0.028890301939100027,
            "batch_time": 0.07323246402665973,
            "samples_per_second": 4448664.196008004,
            "samples_per_second_per_gpu": 556083.0245010005,
            "loss_sequences_lower_95": 7.601598269369285,
            "loss_sequences_upper_95": 7.68541956589393,
            "loss_tokens_lower_95": 7.115388565687209,
            "loss_tokens_upper_95": 7.186254387854236,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.566551697947991,
            "data_time": 0.37342916429042816,
            "batch_time": 0.4118163287639618,
            "samples_per_second": 1848769.0028523505,
            "samples_per_second_per_gpu": 231096.1253565438,
            "loss_sequences_lower_95": 5.49652272380849,
            "loss_sequences_upper_95": 5.636298673115079,
            "loss_tokens_lower_95": 5.497759848296957,
            "loss_tokens_upper_95": 5.636527619538484,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.012046245843264,
            "data_time": 0.05038958558669457,
            "batch_time": 0.09447509394242214,
            "samples_per_second": 4339377.241684207,
            "samples_per_second_per_gpu": 542422.1552105258,
            "loss_sequences_lower_95": 8.986974746750764,
            "loss_sequences_upper_95": 9.038303961797592,
            "loss_tokens_lower_95": 8.986639505088878,
            "loss_tokens_upper_95": 9.037291726395258,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.047792298122517,
            "data_time": 0.3469160497188568,
            "batch_time": 0.3872913420200348,
            "samples_per_second": 2075694.5750705204,
            "samples_per_second_per_gpu": 259461.82188381505,
            "loss_sequences_lower_95": 4.908116364710539,
            "loss_sequences_upper_95": 5.183842557147869,
            "loss_tokens_lower_95": 4.909508640326342,
            "loss_tokens_upper_95": 5.186428699678587,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.565490754445394,
            "data_time": 0.30279767513275146,
            "batch_time": 0.32324352860450745,
            "samples_per_second": 1287842.6511916725,
            "samples_per_second_per_gpu": 160980.33139895907,
            "loss_sequences_lower_95": 8.33796412150065,
            "loss_sequences_upper_95": 9.01602180480957,
            "loss_tokens_lower_95": 8.013556967841255,
            "loss_tokens_upper_95": 9.037488683064778,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.48245325088501,
            "data_time": 0.31268230080604553,
            "batch_time": 0.3338385820388794,
            "samples_per_second": 1137696.2160821538,
            "samples_per_second_per_gpu": 142212.02701026923,
            "loss_sequences_lower_95": 8.256634114583333,
            "loss_sequences_upper_95": 9.16140719095866,
            "loss_tokens_lower_95": 7.832079820954398,
            "loss_tokens_upper_95": 8.977194796787218,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.933235195423963,
            "data_time": 0.042404254632336755,
            "batch_time": 0.08514647824423653,
            "samples_per_second": 4319036.11079186,
            "samples_per_second_per_gpu": 539879.5138489825,
            "loss_sequences_lower_95": 8.900112743349595,
            "loss_sequences_upper_95": 8.96556294010033,
            "loss_tokens_lower_95": 8.90088516257824,
            "loss_tokens_upper_95": 8.9661124470729,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.302319552033787,
            "data_time": 0.022813561636808137,
            "batch_time": 0.06715190098275264,
            "samples_per_second": 4487681.610302855,
            "samples_per_second_per_gpu": 560960.2012878569,
            "loss_sequences_lower_95": 7.931592274370854,
            "loss_sequences_upper_95": 7.96196945201908,
            "loss_tokens_lower_95": 7.228528707076422,
            "loss_tokens_upper_95": 7.258218859408707,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.924020617026982,
            "data_time": 0.3436796963214874,
            "batch_time": 0.37319016456604004,
            "samples_per_second": 1689639.3138405846,
            "samples_per_second_per_gpu": 211204.91423007307,
            "loss_sequences_lower_95": 4.703603525236836,
            "loss_sequences_upper_95": 5.01347510870986,
            "loss_tokens_lower_95": 4.807706771696911,
            "loss_tokens_upper_95": 4.994596438125614,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.785124173035493,
            "data_time": 0.2132250964641571,
            "batch_time": 0.23217223584651947,
            "samples_per_second": 1015967.0070473597,
            "samples_per_second_per_gpu": 126995.87588091996,
            "loss_sequences_lower_95": 5.4819896594898125,
            "loss_sequences_upper_95": 6.1139490797713,
            "loss_tokens_lower_95": 5.3486039055718315,
            "loss_tokens_upper_95": 6.231003655327691,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.944768989958415,
            "data_time": 0.3413204848766327,
            "batch_time": 0.37622158229351044,
            "samples_per_second": 1694107.8845781109,
            "samples_per_second_per_gpu": 211763.48557226386,
            "loss_sequences_lower_95": 4.794014516690882,
            "loss_sequences_upper_95": 5.028646711023843,
            "loss_tokens_lower_95": 4.848299064668313,
            "loss_tokens_upper_95": 5.008233325339045,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.925412628708816,
            "data_time": 0.30773238837718964,
            "batch_time": 0.341573029756546,
            "samples_per_second": 2367522.7155348854,
            "samples_per_second_per_gpu": 295940.3394418607,
            "loss_sequences_lower_95": 4.869681446726729,
            "loss_sequences_upper_95": 5.074878860101467,
            "loss_tokens_lower_95": 4.844186687324089,
            "loss_tokens_upper_95": 4.975071842108156,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.985284342998412,
            "data_time": 0.31475630402565,
            "batch_time": 0.3489890992641449,
            "samples_per_second": 2119018.1693335418,
            "samples_per_second_per_gpu": 264877.2711666927,
            "loss_sequences_lower_95": 4.611413592827029,
            "loss_sequences_upper_95": 4.902960819151343,
            "loss_tokens_lower_95": 4.871740055797154,
            "loss_tokens_upper_95": 5.080970115727429,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.920398325454898,
            "data_time": 0.330726757645607,
            "batch_time": 0.3662111163139343,
            "samples_per_second": 2116612.7571880533,
            "samples_per_second_per_gpu": 264576.59464850667,
            "loss_sequences_lower_95": 4.87480185904154,
            "loss_sequences_upper_95": 5.083927945392888,
            "loss_tokens_lower_95": 4.845837877695434,
            "loss_tokens_upper_95": 4.967221525673555,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.761032717568534,
            "data_time": 0.3446143716573715,
            "batch_time": 0.3786010295152664,
            "samples_per_second": 2261090.8406758867,
            "samples_per_second_per_gpu": 282636.35508448584,
            "loss_sequences_lower_95": 4.553703109077785,
            "loss_sequences_upper_95": 4.712330646396424,
            "loss_tokens_lower_95": 4.72582484827435,
            "loss_tokens_upper_95": 4.827228916075606,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.9400411916942133,
            "data_time": 0.33625905215740204,
            "batch_time": 0.3734079599380493,
            "samples_per_second": 2081557.1023330442,
            "samples_per_second_per_gpu": 260194.63779163052,
            "loss_sequences_lower_95": 3.865947034882336,
            "loss_sequences_upper_95": 4.051889949891626,
            "loss_tokens_lower_95": 3.887484860043701,
            "loss_tokens_upper_95": 3.9740712472686863,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.5/params.txt",
    "uuid": "ad840351-877c-457c-be08-df34086d14af",
    "creation_date": "2023_12_14-05_54_21"
}