{
    "name": "rpj-d=512_l=8_h=4-32.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 50504990720,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 32.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "10100998144",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-32.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.934960514307022,
            "data_time": 0.029289204627275467,
            "batch_time": 0.32968437299132347,
            "samples_per_second": 1696584.713481605,
            "samples_per_second_per_gpu": 212073.08918520063,
            "loss_sequences_lower_95": 2.865236968994141,
            "loss_sequences_upper_95": 3.001039568583171,
            "loss_tokens_lower_95": 2.922976328531901,
            "loss_tokens_upper_95": 2.9469799423217773,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.425380424022153,
            "data_time": 0.001395223208198272,
            "batch_time": 0.015268124229971994,
            "samples_per_second": 2244809.129779768,
            "samples_per_second_per_gpu": 280601.141222471,
            "loss_sequences_lower_95": 3.422760451593548,
            "loss_sequences_upper_95": 3.4279565053706516,
            "loss_tokens_lower_95": 3.4146357135416667,
            "loss_tokens_upper_95": 3.4362029062499997,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.796895701058057,
            "data_time": 0.008852696418762207,
            "batch_time": 0.022467069625854492,
            "samples_per_second": 2216527.5399218113,
            "samples_per_second_per_gpu": 277065.9424902264,
            "loss_sequences_lower_95": 2.770595478914222,
            "loss_sequences_upper_95": 2.8232373420559633,
            "loss_tokens_lower_95": 2.785490375,
            "loss_tokens_upper_95": 2.8084611458333337,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2666985637133883,
            "data_time": 0.0014731446771245253,
            "batch_time": 0.014863062944067152,
            "samples_per_second": 2331913.8103798884,
            "samples_per_second_per_gpu": 291489.22629748605,
            "loss_sequences_lower_95": 3.254120429284794,
            "loss_sequences_upper_95": 3.278757938345683,
            "loss_tokens_lower_95": 3.2557326302083336,
            "loss_tokens_upper_95": 3.2772788124999996,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.419863925203529,
            "data_time": 0.009360874791544273,
            "batch_time": 0.023383552809635482,
            "samples_per_second": 2168083.7752944324,
            "samples_per_second_per_gpu": 271010.47191180405,
            "loss_sequences_lower_95": 3.3856310935709963,
            "loss_sequences_upper_95": 3.453096309747327,
            "loss_tokens_lower_95": 3.4088782864583336,
            "loss_tokens_upper_95": 3.4305971510416664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.194121310210082,
            "data_time": 0.0034632339425708938,
            "batch_time": 0.01708258332117744,
            "samples_per_second": 2294284.626992144,
            "samples_per_second_per_gpu": 286785.578374018,
            "loss_sequences_lower_95": 3.1509927278955643,
            "loss_sequences_upper_95": 3.236966309589565,
            "loss_tokens_lower_95": 3.183047901041667,
            "loss_tokens_upper_95": 3.2051258125,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.8235626687322344,
            "data_time": 0.0014991024190022155,
            "batch_time": 0.014903001742495215,
            "samples_per_second": 2335094.5395636903,
            "samples_per_second_per_gpu": 291886.8174454613,
            "loss_sequences_lower_95": 1.8008355837452168,
            "loss_sequences_upper_95": 1.8461069136639032,
            "loss_tokens_lower_95": 1.8135905937499999,
            "loss_tokens_upper_95": 1.8339373020833334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.793302975899262,
            "data_time": 0.0014564195271567389,
            "batch_time": 0.014986988192869,
            "samples_per_second": 2315729.5569712594,
            "samples_per_second_per_gpu": 289466.1946214074,
            "loss_sequences_lower_95": 3.7838652855039268,
            "loss_sequences_upper_95": 3.803024511207461,
            "loss_tokens_lower_95": 3.782321604166667,
            "loss_tokens_upper_95": 3.8039859166666665,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.592705002645167,
            "data_time": 0.010529823719509064,
            "batch_time": 0.024792650389292882,
            "samples_per_second": 2124876.060655174,
            "samples_per_second_per_gpu": 265609.50758189673,
            "loss_sequences_lower_95": 3.5525021560793,
            "loss_sequences_upper_95": 3.6376659920545125,
            "loss_tokens_lower_95": 3.5815283333333334,
            "loss_tokens_upper_95": 3.6039293229166667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.236646078320831,
            "data_time": 0.009513774886727333,
            "batch_time": 0.023271854035556316,
            "samples_per_second": 2217818.7340217405,
            "samples_per_second_per_gpu": 277227.34175271756,
            "loss_sequences_lower_95": 4.210145321570837,
            "loss_sequences_upper_95": 4.265320174024981,
            "loss_tokens_lower_95": 4.224394875000001,
            "loss_tokens_upper_95": 4.24906865625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3378560507119444,
            "data_time": 0.0012259392517499643,
            "batch_time": 0.014570749959216506,
            "samples_per_second": 2349333.521838223,
            "samples_per_second_per_gpu": 293666.6902297779,
            "loss_sequences_lower_95": 3.3293834500693777,
            "loss_sequences_upper_95": 3.346183296110559,
            "loss_tokens_lower_95": 3.327116130208333,
            "loss_tokens_upper_95": 3.3486217031249996,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.265041669673197,
            "data_time": 0.002452636142257449,
            "batch_time": 0.015773781729578277,
            "samples_per_second": 2346654.006326928,
            "samples_per_second_per_gpu": 293331.750790866,
            "loss_sequences_lower_95": 3.2546501366211342,
            "loss_sequences_upper_95": 3.275388347303207,
            "loss_tokens_lower_95": 3.2545222552083337,
            "loss_tokens_upper_95": 3.2756677552083335,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7572644085719666,
            "data_time": 0.00895689504419862,
            "batch_time": 0.022487544259534996,
            "samples_per_second": 2220740.8729505274,
            "samples_per_second_per_gpu": 277592.6091188159,
            "loss_sequences_lower_95": 3.7227660266187566,
            "loss_sequences_upper_95": 3.7898500392209686,
            "loss_tokens_lower_95": 3.7460198020833335,
            "loss_tokens_upper_95": 3.768572927083333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.139467164115362,
            "data_time": 0.00872606965175188,
            "batch_time": 0.02273633565560755,
            "samples_per_second": 2160047.674478627,
            "samples_per_second_per_gpu": 270005.95930982835,
            "loss_sequences_lower_95": 3.0770247138930436,
            "loss_sequences_upper_95": 3.19977256635056,
            "loss_tokens_lower_95": 3.128226854166667,
            "loss_tokens_upper_95": 3.1505979791666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.000277735970237,
            "data_time": 0.07282396725245885,
            "batch_time": 0.08916138751166207,
            "samples_per_second": 1174004.5457168603,
            "samples_per_second_per_gpu": 146750.56821460754,
            "loss_sequences_lower_95": 3.9380777359008787,
            "loss_sequences_upper_95": 4.062356645410711,
            "loss_tokens_lower_95": 3.97991229837591,
            "loss_tokens_upper_95": 4.02107552615079,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.807121952143077,
            "data_time": 0.012568070129914717,
            "batch_time": 0.026627428152344444,
            "samples_per_second": 2169175.9196474473,
            "samples_per_second_per_gpu": 271146.9899559309,
            "loss_sequences_lower_95": 2.7103778449856506,
            "loss_sequences_upper_95": 2.9033273502272,
            "loss_tokens_lower_95": 2.7962633802083334,
            "loss_tokens_upper_95": 2.8180010677083334,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.796414470924237,
            "data_time": 0.010949773093064627,
            "batch_time": 0.024675199141105015,
            "samples_per_second": 2206317.2045364766,
            "samples_per_second_per_gpu": 275789.6505670596,
            "loss_sequences_lower_95": 5.747439003493981,
            "loss_sequences_upper_95": 5.843676677291185,
            "loss_tokens_lower_95": 5.784954072916667,
            "loss_tokens_upper_95": 5.807528625,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.564429099442529,
            "data_time": 0.03158371150493622,
            "batch_time": 0.04594622924923897,
            "samples_per_second": 1875801.3558501715,
            "samples_per_second_per_gpu": 234475.16948127144,
            "loss_sequences_lower_95": 3.520881427702357,
            "loss_sequences_upper_95": 3.6096870797579403,
            "loss_tokens_lower_95": 3.5520184751416815,
            "loss_tokens_upper_95": 3.5767531598200564,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.6942963877594925,
            "data_time": 0.0021098286999656833,
            "batch_time": 0.015749010178928765,
            "samples_per_second": 2271005.018325273,
            "samples_per_second_per_gpu": 283875.6272906591,
            "loss_sequences_lower_95": 4.67662084894869,
            "loss_sequences_upper_95": 4.71262419493751,
            "loss_tokens_lower_95": 4.675952583098918,
            "loss_tokens_upper_95": 4.712353480852086,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.20232875801562,
            "data_time": 0.0021228023869976115,
            "batch_time": 0.01579605147337458,
            "samples_per_second": 2264199.755687623,
            "samples_per_second_per_gpu": 283024.96946095285,
            "loss_sequences_lower_95": 3.1900865145685,
            "loss_sequences_upper_95": 3.215473137821151,
            "loss_tokens_lower_95": 3.1875875697251868,
            "loss_tokens_upper_95": 3.206909775651484,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.019129322056056,
            "data_time": 0.002942354919548646,
            "batch_time": 0.016769535017957336,
            "samples_per_second": 2239028.9759813524,
            "samples_per_second_per_gpu": 279878.62199766905,
            "loss_sequences_lower_95": 4.273520625442844,
            "loss_sequences_upper_95": 4.566217303377421,
            "loss_tokens_lower_95": 3.4740935443099237,
            "loss_tokens_upper_95": 3.685557653951111,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.39396617658933,
            "data_time": 0.0043233052847233226,
            "batch_time": 0.018076481020196954,
            "samples_per_second": 2241779.204770477,
            "samples_per_second_per_gpu": 280222.4005963096,
            "loss_sequences_lower_95": 4.515530289713542,
            "loss_sequences_upper_95": 4.716646435546875,
            "loss_tokens_lower_95": 4.10601752898978,
            "loss_tokens_upper_95": 4.249825189170597,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.99245238376441,
            "data_time": 0.004265988754111358,
            "batch_time": 0.018073086645088943,
            "samples_per_second": 2226983.2349795364,
            "samples_per_second_per_gpu": 278372.90437244205,
            "loss_sequences_lower_95": 3.034026724791509,
            "loss_sequences_upper_95": 3.0928986359221002,
            "loss_tokens_lower_95": 2.899703366738054,
            "loss_tokens_upper_95": 2.930341056095328,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.2049513827670704,
            "data_time": 0.021073205130440847,
            "batch_time": 0.03509195148944855,
            "samples_per_second": 2086767.7131651037,
            "samples_per_second_per_gpu": 260845.96414563796,
            "loss_sequences_lower_95": 2.1850681790438564,
            "loss_sequences_upper_95": 2.2912231930819424,
            "loss_tokens_lower_95": 2.139398713846619,
            "loss_tokens_upper_95": 2.184991926112315,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.384683573975855,
            "data_time": 0.017997510731220245,
            "batch_time": 0.03189564868807793,
            "samples_per_second": 2036150.4942285344,
            "samples_per_second_per_gpu": 254518.8117785668,
            "loss_sequences_lower_95": 3.370299695073342,
            "loss_sequences_upper_95": 3.5638873913823343,
            "loss_tokens_lower_95": 3.2502245564806196,
            "loss_tokens_upper_95": 3.3431313341020648,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6027711645762124,
            "data_time": 0.015644284395071175,
            "batch_time": 0.030570759223057672,
            "samples_per_second": 1952511.1241393597,
            "samples_per_second_per_gpu": 244063.89051741996,
            "loss_sequences_lower_95": 3.574837534586589,
            "loss_sequences_upper_95": 3.6749428710937497,
            "loss_tokens_lower_95": 3.4654720575169966,
            "loss_tokens_upper_95": 3.6794185077304156,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.138710484553719,
            "data_time": 0.0016662261462824022,
            "batch_time": 0.015306750827740877,
            "samples_per_second": 2274009.12220925,
            "samples_per_second_per_gpu": 284251.14027615625,
            "loss_sequences_lower_95": 6.152663414309704,
            "loss_sequences_upper_95": 6.2284075163931405,
            "loss_tokens_lower_95": 5.993647122413876,
            "loss_tokens_upper_95": 6.071294129855702,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4469529428466,
            "data_time": 0.002686271131438697,
            "batch_time": 0.016432005487032386,
            "samples_per_second": 2257963.153300451,
            "samples_per_second_per_gpu": 282245.3941625564,
            "loss_sequences_lower_95": 4.948990607984139,
            "loss_sequences_upper_95": 5.238498099484428,
            "loss_tokens_lower_95": 3.7527861523260104,
            "loss_tokens_upper_95": 3.887220945713414,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.089833583339489,
            "data_time": 0.004446037315033577,
            "batch_time": 0.017992533944748545,
            "samples_per_second": 2251517.1215391066,
            "samples_per_second_per_gpu": 281439.6401923883,
            "loss_sequences_lower_95": 4.500415580671395,
            "loss_sequences_upper_95": 4.825554128392971,
            "loss_tokens_lower_95": 3.698575326403174,
            "loss_tokens_upper_95": 3.8521200607806954,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.630282127693908,
            "data_time": 0.02119898796081543,
            "batch_time": 0.03564507620675223,
            "samples_per_second": 2004133.0111696539,
            "samples_per_second_per_gpu": 250516.62639620673,
            "loss_sequences_lower_95": 5.547925571110695,
            "loss_sequences_upper_95": 5.712774031129602,
            "loss_tokens_lower_95": 5.546699210389019,
            "loss_tokens_upper_95": 5.712612747819456,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5005926871299744,
            "data_time": 0.044010703380291276,
            "batch_time": 0.06022587647804847,
            "samples_per_second": 1672464.4567774248,
            "samples_per_second_per_gpu": 209058.0570971781,
            "loss_sequences_lower_95": 3.356619560241699,
            "loss_sequences_upper_95": 3.7164561614990235,
            "loss_tokens_lower_95": 3.1954622326681994,
            "loss_tokens_upper_95": 3.6544478305550507,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.305022308623901,
            "data_time": 0.0030243143225983852,
            "batch_time": 0.016703444268318286,
            "samples_per_second": 2260772.6653447347,
            "samples_per_second_per_gpu": 282596.58316809183,
            "loss_sequences_lower_95": 4.266524047226842,
            "loss_sequences_upper_95": 4.343559647777796,
            "loss_tokens_lower_95": 4.2660772348987654,
            "loss_tokens_upper_95": 4.344135552272581,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.714968069765433,
            "data_time": 0.004566074778944205,
            "batch_time": 0.018163633968857333,
            "samples_per_second": 2264388.1160149467,
            "samples_per_second_per_gpu": 283048.51450186834,
            "loss_sequences_lower_95": 4.66497859720516,
            "loss_sequences_upper_95": 4.764768498301213,
            "loss_tokens_lower_95": 4.662533961740505,
            "loss_tokens_upper_95": 4.76574594058725,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7080791929211787,
            "data_time": 0.0032633401405219127,
            "batch_time": 0.016811686252157322,
            "samples_per_second": 2265848.0693169464,
            "samples_per_second_per_gpu": 283231.0086646183,
            "loss_sequences_lower_95": 3.875580684093189,
            "loss_sequences_upper_95": 4.00599726530621,
            "loss_tokens_lower_95": 3.5066116916255723,
            "loss_tokens_upper_95": 3.5635068990306387,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.584428046226502,
            "data_time": 0.010139827616512775,
            "batch_time": 0.023895379155874252,
            "samples_per_second": 2167028.336758377,
            "samples_per_second_per_gpu": 270878.5420947971,
            "loss_sequences_lower_95": 5.7890689697265625,
            "loss_sequences_upper_95": 6.3475760620117185,
            "loss_tokens_lower_95": 4.947718763572345,
            "loss_tokens_upper_95": 5.305358846324867,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8723871260881424,
            "data_time": 0.13709111511707306,
            "batch_time": 0.1535506695508957,
            "samples_per_second": 1048984.881470962,
            "samples_per_second_per_gpu": 131123.11018387025,
            "loss_sequences_lower_95": 3.6806276559829714,
            "loss_sequences_upper_95": 4.121543896198273,
            "loss_tokens_lower_95": 3.463108843222432,
            "loss_tokens_upper_95": 4.218104570761494,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.806412398130044,
            "data_time": 0.024287135043042772,
            "batch_time": 0.03929663972651705,
            "samples_per_second": 1790980.857918889,
            "samples_per_second_per_gpu": 223872.60723986113,
            "loss_sequences_lower_95": 5.193067940898325,
            "loss_sequences_upper_95": 5.916500012627964,
            "loss_tokens_lower_95": 3.5527582936249296,
            "loss_tokens_upper_95": 4.002702047698527,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9796366283288425,
            "data_time": 0.002639454272058275,
            "batch_time": 0.016081207949254248,
            "samples_per_second": 2288143.497517959,
            "samples_per_second_per_gpu": 286017.9371897449,
            "loss_sequences_lower_95": 2.951537200674421,
            "loss_sequences_upper_95": 3.0073241014259775,
            "loss_tokens_lower_95": 2.9513835488682285,
            "loss_tokens_upper_95": 3.008025037799667,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.977811105116613,
            "data_time": 0.0028410679525213275,
            "batch_time": 0.016390191245828436,
            "samples_per_second": 2284039.1529606683,
            "samples_per_second_per_gpu": 285504.89412008354,
            "loss_sequences_lower_95": 2.9542398693722105,
            "loss_sequences_upper_95": 3.108103028765101,
            "loss_tokens_lower_95": 2.809411745700704,
            "loss_tokens_upper_95": 2.96210283758516,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3593426877325707,
            "data_time": 0.016724260316954717,
            "batch_time": 0.030276364750332303,
            "samples_per_second": 2078461.1423213987,
            "samples_per_second_per_gpu": 259807.64279017484,
            "loss_sequences_lower_95": 3.2181289449279564,
            "loss_sequences_upper_95": 3.6180099878555687,
            "loss_tokens_lower_95": 3.083578505326376,
            "loss_tokens_upper_95": 3.3834117229507545,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7443999677731847,
            "data_time": 0.004530150443315506,
            "batch_time": 0.018053795769810678,
            "samples_per_second": 2259145.294777726,
            "samples_per_second_per_gpu": 282393.1618472157,
            "loss_sequences_lower_95": 3.7814212838212806,
            "loss_sequences_upper_95": 3.9319128119296316,
            "loss_tokens_lower_95": 3.60198744996533,
            "loss_tokens_upper_95": 3.7470452048794964,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.847699184243272,
            "data_time": 0.029044696262904575,
            "batch_time": 0.0441261473156157,
            "samples_per_second": 1866388.1253104636,
            "samples_per_second_per_gpu": 233298.51566380795,
            "loss_sequences_lower_95": 2.7074093702362805,
            "loss_sequences_upper_95": 3.169855620221394,
            "loss_tokens_lower_95": 2.597066250127447,
            "loss_tokens_upper_95": 2.960110125144971,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.826321864275962,
            "data_time": 0.002126728826363564,
            "batch_time": 0.01571270874677255,
            "samples_per_second": 2273751.9317447064,
            "samples_per_second_per_gpu": 284218.9914680883,
            "loss_sequences_lower_95": 3.814454658509827,
            "loss_sequences_upper_95": 3.8376906826677835,
            "loss_tokens_lower_95": 3.8146495803066864,
            "loss_tokens_upper_95": 3.8379643311474796,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.023256594694934,
            "data_time": 0.04042016809636896,
            "batch_time": 0.0551826997236772,
            "samples_per_second": 1811508.4853663277,
            "samples_per_second_per_gpu": 226438.56067079096,
            "loss_sequences_lower_95": 0.9801609502255337,
            "loss_sequences_upper_95": 1.112762362285725,
            "loss_tokens_lower_95": 0.8786899437801865,
            "loss_tokens_upper_95": 1.0799381220963007,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.1496174487833954,
            "data_time": 0.0016070893354407899,
            "batch_time": 0.01522167003264671,
            "samples_per_second": 2271262.0440642377,
            "samples_per_second_per_gpu": 283907.7555080297,
            "loss_sequences_lower_95": 5.57619463074882,
            "loss_sequences_upper_95": 5.6273445382108225,
            "loss_tokens_lower_95": 4.494897171179884,
            "loss_tokens_upper_95": 4.545852224371374,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.028765433788299,
            "data_time": 0.0051712289689079165,
            "batch_time": 0.019023812952495757,
            "samples_per_second": 2218883.4543043855,
            "samples_per_second_per_gpu": 277360.4317880482,
            "loss_sequences_lower_95": 6.04938690185547,
            "loss_sequences_upper_95": 6.303178979492188,
            "loss_tokens_lower_95": 5.745517404413186,
            "loss_tokens_upper_95": 5.984218074034464,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.453144495383553,
            "data_time": 0.02098280696545617,
            "batch_time": 0.034805261482626706,
            "samples_per_second": 2072013.2371594761,
            "samples_per_second_per_gpu": 259001.65464493452,
            "loss_sequences_lower_95": 5.283651693592901,
            "loss_sequences_upper_95": 5.624872662088145,
            "loss_tokens_lower_95": 5.281382034965183,
            "loss_tokens_upper_95": 5.623239480723505,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.1063593669371174,
            "data_time": 0.004182495266558176,
            "batch_time": 0.017864580973085152,
            "samples_per_second": 2244210.4271345334,
            "samples_per_second_per_gpu": 280526.3033918167,
            "loss_sequences_lower_95": 7.012093209931344,
            "loss_sequences_upper_95": 7.19818466648911,
            "loss_tokens_lower_95": 7.01510566480232,
            "loss_tokens_upper_95": 7.196162331321022,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.9303318688074748,
            "data_time": 0.0037394248424692357,
            "batch_time": 0.01752614404292817,
            "samples_per_second": 2234370.744714822,
            "samples_per_second_per_gpu": 279296.34308935277,
            "loss_sequences_lower_95": 0.9582512166341146,
            "loss_sequences_upper_95": 1.0005544901529948,
            "loss_tokens_lower_95": 0.8722038073041716,
            "loss_tokens_upper_95": 0.922340881665166,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.125010158902123,
            "data_time": 0.021958270243235996,
            "batch_time": 0.036408185958862305,
            "samples_per_second": 1940974.4401528605,
            "samples_per_second_per_gpu": 242621.80501910756,
            "loss_sequences_lower_95": 5.773942289806548,
            "loss_sequences_upper_95": 6.475829177129836,
            "loss_tokens_lower_95": 5.780091334751674,
            "loss_tokens_upper_95": 6.481158708844866,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.4408850744366646,
            "data_time": 0.14507247507572174,
            "batch_time": 0.16406439244747162,
            "samples_per_second": 926036.393149101,
            "samples_per_second_per_gpu": 115754.54914363762,
            "loss_sequences_lower_95": 2.1940170228481293,
            "loss_sequences_upper_95": 3.2509622514247893,
            "loss_tokens_lower_95": 1.8678601168603013,
            "loss_tokens_upper_95": 2.3968682546713915,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.541789635181427,
            "data_time": 0.005142131495097327,
            "batch_time": 0.018879220126167176,
            "samples_per_second": 2225102.2410028186,
            "samples_per_second_per_gpu": 278137.7801253523,
            "loss_sequences_lower_95": 7.473272509765625,
            "loss_sequences_upper_95": 7.831195886230469,
            "loss_tokens_lower_95": 7.242611338039341,
            "loss_tokens_upper_95": 7.554481632858767,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.537551785469055,
            "data_time": 0.00531269113222758,
            "batch_time": 0.018996157816478183,
            "samples_per_second": 2234196.2434147005,
            "samples_per_second_per_gpu": 279274.53042683756,
            "loss_sequences_lower_95": 6.613791931152344,
            "loss_sequences_upper_95": 6.823506311035156,
            "loss_tokens_lower_95": 6.317422569400976,
            "loss_tokens_upper_95": 6.501770673588507,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.175862727531282,
            "data_time": 0.003779928500835712,
            "batch_time": 0.017388267421403456,
            "samples_per_second": 2255047.871667504,
            "samples_per_second_per_gpu": 281880.983958438,
            "loss_sequences_lower_95": 4.157546236944246,
            "loss_sequences_upper_95": 4.194293812397858,
            "loss_tokens_lower_95": 4.157729862122339,
            "loss_tokens_upper_95": 4.193760754300411,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.784240030839513,
            "data_time": 0.007972439970494757,
            "batch_time": 0.021683756318337247,
            "samples_per_second": 2189712.4640288665,
            "samples_per_second_per_gpu": 273714.0580036083,
            "loss_sequences_lower_95": 4.683266354046658,
            "loss_sequences_upper_95": 4.883350003150202,
            "loss_tokens_lower_95": 4.681546193956414,
            "loss_tokens_upper_95": 4.88199945734027,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.11787021446228,
            "data_time": 0.005836616905908736,
            "batch_time": 0.01948014090931605,
            "samples_per_second": 2235618.183767106,
            "samples_per_second_per_gpu": 279452.27297088824,
            "loss_sequences_lower_95": 4.970366442871094,
            "loss_sequences_upper_95": 5.268056762695313,
            "loss_tokens_lower_95": 4.970691430664062,
            "loss_tokens_upper_95": 5.272481115722656,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.058685666615938,
            "data_time": 0.0030466484437592766,
            "batch_time": 0.016569604089444887,
            "samples_per_second": 2288686.818317475,
            "samples_per_second_per_gpu": 286085.85228968435,
            "loss_sequences_lower_95": 3.5558019546032402,
            "loss_sequences_upper_95": 3.648066166035951,
            "loss_tokens_lower_95": 2.442741461670328,
            "loss_tokens_upper_95": 2.5062174451395407,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.058329043103687,
            "data_time": 0.017513551030840194,
            "batch_time": 0.03195773192814418,
            "samples_per_second": 1975260.3625535956,
            "samples_per_second_per_gpu": 246907.54531919945,
            "loss_sequences_lower_95": 4.896964377787576,
            "loss_sequences_upper_95": 5.219989503319583,
            "loss_tokens_lower_95": 4.896364536570079,
            "loss_tokens_upper_95": 5.219503579210879,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.03021325971566,
            "data_time": 0.010208050720393658,
            "batch_time": 0.024278828874230385,
            "samples_per_second": 2159161.475590921,
            "samples_per_second_per_gpu": 269895.1844488651,
            "loss_sequences_lower_95": 4.920389248717065,
            "loss_sequences_upper_95": 5.137283791934743,
            "loss_tokens_lower_95": 4.9204898011450675,
            "loss_tokens_upper_95": 5.136353999119179,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.298933380299398,
            "data_time": 0.002357621346750567,
            "batch_time": 0.01599398372351572,
            "samples_per_second": 2272772.4380109333,
            "samples_per_second_per_gpu": 284096.55475136667,
            "loss_sequences_lower_95": 3.5898049651897,
            "loss_sequences_upper_95": 3.675018808278138,
            "loss_tokens_lower_95": 2.7988877920844484,
            "loss_tokens_upper_95": 2.8689173943478026,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.306740309195543,
            "data_time": 0.024730590482552845,
            "batch_time": 0.039854496717453,
            "samples_per_second": 1955742.4442166705,
            "samples_per_second_per_gpu": 244467.8055270838,
            "loss_sequences_lower_95": 4.2124610699043075,
            "loss_sequences_upper_95": 4.398618797019676,
            "loss_tokens_lower_95": 4.212883326616237,
            "loss_tokens_upper_95": 4.398237658303882,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.422005232248102,
            "data_time": 0.003462946313059228,
            "batch_time": 0.017266124304979858,
            "samples_per_second": 2232565.5907112104,
            "samples_per_second_per_gpu": 279070.6988389013,
            "loss_sequences_lower_95": 5.383290074899656,
            "loss_sequences_upper_95": 5.460929182793387,
            "loss_tokens_lower_95": 5.382499447510512,
            "loss_tokens_upper_95": 5.460887596760321,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.447973117087651,
            "data_time": 0.022280970486727628,
            "batch_time": 0.03626423965800892,
            "samples_per_second": 1961553.5642375725,
            "samples_per_second_per_gpu": 245194.19552969656,
            "loss_sequences_lower_95": 5.28000629017654,
            "loss_sequences_upper_95": 5.61412221667836,
            "loss_tokens_lower_95": 5.27672619865936,
            "loss_tokens_upper_95": 5.617616049757281,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5799524704615275,
            "data_time": 0.07285207509994507,
            "batch_time": 0.08829985558986664,
            "samples_per_second": 1351553.2038934769,
            "samples_per_second_per_gpu": 168944.1504866846,
            "loss_sequences_lower_95": 3.3166468493143717,
            "loss_sequences_upper_95": 4.047053337097168,
            "loss_tokens_lower_95": 2.9462543540530737,
            "loss_tokens_upper_95": 3.8371996561686195,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.923122219244639,
            "data_time": 0.07067634910345078,
            "batch_time": 0.08610454946756363,
            "samples_per_second": 1473179.923301528,
            "samples_per_second_per_gpu": 184147.490412691,
            "loss_sequences_lower_95": 2.7823616790771486,
            "loss_sequences_upper_95": 3.477171141306559,
            "loss_tokens_lower_95": 2.2429855132370853,
            "loss_tokens_upper_95": 3.1691721541158264,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.018330449173131,
            "data_time": 0.003556633781446577,
            "batch_time": 0.01727373323558428,
            "samples_per_second": 2249601.2106307507,
            "samples_per_second_per_gpu": 281200.15132884384,
            "loss_sequences_lower_95": 4.9926051926086155,
            "loss_sequences_upper_95": 5.044314781042894,
            "loss_tokens_lower_95": 4.992492722523932,
            "loss_tokens_upper_95": 5.044526115496134,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.5344070899468097,
            "data_time": 0.001581884227709384,
            "batch_time": 0.015164194127271682,
            "samples_per_second": 2275764.7800794733,
            "samples_per_second_per_gpu": 284470.59750993416,
            "loss_sequences_lower_95": 0.6149932663067171,
            "loss_sequences_upper_95": 0.631636083780688,
            "loss_tokens_lower_95": 0.4461296435154896,
            "loss_tokens_upper_95": 0.45495356829866146,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.7588750593305573,
            "data_time": 0.035208672285079956,
            "batch_time": 0.05022991821169853,
            "samples_per_second": 1891707.473266722,
            "samples_per_second_per_gpu": 236463.43415834024,
            "loss_sequences_lower_95": 1.6786082830954725,
            "loss_sequences_upper_95": 1.9122228967861867,
            "loss_tokens_lower_95": 1.5612019047166696,
            "loss_tokens_upper_95": 1.6889702561035755,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5477497835417053,
            "data_time": 0.11055942944117955,
            "batch_time": 0.1271157151176816,
            "samples_per_second": 1079578.6673660441,
            "samples_per_second_per_gpu": 134947.33342075552,
            "loss_sequences_lower_95": 3.1368242676193647,
            "loss_sequences_upper_95": 4.031003426216744,
            "loss_tokens_lower_95": 3.0134727666407453,
            "loss_tokens_upper_95": 4.0152122403368535,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.5919333732709653,
            "data_time": 0.028931299845377605,
            "batch_time": 0.044005192461467925,
            "samples_per_second": 1859021.6798982436,
            "samples_per_second_per_gpu": 232377.70998728045,
            "loss_sequences_lower_95": 1.5473166930966262,
            "loss_sequences_upper_95": 1.749289238162157,
            "loss_tokens_lower_95": 1.4372882608809925,
            "loss_tokens_upper_95": 1.53858224726004,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.6539976048033411,
            "data_time": 0.02832341194152832,
            "batch_time": 0.042254263446444555,
            "samples_per_second": 1983369.5728592935,
            "samples_per_second_per_gpu": 247921.1966074117,
            "loss_sequences_lower_95": 1.6367579855569978,
            "loss_sequences_upper_95": 1.820969037311833,
            "loss_tokens_lower_95": 1.4880871085275083,
            "loss_tokens_upper_95": 1.573765751322087,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.5881159999748555,
            "data_time": 0.029268085956573486,
            "batch_time": 0.04373918544678461,
            "samples_per_second": 1914533.040277782,
            "samples_per_second_per_gpu": 239316.63003472274,
            "loss_sequences_lower_95": 1.4704479799038026,
            "loss_sequences_upper_95": 1.6936411183054851,
            "loss_tokens_lower_95": 1.4946595958075763,
            "loss_tokens_upper_95": 1.631308122034934,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.7257519378894712,
            "data_time": 0.02772200959069388,
            "batch_time": 0.04231872161229452,
            "samples_per_second": 1937005.3265652508,
            "samples_per_second_per_gpu": 242125.66582065634,
            "loss_sequences_lower_95": 1.6965028390651795,
            "loss_sequences_upper_95": 1.8671958969860543,
            "loss_tokens_lower_95": 1.5606628322898413,
            "loss_tokens_upper_95": 1.6427822077385734,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.4299167638980084,
            "data_time": 0.02878485196902428,
            "batch_time": 0.04336200525731216,
            "samples_per_second": 1940510.3742013264,
            "samples_per_second_per_gpu": 242563.7967751658,
            "loss_sequences_lower_95": 1.3775604864084945,
            "loss_sequences_upper_95": 1.4873952830059927,
            "loss_tokens_lower_95": 1.3647113242404578,
            "loss_tokens_upper_95": 1.427579473164668,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.3199987920319163,
            "data_time": 0.028933703899383545,
            "batch_time": 0.0428641296568371,
            "samples_per_second": 2011212.4124671174,
            "samples_per_second_per_gpu": 251401.55155838968,
            "loss_sequences_lower_95": 1.313193163057653,
            "loss_sequences_upper_95": 1.4427345996949732,
            "loss_tokens_lower_95": 1.1805157798137056,
            "loss_tokens_upper_95": 1.234857580220577,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-32.0/params.txt",
    "uuid": "f73d129b-b193-4720-86b2-9ec953f63565",
    "creation_date": "2023_12_14-06_36_53"
}