{
    "name": "rpj-d=512_l=8_h=4-16.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 25252495360,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "5050499072",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.0038157681624096,
            "data_time": 0.033404119312763214,
            "batch_time": 0.3439125046133995,
            "samples_per_second": 1711326.5194648681,
            "samples_per_second_per_gpu": 213915.81493310852,
            "loss_sequences_lower_95": 2.934239590962728,
            "loss_sequences_upper_95": 3.069486757914225,
            "loss_tokens_lower_95": 2.9917386182149253,
            "loss_tokens_upper_95": 3.0159251340230306,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.489515495205766,
            "data_time": 0.0016137341753167578,
            "batch_time": 0.015609628303868348,
            "samples_per_second": 2216001.3337857085,
            "samples_per_second_per_gpu": 277000.16672321357,
            "loss_sequences_lower_95": 3.4869277781944494,
            "loss_sequences_upper_95": 3.492069787218085,
            "loss_tokens_lower_95": 3.478672447916667,
            "loss_tokens_upper_95": 3.500416270833333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.8369684691331822,
            "data_time": 0.009647984504699707,
            "batch_time": 0.023632064819335937,
            "samples_per_second": 2158045.1982692047,
            "samples_per_second_per_gpu": 269755.6497836506,
            "loss_sequences_lower_95": 2.8102341445611447,
            "loss_sequences_upper_95": 2.8634032331194192,
            "loss_tokens_lower_95": 2.8254227604166666,
            "loss_tokens_upper_95": 2.848643770833333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3268790449063803,
            "data_time": 0.0016095908848862898,
            "batch_time": 0.01506134661796846,
            "samples_per_second": 2313785.579488727,
            "samples_per_second_per_gpu": 289223.19743609085,
            "loss_sequences_lower_95": 3.3140768464078607,
            "loss_sequences_upper_95": 3.3392091353898197,
            "loss_tokens_lower_95": 3.315658244791667,
            "loss_tokens_upper_95": 3.3376270364583336,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4794149991208325,
            "data_time": 0.009673648621456556,
            "batch_time": 0.023665530748101345,
            "samples_per_second": 2152397.7014649245,
            "samples_per_second_per_gpu": 269049.71268311556,
            "loss_sequences_lower_95": 3.444292728089995,
            "loss_sequences_upper_95": 3.51329954811609,
            "loss_tokens_lower_95": 3.46844859375,
            "loss_tokens_upper_95": 3.4903148385416665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.25077247603421,
            "data_time": 0.0038484438605930495,
            "batch_time": 0.01754829967799394,
            "samples_per_second": 2273398.1834306316,
            "samples_per_second_per_gpu": 284174.77292882896,
            "loss_sequences_lower_95": 3.2078907378590245,
            "loss_sequences_upper_95": 3.293789077437224,
            "loss_tokens_lower_95": 3.239550609375,
            "loss_tokens_upper_95": 3.2618196197916665,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.8774880928409343,
            "data_time": 0.0016856578590430404,
            "batch_time": 0.015100547481711109,
            "samples_per_second": 2324257.8030497967,
            "samples_per_second_per_gpu": 290532.2253812246,
            "loss_sequences_lower_95": 1.8548654934630102,
            "loss_sequences_upper_95": 1.900150390625,
            "loss_tokens_lower_95": 1.8672432864583335,
            "loss_tokens_upper_95": 1.887957578125,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.834191818586819,
            "data_time": 0.0016080027890973188,
            "batch_time": 0.015122664019633716,
            "samples_per_second": 2304423.1001497614,
            "samples_per_second_per_gpu": 288052.8875187202,
            "loss_sequences_lower_95": 3.8248640788612565,
            "loss_sequences_upper_95": 3.843283090641361,
            "loss_tokens_lower_95": 3.8232926770833333,
            "loss_tokens_upper_95": 3.84481046875,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.642536895061896,
            "data_time": 0.01110800201930697,
            "batch_time": 0.025153844129471553,
            "samples_per_second": 2152921.6576688765,
            "samples_per_second_per_gpu": 269115.20720860956,
            "loss_sequences_lower_95": 3.59938567867124,
            "loss_sequences_upper_95": 3.688972715052163,
            "loss_tokens_lower_95": 3.6314957708333333,
            "loss_tokens_upper_95": 3.6537551875,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.222531777596757,
            "data_time": 0.010212164372205734,
            "batch_time": 0.02422195579856634,
            "samples_per_second": 2180798.339072457,
            "samples_per_second_per_gpu": 272599.79238405713,
            "loss_sequences_lower_95": 4.185000272607615,
            "loss_sequences_upper_95": 4.255254053409862,
            "loss_tokens_lower_95": 4.2105298125,
            "loss_tokens_upper_95": 4.234704708333333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4061511789460175,
            "data_time": 0.0012462606950144665,
            "batch_time": 0.014658289107272775,
            "samples_per_second": 2327139.10808575,
            "samples_per_second_per_gpu": 290892.38851071877,
            "loss_sequences_lower_95": 3.3976856927504455,
            "loss_sequences_upper_95": 3.414566385907736,
            "loss_tokens_lower_95": 3.3952147135416664,
            "loss_tokens_upper_95": 3.416994776041667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.332469855085704,
            "data_time": 0.0025979241761041622,
            "batch_time": 0.01609685736631573,
            "samples_per_second": 2310132.1550110104,
            "samples_per_second_per_gpu": 288766.5193763763,
            "loss_sequences_lower_95": 3.3219850869836134,
            "loss_sequences_upper_95": 3.342751494331789,
            "loss_tokens_lower_95": 3.3219463854166666,
            "loss_tokens_upper_95": 3.343305890625,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.791033199302565,
            "data_time": 0.00992154321180502,
            "batch_time": 0.0241276539361524,
            "samples_per_second": 2132990.3639008757,
            "samples_per_second_per_gpu": 266623.79548760946,
            "loss_sequences_lower_95": 3.7538176066493407,
            "loss_sequences_upper_95": 3.82689748767907,
            "loss_tokens_lower_95": 3.7797776145833333,
            "loss_tokens_upper_95": 3.8022063854166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.193433189586321,
            "data_time": 0.009588354612251677,
            "batch_time": 0.02363718838330759,
            "samples_per_second": 2153202.869681626,
            "samples_per_second_per_gpu": 269150.35871020326,
            "loss_sequences_lower_95": 3.1290040740656027,
            "loss_sequences_upper_95": 3.2558156062046524,
            "loss_tokens_lower_95": 3.1822332447916666,
            "loss_tokens_upper_95": 3.2045689635416665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.132287962870165,
            "data_time": 0.08176579645701818,
            "batch_time": 0.09865607534136091,
            "samples_per_second": 1147357.6084050871,
            "samples_per_second_per_gpu": 143419.7010506359,
            "loss_sequences_lower_95": 4.070530926097523,
            "loss_sequences_upper_95": 4.19324570569125,
            "loss_tokens_lower_95": 4.111529220234264,
            "loss_tokens_upper_95": 4.153900345889005,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.873358973261219,
            "data_time": 0.01393124596639113,
            "batch_time": 0.027986541390419006,
            "samples_per_second": 2138869.2616059235,
            "samples_per_second_per_gpu": 267358.65770074044,
            "loss_sequences_lower_95": 2.7746554449765397,
            "loss_sequences_upper_95": 2.9714985705673174,
            "loss_tokens_lower_95": 2.8624670833333337,
            "loss_tokens_upper_95": 2.8842158020833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.800760077926918,
            "data_time": 0.01180023824175199,
            "batch_time": 0.025937532385190327,
            "samples_per_second": 2147218.568013516,
            "samples_per_second_per_gpu": 268402.3210016895,
            "loss_sequences_lower_95": 5.740653568962318,
            "loss_sequences_upper_95": 5.855262035681894,
            "loss_tokens_lower_95": 5.789164166666667,
            "loss_tokens_upper_95": 5.812304802083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.624397942277252,
            "data_time": 0.03628413379192352,
            "batch_time": 0.05078079551458359,
            "samples_per_second": 1831583.6773669703,
            "samples_per_second_per_gpu": 228947.9596708713,
            "loss_sequences_lower_95": 3.5700938302962504,
            "loss_sequences_upper_95": 3.6710950069740167,
            "loss_tokens_lower_95": 3.612011224715436,
            "loss_tokens_upper_95": 3.6367403687023727,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.222777990241486,
            "data_time": 0.002091428345591169,
            "batch_time": 0.015747452006253132,
            "samples_per_second": 2261632.399720855,
            "samples_per_second_per_gpu": 282704.0499651069,
            "loss_sequences_lower_95": 5.201398650032047,
            "loss_sequences_upper_95": 5.244979778855398,
            "loss_tokens_lower_95": 5.2011557264590165,
            "loss_tokens_upper_95": 5.2447837848062955,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2649420089774863,
            "data_time": 0.002210368396370274,
            "batch_time": 0.015844151994604972,
            "samples_per_second": 2259441.1504969657,
            "samples_per_second_per_gpu": 282430.1438121207,
            "loss_sequences_lower_95": 3.25345770420484,
            "loss_sequences_upper_95": 3.2790745956825087,
            "loss_tokens_lower_95": 3.2496286994277725,
            "loss_tokens_upper_95": 3.269193639379961,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.122339727880542,
            "data_time": 0.0031160422926461437,
            "batch_time": 0.01704568283160153,
            "samples_per_second": 2211249.1893957267,
            "samples_per_second_per_gpu": 276406.14867446583,
            "loss_sequences_lower_95": 4.359101382594769,
            "loss_sequences_upper_95": 4.6429873978691845,
            "loss_tokens_lower_95": 3.600345961866363,
            "loss_tokens_upper_95": 3.8071568337968342,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.601971467405558,
            "data_time": 0.003904260853503613,
            "batch_time": 0.017725920423548272,
            "samples_per_second": 2219357.2080077473,
            "samples_per_second_per_gpu": 277419.6510009684,
            "loss_sequences_lower_95": 4.7391890543619795,
            "loss_sequences_upper_95": 4.945679850260417,
            "loss_tokens_lower_95": 4.276260711477987,
            "loss_tokens_upper_95": 4.419608134335691,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.133806657574229,
            "data_time": 0.004692446232561253,
            "batch_time": 0.018619558210646224,
            "samples_per_second": 2199871.4988924637,
            "samples_per_second_per_gpu": 274983.93736155797,
            "loss_sequences_lower_95": 3.18055902094982,
            "loss_sequences_upper_95": 3.2417308978007724,
            "loss_tokens_lower_95": 3.036994171314894,
            "loss_tokens_upper_95": 3.068100357338754,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.269701559977098,
            "data_time": 0.02416634346757616,
            "batch_time": 0.03950160528932299,
            "samples_per_second": 1914611.3736457769,
            "samples_per_second_per_gpu": 239326.4217057221,
            "loss_sequences_lower_95": 2.251000993902033,
            "loss_sequences_upper_95": 2.360553262883967,
            "loss_tokens_lower_95": 2.2019514244054217,
            "loss_tokens_upper_95": 2.2477964781945983,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.471247406395114,
            "data_time": 0.019441617652773857,
            "batch_time": 0.03394368104636669,
            "samples_per_second": 1961751.9888507186,
            "samples_per_second_per_gpu": 245218.99860633982,
            "loss_sequences_lower_95": 3.468809864277742,
            "loss_sequences_upper_95": 3.6637964365433673,
            "loss_tokens_lower_95": 3.3335424027801284,
            "loss_tokens_upper_95": 3.4269769473066183,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.937980353037516,
            "data_time": 0.016188418253874168,
            "batch_time": 0.03016886802820059,
            "samples_per_second": 2042267.6226526373,
            "samples_per_second_per_gpu": 255283.45283157966,
            "loss_sequences_lower_95": 3.906499176025391,
            "loss_sequences_upper_95": 4.026039123535156,
            "loss_tokens_lower_95": 3.7797135889456506,
            "loss_tokens_upper_95": 4.0135638601258306,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.9827995786162775,
            "data_time": 0.001803334169225446,
            "batch_time": 0.015552037485635325,
            "samples_per_second": 2244741.7303549945,
            "samples_per_second_per_gpu": 280592.7162943743,
            "loss_sequences_lower_95": 5.993752489342921,
            "loss_sequences_upper_95": 6.071769717133754,
            "loss_tokens_lower_95": 5.84183086267858,
            "loss_tokens_upper_95": 5.921536271068254,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.598211294247045,
            "data_time": 0.002846839043918072,
            "batch_time": 0.01664691543419089,
            "samples_per_second": 2237568.5809904058,
            "samples_per_second_per_gpu": 279696.0726238007,
            "loss_sequences_lower_95": 5.118624764901621,
            "loss_sequences_upper_95": 5.421385793974905,
            "loss_tokens_lower_95": 3.8697892053034173,
            "loss_tokens_upper_95": 4.006876339336399,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.205488209431489,
            "data_time": 0.005026849943238335,
            "batch_time": 0.018778164644499083,
            "samples_per_second": 2216214.526610737,
            "samples_per_second_per_gpu": 277026.81582634215,
            "loss_sequences_lower_95": 4.635757737924622,
            "loss_sequences_upper_95": 4.973083933547088,
            "loss_tokens_lower_95": 3.780488003942804,
            "loss_tokens_upper_95": 3.937919356231345,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.0616624017828675,
            "data_time": 0.022918865084648132,
            "batch_time": 0.03748590179852077,
            "samples_per_second": 1987761.9551827689,
            "samples_per_second_per_gpu": 248470.2443978461,
            "loss_sequences_lower_95": 5.953275776344892,
            "loss_sequences_upper_95": 6.169863515357449,
            "loss_tokens_lower_95": 5.954327392578125,
            "loss_tokens_upper_95": 6.166524488196525,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5639354586601257,
            "data_time": 0.04875590251042293,
            "batch_time": 0.06355071984804593,
            "samples_per_second": 1686151.914634621,
            "samples_per_second_per_gpu": 210768.98932932763,
            "loss_sequences_lower_95": 3.428573028564453,
            "loss_sequences_upper_95": 3.8085927734375,
            "loss_tokens_lower_95": 3.2604586142333547,
            "loss_tokens_upper_95": 3.7143284997274706,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.15770383867051,
            "data_time": 0.003337825002845811,
            "batch_time": 0.01728597106621553,
            "samples_per_second": 2213865.6715864777,
            "samples_per_second_per_gpu": 276733.2089483097,
            "loss_sequences_lower_95": 5.114695466431039,
            "loss_sequences_upper_95": 5.202235173642209,
            "loss_tokens_lower_95": 5.112945625359838,
            "loss_tokens_upper_95": 5.202238509647677,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.924603433320017,
            "data_time": 0.004964762371865999,
            "batch_time": 0.018748198208956696,
            "samples_per_second": 2219000.0582188712,
            "samples_per_second_per_gpu": 277375.0072773589,
            "loss_sequences_lower_95": 4.8776736297732395,
            "loss_sequences_upper_95": 4.971242103920327,
            "loss_tokens_lower_95": 4.875709027564496,
            "loss_tokens_upper_95": 4.971771765106726,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.765954000905756,
            "data_time": 0.0035960133595806477,
            "batch_time": 0.01732767065966876,
            "samples_per_second": 2233212.820225981,
            "samples_per_second_per_gpu": 279151.60252824763,
            "loss_sequences_lower_95": 3.923297178913901,
            "loss_sequences_upper_95": 4.048752299278682,
            "loss_tokens_lower_95": 3.5807466531546472,
            "loss_tokens_upper_95": 3.6378510041709426,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.635270403385162,
            "data_time": 0.0108041288331151,
            "batch_time": 0.025035470724105835,
            "samples_per_second": 2087804.7742358954,
            "samples_per_second_per_gpu": 260975.59677948692,
            "loss_sequences_lower_95": 5.840765405273438,
            "loss_sequences_upper_95": 6.401458422851562,
            "loss_tokens_lower_95": 4.993148163467902,
            "loss_tokens_upper_95": 5.35977824404885,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.055525168776512,
            "data_time": 0.15111194550991058,
            "batch_time": 0.16833411157131195,
            "samples_per_second": 920082.4542657557,
            "samples_per_second_per_gpu": 115010.30678321946,
            "loss_sequences_lower_95": 3.817052811384201,
            "loss_sequences_upper_95": 4.378609251976013,
            "loss_tokens_lower_95": 3.61380738006241,
            "loss_tokens_upper_95": 4.345773008500022,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.483528868905429,
            "data_time": 0.028571948092034522,
            "batch_time": 0.04277237425459192,
            "samples_per_second": 1844417.3008824156,
            "samples_per_second_per_gpu": 230552.16261030195,
            "loss_sequences_lower_95": 6.06700246525907,
            "loss_sequences_upper_95": 7.0354563219793915,
            "loss_tokens_lower_95": 3.817286387962849,
            "loss_tokens_upper_95": 4.309828380425592,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.159314209636823,
            "data_time": 0.0030734334141016006,
            "batch_time": 0.0169299625688129,
            "samples_per_second": 2214720.7470258307,
            "samples_per_second_per_gpu": 276840.09337822883,
            "loss_sequences_lower_95": 3.1363003565370384,
            "loss_sequences_upper_95": 3.1823532244481076,
            "loss_tokens_lower_95": 3.135743638051551,
            "loss_tokens_upper_95": 3.1818333937018757,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.0718855134155523,
            "data_time": 0.002720332673042839,
            "batch_time": 0.016477876085156287,
            "samples_per_second": 2246721.1045311405,
            "samples_per_second_per_gpu": 280840.13806639257,
            "loss_sequences_lower_95": 3.044261837764409,
            "loss_sequences_upper_95": 3.203893827397269,
            "loss_tokens_lower_95": 2.89933143473871,
            "loss_tokens_upper_95": 3.0562081587649246,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3619825062734305,
            "data_time": 0.018247692121399775,
            "batch_time": 0.03246174918280707,
            "samples_per_second": 1974182.1149363837,
            "samples_per_second_per_gpu": 246772.76436704796,
            "loss_sequences_lower_95": 3.2221772413987377,
            "loss_sequences_upper_95": 3.6250517010252117,
            "loss_tokens_lower_95": 3.104832883370794,
            "loss_tokens_upper_95": 3.3984733701524195,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7828694876480404,
            "data_time": 0.004604973644018173,
            "batch_time": 0.01857149489223957,
            "samples_per_second": 2179039.580980438,
            "samples_per_second_per_gpu": 272379.9476225547,
            "loss_sequences_lower_95": 3.824714334272642,
            "loss_sequences_upper_95": 3.9760068632997485,
            "loss_tokens_lower_95": 3.6372174633473064,
            "loss_tokens_upper_95": 3.783687368018785,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.886752064635114,
            "data_time": 0.029891805989401683,
            "batch_time": 0.04537318150202433,
            "samples_per_second": 1832940.9375658217,
            "samples_per_second_per_gpu": 229117.6171957277,
            "loss_sequences_lower_95": 2.740619268649962,
            "loss_sequences_upper_95": 3.1962443375005956,
            "loss_tokens_lower_95": 2.604717360429624,
            "loss_tokens_upper_95": 2.9641089384940864,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.772449987844172,
            "data_time": 0.002362803702253463,
            "batch_time": 0.01627627106794966,
            "samples_per_second": 2215107.676278476,
            "samples_per_second_per_gpu": 276888.4595348095,
            "loss_sequences_lower_95": 3.7583865112866324,
            "loss_sequences_upper_95": 3.786049836920509,
            "loss_tokens_lower_95": 3.7583324575071266,
            "loss_tokens_upper_95": 3.7865000734521903,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.103945949123901,
            "data_time": 0.04722193804654208,
            "batch_time": 0.061530841480601915,
            "samples_per_second": 1751190.3377187175,
            "samples_per_second_per_gpu": 218898.7922148397,
            "loss_sequences_lower_95": 1.0556544442778653,
            "loss_sequences_upper_95": 1.2068746788987834,
            "loss_tokens_lower_95": 0.9426419542170108,
            "loss_tokens_upper_95": 1.1646793049213953,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.648641915341344,
            "data_time": 0.0017266033930830463,
            "batch_time": 0.015607179435280722,
            "samples_per_second": 2225282.431070792,
            "samples_per_second_per_gpu": 278160.303883849,
            "loss_sequences_lower_95": 5.0271921371363995,
            "loss_sequences_upper_95": 5.074430031446541,
            "loss_tokens_lower_95": 4.06352171179884,
            "loss_tokens_upper_95": 4.10978829787234,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.281284095525741,
            "data_time": 0.005619853734970093,
            "batch_time": 0.02002934567512028,
            "samples_per_second": 2190528.3852124275,
            "samples_per_second_per_gpu": 273816.04815155343,
            "loss_sequences_lower_95": 5.285119616699219,
            "loss_sequences_upper_95": 5.525043371582031,
            "loss_tokens_lower_95": 5.031399278869393,
            "loss_tokens_upper_95": 5.262710625574004,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.030532925025277,
            "data_time": 0.022055492562762763,
            "batch_time": 0.03730883234638279,
            "samples_per_second": 1881581.7826446733,
            "samples_per_second_per_gpu": 235197.72283058416,
            "loss_sequences_lower_95": 5.843899602475374,
            "loss_sequences_upper_95": 6.212882371985393,
            "loss_tokens_lower_95": 5.84688940960428,
            "loss_tokens_upper_95": 6.213954945439878,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.697214344776038,
            "data_time": 0.004578378186168441,
            "batch_time": 0.018240309264286454,
            "samples_per_second": 2242694.9832496773,
            "samples_per_second_per_gpu": 280336.87290620967,
            "loss_sequences_lower_95": 6.61514219341856,
            "loss_sequences_upper_95": 6.77730927438447,
            "loss_tokens_lower_95": 6.617310809511127,
            "loss_tokens_upper_95": 6.776761807528409,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.981669172445933,
            "data_time": 0.004104162784332925,
            "batch_time": 0.01804284878233646,
            "samples_per_second": 2212673.644627859,
            "samples_per_second_per_gpu": 276584.20557848236,
            "loss_sequences_lower_95": 1.0132411661783853,
            "loss_sequences_upper_95": 1.0618386169433593,
            "loss_tokens_lower_95": 0.9192471480779812,
            "loss_tokens_upper_95": 0.9714650664953481,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.196309099878584,
            "data_time": 0.022262945771217346,
            "batch_time": 0.03741315220083509,
            "samples_per_second": 1870368.6208035126,
            "samples_per_second_per_gpu": 233796.07760043908,
            "loss_sequences_lower_95": 5.845385189964658,
            "loss_sequences_upper_95": 6.538872302827381,
            "loss_tokens_lower_95": 5.844353928338913,
            "loss_tokens_upper_95": 6.547416774204799,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.3061085902154446,
            "data_time": 0.144907146692276,
            "batch_time": 0.16210825741291046,
            "samples_per_second": 804282.6859466594,
            "samples_per_second_per_gpu": 100535.33574333243,
            "loss_sequences_lower_95": 2.0592335164546967,
            "loss_sequences_upper_95": 3.0779263257980345,
            "loss_tokens_lower_95": 1.752727635963676,
            "loss_tokens_upper_95": 2.2594317186493234,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.531399389505387,
            "data_time": 0.005646207029857333,
            "batch_time": 0.01974721087349786,
            "samples_per_second": 2171403.7706225505,
            "samples_per_second_per_gpu": 271425.4713278188,
            "loss_sequences_lower_95": 7.465870275878906,
            "loss_sequences_upper_95": 7.832107763671875,
            "loss_tokens_lower_95": 7.218478161187341,
            "loss_tokens_upper_95": 7.539558258314826,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.689411122798919,
            "data_time": 0.005696973630360195,
            "batch_time": 0.019732133263633364,
            "samples_per_second": 2173045.085066208,
            "samples_per_second_per_gpu": 271630.635633276,
            "loss_sequences_lower_95": 6.75376796875,
            "loss_sequences_upper_95": 6.972474304199219,
            "loss_tokens_lower_95": 6.465780824557148,
            "loss_tokens_upper_95": 6.6582104113201055,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.097292412538238,
            "data_time": 0.004151066968273559,
            "batch_time": 0.01789045445496422,
            "samples_per_second": 2227001.1987237995,
            "samples_per_second_per_gpu": 278375.14984047494,
            "loss_sequences_lower_95": 4.063801540434859,
            "loss_sequences_upper_95": 4.1303254329874495,
            "loss_tokens_lower_95": 4.064771088904836,
            "loss_tokens_upper_95": 4.130292687206671,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.205797583276775,
            "data_time": 0.008398456515862502,
            "batch_time": 0.02224590122879812,
            "samples_per_second": 2161959.3322376153,
            "samples_per_second_per_gpu": 270244.9165297019,
            "loss_sequences_lower_95": 5.099134182087654,
            "loss_sequences_upper_95": 5.308753941502256,
            "loss_tokens_lower_95": 5.094910877421155,
            "loss_tokens_upper_95": 5.309118202314948,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.561993608474731,
            "data_time": 0.00561941663424174,
            "batch_time": 0.019558006335818577,
            "samples_per_second": 2184267.128671891,
            "samples_per_second_per_gpu": 273033.3910839864,
            "loss_sequences_lower_95": 7.444018432617188,
            "loss_sequences_upper_95": 7.6840815917968746,
            "loss_tokens_lower_95": 7.444567260742187,
            "loss_tokens_upper_95": 7.68606181640625,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5643580078796724,
            "data_time": 0.0023099060996042984,
            "batch_time": 0.015966832389903655,
            "samples_per_second": 2259113.639249541,
            "samples_per_second_per_gpu": 282389.2049061926,
            "loss_sequences_lower_95": 4.154815497132214,
            "loss_sequences_upper_95": 4.261936642620624,
            "loss_tokens_lower_95": 2.8372033862720216,
            "loss_tokens_upper_95": 2.909080644075524,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.019977336499228,
            "data_time": 0.018333780765533447,
            "batch_time": 0.033263213293893,
            "samples_per_second": 1941614.867081076,
            "samples_per_second_per_gpu": 242701.8583851345,
            "loss_sequences_lower_95": 5.805228424072266,
            "loss_sequences_upper_95": 6.23077901583999,
            "loss_tokens_lower_95": 5.811388180860833,
            "loss_tokens_upper_95": 6.2282457380152465,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.1695314921584785,
            "data_time": 0.00992575753480196,
            "batch_time": 0.02393159084022045,
            "samples_per_second": 2152655.117932017,
            "samples_per_second_per_gpu": 269081.8897415021,
            "loss_sequences_lower_95": 6.0203992058249085,
            "loss_sequences_upper_95": 6.314546772077971,
            "loss_tokens_lower_95": 6.024139500038296,
            "loss_tokens_upper_95": 6.313673634248621,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.043043871078518,
            "data_time": 0.002297439465495341,
            "batch_time": 0.016028546726563536,
            "samples_per_second": 2245433.788592422,
            "samples_per_second_per_gpu": 280679.22357405274,
            "loss_sequences_lower_95": 4.539903178042403,
            "loss_sequences_upper_95": 4.650408570849774,
            "loss_tokens_lower_95": 3.322914845554405,
            "loss_tokens_upper_95": 3.404853727212945,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.449604411604543,
            "data_time": 0.026370331645011902,
            "batch_time": 0.04164855182170868,
            "samples_per_second": 1944098.258906409,
            "samples_per_second_per_gpu": 243012.2823633011,
            "loss_sequences_lower_95": 4.325425364983776,
            "loss_sequences_upper_95": 4.571702938988095,
            "loss_tokens_lower_95": 4.326312021729807,
            "loss_tokens_upper_95": 4.571817573668465,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.270449694580988,
            "data_time": 0.003971519633236094,
            "batch_time": 0.0179807863247118,
            "samples_per_second": 2191231.592134082,
            "samples_per_second_per_gpu": 273903.9490167603,
            "loss_sequences_lower_95": 6.229081523556957,
            "loss_sequences_upper_95": 6.312128308964067,
            "loss_tokens_lower_95": 6.229245000716743,
            "loss_tokens_upper_95": 6.311429648676414,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.2297862177913625,
            "data_time": 0.02349286729639227,
            "batch_time": 0.03805770007046786,
            "samples_per_second": 1915281.5875223225,
            "samples_per_second_per_gpu": 239410.19844029032,
            "loss_sequences_lower_95": 6.014540588971481,
            "loss_sequences_upper_95": 6.444881579945388,
            "loss_tokens_lower_95": 6.014396726737902,
            "loss_tokens_upper_95": 6.4443757288664285,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4082294503847756,
            "data_time": 0.07568836212158203,
            "batch_time": 0.09112751483917236,
            "samples_per_second": 1388558.5695098294,
            "samples_per_second_per_gpu": 173569.82118872867,
            "loss_sequences_lower_95": 3.1529440180460613,
            "loss_sequences_upper_95": 3.808831183115641,
            "loss_tokens_lower_95": 2.830333773295085,
            "loss_tokens_upper_95": 3.7347085211012097,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9550591349601745,
            "data_time": 0.07708807289600372,
            "batch_time": 0.09339745342731476,
            "samples_per_second": 1354483.0764738517,
            "samples_per_second_per_gpu": 169310.38455923146,
            "loss_sequences_lower_95": 2.813292007446289,
            "loss_sequences_upper_95": 3.4549816767374675,
            "loss_tokens_lower_95": 2.2989095452126493,
            "loss_tokens_upper_95": 3.235596311762092,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.381539616633936,
            "data_time": 0.003680808744829076,
            "batch_time": 0.01751071190118369,
            "samples_per_second": 2221089.7661296,
            "samples_per_second_per_gpu": 277636.2207662,
            "loss_sequences_lower_95": 4.357426592415316,
            "loss_sequences_upper_95": 4.404911404639175,
            "loss_tokens_lower_95": 4.358223619868372,
            "loss_tokens_upper_95": 4.405671397505523,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.6495676981158792,
            "data_time": 0.0016444108858541535,
            "batch_time": 0.015372708521343164,
            "samples_per_second": 2246739.6050113253,
            "samples_per_second_per_gpu": 280842.45062641567,
            "loss_sequences_lower_95": 0.7471161654585242,
            "loss_sequences_upper_95": 0.7674077257627111,
            "loss_tokens_lower_95": 0.5519922800553453,
            "loss_tokens_upper_95": 0.5623752450644516,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.8599346594547663,
            "data_time": 0.03875276446342468,
            "batch_time": 0.05469144135713577,
            "samples_per_second": 1792183.2756110395,
            "samples_per_second_per_gpu": 224022.90945137994,
            "loss_sequences_lower_95": 1.776235358170637,
            "loss_sequences_upper_95": 2.0109201716625784,
            "loss_tokens_lower_95": 1.6443807655808615,
            "loss_tokens_upper_95": 1.7757452106218143,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8957384019284635,
            "data_time": 0.11169583456856864,
            "batch_time": 0.12834899766104563,
            "samples_per_second": 973125.068369749,
            "samples_per_second_per_gpu": 121640.63354621863,
            "loss_sequences_lower_95": 3.434903742815997,
            "loss_sequences_upper_95": 4.4082406739930855,
            "loss_tokens_lower_95": 3.3123277546447003,
            "loss_tokens_upper_95": 4.4172990869592725,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.6963474034536175,
            "data_time": 0.03048809085573469,
            "batch_time": 0.04611934366680327,
            "samples_per_second": 1808698.577270266,
            "samples_per_second_per_gpu": 226087.32215878324,
            "loss_sequences_lower_95": 1.6432905755391933,
            "loss_sequences_upper_95": 1.8522014245754335,
            "loss_tokens_lower_95": 1.5272579537689235,
            "loss_tokens_upper_95": 1.6303983496175882,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.749950242478673,
            "data_time": 0.030420297668093725,
            "batch_time": 0.045607748485746835,
            "samples_per_second": 1855788.9081960768,
            "samples_per_second_per_gpu": 231973.6135245096,
            "loss_sequences_lower_95": 1.7252222572884908,
            "loss_sequences_upper_95": 1.916808281875238,
            "loss_tokens_lower_95": 1.574321176254471,
            "loss_tokens_upper_95": 1.6631037835011213,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.6855230742111438,
            "data_time": 0.03052343357177008,
            "batch_time": 0.045422207741510304,
            "samples_per_second": 1954656.1797699907,
            "samples_per_second_per_gpu": 244332.02247124884,
            "loss_sequences_lower_95": 1.5544133977192205,
            "loss_sequences_upper_95": 1.7869375368443932,
            "loss_tokens_lower_95": 1.5861507439901255,
            "loss_tokens_upper_95": 1.7261946148951892,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.8221125275623509,
            "data_time": 0.0308983808472043,
            "batch_time": 0.04533411775316511,
            "samples_per_second": 1920834.5390228725,
            "samples_per_second_per_gpu": 240104.31737785906,
            "loss_sequences_lower_95": 1.792752968392721,
            "loss_sequences_upper_95": 1.972920348004597,
            "loss_tokens_lower_95": 1.6481981509199766,
            "loss_tokens_upper_95": 1.732075029890114,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.5029193644197831,
            "data_time": 0.03155559080618399,
            "batch_time": 0.04625245376869484,
            "samples_per_second": 1947330.1227531687,
            "samples_per_second_per_gpu": 243416.2653441461,
            "loss_sequences_lower_95": 1.4522353367775864,
            "loss_sequences_upper_95": 1.5671063061826718,
            "loss_tokens_lower_95": 1.436010569382495,
            "loss_tokens_upper_95": 1.500595257545152,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.4020204115204695,
            "data_time": 0.030412486621311734,
            "batch_time": 0.04557677393867856,
            "samples_per_second": 1843097.5756335792,
            "samples_per_second_per_gpu": 230387.1969541974,
            "loss_sequences_lower_95": 1.3867676734924317,
            "loss_sequences_upper_95": 1.51221867072873,
            "loss_tokens_lower_95": 1.254901669751168,
            "loss_tokens_upper_95": 1.3107130613321278,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-16.0/params.txt",
    "uuid": "1f714df6-e564-4cc9-aaa2-ac5c8dfe293c",
    "creation_date": "2023_12_14-06_34_53"
}