{
    "name": "rpj-d=96_l=8_h=4-32.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 6764359680,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 32.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1352871936",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-32.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.220440765221913,
            "data_time": 0.13419055938720703,
            "batch_time": 1.3026804029941559,
            "samples_per_second": 371366.1799320887,
            "samples_per_second_per_gpu": 46420.77249151109,
            "loss_sequences_lower_95": 4.1460147412618,
            "loss_sequences_upper_95": 4.293768768310547,
            "loss_tokens_lower_95": 4.206672388712565,
            "loss_tokens_upper_95": 4.2341765085856125,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.492098042798854,
            "data_time": 0.01919414248989139,
            "batch_time": 0.06399968110163604,
            "samples_per_second": 4683851.127821855,
            "samples_per_second_per_gpu": 585481.3909777319,
            "loss_sequences_lower_95": 4.489699492788151,
            "loss_sequences_upper_95": 4.4944859737320435,
            "loss_tokens_lower_95": 4.480283322916667,
            "loss_tokens_upper_95": 4.503668489583333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.603913103804296,
            "data_time": 0.08986596763134003,
            "batch_time": 0.13502121716737747,
            "samples_per_second": 4139743.4917960702,
            "samples_per_second_per_gpu": 517467.9364745088,
            "loss_sequences_lower_95": 3.5741473700075734,
            "loss_sequences_upper_95": 3.6336770754444356,
            "loss_tokens_lower_95": 3.59139109375,
            "loss_tokens_upper_95": 3.6167898333333333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.3623058712359555,
            "data_time": 0.013333079846281754,
            "batch_time": 0.057060258953194866,
            "samples_per_second": 5397957.462010346,
            "samples_per_second_per_gpu": 674744.6827512933,
            "loss_sequences_lower_95": 4.350540693460052,
            "loss_sequences_upper_95": 4.3743138893363405,
            "loss_tokens_lower_95": 4.3510564687499995,
            "loss_tokens_upper_95": 4.373852260416666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.480774274425934,
            "data_time": 0.09218097478151321,
            "batch_time": 0.1364217773079872,
            "samples_per_second": 4229375.67848571,
            "samples_per_second_per_gpu": 528671.9598107138,
            "loss_sequences_lower_95": 4.448678657236507,
            "loss_sequences_upper_95": 4.5131420873575925,
            "loss_tokens_lower_95": 4.4692032500000005,
            "loss_tokens_upper_95": 4.492309625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.286386678204255,
            "data_time": 0.03447934736808141,
            "batch_time": 0.07721680402755737,
            "samples_per_second": 4966432.956846583,
            "samples_per_second_per_gpu": 620804.1196058228,
            "loss_sequences_lower_95": 4.244518213122982,
            "loss_sequences_upper_95": 4.327451231623895,
            "loss_tokens_lower_95": 4.27429325,
            "loss_tokens_upper_95": 4.2981819479166665,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.9494086995416757,
            "data_time": 0.013327045738697052,
            "batch_time": 0.05585234761238098,
            "samples_per_second": 5168858.602278191,
            "samples_per_second_per_gpu": 646107.3252847738,
            "loss_sequences_lower_95": 2.921940065967793,
            "loss_sequences_upper_95": 2.976422682158801,
            "loss_tokens_lower_95": 2.9370101770833332,
            "loss_tokens_upper_95": 2.9619445625,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.637328761734888,
            "data_time": 0.014538092048544633,
            "batch_time": 0.05793387482040807,
            "samples_per_second": 5256180.706680626,
            "samples_per_second_per_gpu": 657022.5883350782,
            "loss_sequences_lower_95": 4.629285780022905,
            "loss_sequences_upper_95": 4.645210446662303,
            "loss_tokens_lower_95": 4.6260991562500005,
            "loss_tokens_upper_95": 4.648817125,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.61219606457687,
            "data_time": 0.09344089031219482,
            "batch_time": 0.13834842294454575,
            "samples_per_second": 4108489.130524041,
            "samples_per_second_per_gpu": 513561.14131550514,
            "loss_sequences_lower_95": 4.571476832444106,
            "loss_sequences_upper_95": 4.652812294068376,
            "loss_tokens_lower_95": 4.600106072916667,
            "loss_tokens_upper_95": 4.624281666666667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.148532219083884,
            "data_time": 0.0940510481595993,
            "batch_time": 0.13932524621486664,
            "samples_per_second": 4120611.5659305393,
            "samples_per_second_per_gpu": 515076.4457413174,
            "loss_sequences_lower_95": 5.115962575075655,
            "loss_sequences_upper_95": 5.176672978457726,
            "loss_tokens_lower_95": 5.13627234375,
            "loss_tokens_upper_95": 5.160610031249999,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.498807880508258,
            "data_time": 0.010788522917648842,
            "batch_time": 0.05382858833362316,
            "samples_per_second": 5392929.061396814,
            "samples_per_second_per_gpu": 674116.1326746017,
            "loss_sequences_lower_95": 4.491534901436379,
            "loss_sequences_upper_95": 4.506301899539195,
            "loss_tokens_lower_95": 4.486689354166666,
            "loss_tokens_upper_95": 4.51091478125,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.427842844629824,
            "data_time": 0.025992608070373534,
            "batch_time": 0.06887025088071823,
            "samples_per_second": 4938547.716706135,
            "samples_per_second_per_gpu": 617318.4645882669,
            "loss_sequences_lower_95": 4.417742505564088,
            "loss_sequences_upper_95": 4.438039204042586,
            "loss_tokens_lower_95": 4.416031062499999,
            "loss_tokens_upper_95": 4.439575447916667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.700917752712783,
            "data_time": 0.1207430511713028,
            "batch_time": 0.21696393936872482,
            "samples_per_second": 4215905.287269247,
            "samples_per_second_per_gpu": 526988.1609086558,
            "loss_sequences_lower_95": 4.66793337932223,
            "loss_sequences_upper_95": 4.733461366850754,
            "loss_tokens_lower_95": 4.689436447916666,
            "loss_tokens_upper_95": 4.712714656249999,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.278041674503241,
            "data_time": 0.08678317815065384,
            "batch_time": 0.13092589378356934,
            "samples_per_second": 4193285.6536015915,
            "samples_per_second_per_gpu": 524160.70670019893,
            "loss_sequences_lower_95": 4.216046428486189,
            "loss_sequences_upper_95": 4.336410155752769,
            "loss_tokens_lower_95": 4.26579384375,
            "loss_tokens_upper_95": 4.290556739583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.374213077805259,
            "data_time": 0.15102289617061615,
            "batch_time": 0.17210224270820618,
            "samples_per_second": 1141036.1067805735,
            "samples_per_second_per_gpu": 142629.5133475717,
            "loss_sequences_lower_95": 5.30065987326882,
            "loss_sequences_upper_95": 5.4447699720209295,
            "loss_tokens_lower_95": 5.350810484452682,
            "loss_tokens_upper_95": 5.398492154208096,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.9271177300211293,
            "data_time": 0.09360112249851227,
            "batch_time": 0.12951187789440155,
            "samples_per_second": 3134539.109797218,
            "samples_per_second_per_gpu": 391817.38872465224,
            "loss_sequences_lower_95": 3.81576278286147,
            "loss_sequences_upper_95": 4.036849272702942,
            "loss_tokens_lower_95": 3.915176489583333,
            "loss_tokens_upper_95": 3.9393225312499998,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.2986609162001,
            "data_time": 0.09174055606126785,
            "batch_time": 0.12859363108873367,
            "samples_per_second": 3748900.083705313,
            "samples_per_second_per_gpu": 468612.51046316413,
            "loss_sequences_lower_95": 6.246278755771768,
            "loss_sequences_upper_95": 6.347829547972666,
            "loss_tokens_lower_95": 6.2869339375,
            "loss_tokens_upper_95": 6.310383427083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.027355448144381,
            "data_time": 0.15387128293514252,
            "batch_time": 0.18264049291610718,
            "samples_per_second": 2317094.2642883616,
            "samples_per_second_per_gpu": 289636.7830360452,
            "loss_sequences_lower_95": 4.970594512439165,
            "loss_sequences_upper_95": 5.079245758056641,
            "loss_tokens_lower_95": 5.0139149775270555,
            "loss_tokens_upper_95": 5.040451324963179,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.048656761909444,
            "data_time": 0.02755433185534044,
            "batch_time": 0.07198164517229254,
            "samples_per_second": 4530060.126986261,
            "samples_per_second_per_gpu": 566257.5158732827,
            "loss_sequences_lower_95": 5.02889179837719,
            "loss_sequences_upper_95": 5.067877199873593,
            "loss_tokens_lower_95": 5.028837399631462,
            "loss_tokens_upper_95": 5.0681853017955065,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.325247468950264,
            "data_time": 0.028372669592499733,
            "batch_time": 0.07228471264243126,
            "samples_per_second": 4468191.8309437465,
            "samples_per_second_per_gpu": 558523.9788679683,
            "loss_sequences_lower_95": 4.305794841353815,
            "loss_sequences_upper_95": 4.332337356929023,
            "loss_tokens_lower_95": 4.313697929368042,
            "loss_tokens_upper_95": 4.335441357425732,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.166242916853179,
            "data_time": 0.05601454277833303,
            "batch_time": 0.09793218639161852,
            "samples_per_second": 4336248.536783483,
            "samples_per_second_per_gpu": 542031.0670979354,
            "loss_sequences_lower_95": 6.6152485944327175,
            "loss_sequences_upper_95": 6.885694281965931,
            "loss_tokens_lower_95": 6.0318790533044435,
            "loss_tokens_upper_95": 6.236219881760782,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.24919525273641,
            "data_time": 0.04549402371048927,
            "batch_time": 0.08900834619998932,
            "samples_per_second": 4497875.741392132,
            "samples_per_second_per_gpu": 562234.4676740165,
            "loss_sequences_lower_95": 6.679634928385417,
            "loss_sequences_upper_95": 6.880525439453125,
            "loss_tokens_lower_95": 6.1456907920597486,
            "loss_tokens_upper_95": 6.284621769359276,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.3910695174320615,
            "data_time": 0.0659908081094424,
            "batch_time": 0.10567811131477356,
            "samples_per_second": 4032180.216907603,
            "samples_per_second_per_gpu": 504022.5271134504,
            "loss_sequences_lower_95": 4.514449515642769,
            "loss_sequences_upper_95": 4.587479158275682,
            "loss_tokens_lower_95": 4.363389439752525,
            "loss_tokens_upper_95": 4.399171511510664,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.120237369970842,
            "data_time": 0.33913809061050415,
            "batch_time": 0.38141000270843506,
            "samples_per_second": 2482862.8109245175,
            "samples_per_second_per_gpu": 310357.8513655647,
            "loss_sequences_lower_95": 3.129646911621094,
            "loss_sequences_upper_95": 3.2580614263361154,
            "loss_tokens_lower_95": 3.089500261387685,
            "loss_tokens_upper_95": 3.1402903371586213,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.448557633769756,
            "data_time": 0.34468917548656464,
            "batch_time": 0.39102479815483093,
            "samples_per_second": 2768872.4345855466,
            "samples_per_second_per_gpu": 346109.0543231933,
            "loss_sequences_lower_95": 4.531636663942921,
            "loss_sequences_upper_95": 4.746526003467793,
            "loss_tokens_lower_95": 4.390487785365183,
            "loss_tokens_upper_95": 4.496433093898104,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.335057676633199,
            "data_time": 0.20032191276550293,
            "batch_time": 0.23271121829748154,
            "samples_per_second": 2539999.1687743706,
            "samples_per_second_per_gpu": 317499.8960967963,
            "loss_sequences_lower_95": 4.294305318196614,
            "loss_sequences_upper_95": 4.404192118326823,
            "loss_tokens_lower_95": 4.231552474843556,
            "loss_tokens_upper_95": 4.439117582530998,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.7655472074236185,
            "data_time": 0.029615647345781326,
            "batch_time": 0.07363411765545606,
            "samples_per_second": 4425004.919654131,
            "samples_per_second_per_gpu": 553125.6149567664,
            "loss_sequences_lower_95": 7.860229430915063,
            "loss_sequences_upper_95": 7.929562994408493,
            "loss_tokens_lower_95": 7.709108679195136,
            "loss_tokens_upper_95": 7.783646980565244,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.372637987638563,
            "data_time": 0.04733304083347321,
            "batch_time": 0.0895240381360054,
            "samples_per_second": 4390311.248593914,
            "samples_per_second_per_gpu": 548788.9060742393,
            "loss_sequences_lower_95": 6.705115989402488,
            "loss_sequences_upper_95": 7.024776430483217,
            "loss_tokens_lower_95": 5.220664242112994,
            "loss_tokens_upper_95": 5.371122399381244,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.044068775893072,
            "data_time": 0.08445167541503906,
            "batch_time": 0.12662798464298247,
            "samples_per_second": 4245869.320554456,
            "samples_per_second_per_gpu": 530733.665069307,
            "loss_sequences_lower_95": 5.89824659328005,
            "loss_sequences_upper_95": 6.258011041322258,
            "loss_tokens_lower_95": 4.933570525522346,
            "loss_tokens_upper_95": 5.104300231047313,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.826312369951919,
            "data_time": 0.3245445638895035,
            "batch_time": 0.365996852517128,
            "samples_per_second": 2458013.4934578384,
            "samples_per_second_per_gpu": 307251.6866822298,
            "loss_sequences_lower_95": 6.743754654279038,
            "loss_sequences_upper_95": 6.908148346644014,
            "loss_tokens_lower_95": 6.743162919832692,
            "loss_tokens_upper_95": 6.909452715311964,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.460034489631653,
            "data_time": 0.3027117997407913,
            "batch_time": 0.32766853272914886,
            "samples_per_second": 1301545.4441232611,
            "samples_per_second_per_gpu": 162693.18051540764,
            "loss_sequences_lower_95": 4.377650047302246,
            "loss_sequences_upper_95": 4.792538352966309,
            "loss_tokens_lower_95": 4.207181031802047,
            "loss_tokens_upper_95": 4.696914438100961,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.869954745869554,
            "data_time": 0.05102437362074852,
            "batch_time": 0.09445449523627758,
            "samples_per_second": 4477802.928741965,
            "samples_per_second_per_gpu": 559725.3660927457,
            "loss_sequences_lower_95": 4.825404143943514,
            "loss_sequences_upper_95": 4.9149718301201055,
            "loss_tokens_lower_95": 4.8238419687979786,
            "loss_tokens_upper_95": 4.915770972104497,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.368261643455812,
            "data_time": 0.07116041481494903,
            "batch_time": 0.11487947702407837,
            "samples_per_second": 4423953.129070692,
            "samples_per_second_per_gpu": 552994.1411338365,
            "loss_sequences_lower_95": 5.3180991582847055,
            "loss_sequences_upper_95": 5.416516313232623,
            "loss_tokens_lower_95": 5.316670965621161,
            "loss_tokens_upper_95": 5.4186783174332005,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.865393202245171,
            "data_time": 0.055904824286699295,
            "batch_time": 0.09709158912301064,
            "samples_per_second": 4316032.726352482,
            "samples_per_second_per_gpu": 539504.0907940603,
            "loss_sequences_lower_95": 5.120891663195389,
            "loss_sequences_upper_95": 5.239255474700761,
            "loss_tokens_lower_95": 4.8264493576966885,
            "loss_tokens_upper_95": 4.887353876192146,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.283051129341126,
            "data_time": 0.18429924547672272,
            "batch_time": 0.22997979819774628,
            "samples_per_second": 3652688.118224801,
            "samples_per_second_per_gpu": 456586.0147781001,
            "loss_sequences_lower_95": 6.896156481933594,
            "loss_sequences_upper_95": 7.4253098632812495,
            "loss_tokens_lower_95": 6.04882395635134,
            "loss_tokens_upper_95": 6.406520590539171,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.864914506673813,
            "data_time": 0.15272943675518036,
            "batch_time": 0.17029137909412384,
            "samples_per_second": 815274.5580688146,
            "samples_per_second_per_gpu": 101909.31975860182,
            "loss_sequences_lower_95": 4.583208954334259,
            "loss_sequences_upper_95": 5.24386568069458,
            "loss_tokens_lower_95": 4.338561099699174,
            "loss_tokens_upper_95": 5.216041336936512,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.211451070062045,
            "data_time": 0.3394251763820648,
            "batch_time": 0.37496350705623627,
            "samples_per_second": 2187427.21984325,
            "samples_per_second_per_gpu": 273428.4024804062,
            "loss_sequences_lower_95": 6.9281825975440015,
            "loss_sequences_upper_95": 7.819843213311557,
            "loss_tokens_lower_95": 4.851672996340368,
            "loss_tokens_upper_95": 5.340495002686373,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.365185304935867,
            "data_time": 0.05483334925439623,
            "batch_time": 0.09907172289159563,
            "samples_per_second": 4423450.365785433,
            "samples_per_second_per_gpu": 552931.2957231791,
            "loss_sequences_lower_95": 4.338125976989133,
            "loss_sequences_upper_95": 4.392086313363505,
            "loss_tokens_lower_95": 4.337902432062991,
            "loss_tokens_upper_95": 4.3921626486388705,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.29218168020127,
            "data_time": 0.0373813822155907,
            "batch_time": 0.08115782411325545,
            "samples_per_second": 4290899.892569983,
            "samples_per_second_per_gpu": 536362.4865712479,
            "loss_sequences_lower_95": 5.404975778293592,
            "loss_sequences_upper_95": 5.6192995888317485,
            "loss_tokens_lower_95": 5.158522279011348,
            "loss_tokens_upper_95": 5.368173158807647,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.038063313934829,
            "data_time": 0.17486408352851868,
            "batch_time": 0.20460350066423416,
            "samples_per_second": 1889225.6678324156,
            "samples_per_second_per_gpu": 236153.20847905194,
            "loss_sequences_lower_95": 3.9553718538948033,
            "loss_sequences_upper_95": 4.332447183350504,
            "loss_tokens_lower_95": 3.8541568159726234,
            "loss_tokens_upper_95": 4.18449830117874,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.490170106609318,
            "data_time": 0.08015819489955903,
            "batch_time": 0.12467197775840759,
            "samples_per_second": 4488244.215545615,
            "samples_per_second_per_gpu": 561030.5269432019,
            "loss_sequences_lower_95": 4.534952468691422,
            "loss_sequences_upper_95": 4.673188156693962,
            "loss_tokens_lower_95": 4.405218190885548,
            "loss_tokens_upper_95": 4.561576369518511,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.103865451929046,
            "data_time": 0.3116316497325897,
            "batch_time": 0.34639422595500946,
            "samples_per_second": 2150230.57865424,
            "samples_per_second_per_gpu": 268778.82233178,
            "loss_sequences_lower_95": 3.9539413824314025,
            "loss_sequences_upper_95": 4.443153530213891,
            "loss_tokens_lower_95": 3.910544226492404,
            "loss_tokens_upper_95": 4.313092309849102,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.438431493924555,
            "data_time": 0.02765964265901093,
            "batch_time": 0.0715351713862165,
            "samples_per_second": 4484062.097230266,
            "samples_per_second_per_gpu": 560507.7621537832,
            "loss_sequences_lower_95": 4.42595701757539,
            "loss_sequences_upper_95": 4.451073847582016,
            "loss_tokens_lower_95": 4.426050327252951,
            "loss_tokens_upper_95": 4.450929658587967,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.476881006388988,
            "data_time": 0.3444901555776596,
            "batch_time": 0.3703499287366867,
            "samples_per_second": 1869060.652992706,
            "samples_per_second_per_gpu": 233632.58162408826,
            "loss_sequences_lower_95": 2.387018629648153,
            "loss_sequences_upper_95": 2.684782513368477,
            "loss_tokens_lower_95": 2.265769504012203,
            "loss_tokens_upper_95": 2.6092329011602797,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.098909059610507,
            "data_time": 0.023745968639850616,
            "batch_time": 0.06783632894357046,
            "samples_per_second": 4456350.412464086,
            "samples_per_second_per_gpu": 557043.8015580107,
            "loss_sequences_lower_95": 5.867920054949555,
            "loss_sequences_upper_95": 5.9142084316037735,
            "loss_tokens_lower_95": 5.0081767529013534,
            "loss_tokens_upper_95": 5.052901982591877,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.473369661808014,
            "data_time": 0.10324268415570259,
            "batch_time": 0.14789265766739845,
            "samples_per_second": 4257704.964441841,
            "samples_per_second_per_gpu": 532213.1205552301,
            "loss_sequences_lower_95": 6.626988623046875,
            "loss_sequences_upper_95": 6.928203625488281,
            "loss_tokens_lower_95": 6.312784169869006,
            "loss_tokens_upper_95": 6.583049880969644,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.845394076471743,
            "data_time": 0.3543998748064041,
            "batch_time": 0.3980918675661087,
            "samples_per_second": 2192189.118563776,
            "samples_per_second_per_gpu": 274023.639820472,
            "loss_sequences_lower_95": 4.689346844217051,
            "loss_sequences_upper_95": 4.999073181152344,
            "loss_tokens_lower_95": 4.69119188391644,
            "loss_tokens_upper_95": 4.999348887567935,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.36154763734702,
            "data_time": 0.06368035326401393,
            "batch_time": 0.10370070238908131,
            "samples_per_second": 4079399.455513134,
            "samples_per_second_per_gpu": 509924.93193914177,
            "loss_sequences_lower_95": 8.2757421875,
            "loss_sequences_upper_95": 8.44455096620502,
            "loss_tokens_lower_95": 8.277442700935133,
            "loss_tokens_upper_95": 8.447693573922823,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.5227130366166433,
            "data_time": 0.0636207362016042,
            "batch_time": 0.10769254217545192,
            "samples_per_second": 4519123.457022344,
            "samples_per_second_per_gpu": 564890.432127793,
            "loss_sequences_lower_95": 1.6645628743489584,
            "loss_sequences_upper_95": 1.7572691813151042,
            "loss_tokens_lower_95": 1.4795430555034514,
            "loss_tokens_upper_95": 1.5481130577230893,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.253971778778803,
            "data_time": 0.3333013355731964,
            "batch_time": 0.3742809146642685,
            "samples_per_second": 2337692.896082511,
            "samples_per_second_per_gpu": 292211.6120103139,
            "loss_sequences_lower_95": 5.879492579868862,
            "loss_sequences_upper_95": 6.631778782435826,
            "loss_tokens_lower_95": 5.876759134928386,
            "loss_tokens_upper_95": 6.636001383463541,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.059760242700577,
            "data_time": 0.14502809941768646,
            "batch_time": 0.16231271624565125,
            "samples_per_second": 841070.75213524,
            "samples_per_second_per_gpu": 105133.844016905,
            "loss_sequences_lower_95": 3.7333186089992525,
            "loss_sequences_upper_95": 5.070899987220764,
            "loss_tokens_lower_95": 3.4164311485683796,
            "loss_tokens_upper_95": 4.0121441147007895,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.373817569732666,
            "data_time": 0.09654900059103966,
            "batch_time": 0.14106234908103943,
            "samples_per_second": 4266663.664997415,
            "samples_per_second_per_gpu": 533332.9581246768,
            "loss_sequences_lower_95": 7.4955604492187495,
            "loss_sequences_upper_95": 7.832070471191406,
            "loss_tokens_lower_95": 7.207108086135787,
            "loss_tokens_upper_95": 7.510064191221183,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.640162657737732,
            "data_time": 0.09746983274817467,
            "batch_time": 0.14179925248026848,
            "samples_per_second": 4359233.242158064,
            "samples_per_second_per_gpu": 544904.155269758,
            "loss_sequences_lower_95": 7.803438720703125,
            "loss_sequences_upper_95": 8.043116894531249,
            "loss_tokens_lower_95": 7.521464961113546,
            "loss_tokens_upper_95": 7.738238669038714,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.378267137468001,
            "data_time": 0.03826466823617617,
            "batch_time": 0.08192899947365125,
            "samples_per_second": 4602270.8843243895,
            "samples_per_second_per_gpu": 575283.8605405487,
            "loss_sequences_lower_95": 4.352880867559399,
            "loss_sequences_upper_95": 4.404132802351345,
            "loss_tokens_lower_95": 4.352961475706085,
            "loss_tokens_upper_95": 4.403830871684336,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.52458840398012,
            "data_time": 0.12562337517738342,
            "batch_time": 0.16606099406878153,
            "samples_per_second": 3790338.3555447366,
            "samples_per_second_per_gpu": 473792.2944430921,
            "loss_sequences_lower_95": 5.42457262264785,
            "loss_sequences_upper_95": 5.621654710901497,
            "loss_tokens_lower_95": 5.425201350386425,
            "loss_tokens_upper_95": 5.62144541938184,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.05926537322998,
            "data_time": 0.08925691992044449,
            "batch_time": 0.13352108001708984,
            "samples_per_second": 4368081.939265979,
            "samples_per_second_per_gpu": 546010.2424082474,
            "loss_sequences_lower_95": 9.004816015625,
            "loss_sequences_upper_95": 9.116190332031248,
            "loss_tokens_lower_95": 9.005238037109375,
            "loss_tokens_upper_95": 9.1156275390625,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.443253554142976,
            "data_time": 0.02698706409760884,
            "batch_time": 0.07097218079226357,
            "samples_per_second": 4514814.091809265,
            "samples_per_second_per_gpu": 564351.7614761582,
            "loss_sequences_lower_95": 5.538443837955298,
            "loss_sequences_upper_95": 5.646922414557711,
            "loss_tokens_lower_95": 4.323643318367957,
            "loss_tokens_upper_95": 4.395932987725547,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.398368866585973,
            "data_time": 0.22219822236469813,
            "batch_time": 0.2555017045566014,
            "samples_per_second": 1363443.682170191,
            "samples_per_second_per_gpu": 170430.46027127389,
            "loss_sequences_lower_95": 5.239964226110658,
            "loss_sequences_upper_95": 5.5525572819496265,
            "loss_tokens_lower_95": 5.23617537768919,
            "loss_tokens_upper_95": 5.552017337172779,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3545685450236,
            "data_time": 0.16769599169492722,
            "batch_time": 0.21394842118024826,
            "samples_per_second": 3872122.9196001915,
            "samples_per_second_per_gpu": 484015.36495002394,
            "loss_sequences_lower_95": 5.241658217486213,
            "loss_sequences_upper_95": 5.46578961540671,
            "loss_tokens_lower_95": 5.2434691784428615,
            "loss_tokens_upper_95": 5.464563622568168,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.134124700697266,
            "data_time": 0.03318002261221409,
            "batch_time": 0.07705000648275018,
            "samples_per_second": 4330969.471716573,
            "samples_per_second_per_gpu": 541371.1839645717,
            "loss_sequences_lower_95": 6.188167042465238,
            "loss_sequences_upper_95": 6.305655389284261,
            "loss_tokens_lower_95": 5.002014045143344,
            "loss_tokens_upper_95": 5.0903732282463245,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.842173636905731,
            "data_time": 0.33245979249477386,
            "batch_time": 0.37022964656352997,
            "samples_per_second": 2234975.429509215,
            "samples_per_second_per_gpu": 279371.9286886519,
            "loss_sequences_lower_95": 4.7270665486653645,
            "loss_sequences_upper_95": 4.958691826068534,
            "loss_tokens_lower_95": 4.724477778036128,
            "loss_tokens_upper_95": 4.957488472751839,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.509496078724526,
            "data_time": 0.04340176055064568,
            "batch_time": 0.08735486291922055,
            "samples_per_second": 4486864.133836476,
            "samples_per_second_per_gpu": 560858.0167295595,
            "loss_sequences_lower_95": 8.472804048404052,
            "loss_sequences_upper_95": 8.54675363149847,
            "loss_tokens_lower_95": 8.471973522314602,
            "loss_tokens_upper_95": 8.546036425482606,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.539078596726204,
            "data_time": 0.3428814262151718,
            "batch_time": 0.38341446220874786,
            "samples_per_second": 2421410.8245925843,
            "samples_per_second_per_gpu": 302676.35307407303,
            "loss_sequences_lower_95": 5.356459949086013,
            "loss_sequences_upper_95": 5.719114107298619,
            "loss_tokens_lower_95": 5.357561544770176,
            "loss_tokens_upper_95": 5.719966688433897,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.1709423939387005,
            "data_time": 0.29692038893699646,
            "batch_time": 0.31649796664714813,
            "samples_per_second": 1251238.5800755485,
            "samples_per_second_per_gpu": 156404.82250944356,
            "loss_sequences_lower_95": 5.976208101908366,
            "loss_sequences_upper_95": 6.930292282104492,
            "loss_tokens_lower_95": 5.325204096900093,
            "loss_tokens_upper_95": 6.840222771962483,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.104935359954834,
            "data_time": 0.28917260468006134,
            "batch_time": 0.3085318058729172,
            "samples_per_second": 1582992.3083498667,
            "samples_per_second_per_gpu": 197874.03854373333,
            "loss_sequences_lower_95": 5.043549677530924,
            "loss_sequences_upper_95": 6.0859787623087565,
            "loss_tokens_lower_95": 4.156488388575865,
            "loss_tokens_upper_95": 5.705341913459006,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.824236389884892,
            "data_time": 0.040992300425257,
            "batch_time": 0.0840398690530232,
            "samples_per_second": 4348679.79610978,
            "samples_per_second_per_gpu": 543584.9745137225,
            "loss_sequences_lower_95": 7.790628768179308,
            "loss_sequences_upper_95": 7.857696606912739,
            "loss_tokens_lower_95": 7.7907052823545655,
            "loss_tokens_upper_95": 7.858022525658137,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.6796526863670689,
            "data_time": 0.022536569287162682,
            "batch_time": 0.06700930454845512,
            "samples_per_second": 4492044.39622601,
            "samples_per_second_per_gpu": 561505.5495282513,
            "loss_sequences_lower_95": 2.2086385578102563,
            "loss_sequences_upper_95": 2.245434511877693,
            "loss_tokens_lower_95": 1.6330104496366307,
            "loss_tokens_upper_95": 1.654282798298971,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.1959325291040375,
            "data_time": 0.3375459760427475,
            "batch_time": 0.3671797513961792,
            "samples_per_second": 1360743.4048566995,
            "samples_per_second_per_gpu": 170092.92560708744,
            "loss_sequences_lower_95": 3.2246921088751845,
            "loss_sequences_upper_95": 3.5982058307317297,
            "loss_tokens_lower_95": 3.0394554458651895,
            "loss_tokens_upper_95": 3.217043252531242,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.250456165623021,
            "data_time": 0.23144175112247467,
            "batch_time": 0.25086934864521027,
            "samples_per_second": 958146.8555354236,
            "samples_per_second_per_gpu": 119768.35694192795,
            "loss_sequences_lower_95": 4.877094093528954,
            "loss_sequences_upper_95": 5.74435944428315,
            "loss_tokens_lower_95": 4.677920607578607,
            "loss_tokens_upper_95": 5.642008934491947,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.0463911149559952,
            "data_time": 0.32020388543605804,
            "batch_time": 0.35478997230529785,
            "samples_per_second": 2241067.299053779,
            "samples_per_second_per_gpu": 280133.41238172236,
            "loss_sequences_lower_95": 3.104961660431653,
            "loss_sequences_upper_95": 3.4309728575915823,
            "loss_tokens_lower_95": 2.924915944990538,
            "loss_tokens_upper_95": 3.0787995180429983,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.0760999438239307,
            "data_time": 0.3367070108652115,
            "batch_time": 0.37124235928058624,
            "samples_per_second": 2246707.1383763496,
            "samples_per_second_per_gpu": 280838.3922970437,
            "loss_sequences_lower_95": 3.212958545219607,
            "loss_sequences_upper_95": 3.505961990356445,
            "loss_tokens_lower_95": 2.9657571400038116,
            "loss_tokens_upper_95": 3.091649411428404,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.192370443809323,
            "data_time": 0.3339243233203888,
            "batch_time": 0.36860141158103943,
            "samples_per_second": 2269569.501976394,
            "samples_per_second_per_gpu": 283696.18774704926,
            "loss_sequences_lower_95": 3.0105499546702315,
            "loss_sequences_upper_95": 3.37015721390887,
            "loss_tokens_lower_95": 3.063493851699522,
            "loss_tokens_upper_95": 3.265814254611576,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.168491260307591,
            "data_time": 0.3176247328519821,
            "batch_time": 0.3517513871192932,
            "samples_per_second": 2268428.5223229853,
            "samples_per_second_per_gpu": 283553.56529037317,
            "loss_sequences_lower_95": 3.3232742216528917,
            "loss_sequences_upper_95": 3.5994919381490567,
            "loss_tokens_lower_95": 3.067020623037748,
            "loss_tokens_upper_95": 3.1812430408513435,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.75214942197622,
            "data_time": 0.3187853991985321,
            "batch_time": 0.3544749617576599,
            "samples_per_second": 1832825.6773228412,
            "samples_per_second_per_gpu": 229103.20966535516,
            "loss_sequences_lower_95": 2.7503931602335863,
            "loss_sequences_upper_95": 2.908141578058278,
            "loss_tokens_lower_95": 2.6788121755264696,
            "loss_tokens_upper_95": 2.7671985081072683,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.315426182456133,
            "data_time": 0.33090002834796906,
            "batch_time": 0.36578354239463806,
            "samples_per_second": 1834291.269978219,
            "samples_per_second_per_gpu": 229286.40874727737,
            "loss_sequences_lower_95": 2.4551232593815504,
            "loss_sequences_upper_95": 2.642241375620772,
            "loss_tokens_lower_95": 2.2402528937801383,
            "loss_tokens_upper_95": 2.318773916657248,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-32.0/params.txt",
    "uuid": "cf094ee7-2c36-47ce-88d3-32765560b19a",
    "creation_date": "2023_12_14-06_04_30"
}