{
    "name": "rpj-d=96_l=8_h=4-0.25",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 52846560,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "10569312",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.319455317656199,
            "data_time": 0.12504985928535461,
            "batch_time": 1.23276849091053,
            "samples_per_second": 387978.3946210456,
            "samples_per_second_per_gpu": 48497.2993276307,
            "loss_sequences_lower_95": 6.240710334777832,
            "loss_sequences_upper_95": 6.400031483968099,
            "loss_tokens_lower_95": 6.305777931213378,
            "loss_tokens_upper_95": 6.332977511088053,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.123879526201114,
            "data_time": 0.01879336726824622,
            "batch_time": 0.06376024748423333,
            "samples_per_second": 4694290.86468659,
            "samples_per_second_per_gpu": 586786.3580858237,
            "loss_sequences_lower_95": 6.121315361063071,
            "loss_sequences_upper_95": 6.1264777287821035,
            "loss_tokens_lower_95": 6.11196165625,
            "loss_tokens_upper_95": 6.135532510416667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.4980999401637485,
            "data_time": 0.08543645590543747,
            "batch_time": 0.1295011192560196,
            "samples_per_second": 4209605.066044235,
            "samples_per_second_per_gpu": 526200.6332555293,
            "loss_sequences_lower_95": 6.474890871631856,
            "loss_sequences_upper_95": 6.523871123644771,
            "loss_tokens_lower_95": 6.486115364583333,
            "loss_tokens_upper_95": 6.510518010416666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.14621580359862,
            "data_time": 0.01303628400752419,
            "batch_time": 0.05675998879106421,
            "samples_per_second": 5366202.017063665,
            "samples_per_second_per_gpu": 670775.2521329582,
            "loss_sequences_lower_95": 6.129840931056701,
            "loss_sequences_upper_95": 6.162233156813788,
            "loss_tokens_lower_95": 6.134456010416667,
            "loss_tokens_upper_95": 6.158119197916666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.112148100392639,
            "data_time": 0.0875343531370163,
            "batch_time": 0.13235192000865936,
            "samples_per_second": 4041931.879240508,
            "samples_per_second_per_gpu": 505241.4849050635,
            "loss_sequences_lower_95": 6.074470299373091,
            "loss_sequences_upper_95": 6.151627115222442,
            "loss_tokens_lower_95": 6.100237885416667,
            "loss_tokens_upper_95": 6.124083104166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.255252605713119,
            "data_time": 0.03068833549817403,
            "batch_time": 0.0738327403863271,
            "samples_per_second": 5023199.723477031,
            "samples_per_second_per_gpu": 627899.9654346289,
            "loss_sequences_lower_95": 6.227168237620028,
            "loss_sequences_upper_95": 6.2831230614802855,
            "loss_tokens_lower_95": 6.243083322916667,
            "loss_tokens_upper_95": 6.26736721875,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.378683211560152,
            "data_time": 0.011956805735826493,
            "batch_time": 0.05443274155259133,
            "samples_per_second": 5254036.748632748,
            "samples_per_second_per_gpu": 656754.5935790935,
            "loss_sequences_lower_95": 6.3538373624840565,
            "loss_sequences_upper_95": 6.403686832350127,
            "loss_tokens_lower_95": 6.3659923437499994,
            "loss_tokens_upper_95": 6.3918280520833335,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.808128449504912,
            "data_time": 0.012164173941863211,
            "batch_time": 0.055248802429751345,
            "samples_per_second": 5385882.472395184,
            "samples_per_second_per_gpu": 673235.309049398,
            "loss_sequences_lower_95": 5.796051742473822,
            "loss_sequences_upper_95": 5.820682816590314,
            "loss_tokens_lower_95": 5.7967070625,
            "loss_tokens_upper_95": 5.819698416666666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.270522970494216,
            "data_time": 0.08730696886777878,
            "batch_time": 0.13124102354049683,
            "samples_per_second": 4198747.573823089,
            "samples_per_second_per_gpu": 524843.4467278861,
            "loss_sequences_lower_95": 6.223594057656886,
            "loss_sequences_upper_95": 6.319730110478595,
            "loss_tokens_lower_95": 6.258608145833333,
            "loss_tokens_upper_95": 6.28239459375,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.455844185569069,
            "data_time": 0.0884508565068245,
            "batch_time": 0.13361288607120514,
            "samples_per_second": 4111823.161921892,
            "samples_per_second_per_gpu": 513977.8952402365,
            "loss_sequences_lower_95": 6.41552640289186,
            "loss_sequences_upper_95": 6.503059236428483,
            "loss_tokens_lower_95": 6.443891104166667,
            "loss_tokens_upper_95": 6.4675813125,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.45202708031946,
            "data_time": 0.0087798155587295,
            "batch_time": 0.05191398180764297,
            "samples_per_second": 5433770.569751643,
            "samples_per_second_per_gpu": 679221.3212189553,
            "loss_sequences_lower_95": 6.444021301562286,
            "loss_sequences_upper_95": 6.4600843561566395,
            "loss_tokens_lower_95": 6.4399818125,
            "loss_tokens_upper_95": 6.464467791666666,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.3655317339088855,
            "data_time": 0.023277734220027924,
            "batch_time": 0.06802991926670074,
            "samples_per_second": 4985923.227284168,
            "samples_per_second_per_gpu": 623240.403410521,
            "loss_sequences_lower_95": 6.352984047174354,
            "loss_sequences_upper_95": 6.378488577350583,
            "loss_tokens_lower_95": 6.353445760416667,
            "loss_tokens_upper_95": 6.377528114583334,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.940595399535452,
            "data_time": 0.110198475420475,
            "batch_time": 0.2724161297082901,
            "samples_per_second": 4029451.2314634523,
            "samples_per_second_per_gpu": 503681.40393293154,
            "loss_sequences_lower_95": 5.888251896175362,
            "loss_sequences_upper_95": 5.997883305733392,
            "loss_tokens_lower_95": 5.929384104166667,
            "loss_tokens_upper_95": 5.951870104166667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.679653974754505,
            "data_time": 0.0848696157336235,
            "batch_time": 0.1308153122663498,
            "samples_per_second": 4124454.2789247744,
            "samples_per_second_per_gpu": 515556.7848655968,
            "loss_sequences_lower_95": 6.6227215364848515,
            "loss_sequences_upper_95": 6.740120086359153,
            "loss_tokens_lower_95": 6.668107447916666,
            "loss_tokens_upper_95": 6.691963052083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.565634684129194,
            "data_time": 0.16319404542446136,
            "batch_time": 0.18494713306427002,
            "samples_per_second": 1059568.2595589736,
            "samples_per_second_per_gpu": 132446.0324448717,
            "loss_sequences_lower_95": 7.516688572276722,
            "loss_sequences_upper_95": 7.61624233939431,
            "loss_tokens_lower_95": 7.5448755437677555,
            "loss_tokens_upper_95": 7.586710340326483,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.178251712732343,
            "data_time": 0.08469367772340775,
            "batch_time": 0.11941072344779968,
            "samples_per_second": 3390728.235781773,
            "samples_per_second_per_gpu": 423841.0294727216,
            "loss_sequences_lower_95": 6.083066712315507,
            "loss_sequences_upper_95": 6.277681578700118,
            "loss_tokens_lower_95": 6.166015541666667,
            "loss_tokens_upper_95": 6.190321041666667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.354055155550269,
            "data_time": 0.08498810976743698,
            "batch_time": 0.1218394786119461,
            "samples_per_second": 3752770.157585142,
            "samples_per_second_per_gpu": 469096.2696981428,
            "loss_sequences_lower_95": 7.299804252684902,
            "loss_sequences_upper_95": 7.410885869733262,
            "loss_tokens_lower_95": 7.343764083333333,
            "loss_tokens_upper_95": 7.3643153125000005,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.339651139056096,
            "data_time": 0.15399281680583954,
            "batch_time": 0.1825827807188034,
            "samples_per_second": 2118848.82571879,
            "samples_per_second_per_gpu": 264856.1032148487,
            "loss_sequences_lower_95": 7.277719416383837,
            "loss_sequences_upper_95": 7.424882807497118,
            "loss_tokens_lower_95": 7.326982754566631,
            "loss_tokens_upper_95": 7.352054383324795,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4869110513626556,
            "data_time": 0.02736920876936479,
            "batch_time": 0.07140016474507072,
            "samples_per_second": 4527166.460351681,
            "samples_per_second_per_gpu": 565895.8075439602,
            "loss_sequences_lower_95": 5.472153656218843,
            "loss_sequences_upper_95": 5.50108387171076,
            "loss_tokens_lower_95": 5.472398415801702,
            "loss_tokens_upper_95": 5.501231267136092,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.91678694766063,
            "data_time": 0.027143371105194092,
            "batch_time": 0.0709543440490961,
            "samples_per_second": 4503636.237750128,
            "samples_per_second_per_gpu": 562954.529718766,
            "loss_sequences_lower_95": 5.894475857647879,
            "loss_sequences_upper_95": 5.922741672475603,
            "loss_tokens_lower_95": 5.903301577747018,
            "loss_tokens_upper_95": 5.92700847392862,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.313244222299524,
            "data_time": 0.04811204473177592,
            "batch_time": 0.08983508911397722,
            "samples_per_second": 4408927.998049419,
            "samples_per_second_per_gpu": 551115.9997561774,
            "loss_sequences_lower_95": 8.759644811717642,
            "loss_sequences_upper_95": 8.945030371693434,
            "loss_tokens_lower_95": 8.194591863421788,
            "loss_tokens_upper_95": 8.346998320973315,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.080463002999624,
            "data_time": 0.04086571807662646,
            "batch_time": 0.08433890342712402,
            "samples_per_second": 4634461.805834488,
            "samples_per_second_per_gpu": 579307.725729311,
            "loss_sequences_lower_95": 8.398636376953124,
            "loss_sequences_upper_95": 8.523288118489583,
            "loss_tokens_lower_95": 7.999183925412736,
            "loss_tokens_upper_95": 8.102072609571541,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.88932426921159,
            "data_time": 0.06202342361211777,
            "batch_time": 0.1016748920083046,
            "samples_per_second": 4122127.7683956106,
            "samples_per_second_per_gpu": 515265.9710494513,
            "loss_sequences_lower_95": 6.933073928520659,
            "loss_sequences_upper_95": 7.001842641649687,
            "loss_tokens_lower_95": 6.869367578969444,
            "loss_tokens_upper_95": 6.90483491231891,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.347641010717912,
            "data_time": 0.3073568791151047,
            "batch_time": 0.34841182827949524,
            "samples_per_second": 2785189.941746638,
            "samples_per_second_per_gpu": 348148.74271832977,
            "loss_sequences_lower_95": 5.3265148093483665,
            "loss_sequences_upper_95": 5.43620322487571,
            "loss_tokens_lower_95": 5.3231189534415,
            "loss_tokens_upper_95": 5.374467713897007,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.2897518391511875,
            "data_time": 0.3152691572904587,
            "batch_time": 0.36026811599731445,
            "samples_per_second": 2649046.697865425,
            "samples_per_second_per_gpu": 331130.83723317814,
            "loss_sequences_lower_95": 6.258402759785555,
            "loss_sequences_upper_95": 6.448845152562979,
            "loss_tokens_lower_95": 6.245320473827861,
            "loss_tokens_upper_95": 6.3449983999449895,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.668856608072916,
            "data_time": 0.1700103059411049,
            "batch_time": 0.2005924955010414,
            "samples_per_second": 2650434.0600449573,
            "samples_per_second_per_gpu": 331304.25750561967,
            "loss_sequences_lower_95": 5.598536580403646,
            "loss_sequences_upper_95": 5.72079805501302,
            "loss_tokens_lower_95": 5.571164638297667,
            "loss_tokens_upper_95": 5.771316962130138,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.49251356205129,
            "data_time": 0.024517012014985084,
            "batch_time": 0.06849399991333485,
            "samples_per_second": 4530830.5818170775,
            "samples_per_second_per_gpu": 566353.8227271347,
            "loss_sequences_lower_95": 10.550334129410462,
            "loss_sequences_upper_95": 10.603116157546381,
            "loss_tokens_lower_95": 10.453412878063931,
            "loss_tokens_upper_95": 10.50780673273329,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.229979530930118,
            "data_time": 0.04573151767253876,
            "batch_time": 0.08777973055839539,
            "samples_per_second": 4384560.92038136,
            "samples_per_second_per_gpu": 548070.11504767,
            "loss_sequences_lower_95": 8.234859273891256,
            "loss_sequences_upper_95": 8.445451483421454,
            "loss_tokens_lower_95": 7.091528597493046,
            "loss_tokens_upper_95": 7.2371721397628574,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.82335789374524,
            "data_time": 0.08079911768436432,
            "batch_time": 0.12258493900299072,
            "samples_per_second": 4342242.925648633,
            "samples_per_second_per_gpu": 542780.3657060792,
            "loss_sequences_lower_95": 7.444912896791009,
            "loss_sequences_upper_95": 7.696742618287383,
            "loss_tokens_lower_95": 6.720324206775372,
            "loss_tokens_upper_95": 6.8927147460622376,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.054416175302305,
            "data_time": 0.3206084072589874,
            "batch_time": 0.3632088154554367,
            "samples_per_second": 2227500.1220809976,
            "samples_per_second_per_gpu": 278437.5152601247,
            "loss_sequences_lower_95": 6.030524434128853,
            "loss_sequences_upper_95": 6.078677298489226,
            "loss_tokens_lower_95": 6.030262596095533,
            "loss_tokens_upper_95": 6.0790116958966545,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.976819548606873,
            "data_time": 0.3142862170934677,
            "batch_time": 0.3399171680212021,
            "samples_per_second": 1794467.9251835933,
            "samples_per_second_per_gpu": 224308.49064794916,
            "loss_sequences_lower_95": 5.877699371337891,
            "loss_sequences_upper_95": 6.38744645690918,
            "loss_tokens_lower_95": 5.689476927619074,
            "loss_tokens_upper_95": 6.238130310822898,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.443142957794752,
            "data_time": 0.047919996082782745,
            "batch_time": 0.09183202125132084,
            "samples_per_second": 4514064.654059753,
            "samples_per_second_per_gpu": 564258.0817574691,
            "loss_sequences_lower_95": 5.404195170563588,
            "loss_sequences_upper_95": 5.483246805431167,
            "loss_tokens_lower_95": 5.403456026700038,
            "loss_tokens_upper_95": 5.482807252350947,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.60998449903546,
            "data_time": 0.07085302174091339,
            "batch_time": 0.11414903402328491,
            "samples_per_second": 4389643.898839054,
            "samples_per_second_per_gpu": 548705.4873548817,
            "loss_sequences_lower_95": 5.571811215512387,
            "loss_sequences_upper_95": 5.647066233492015,
            "loss_tokens_lower_95": 5.5715075193712895,
            "loss_tokens_upper_95": 5.646904702776284,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.709990462488915,
            "data_time": 0.049179814755916595,
            "batch_time": 0.08992889896035194,
            "samples_per_second": 4294782.367757005,
            "samples_per_second_per_gpu": 536847.7959696256,
            "loss_sequences_lower_95": 6.786536614186616,
            "loss_sequences_upper_95": 6.898648552796008,
            "loss_tokens_lower_95": 6.691375801021302,
            "loss_tokens_upper_95": 6.75758160013392,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.8659845733642575,
            "data_time": 0.16101987659931183,
            "batch_time": 0.20518172532320023,
            "samples_per_second": 4023634.801783285,
            "samples_per_second_per_gpu": 502954.35022291064,
            "loss_sequences_lower_95": 8.520534765625001,
            "loss_sequences_upper_95": 8.948424609375001,
            "loss_tokens_lower_95": 7.637692291037599,
            "loss_tokens_upper_95": 7.968679052185018,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.027407199144363,
            "data_time": 0.12833428382873535,
            "batch_time": 0.14500276744365692,
            "samples_per_second": 937281.9294200081,
            "samples_per_second_per_gpu": 117160.24117750101,
            "loss_sequences_lower_95": 5.7306847214698795,
            "loss_sequences_upper_95": 6.432403242588043,
            "loss_tokens_lower_95": 5.433581911284348,
            "loss_tokens_upper_95": 6.462786391685749,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.6504423453890045,
            "data_time": 0.30789171159267426,
            "batch_time": 0.3431668132543564,
            "samples_per_second": 2422209.884917845,
            "samples_per_second_per_gpu": 302776.2356147306,
            "loss_sequences_lower_95": 7.35307722420528,
            "loss_sequences_upper_95": 7.855582331514906,
            "loss_tokens_lower_95": 6.3711317473092235,
            "loss_tokens_upper_95": 6.788069037573985,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.663427531849406,
            "data_time": 0.05306374033292135,
            "batch_time": 0.09738021592299144,
            "samples_per_second": 4491671.026033393,
            "samples_per_second_per_gpu": 561458.8782541741,
            "loss_sequences_lower_95": 5.653245150464859,
            "loss_sequences_upper_95": 5.673793322428598,
            "loss_tokens_lower_95": 5.653242217364297,
            "loss_tokens_upper_95": 5.673550493699486,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.40788600173201,
            "data_time": 0.03258244835195087,
            "batch_time": 0.07599846025307973,
            "samples_per_second": 4410423.261819712,
            "samples_per_second_per_gpu": 551302.907727464,
            "loss_sequences_lower_95": 9.458638342500729,
            "loss_sequences_upper_95": 9.580609539497623,
            "loss_tokens_lower_95": 9.334243432857502,
            "loss_tokens_upper_95": 9.454026317700738,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.169323060975406,
            "data_time": 0.17022757232189178,
            "batch_time": 0.2001781091094017,
            "samples_per_second": 1780582.4476619652,
            "samples_per_second_per_gpu": 222572.80595774564,
            "loss_sequences_lower_95": 5.046571523365957,
            "loss_sequences_upper_95": 5.418493920630151,
            "loss_tokens_lower_95": 4.949936424064813,
            "loss_tokens_upper_95": 5.307160118307701,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.679491508261087,
            "data_time": 0.07096672654151917,
            "batch_time": 0.11608650982379913,
            "samples_per_second": 4435569.36969814,
            "samples_per_second_per_gpu": 554446.1712122675,
            "loss_sequences_lower_95": 5.716986214051401,
            "loss_sequences_upper_95": 5.849000063973707,
            "loss_tokens_lower_95": 5.592388163977139,
            "loss_tokens_upper_95": 5.753553315158009,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.4927353422816205,
            "data_time": 0.2831685394048691,
            "batch_time": 0.3170717805624008,
            "samples_per_second": 2490064.902436299,
            "samples_per_second_per_gpu": 311258.11280453735,
            "loss_sequences_lower_95": 7.323925074135385,
            "loss_sequences_upper_95": 7.806404411502001,
            "loss_tokens_lower_95": 7.329321806815864,
            "loss_tokens_upper_95": 7.688459277503059,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.182197251589829,
            "data_time": 0.02778838906542261,
            "batch_time": 0.07178819964298261,
            "samples_per_second": 4438213.366023731,
            "samples_per_second_per_gpu": 554776.6707529664,
            "loss_sequences_lower_95": 5.1715882141272,
            "loss_sequences_upper_95": 5.192848100870174,
            "loss_tokens_lower_95": 5.171513736341018,
            "loss_tokens_upper_95": 5.192880656209367,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.190990971130075,
            "data_time": 0.30925452709198,
            "batch_time": 0.33555738627910614,
            "samples_per_second": 1902049.8676908612,
            "samples_per_second_per_gpu": 237756.23346135765,
            "loss_sequences_lower_95": 7.071314854297824,
            "loss_sequences_upper_95": 7.447320838113432,
            "loss_tokens_lower_95": 6.929863258760076,
            "loss_tokens_upper_95": 7.348569303452053,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.128412365451049,
            "data_time": 0.023055453101793924,
            "batch_time": 0.06727961122989655,
            "samples_per_second": 4482847.549579986,
            "samples_per_second_per_gpu": 560355.9436974983,
            "loss_sequences_lower_95": 7.477773887906184,
            "loss_sequences_upper_95": 7.5225974105738995,
            "loss_tokens_lower_95": 7.06572998065764,
            "loss_tokens_upper_95": 7.108763261605415,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.444768700122833,
            "data_time": 0.09518267959356308,
            "batch_time": 0.14038600400090218,
            "samples_per_second": 4121034.1906180605,
            "samples_per_second_per_gpu": 515129.27382725757,
            "loss_sequences_lower_95": 7.484737646484375,
            "loss_sequences_upper_95": 7.632306799316406,
            "loss_tokens_lower_95": 7.368825602402358,
            "loss_tokens_upper_95": 7.503668812385199,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.331728499868642,
            "data_time": 0.32312287390232086,
            "batch_time": 0.36590850353240967,
            "samples_per_second": 2753308.467282378,
            "samples_per_second_per_gpu": 344163.55841029726,
            "loss_sequences_lower_95": 5.222228380286175,
            "loss_sequences_upper_95": 5.44135890794837,
            "loss_tokens_lower_95": 5.223486394467561,
            "loss_tokens_upper_95": 5.43851550558339,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.701879902319474,
            "data_time": 0.06336831549803416,
            "batch_time": 0.103428415954113,
            "samples_per_second": 4077120.2898843163,
            "samples_per_second_per_gpu": 509640.03623553953,
            "loss_sequences_lower_95": 9.632793024236506,
            "loss_sequences_upper_95": 9.770099838719222,
            "loss_tokens_lower_95": 9.633279437440814,
            "loss_tokens_upper_95": 9.770825232303503,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.373428071339925,
            "data_time": 0.06430797775586446,
            "batch_time": 0.10805530349413554,
            "samples_per_second": 4587611.121406511,
            "samples_per_second_per_gpu": 573451.3901758139,
            "loss_sequences_lower_95": 5.500267301432292,
            "loss_sequences_upper_95": 5.602405110677084,
            "loss_tokens_lower_95": 5.307201302395959,
            "loss_tokens_upper_95": 5.421741899884954,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.886165503093174,
            "data_time": 0.323482945561409,
            "batch_time": 0.36641308665275574,
            "samples_per_second": 2441055.1679844367,
            "samples_per_second_per_gpu": 305131.8959980546,
            "loss_sequences_lower_95": 6.504708716982886,
            "loss_sequences_upper_95": 7.275530410039992,
            "loss_tokens_lower_95": 6.497993890671503,
            "loss_tokens_upper_95": 7.275023585728237,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.877438306808472,
            "data_time": 0.13590680062770844,
            "batch_time": 0.15273700654506683,
            "samples_per_second": 904544.6306274324,
            "samples_per_second_per_gpu": 113068.07882842905,
            "loss_sequences_lower_95": 7.713550186157226,
            "loss_sequences_upper_95": 9.108571004867553,
            "loss_tokens_lower_95": 7.392667903310245,
            "loss_tokens_upper_95": 7.96789823866382,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.205200735092163,
            "data_time": 0.09297346696257591,
            "batch_time": 0.13656499981880188,
            "samples_per_second": 4279382.237680492,
            "samples_per_second_per_gpu": 534922.7797100615,
            "loss_sequences_lower_95": 8.33177236328125,
            "loss_sequences_upper_95": 8.65376689453125,
            "loss_tokens_lower_95": 8.042133632085449,
            "loss_tokens_upper_95": 8.330721173229959,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.798921583175659,
            "data_time": 0.08540019765496254,
            "batch_time": 0.13074352964758873,
            "samples_per_second": 4359442.669111378,
            "samples_per_second_per_gpu": 544930.3336389222,
            "loss_sequences_lower_95": 8.135585412597656,
            "loss_sequences_upper_95": 8.410803369140625,
            "loss_tokens_lower_95": 7.669767101824612,
            "loss_tokens_upper_95": 7.889684600142401,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.273213685937588,
            "data_time": 0.037820011377334595,
            "batch_time": 0.08127797519167264,
            "samples_per_second": 4556247.0326386755,
            "samples_per_second_per_gpu": 569530.8790798344,
            "loss_sequences_lower_95": 5.255698649770575,
            "loss_sequences_upper_95": 5.290949626005699,
            "loss_tokens_lower_95": 5.255482892641636,
            "loss_tokens_upper_95": 5.290865555858197,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.532152602932603,
            "data_time": 0.11817799011866252,
            "batch_time": 0.157841165860494,
            "samples_per_second": 3938205.4885383993,
            "samples_per_second_per_gpu": 492275.6860672999,
            "loss_sequences_lower_95": 5.461645957841302,
            "loss_sequences_upper_95": 5.600874574722782,
            "loss_tokens_lower_95": 5.461535128873248,
            "loss_tokens_upper_95": 5.600388824884792,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.539004253387452,
            "data_time": 0.08765958249568939,
            "batch_time": 0.13180993124842644,
            "samples_per_second": 4367588.056330578,
            "samples_per_second_per_gpu": 545948.5070413223,
            "loss_sequences_lower_95": 9.460744677734375,
            "loss_sequences_upper_95": 9.617143774414062,
            "loss_tokens_lower_95": 9.458599194335937,
            "loss_tokens_upper_95": 9.615792993164062,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.775948134118245,
            "data_time": 0.028061940911270324,
            "batch_time": 0.07181587637889952,
            "samples_per_second": 4466050.17463441,
            "samples_per_second_per_gpu": 558256.2718293013,
            "loss_sequences_lower_95": 8.29925225062086,
            "loss_sequences_upper_95": 8.36754277657876,
            "loss_tokens_lower_95": 7.6937206235603375,
            "loss_tokens_upper_95": 7.7484264692552145,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.494832384052561,
            "data_time": 0.17916566133499146,
            "batch_time": 0.25451691661562237,
            "samples_per_second": 2095953.2812222291,
            "samples_per_second_per_gpu": 261994.16015277864,
            "loss_sequences_lower_95": 5.3685456233238105,
            "loss_sequences_upper_95": 5.617478555707789,
            "loss_tokens_lower_95": 5.365944364177647,
            "loss_tokens_upper_95": 5.618414192769065,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.617096088446822,
            "data_time": 0.1715441271662712,
            "batch_time": 0.21603118628263474,
            "samples_per_second": 3939225.860903174,
            "samples_per_second_per_gpu": 492403.23261289677,
            "loss_sequences_lower_95": 5.530033796721814,
            "loss_sequences_upper_95": 5.703094518324908,
            "loss_tokens_lower_95": 5.529873477711397,
            "loss_tokens_upper_95": 5.704396984623927,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.440643151123303,
            "data_time": 0.02816100511699915,
            "batch_time": 0.0715115861967206,
            "samples_per_second": 4523667.234908376,
            "samples_per_second_per_gpu": 565458.404363547,
            "loss_sequences_lower_95": 7.781240947560441,
            "loss_sequences_upper_95": 7.855699330315201,
            "loss_tokens_lower_95": 7.367044493758478,
            "loss_tokens_upper_95": 7.436120160185321,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3492410700157205,
            "data_time": 0.30745573341846466,
            "batch_time": 0.3447531461715698,
            "samples_per_second": 2323603.518519079,
            "samples_per_second_per_gpu": 290450.43981488486,
            "loss_sequences_lower_95": 5.274373985976768,
            "loss_sequences_upper_95": 5.424690626033399,
            "loss_tokens_lower_95": 5.272858626754196,
            "loss_tokens_upper_95": 5.4242040240575395,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.857945564398342,
            "data_time": 0.05006564351228567,
            "batch_time": 0.09397263595691094,
            "samples_per_second": 4327904.190563095,
            "samples_per_second_per_gpu": 540988.0238203869,
            "loss_sequences_lower_95": 8.826861919438073,
            "loss_sequences_upper_95": 8.890084545823777,
            "loss_tokens_lower_95": 8.82629369146598,
            "loss_tokens_upper_95": 8.889113149847095,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.47756910555571,
            "data_time": 0.33720602095127106,
            "batch_time": 0.3774489760398865,
            "samples_per_second": 2007931.5435000667,
            "samples_per_second_per_gpu": 250991.44293750834,
            "loss_sequences_lower_95": 5.3335666249099285,
            "loss_sequences_upper_95": 5.619512169106493,
            "loss_tokens_lower_95": 5.331272340052337,
            "loss_tokens_upper_95": 5.623180707913,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.434871260325114,
            "data_time": 0.29485903680324554,
            "batch_time": 0.31536486744880676,
            "samples_per_second": 1277894.533642252,
            "samples_per_second_per_gpu": 159736.8167052815,
            "loss_sequences_lower_95": 9.179185918172202,
            "loss_sequences_upper_95": 9.78556744893392,
            "loss_tokens_lower_95": 9.026384480794272,
            "loss_tokens_upper_95": 9.804091050889756,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.211804151535034,
            "data_time": 0.297894224524498,
            "batch_time": 0.31818270683288574,
            "samples_per_second": 892420.8582430709,
            "samples_per_second_per_gpu": 111552.60728038386,
            "loss_sequences_lower_95": 9.01536984761556,
            "loss_sequences_upper_95": 9.764235280354818,
            "loss_tokens_lower_95": 8.71323262761148,
            "loss_tokens_upper_95": 9.591417668374737,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.898625170464719,
            "data_time": 0.04083319434097835,
            "batch_time": 0.08360274136066437,
            "samples_per_second": 4374082.255765518,
            "samples_per_second_per_gpu": 546760.2819706898,
            "loss_sequences_lower_95": 8.869045514428388,
            "loss_sequences_upper_95": 8.927996608638622,
            "loss_tokens_lower_95": 8.869591397045287,
            "loss_tokens_upper_95": 8.927946385424336,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.516145421135876,
            "data_time": 0.023017625686383145,
            "batch_time": 0.067262265880035,
            "samples_per_second": 4492119.373805374,
            "samples_per_second_per_gpu": 561514.9217256717,
            "loss_sequences_lower_95": 8.022517769524038,
            "loss_sequences_upper_95": 8.053501280089584,
            "loss_tokens_lower_95": 7.460113166177458,
            "loss_tokens_upper_95": 7.4891201771757965,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.234939864301306,
            "data_time": 0.30954286456108093,
            "batch_time": 0.3400087207555771,
            "samples_per_second": 1731511.4235097314,
            "samples_per_second_per_gpu": 216438.92793871643,
            "loss_sequences_lower_95": 4.985526690145178,
            "loss_sequences_upper_95": 5.306387629471425,
            "loss_tokens_lower_95": 5.13058123100129,
            "loss_tokens_upper_95": 5.319852237050172,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.927169129655168,
            "data_time": 0.1968236118555069,
            "batch_time": 0.21340075135231018,
            "samples_per_second": 1122664.687882587,
            "samples_per_second_per_gpu": 140333.08598532339,
            "loss_sequences_lower_95": 5.631420785027581,
            "loss_sequences_upper_95": 6.274923396754909,
            "loss_tokens_lower_95": 5.44779907980083,
            "loss_tokens_upper_95": 6.40153712519893,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.300392883579906,
            "data_time": 0.2949901819229126,
            "batch_time": 0.3288344740867615,
            "samples_per_second": 2359973.255026902,
            "samples_per_second_per_gpu": 294996.65687836276,
            "loss_sequences_lower_95": 5.129169920014172,
            "loss_sequences_upper_95": 5.367107856564405,
            "loss_tokens_lower_95": 5.207313278621741,
            "loss_tokens_upper_95": 5.368345371307296,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.283569114964183,
            "data_time": 0.32983915507793427,
            "batch_time": 0.36444544792175293,
            "samples_per_second": 2032836.852537086,
            "samples_per_second_per_gpu": 254104.60656713575,
            "loss_sequences_lower_95": 5.214174270629883,
            "loss_sequences_upper_95": 5.418811919049518,
            "loss_tokens_lower_95": 5.205674667599976,
            "loss_tokens_upper_95": 5.3383568600108475,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.293113845150645,
            "data_time": 0.3202497512102127,
            "batch_time": 0.3549506217241287,
            "samples_per_second": 1937886.3118476144,
            "samples_per_second_per_gpu": 242235.7889809518,
            "loss_sequences_lower_95": 4.904943335928569,
            "loss_sequences_upper_95": 5.204339143706531,
            "loss_tokens_lower_95": 5.182582566983539,
            "loss_tokens_upper_95": 5.394136062279867,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.279223721201827,
            "data_time": 0.3111492693424225,
            "batch_time": 0.34709373116493225,
            "samples_per_second": 1786577.9305250566,
            "samples_per_second_per_gpu": 223322.24131563207,
            "loss_sequences_lower_95": 5.228843642444145,
            "loss_sequences_upper_95": 5.434518190709555,
            "loss_tokens_lower_95": 5.20840811566029,
            "loss_tokens_upper_95": 5.329142908589491,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.246760871839819,
            "data_time": 0.31976118683815,
            "batch_time": 0.3548811376094818,
            "samples_per_second": 1847227.3903380537,
            "samples_per_second_per_gpu": 230903.4237922567,
            "loss_sequences_lower_95": 5.048082140810001,
            "loss_sequences_upper_95": 5.211030843983526,
            "loss_tokens_lower_95": 5.213073107085405,
            "loss_tokens_upper_95": 5.314772114470605,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.650932192802429,
            "data_time": 0.30599047243595123,
            "batch_time": 0.3409665524959564,
            "samples_per_second": 2070255.3550882258,
            "samples_per_second_per_gpu": 258781.91938602822,
            "loss_sequences_lower_95": 4.543318222790229,
            "loss_sequences_upper_95": 4.716318297967678,
            "loss_tokens_lower_95": 4.599684132949066,
            "loss_tokens_upper_95": 4.689414274412297,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-0.25/params.txt",
    "uuid": "3070bbe6-259e-4569-bd06-23e482f79e6c",
    "creation_date": "2023_12_14-05_51_51"
}