{
    "name": "rpj-d=512_l=8_h=4-0.25",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 394570240,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "78914048",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.046179107824961,
            "data_time": 0.029696691781282425,
            "batch_time": 0.3232000879943371,
            "samples_per_second": 1718974.5070774348,
            "samples_per_second_per_gpu": 214871.81338467935,
            "loss_sequences_lower_95": 4.964471270243327,
            "loss_sequences_upper_95": 5.127060610453287,
            "loss_tokens_lower_95": 5.032572580973308,
            "loss_tokens_upper_95": 5.059590657552083,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.038766634018416,
            "data_time": 0.0016122660748978726,
            "batch_time": 0.015595563172738731,
            "samples_per_second": 2219530.567181259,
            "samples_per_second_per_gpu": 277441.32089765737,
            "loss_sequences_lower_95": 5.036658868780809,
            "loss_sequences_upper_95": 5.0409210788509276,
            "loss_tokens_lower_95": 5.02734125,
            "loss_tokens_upper_95": 5.050436739583333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.770480766101759,
            "data_time": 0.009025806427001954,
            "batch_time": 0.022968372344970702,
            "samples_per_second": 2189491.7840692257,
            "samples_per_second_per_gpu": 273686.4730086532,
            "loss_sequences_lower_95": 4.752932951012436,
            "loss_sequences_upper_95": 4.78843092315051,
            "loss_tokens_lower_95": 4.7588002395833335,
            "loss_tokens_upper_95": 4.782497010416667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.947078046602072,
            "data_time": 0.0015148577329359557,
            "batch_time": 0.015000614778775918,
            "samples_per_second": 2313700.667996008,
            "samples_per_second_per_gpu": 289212.583499501,
            "loss_sequences_lower_95": 4.939007691688144,
            "loss_sequences_upper_95": 4.955267960695877,
            "loss_tokens_lower_95": 4.93574434375,
            "loss_tokens_upper_95": 4.958650052083333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0263212674018565,
            "data_time": 0.008940122042044226,
            "batch_time": 0.02260764186600765,
            "samples_per_second": 2203684.4582428476,
            "samples_per_second_per_gpu": 275460.55728035595,
            "loss_sequences_lower_95": 4.999304684019379,
            "loss_sequences_upper_95": 5.053842119578189,
            "loss_tokens_lower_95": 5.014664989583333,
            "loss_tokens_upper_95": 5.0381229375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.000161979734938,
            "data_time": 0.003556828136029451,
            "batch_time": 0.017063615114792534,
            "samples_per_second": 2306535.8956478024,
            "samples_per_second_per_gpu": 288316.9869559753,
            "loss_sequences_lower_95": 4.971615814271223,
            "loss_sequences_upper_95": 5.02768200343357,
            "loss_tokens_lower_95": 4.9882270625,
            "loss_tokens_upper_95": 5.011992604166666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.421683766987859,
            "data_time": 0.0014827134753129221,
            "batch_time": 0.014824077511300465,
            "samples_per_second": 2345168.9116788246,
            "samples_per_second_per_gpu": 293146.1139598531,
            "loss_sequences_lower_95": 4.398187878667091,
            "loss_sequences_upper_95": 4.444327985491071,
            "loss_tokens_lower_95": 4.408922875,
            "loss_tokens_upper_95": 4.434665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.976967479046727,
            "data_time": 0.0014936126260665708,
            "batch_time": 0.014801672840158294,
            "samples_per_second": 2345605.954030138,
            "samples_per_second_per_gpu": 293200.74425376725,
            "loss_sequences_lower_95": 4.970045709260471,
            "loss_sequences_upper_95": 4.98430018815445,
            "loss_tokens_lower_95": 4.9658845625,
            "loss_tokens_upper_95": 4.988055020833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.164633052135871,
            "data_time": 0.01213468351061382,
            "batch_time": 0.025882551594386027,
            "samples_per_second": 2217845.424688028,
            "samples_per_second_per_gpu": 277230.6780860035,
            "loss_sequences_lower_95": 5.127878204593814,
            "loss_sequences_upper_95": 5.20461595736868,
            "loss_tokens_lower_95": 5.152973354166667,
            "loss_tokens_upper_95": 5.1763575625,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.496669970953417,
            "data_time": 0.008979259058833122,
            "batch_time": 0.02357002440840006,
            "samples_per_second": 2220764.09860509,
            "samples_per_second_per_gpu": 277595.5123256362,
            "loss_sequences_lower_95": 5.484114261672431,
            "loss_sequences_upper_95": 5.5088442595108695,
            "loss_tokens_lower_95": 5.484497875,
            "loss_tokens_upper_95": 5.508975552083334,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.273736176105889,
            "data_time": 0.001227563621247099,
            "batch_time": 0.0146291481367233,
            "samples_per_second": 2348894.9114137064,
            "samples_per_second_per_gpu": 293611.8639267133,
            "loss_sequences_lower_95": 5.267963102345142,
            "loss_sequences_upper_95": 5.279536298093394,
            "loss_tokens_lower_95": 5.261738489583333,
            "loss_tokens_upper_95": 5.28575928125,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.086346756960143,
            "data_time": 0.00266741693863563,
            "batch_time": 0.01664152312139786,
            "samples_per_second": 2271733.7560265823,
            "samples_per_second_per_gpu": 283966.7195033228,
            "loss_sequences_lower_95": 5.077593788232767,
            "loss_sequences_upper_95": 5.095368090037354,
            "loss_tokens_lower_95": 5.07492240625,
            "loss_tokens_upper_95": 5.098131677083333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.100017313541311,
            "data_time": 0.008778423188703333,
            "batch_time": 0.02241579911454393,
            "samples_per_second": 2208593.6003627786,
            "samples_per_second_per_gpu": 276074.20004534733,
            "loss_sequences_lower_95": 5.0650268926098185,
            "loss_sequences_upper_95": 5.136941002155172,
            "loss_tokens_lower_95": 5.088492333333333,
            "loss_tokens_upper_95": 5.1112238125,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.228656456086883,
            "data_time": 0.008823380527268368,
            "batch_time": 0.022995949741378724,
            "samples_per_second": 2159466.8976465277,
            "samples_per_second_per_gpu": 269933.36220581597,
            "loss_sequences_lower_95": 5.1844132743882065,
            "loss_sequences_upper_95": 5.273578452675503,
            "loss_tokens_lower_95": 5.2165632083333335,
            "loss_tokens_upper_95": 5.2405457916666665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.106875798918984,
            "data_time": 0.07121884822845459,
            "batch_time": 0.08763239213398524,
            "samples_per_second": 1126690.9917981478,
            "samples_per_second_per_gpu": 140836.37397476847,
            "loss_sequences_lower_95": 6.048947802456942,
            "loss_sequences_upper_95": 6.165199279785156,
            "loss_tokens_lower_95": 6.084470471468839,
            "loss_tokens_upper_95": 6.129271559281783,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.758350422361501,
            "data_time": 0.011753988536921415,
            "batch_time": 0.025691517374732277,
            "samples_per_second": 2133052.9371768604,
            "samples_per_second_per_gpu": 266631.61714710755,
            "loss_sequences_lower_95": 4.6687933474170915,
            "loss_sequences_upper_95": 4.848859870399052,
            "loss_tokens_lower_95": 4.746219895833334,
            "loss_tokens_upper_95": 4.770367947916666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.539036798602988,
            "data_time": 0.011710009227196375,
            "batch_time": 0.02550371860464414,
            "samples_per_second": 2197773.192940296,
            "samples_per_second_per_gpu": 274721.649117537,
            "loss_sequences_lower_95": 6.490401794916722,
            "loss_sequences_upper_95": 6.58545107074023,
            "loss_tokens_lower_95": 6.5279339895833335,
            "loss_tokens_upper_95": 6.550084583333334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.825853488484367,
            "data_time": 0.03329316899180412,
            "batch_time": 0.04831312596797943,
            "samples_per_second": 1845794.4709031254,
            "samples_per_second_per_gpu": 230724.30886289067,
            "loss_sequences_lower_95": 5.784256381675845,
            "loss_sequences_upper_95": 5.866475539910989,
            "loss_tokens_lower_95": 5.812771356301229,
            "loss_tokens_upper_95": 5.838944994817015,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.788244844722436,
            "data_time": 0.0020151376927903682,
            "batch_time": 0.01562783213709916,
            "samples_per_second": 2269877.959848526,
            "samples_per_second_per_gpu": 283734.74498106574,
            "loss_sequences_lower_95": 4.770525379775851,
            "loss_sequences_upper_95": 4.806339157749965,
            "loss_tokens_lower_95": 4.770508341048284,
            "loss_tokens_upper_95": 4.806283799271827,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.794481016492207,
            "data_time": 0.0021602460628102537,
            "batch_time": 0.015797926836712346,
            "samples_per_second": 2264954.4139890186,
            "samples_per_second_per_gpu": 283119.30174862733,
            "loss_sequences_lower_95": 4.776934297824139,
            "loss_sequences_upper_95": 4.802929668050438,
            "loss_tokens_lower_95": 4.7823078993827774,
            "loss_tokens_upper_95": 4.804092447697523,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.924932318472558,
            "data_time": 0.0028884867021882455,
            "batch_time": 0.016514767351069617,
            "samples_per_second": 2266112.987684218,
            "samples_per_second_per_gpu": 283264.1234605273,
            "loss_sequences_lower_95": 7.09533588768009,
            "loss_sequences_upper_95": 7.371346436123495,
            "loss_tokens_lower_95": 6.466381457830742,
            "loss_tokens_upper_95": 6.661393903941804,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.860219950517019,
            "data_time": 0.004114838515190368,
            "batch_time": 0.017731363944550778,
            "samples_per_second": 2252772.0345556485,
            "samples_per_second_per_gpu": 281596.50431945606,
            "loss_sequences_lower_95": 6.995921061197917,
            "loss_sequences_upper_95": 7.1805745768229166,
            "loss_tokens_lower_95": 6.517341833726416,
            "loss_tokens_upper_95": 6.64549011153695,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.339417933333182,
            "data_time": 0.004112648927967652,
            "batch_time": 0.01778721881308347,
            "samples_per_second": 2242329.4744242774,
            "samples_per_second_per_gpu": 280291.1843030347,
            "loss_sequences_lower_95": 5.361924594419304,
            "loss_sequences_upper_95": 5.428400477249573,
            "loss_tokens_lower_95": 5.263225995587714,
            "loss_tokens_upper_95": 5.298318219033329,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.01192122372714,
            "data_time": 0.020242309996059964,
            "batch_time": 0.03533187934330532,
            "samples_per_second": 1944390.497520141,
            "samples_per_second_per_gpu": 243048.81219001763,
            "loss_sequences_lower_95": 3.969832465431907,
            "loss_sequences_upper_95": 4.108722520308061,
            "loss_tokens_lower_95": 3.95367748066361,
            "loss_tokens_upper_95": 4.006088199448252,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.126687964614557,
            "data_time": 0.01849130541086197,
            "batch_time": 0.03261866606771946,
            "samples_per_second": 1997654.6847319312,
            "samples_per_second_per_gpu": 249706.8355914914,
            "loss_sequences_lower_95": 5.092422971141582,
            "loss_sequences_upper_95": 5.293591545260682,
            "loss_tokens_lower_95": 4.997457697719194,
            "loss_tokens_upper_95": 5.101084493483412,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.048525245984395,
            "data_time": 0.0151824462108123,
            "batch_time": 0.029412592068696633,
            "samples_per_second": 2013633.8471064896,
            "samples_per_second_per_gpu": 251704.2308883112,
            "loss_sequences_lower_95": 4.993127644856771,
            "loss_sequences_upper_95": 5.164974009195963,
            "loss_tokens_lower_95": 4.8668524630137515,
            "loss_tokens_upper_95": 5.122815047029512,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.121907106799103,
            "data_time": 0.0017936819827576555,
            "batch_time": 0.015328881848739505,
            "samples_per_second": 2284068.5047746557,
            "samples_per_second_per_gpu": 285508.56309683196,
            "loss_sequences_lower_95": 9.137121350757837,
            "loss_sequences_upper_95": 9.212131577247675,
            "loss_tokens_lower_95": 8.974132614285201,
            "loss_tokens_upper_95": 9.052955248250036,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.47017461513028,
            "data_time": 0.002601104694724883,
            "batch_time": 0.016246324817606267,
            "samples_per_second": 2261582.074463046,
            "samples_per_second_per_gpu": 282697.75930788077,
            "loss_sequences_lower_95": 7.012184692793824,
            "loss_sequences_upper_95": 7.300466826467803,
            "loss_tokens_lower_95": 5.749414550337761,
            "loss_tokens_upper_95": 5.897797467944624,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.876417332541821,
            "data_time": 0.0047272886778857255,
            "batch_time": 0.018336267084688752,
            "samples_per_second": 2235663.797101376,
            "samples_per_second_per_gpu": 279457.974637672,
            "loss_sequences_lower_95": 6.270731986504773,
            "loss_sequences_upper_95": 6.582866660889505,
            "loss_tokens_lower_95": 5.462970209013996,
            "loss_tokens_upper_95": 5.629068663782672,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.918846687769781,
            "data_time": 0.021907348717961992,
            "batch_time": 0.036024323531559536,
            "samples_per_second": 2033599.0805039466,
            "samples_per_second_per_gpu": 254199.88506299333,
            "loss_sequences_lower_95": 5.804921621383597,
            "loss_sequences_upper_95": 6.036738189279217,
            "loss_tokens_lower_95": 5.804656843072204,
            "loss_tokens_upper_95": 6.033188632634133,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0025362253189085,
            "data_time": 0.04434057382436899,
            "batch_time": 0.06046933852709257,
            "samples_per_second": 1583767.3201079646,
            "samples_per_second_per_gpu": 197970.91501349557,
            "loss_sequences_lower_95": 4.841432067871094,
            "loss_sequences_upper_95": 5.247838226318359,
            "loss_tokens_lower_95": 4.656439787671903,
            "loss_tokens_upper_95": 5.188981068838047,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.204227839546047,
            "data_time": 0.0031698298600553736,
            "batch_time": 0.01705574794293425,
            "samples_per_second": 2215901.411750292,
            "samples_per_second_per_gpu": 276987.6764687865,
            "loss_sequences_lower_95": 4.163872597576286,
            "loss_sequences_upper_95": 4.245572333594549,
            "loss_tokens_lower_95": 4.163608890716959,
            "loss_tokens_upper_95": 4.245306515181198,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.710059048014523,
            "data_time": 0.004680301781385213,
            "batch_time": 0.01844846171617119,
            "samples_per_second": 2226050.3853521026,
            "samples_per_second_per_gpu": 278256.2981690128,
            "loss_sequences_lower_95": 4.660134615261568,
            "loss_sequences_upper_95": 4.758785283419072,
            "loss_tokens_lower_95": 4.658192987465448,
            "loss_tokens_upper_95": 4.759946850925855,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.4417219491466735,
            "data_time": 0.0032934897919195518,
            "batch_time": 0.016892621378146806,
            "samples_per_second": 2250824.1582263643,
            "samples_per_second_per_gpu": 281353.01977829554,
            "loss_sequences_lower_95": 5.560151149347116,
            "loss_sequences_upper_95": 5.670556693756801,
            "loss_tokens_lower_95": 5.305309826669629,
            "loss_tokens_upper_95": 5.365282981895667,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.145279537200928,
            "data_time": 0.009571535512804985,
            "batch_time": 0.023326962254941463,
            "samples_per_second": 2158650.049974192,
            "samples_per_second_per_gpu": 269831.256246774,
            "loss_sequences_lower_95": 7.328022094726562,
            "loss_sequences_upper_95": 7.839081152343749,
            "loss_tokens_lower_95": 6.446870815193788,
            "loss_tokens_upper_95": 6.802167002528334,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.465308994054794,
            "data_time": 0.1386478692293167,
            "batch_time": 0.1561659574508667,
            "samples_per_second": 866326.5142715827,
            "samples_per_second_per_gpu": 108290.81428394784,
            "loss_sequences_lower_95": 5.14831314086914,
            "loss_sequences_upper_95": 5.94981062412262,
            "loss_tokens_lower_95": 4.897566152989179,
            "loss_tokens_upper_95": 5.842922482545348,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.767332403139136,
            "data_time": 0.025973223625345432,
            "batch_time": 0.040522775751479126,
            "samples_per_second": 1812144.1127292705,
            "samples_per_second_per_gpu": 226518.0140911588,
            "loss_sequences_lower_95": 7.192367974643049,
            "loss_sequences_upper_95": 7.953824203315823,
            "loss_tokens_lower_95": 5.446620936861437,
            "loss_tokens_upper_95": 5.915754914740699,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.642654350200768,
            "data_time": 0.002801325793067614,
            "batch_time": 0.016500408657723002,
            "samples_per_second": 2240487.186635073,
            "samples_per_second_per_gpu": 280060.8983293841,
            "loss_sequences_lower_95": 4.613933518235993,
            "loss_sequences_upper_95": 4.670992392710382,
            "loss_tokens_lower_95": 4.613041535690394,
            "loss_tokens_upper_95": 4.670824384710163,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.040834058126026,
            "data_time": 0.0025591508889781164,
            "batch_time": 0.016043684489050716,
            "samples_per_second": 2293313.3348278604,
            "samples_per_second_per_gpu": 286664.16685348254,
            "loss_sequences_lower_95": 7.02145325690132,
            "loss_sequences_upper_95": 7.234793445565689,
            "loss_tokens_lower_95": 6.766137027775246,
            "loss_tokens_upper_95": 6.9764504071443225,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.423706528031346,
            "data_time": 0.016895001133282978,
            "batch_time": 0.030701883965068393,
            "samples_per_second": 2033081.1674784059,
            "samples_per_second_per_gpu": 254135.14593480073,
            "loss_sequences_lower_95": 4.241352928077782,
            "loss_sequences_upper_95": 4.654994755000859,
            "loss_tokens_lower_95": 4.15286052185115,
            "loss_tokens_upper_95": 4.478071654934667,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.759975021433059,
            "data_time": 0.004295025020837784,
            "batch_time": 0.017894430086016655,
            "samples_per_second": 2246719.9699336477,
            "samples_per_second_per_gpu": 280839.99624170596,
            "loss_sequences_lower_95": 4.7844870021612325,
            "loss_sequences_upper_95": 4.93683556945849,
            "loss_tokens_lower_95": 4.636087965151707,
            "loss_tokens_upper_95": 4.793710717531308,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.6028279560368235,
            "data_time": 0.028270167963845388,
            "batch_time": 0.043020824591318764,
            "samples_per_second": 1910647.1559933734,
            "samples_per_second_per_gpu": 238830.89449917167,
            "loss_sequences_lower_95": 5.378983492967559,
            "loss_sequences_upper_95": 5.870475685305712,
            "loss_tokens_lower_95": 5.404017467405434,
            "loss_tokens_upper_95": 5.799541151037419,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.3820513799491465,
            "data_time": 0.0021386502792108456,
            "batch_time": 0.015755486831253544,
            "samples_per_second": 2265734.0024168673,
            "samples_per_second_per_gpu": 283216.7503021084,
            "loss_sequences_lower_95": 5.369316031174986,
            "loss_sequences_upper_95": 5.3947301765040505,
            "loss_tokens_lower_95": 5.369389522435737,
            "loss_tokens_upper_95": 5.394668103542584,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.929068192694951,
            "data_time": 0.04334992061961781,
            "batch_time": 0.057693945277820936,
            "samples_per_second": 1840495.257964805,
            "samples_per_second_per_gpu": 230061.90724560063,
            "loss_sequences_lower_95": 4.755971786350879,
            "loss_sequences_upper_95": 5.169018421358275,
            "loss_tokens_lower_95": 4.618148806834538,
            "loss_tokens_upper_95": 5.046762525388626,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.269049564166389,
            "data_time": 0.0016901949725027156,
            "batch_time": 0.015155643674374027,
            "samples_per_second": 2291937.3194069015,
            "samples_per_second_per_gpu": 286492.1649258627,
            "loss_sequences_lower_95": 6.61477311812107,
            "loss_sequences_upper_95": 6.662665483326782,
            "loss_tokens_lower_95": 5.707522654738878,
            "loss_tokens_upper_95": 5.755233849129594,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.276950288534165,
            "data_time": 0.005206581619050767,
            "batch_time": 0.01893059318027799,
            "samples_per_second": 2229402.2502658023,
            "samples_per_second_per_gpu": 278675.2812832253,
            "loss_sequences_lower_95": 5.2750650390625005,
            "loss_sequences_upper_95": 5.574622473144531,
            "loss_tokens_lower_95": 4.955164574949849,
            "loss_tokens_upper_95": 5.217411550476121,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.519871111538099,
            "data_time": 0.01950662620997025,
            "batch_time": 0.033751352358672576,
            "samples_per_second": 2022256.303299392,
            "samples_per_second_per_gpu": 252782.037912424,
            "loss_sequences_lower_95": 4.370526587444803,
            "loss_sequences_upper_95": 4.667949139138927,
            "loss_tokens_lower_95": 4.36985279912534,
            "loss_tokens_upper_95": 4.66748779296875,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 11.178329158551765,
            "data_time": 0.004233452570007508,
            "batch_time": 0.018244767045400227,
            "samples_per_second": 2184613.028751845,
            "samples_per_second_per_gpu": 273076.6285939806,
            "loss_sequences_lower_95": 11.010509994969224,
            "loss_sequences_upper_95": 11.343578657670454,
            "loss_tokens_lower_95": 11.011452303799716,
            "loss_tokens_upper_95": 11.34559170809659,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3647124546368916,
            "data_time": 0.0038744683595413856,
            "batch_time": 0.017542991232364736,
            "samples_per_second": 2253689.557070592,
            "samples_per_second_per_gpu": 281711.194633824,
            "loss_sequences_lower_95": 3.4069115559895833,
            "loss_sequences_upper_95": 3.476198152669271,
            "loss_tokens_lower_95": 3.266131515106043,
            "loss_tokens_upper_95": 3.355768666841737,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.527002756936209,
            "data_time": 0.021126619407108853,
            "batch_time": 0.035267744745527,
            "samples_per_second": 1949751.7260198127,
            "samples_per_second_per_gpu": 243718.9657524766,
            "loss_sequences_lower_95": 6.153916655040923,
            "loss_sequences_upper_95": 6.898503970191592,
            "loss_tokens_lower_95": 6.155444568452381,
            "loss_tokens_upper_95": 6.907920328776042,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.686511382460594,
            "data_time": 0.13981647789478302,
            "batch_time": 0.15706199407577515,
            "samples_per_second": 746674.8426514054,
            "samples_per_second_per_gpu": 93334.35533142567,
            "loss_sequences_lower_95": 5.470014345645905,
            "loss_sequences_upper_95": 6.738216364383697,
            "loss_tokens_lower_95": 5.194957514497423,
            "loss_tokens_upper_95": 5.800530414384665,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.107753353595733,
            "data_time": 0.005468507607777913,
            "batch_time": 0.019170289947873072,
            "samples_per_second": 2220513.3070065645,
            "samples_per_second_per_gpu": 277564.16337582056,
            "loss_sequences_lower_95": 7.99657001953125,
            "loss_sequences_upper_95": 8.399462036132814,
            "loss_tokens_lower_95": 7.814461758737838,
            "loss_tokens_upper_95": 8.17256605015995,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.5175074458122255,
            "data_time": 0.005551912481822665,
            "batch_time": 0.01914999456632705,
            "samples_per_second": 2240562.377369873,
            "samples_per_second_per_gpu": 280070.29717123415,
            "loss_sequences_lower_95": 7.6487201171875006,
            "loss_sequences_upper_95": 7.9152634765625,
            "loss_tokens_lower_95": 7.219696390288559,
            "loss_tokens_upper_95": 7.443707798025162,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.709635132397336,
            "data_time": 0.003908400232576607,
            "batch_time": 0.01801821093096781,
            "samples_per_second": 2175438.7055151593,
            "samples_per_second_per_gpu": 271929.8381893949,
            "loss_sequences_lower_95": 5.68596625997842,
            "loss_sequences_upper_95": 5.733275180449631,
            "loss_tokens_lower_95": 5.686044928422519,
            "loss_tokens_upper_95": 5.733417916369846,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.550137857870755,
            "data_time": 0.007673751911728043,
            "batch_time": 0.02126757904121883,
            "samples_per_second": 2209682.5716065797,
            "samples_per_second_per_gpu": 276210.32145082246,
            "loss_sequences_lower_95": 4.4534461143013155,
            "loss_sequences_upper_95": 4.647444708336334,
            "loss_tokens_lower_95": 4.449672322778658,
            "loss_tokens_upper_95": 4.645427894885273,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.350911876678467,
            "data_time": 0.005341560121566531,
            "batch_time": 0.01897899546320476,
            "samples_per_second": 2239678.4766595084,
            "samples_per_second_per_gpu": 279959.80958243855,
            "loss_sequences_lower_95": 8.289158764648437,
            "loss_sequences_upper_95": 8.41385537109375,
            "loss_tokens_lower_95": 8.288637353515625,
            "loss_tokens_upper_95": 8.415823559570313,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.351382519168006,
            "data_time": 0.002130125826383133,
            "batch_time": 0.01566944748782931,
            "samples_per_second": 2279860.970979565,
            "samples_per_second_per_gpu": 284982.6213724456,
            "loss_sequences_lower_95": 6.794448893537133,
            "loss_sequences_upper_95": 6.890287332958845,
            "loss_tokens_lower_95": 5.810736263890074,
            "loss_tokens_upper_95": 5.877908650769952,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.061785592072046,
            "data_time": 0.017653075286320277,
            "batch_time": 0.031700542994907925,
            "samples_per_second": 2024246.9360218751,
            "samples_per_second_per_gpu": 253030.8670027344,
            "loss_sequences_lower_95": 4.917532303084188,
            "loss_sequences_upper_95": 5.205455871126546,
            "loss_tokens_lower_95": 4.919354396079903,
            "loss_tokens_upper_95": 5.204050263362144,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.969540364134545,
            "data_time": 0.009619824588298798,
            "batch_time": 0.023494350723922253,
            "samples_per_second": 2184534.2148002004,
            "samples_per_second_per_gpu": 273066.77685002505,
            "loss_sequences_lower_95": 4.868909600949755,
            "loss_sequences_upper_95": 5.070287296070772,
            "loss_tokens_lower_95": 4.86948018391927,
            "loss_tokens_upper_95": 5.0666174316406245,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.567553450227098,
            "data_time": 0.002206656419506488,
            "batch_time": 0.01583811049760655,
            "samples_per_second": 2261684.567326943,
            "samples_per_second_per_gpu": 282710.57091586787,
            "loss_sequences_lower_95": 6.880163805117124,
            "loss_sequences_upper_95": 6.974041823738726,
            "loss_tokens_lower_95": 6.0683116155552295,
            "loss_tokens_upper_95": 6.147214829572533,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.381615908688338,
            "data_time": 0.024812273681163788,
            "batch_time": 0.039447635412216187,
            "samples_per_second": 1969018.9929662405,
            "samples_per_second_per_gpu": 246127.37412078006,
            "loss_sequences_lower_95": 5.29740637723731,
            "loss_sequences_upper_95": 5.466236966249173,
            "loss_tokens_lower_95": 5.299654247142651,
            "loss_tokens_upper_95": 5.468269727595899,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.728362768068227,
            "data_time": 0.003868676658369537,
            "batch_time": 0.01749473686474438,
            "samples_per_second": 2257184.560091824,
            "samples_per_second_per_gpu": 282148.070011478,
            "loss_sequences_lower_95": 9.693294748662078,
            "loss_sequences_upper_95": 9.762666030557149,
            "loss_tokens_lower_95": 9.69430130984805,
            "loss_tokens_upper_95": 9.763627168147936,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9951029992798,
            "data_time": 0.022477550940080124,
            "batch_time": 0.03730849136005748,
            "samples_per_second": 1860110.822790536,
            "samples_per_second_per_gpu": 232513.852848817,
            "loss_sequences_lower_95": 4.834567386664233,
            "loss_sequences_upper_95": 5.158137660350614,
            "loss_tokens_lower_95": 4.831348611776112,
            "loss_tokens_upper_95": 5.159112208097884,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.895058488845825,
            "data_time": 0.06896693259477615,
            "batch_time": 0.08450822532176971,
            "samples_per_second": 1429504.687951348,
            "samples_per_second_per_gpu": 178688.0859939185,
            "loss_sequences_lower_95": 6.512137896219889,
            "loss_sequences_upper_95": 7.575539143880208,
            "loss_tokens_lower_95": 5.89557622273763,
            "loss_tokens_upper_95": 7.376636886596679,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.120997540156046,
            "data_time": 0.06900017708539963,
            "batch_time": 0.0850578248500824,
            "samples_per_second": 1380651.6449687348,
            "samples_per_second_per_gpu": 172581.45562109185,
            "loss_sequences_lower_95": 5.751099230448405,
            "loss_sequences_upper_95": 6.951587168375651,
            "loss_tokens_lower_95": 4.9656602752342645,
            "loss_tokens_upper_95": 6.48115164081702,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.737975523362981,
            "data_time": 0.003455344335411211,
            "batch_time": 0.017064163725540312,
            "samples_per_second": 2269482.6678319057,
            "samples_per_second_per_gpu": 283685.3334789882,
            "loss_sequences_lower_95": 7.7169065261413845,
            "loss_sequences_upper_95": 7.760906477241347,
            "loss_tokens_lower_95": 7.715665069265464,
            "loss_tokens_upper_95": 7.760329586248159,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.165630854556688,
            "data_time": 0.0015704039916298322,
            "batch_time": 0.015175874958413956,
            "samples_per_second": 2272325.7353067687,
            "samples_per_second_per_gpu": 284040.7169133461,
            "loss_sequences_lower_95": 5.544400008761881,
            "loss_sequences_upper_95": 5.580894516023559,
            "loss_tokens_lower_95": 4.686880459383952,
            "loss_tokens_upper_95": 4.720196062612724,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.00760369413481,
            "data_time": 0.03767715021967888,
            "batch_time": 0.052881717681884766,
            "samples_per_second": 1897898.7528078395,
            "samples_per_second_per_gpu": 237237.34410097994,
            "loss_sequences_lower_95": 3.8766446932094305,
            "loss_sequences_upper_95": 4.207235789862205,
            "loss_tokens_lower_95": 3.749917582298959,
            "loss_tokens_upper_95": 3.9438056887704187,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.685734658627896,
            "data_time": 0.10405740283784412,
            "batch_time": 0.1253003506433396,
            "samples_per_second": 1016602.7516670264,
            "samples_per_second_per_gpu": 127075.3439583783,
            "loss_sequences_lower_95": 5.330799607972841,
            "loss_sequences_upper_95": 6.1219630370268945,
            "loss_tokens_lower_95": 5.133033441614222,
            "loss_tokens_upper_95": 6.150646718343099,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9193710772002617,
            "data_time": 0.028166350864228747,
            "batch_time": 0.042868029503595265,
            "samples_per_second": 1927706.59953759,
            "samples_per_second_per_gpu": 240963.32494219876,
            "loss_sequences_lower_95": 3.8234216643542775,
            "loss_sequences_upper_95": 4.096916124297351,
            "loss_tokens_lower_95": 3.6829643558794154,
            "loss_tokens_upper_95": 3.8448491215044154,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.942298742329202,
            "data_time": 0.02945254814057123,
            "batch_time": 0.04423097769419352,
            "samples_per_second": 1905296.9752537708,
            "samples_per_second_per_gpu": 238162.12190672135,
            "loss_sequences_lower_95": 3.9050080462199888,
            "loss_sequences_upper_95": 4.14923438095465,
            "loss_tokens_lower_95": 3.6923747498753956,
            "loss_tokens_upper_95": 3.826641018969157,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9655026255584342,
            "data_time": 0.026980255331311907,
            "batch_time": 0.04154716786884126,
            "samples_per_second": 1964286.515740469,
            "samples_per_second_per_gpu": 245535.81446755864,
            "loss_sequences_lower_95": 3.716632731367902,
            "loss_sequences_upper_95": 4.037161757306355,
            "loss_tokens_lower_95": 3.8080472835389414,
            "loss_tokens_upper_95": 4.024868411151344,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.012337228146995,
            "data_time": 0.028260168575105212,
            "batch_time": 0.04267779134568714,
            "samples_per_second": 1929595.5461528369,
            "samples_per_second_per_gpu": 241199.4432691046,
            "loss_sequences_lower_95": 3.985551154904249,
            "loss_sequences_upper_95": 4.2203095412835845,
            "loss_tokens_lower_95": 3.7599088199413453,
            "loss_tokens_upper_95": 3.882433349989656,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5161739165738504,
            "data_time": 0.02972768559867953,
            "batch_time": 0.04451673413500374,
            "samples_per_second": 1975818.9993655484,
            "samples_per_second_per_gpu": 246977.37492069355,
            "loss_sequences_lower_95": 3.451269597592561,
            "loss_sequences_upper_95": 3.598007789753979,
            "loss_tokens_lower_95": 3.4661974901286925,
            "loss_tokens_upper_95": 3.5643771022228723,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.11640465259552,
            "data_time": 0.02935944284711565,
            "batch_time": 0.044396831875755674,
            "samples_per_second": 1866877.2538156111,
            "samples_per_second_per_gpu": 233359.6567269514,
            "loss_sequences_lower_95": 3.081776214227444,
            "loss_sequences_upper_95": 3.2751764344006054,
            "loss_tokens_lower_95": 2.9164399440551536,
            "loss_tokens_upper_95": 3.0008985611155063,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.25/params.txt",
    "uuid": "4c95a0ba-21ed-46ca-8965-b1a27167e3c9",
    "creation_date": "2023_12_14-06_30_44"
}