{
    "name": "rw_original-d=512_l=8_h=4-4.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 6313123840,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1262624768",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.7880390008290608,
            "data_time": 0.03216378763318062,
            "batch_time": 0.3354330509901047,
            "samples_per_second": 1704107.7556162481,
            "samples_per_second_per_gpu": 213013.46945203101,
            "loss_sequences_lower_95": 3.7049106915791827,
            "loss_sequences_upper_95": 3.8727920850118003,
            "loss_tokens_lower_95": 3.7736722882588705,
            "loss_tokens_upper_95": 3.8025310643513994,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.606849741176428,
            "data_time": 0.0014167358417375012,
            "batch_time": 0.015292062200249142,
            "samples_per_second": 2241420.6374586113,
            "samples_per_second_per_gpu": 280177.5796823264,
            "loss_sequences_lower_95": 3.60444129305933,
            "loss_sequences_upper_95": 3.6092014042076963,
            "loss_tokens_lower_95": 3.5958519791666665,
            "loss_tokens_upper_95": 3.6176081562499998,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1614281391610906,
            "data_time": 0.009060787200927735,
            "batch_time": 0.023513991355895995,
            "samples_per_second": 2118870.7848046054,
            "samples_per_second_per_gpu": 264858.8481005757,
            "loss_sequences_lower_95": 3.103879793128189,
            "loss_sequences_upper_95": 3.2342152249083225,
            "loss_tokens_lower_95": 3.1484740625,
            "loss_tokens_upper_95": 3.174449640625,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7368289490335997,
            "data_time": 0.0015853941440582275,
            "batch_time": 0.015104459108490693,
            "samples_per_second": 2308657.6077709426,
            "samples_per_second_per_gpu": 288582.2009713678,
            "loss_sequences_lower_95": 3.6949081628543814,
            "loss_sequences_upper_95": 3.7807006080863403,
            "loss_tokens_lower_95": 3.723812802083333,
            "loss_tokens_upper_95": 3.7496463437500003,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6760785516556314,
            "data_time": 0.009517226086194772,
            "batch_time": 0.02315747595403299,
            "samples_per_second": 2214409.2236753264,
            "samples_per_second_per_gpu": 276801.1529594158,
            "loss_sequences_lower_95": 3.614868909909623,
            "loss_sequences_upper_95": 3.756408393067401,
            "loss_tokens_lower_95": 3.6645207291666666,
            "loss_tokens_upper_95": 3.6873499062499997,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.847668441277147,
            "data_time": 0.0038012292074120564,
            "batch_time": 0.01758680557427199,
            "samples_per_second": 2274031.804026792,
            "samples_per_second_per_gpu": 284253.975503349,
            "loss_sequences_lower_95": 3.797332618448866,
            "loss_sequences_upper_95": 3.9027967545063946,
            "loss_tokens_lower_95": 3.834658322916667,
            "loss_tokens_upper_95": 3.860443833333333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6945325993031872,
            "data_time": 0.0014838300171334047,
            "batch_time": 0.014869978921837364,
            "samples_per_second": 2337120.1701995702,
            "samples_per_second_per_gpu": 292140.0212749463,
            "loss_sequences_lower_95": 3.660833227040816,
            "loss_sequences_upper_95": 3.7279765724649234,
            "loss_tokens_lower_95": 3.6777540520833334,
            "loss_tokens_upper_95": 3.71216665625,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.123601439510964,
            "data_time": 0.0016346493309783616,
            "batch_time": 0.014987865585205895,
            "samples_per_second": 2337512.345847793,
            "samples_per_second_per_gpu": 292189.0432309741,
            "loss_sequences_lower_95": 4.09822206315445,
            "loss_sequences_upper_95": 4.151103873527487,
            "loss_tokens_lower_95": 4.11148328125,
            "loss_tokens_upper_95": 4.13541521875,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7922910161134675,
            "data_time": 0.01274362253764319,
            "batch_time": 0.026606413107069713,
            "samples_per_second": 2172774.1267171013,
            "samples_per_second_per_gpu": 271596.76583963766,
            "loss_sequences_lower_95": 3.6997451782226562,
            "loss_sequences_upper_95": 3.9044500459500444,
            "loss_tokens_lower_95": 3.780113375,
            "loss_tokens_upper_95": 3.80434375,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.91920380818514,
            "data_time": 0.009434234350919724,
            "batch_time": 0.023363983258605003,
            "samples_per_second": 2187533.2571241674,
            "samples_per_second_per_gpu": 273441.6571405209,
            "loss_sequences_lower_95": 4.7975545649471965,
            "loss_sequences_upper_95": 5.070291282442718,
            "loss_tokens_lower_95": 4.905678395833333,
            "loss_tokens_upper_95": 4.933031583333333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8489502647306324,
            "data_time": 0.0012368477021350898,
            "batch_time": 0.014767257440054244,
            "samples_per_second": 2316575.1022401447,
            "samples_per_second_per_gpu": 289571.8877800181,
            "loss_sequences_lower_95": 3.8358115920926408,
            "loss_sequences_upper_95": 3.8626866670378237,
            "loss_tokens_lower_95": 3.8371867083333333,
            "loss_tokens_upper_95": 3.860826354166667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.723179285747714,
            "data_time": 0.002373370997216878,
            "batch_time": 0.015968528020193336,
            "samples_per_second": 2331251.2857574103,
            "samples_per_second_per_gpu": 291406.4107196763,
            "loss_sequences_lower_95": 3.695118672070231,
            "loss_sequences_upper_95": 3.752469379213739,
            "loss_tokens_lower_95": 3.711247854166667,
            "loss_tokens_upper_95": 3.7351324999999997,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.189009997714122,
            "data_time": 0.009065266183242496,
            "batch_time": 0.022685572092712162,
            "samples_per_second": 2208939.804812504,
            "samples_per_second_per_gpu": 276117.475601563,
            "loss_sequences_lower_95": 4.096152829059965,
            "loss_sequences_upper_95": 4.301626543582816,
            "loss_tokens_lower_95": 4.175578718750001,
            "loss_tokens_upper_95": 4.202154260416666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4596699575785466,
            "data_time": 0.009535811336866888,
            "batch_time": 0.023755192281715423,
            "samples_per_second": 2125708.4013622645,
            "samples_per_second_per_gpu": 265713.55017028307,
            "loss_sequences_lower_95": 3.3692118347540894,
            "loss_sequences_upper_95": 3.5659348870489116,
            "loss_tokens_lower_95": 3.4474871718749998,
            "loss_tokens_upper_95": 3.472138145833333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.514364730228078,
            "data_time": 0.08203752551759992,
            "batch_time": 0.09794052158083234,
            "samples_per_second": 1008688.141999324,
            "samples_per_second_per_gpu": 126086.0177499155,
            "loss_sequences_lower_95": 4.430996938185258,
            "loss_sequences_upper_95": 4.6183362527327105,
            "loss_tokens_lower_95": 4.489878420396285,
            "loss_tokens_upper_95": 4.539570530978116,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8120428480142756,
            "data_time": 0.013469144701957703,
            "batch_time": 0.0276294160972942,
            "samples_per_second": 2108459.7892623846,
            "samples_per_second_per_gpu": 263557.4736577981,
            "loss_sequences_lower_95": 3.7433600511912357,
            "loss_sequences_upper_95": 3.879456224802979,
            "loss_tokens_lower_95": 3.7983829166666667,
            "loss_tokens_upper_95": 3.8253214270833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.671633743044569,
            "data_time": 0.012313449134429296,
            "batch_time": 0.02667482445637385,
            "samples_per_second": 2115933.195252123,
            "samples_per_second_per_gpu": 264491.64940651535,
            "loss_sequences_lower_95": 5.578973066586618,
            "loss_sequences_upper_95": 5.795433861724934,
            "loss_tokens_lower_95": 5.660088197916667,
            "loss_tokens_upper_95": 5.683124895833333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.178608102876632,
            "data_time": 0.034462105482816696,
            "batch_time": 0.049024783074855804,
            "samples_per_second": 1890654.828454972,
            "samples_per_second_per_gpu": 236331.8535568715,
            "loss_sequences_lower_95": 4.017351381895972,
            "loss_sequences_upper_95": 4.463861409171683,
            "loss_tokens_lower_95": 4.164308391633581,
            "loss_tokens_upper_95": 4.193401405459545,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.726164633656241,
            "data_time": 0.0020228569355532355,
            "batch_time": 0.01567457551836696,
            "samples_per_second": 2263568.208705263,
            "samples_per_second_per_gpu": 282946.0260881579,
            "loss_sequences_lower_95": 4.70914790505715,
            "loss_sequences_upper_95": 4.743702361108638,
            "loss_tokens_lower_95": 4.7089335230914395,
            "loss_tokens_upper_95": 4.743159903659557,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2703146342617706,
            "data_time": 0.0020756437235577093,
            "batch_time": 0.015770929064720298,
            "samples_per_second": 2255420.743109029,
            "samples_per_second_per_gpu": 281927.5928886286,
            "loss_sequences_lower_95": 3.2679499228241387,
            "loss_sequences_upper_95": 3.2935554752072544,
            "loss_tokens_lower_95": 3.250928195079394,
            "loss_tokens_upper_95": 3.2702669628842185,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.277467234719177,
            "data_time": 0.002959804193351091,
            "batch_time": 0.01662711261242319,
            "samples_per_second": 2256416.5359550733,
            "samples_per_second_per_gpu": 282052.06699438416,
            "loss_sequences_lower_95": 5.549619673421263,
            "loss_sequences_upper_95": 5.864632626597928,
            "loss_tokens_lower_95": 4.690651037013045,
            "loss_tokens_upper_95": 4.9168831764132115,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.131739436089992,
            "data_time": 0.0038972397116904564,
            "batch_time": 0.01748816875067163,
            "samples_per_second": 2252219.781336592,
            "samples_per_second_per_gpu": 281527.472667074,
            "loss_sequences_lower_95": 5.261320597330729,
            "loss_sequences_upper_95": 5.458063614908854,
            "loss_tokens_lower_95": 4.81058984375,
            "loss_tokens_upper_95": 4.9510284984276725,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3627222661632223,
            "data_time": 0.0044686589305756854,
            "batch_time": 0.0180513412884098,
            "samples_per_second": 2253714.6068833508,
            "samples_per_second_per_gpu": 281714.32586041884,
            "loss_sequences_lower_95": 3.4056228325347093,
            "loss_sequences_upper_95": 3.4679906097481523,
            "loss_tokens_lower_95": 3.2692184777611075,
            "loss_tokens_upper_95": 3.3007669763636143,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.794756403836337,
            "data_time": 0.02263409537928445,
            "batch_time": 0.037282896893365045,
            "samples_per_second": 1998112.9855091325,
            "samples_per_second_per_gpu": 249764.12318864156,
            "loss_sequences_lower_95": 2.7682917924360795,
            "loss_sequences_upper_95": 2.882764809348367,
            "loss_tokens_lower_95": 2.7214955154691123,
            "loss_tokens_upper_95": 2.7721673054393836,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.627833489982449,
            "data_time": 0.01993839256465435,
            "batch_time": 0.03386189974844456,
            "samples_per_second": 1996014.1337912004,
            "samples_per_second_per_gpu": 249501.76672390004,
            "loss_sequences_lower_95": 3.6131263950892856,
            "loss_sequences_upper_95": 3.808402853206712,
            "loss_tokens_lower_95": 3.501741818561696,
            "loss_tokens_upper_95": 3.5967763982629486,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.026895906925201,
            "data_time": 0.016316635486407157,
            "batch_time": 0.03057731420565874,
            "samples_per_second": 2014566.417944346,
            "samples_per_second_per_gpu": 251820.80224304326,
            "loss_sequences_lower_95": 3.9924458516438803,
            "loss_sequences_upper_95": 4.105238423665364,
            "loss_tokens_lower_95": 3.8890169914632264,
            "loss_tokens_upper_95": 4.101134755968016,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.901884629631606,
            "data_time": 0.001817177886182006,
            "batch_time": 0.015525975643608844,
            "samples_per_second": 2253802.07803734,
            "samples_per_second_per_gpu": 281725.2597546675,
            "loss_sequences_lower_95": 6.920905098174303,
            "loss_sequences_upper_95": 6.994453167289995,
            "loss_tokens_lower_95": 6.751309841458024,
            "loss_tokens_upper_95": 6.829119452225381,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.909660047854639,
            "data_time": 0.002778046683177052,
            "batch_time": 0.01651410428469613,
            "samples_per_second": 2242786.323579234,
            "samples_per_second_per_gpu": 280348.29044740426,
            "loss_sequences_lower_95": 5.492533242582071,
            "loss_sequences_upper_95": 5.794606968529698,
            "loss_tokens_lower_95": 4.152247501827174,
            "loss_tokens_upper_95": 4.29104093712747,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.429399246343573,
            "data_time": 0.004910441669257912,
            "batch_time": 0.018589119653444033,
            "samples_per_second": 2228149.292957646,
            "samples_per_second_per_gpu": 278518.66161970573,
            "loss_sequences_lower_95": 4.873786264881746,
            "loss_sequences_upper_95": 5.213180979445526,
            "loss_tokens_lower_95": 3.992043654770188,
            "loss_tokens_upper_95": 4.149806751547879,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.729832067881545,
            "data_time": 0.02191024805818285,
            "batch_time": 0.036345217909131734,
            "samples_per_second": 1982771.2650968707,
            "samples_per_second_per_gpu": 247846.40813710884,
            "loss_sequences_lower_95": 5.63242973432149,
            "loss_sequences_upper_95": 5.829427501382349,
            "loss_tokens_lower_95": 5.6329755669859445,
            "loss_tokens_upper_95": 5.827730696168665,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7061040735244752,
            "data_time": 0.04851124378351065,
            "batch_time": 0.06326723557252151,
            "samples_per_second": 1788751.7359685355,
            "samples_per_second_per_gpu": 223593.96699606694,
            "loss_sequences_lower_95": 3.555229217529297,
            "loss_sequences_upper_95": 3.9412219161987303,
            "loss_tokens_lower_95": 3.3884801165149123,
            "loss_tokens_upper_95": 3.8565516490629195,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.10929336372149,
            "data_time": 0.003315913165273842,
            "batch_time": 0.017049315029370515,
            "samples_per_second": 2251870.5259704622,
            "samples_per_second_per_gpu": 281483.8157463078,
            "loss_sequences_lower_95": 5.061098502870714,
            "loss_sequences_upper_95": 5.158198976858367,
            "loss_tokens_lower_95": 5.060470421751055,
            "loss_tokens_upper_95": 5.158381932394287,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.997062686620239,
            "data_time": 0.004456674488683704,
            "batch_time": 0.01812278971586492,
            "samples_per_second": 2245191.200369094,
            "samples_per_second_per_gpu": 280648.90004613675,
            "loss_sequences_lower_95": 4.945764530066288,
            "loss_sequences_upper_95": 5.047804234010289,
            "loss_tokens_lower_95": 4.944416317951474,
            "loss_tokens_upper_95": 5.048389761609977,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.776989667018687,
            "data_time": 0.0033418524600002213,
            "batch_time": 0.016865906396128446,
            "samples_per_second": 2257910.1586938603,
            "samples_per_second_per_gpu": 282238.76983673254,
            "loss_sequences_lower_95": 3.9223211477319095,
            "loss_sequences_upper_95": 4.051302778474394,
            "loss_tokens_lower_95": 3.610317187617337,
            "loss_tokens_upper_95": 3.67023186007472,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.721587877035141,
            "data_time": 0.010315846651792526,
            "batch_time": 0.02418240439146757,
            "samples_per_second": 2143187.824771883,
            "samples_per_second_per_gpu": 267898.47809648537,
            "loss_sequences_lower_95": 5.913242517089843,
            "loss_sequences_upper_95": 6.463766223144531,
            "loss_tokens_lower_95": 5.1083984375,
            "loss_tokens_upper_95": 5.470475723599852,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.210596516728401,
            "data_time": 0.14152564108371735,
            "batch_time": 0.15810826420783997,
            "samples_per_second": 785323.1975348326,
            "samples_per_second_per_gpu": 98165.39969185408,
            "loss_sequences_lower_95": 3.920667862892151,
            "loss_sequences_upper_95": 4.563811659812927,
            "loss_tokens_lower_95": 3.765039518509788,
            "loss_tokens_upper_95": 4.515639092456335,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.446593358598906,
            "data_time": 0.02572205979773339,
            "batch_time": 0.03983690383586478,
            "samples_per_second": 1888600.6627283036,
            "samples_per_second_per_gpu": 236075.08284103795,
            "loss_sequences_lower_95": 4.684332573550871,
            "loss_sequences_upper_95": 5.280028463780194,
            "loss_tokens_lower_95": 3.549203756132416,
            "loss_tokens_upper_95": 3.9176328433272265,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.7344141910564645,
            "data_time": 0.0029414776298734876,
            "batch_time": 0.016640702055560216,
            "samples_per_second": 2239802.6768700066,
            "samples_per_second_per_gpu": 279975.3346087508,
            "loss_sequences_lower_95": 2.7099611454834944,
            "loss_sequences_upper_95": 2.7589493217818233,
            "loss_tokens_lower_95": 2.709046258088958,
            "loss_tokens_upper_95": 2.7591227053554364,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4791966759037978,
            "data_time": 0.0027796429259211784,
            "batch_time": 0.016442219323782575,
            "samples_per_second": 2261603.852781482,
            "samples_per_second_per_gpu": 282700.48159768526,
            "loss_sequences_lower_95": 3.449866781562318,
            "loss_sequences_upper_95": 3.6216368859008106,
            "loss_tokens_lower_95": 3.2873662789372946,
            "loss_tokens_upper_95": 3.4561128789475486,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4379621744155884,
            "data_time": 0.017082821991708543,
            "batch_time": 0.031093236472871568,
            "samples_per_second": 2023809.617270973,
            "samples_per_second_per_gpu": 252976.20215887163,
            "loss_sequences_lower_95": 3.2913561419252946,
            "loss_sequences_upper_95": 3.694453625801282,
            "loss_tokens_lower_95": 3.177305005944293,
            "loss_tokens_upper_95": 3.478732810783563,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8121192857046813,
            "data_time": 0.004730609431862831,
            "batch_time": 0.018542782589793205,
            "samples_per_second": 2208288.762178179,
            "samples_per_second_per_gpu": 276036.09527227236,
            "loss_sequences_lower_95": 3.8502537366811365,
            "loss_sequences_upper_95": 3.998847451996596,
            "loss_tokens_lower_95": 3.6686041312090265,
            "loss_tokens_upper_95": 3.8135380552403766,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.055187915883413,
            "data_time": 0.029150974182855515,
            "batch_time": 0.043167244820367726,
            "samples_per_second": 1982651.1534498131,
            "samples_per_second_per_gpu": 247831.39418122664,
            "loss_sequences_lower_95": 2.8849799877259787,
            "loss_sequences_upper_95": 3.3675643455691455,
            "loss_tokens_lower_95": 2.789958810728487,
            "loss_tokens_upper_95": 3.1551195552259634,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.4743321356856365,
            "data_time": 0.0025110114730456995,
            "batch_time": 0.016145839435883727,
            "samples_per_second": 2259072.815796716,
            "samples_per_second_per_gpu": 282384.1019745895,
            "loss_sequences_lower_95": 5.463995123243399,
            "loss_sequences_upper_95": 5.484733020822914,
            "loss_tokens_lower_95": 5.464092339952365,
            "loss_tokens_upper_95": 5.484611326562187,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.346037353126748,
            "data_time": 0.04527623870156028,
            "batch_time": 0.06023573875427246,
            "samples_per_second": 1626870.9758542061,
            "samples_per_second_per_gpu": 203358.87198177577,
            "loss_sequences_lower_95": 1.2922492166167323,
            "loss_sequences_upper_95": 1.4754759297787563,
            "loss_tokens_lower_95": 1.1492630645220943,
            "loss_tokens_upper_95": 1.420335280419372,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0157629267755794,
            "data_time": 0.001632833982952882,
            "batch_time": 0.015266942645658092,
            "samples_per_second": 2264897.764660981,
            "samples_per_second_per_gpu": 283112.2205826226,
            "loss_sequences_lower_95": 5.391628535688548,
            "loss_sequences_upper_95": 5.44033481557914,
            "loss_tokens_lower_95": 4.431375362669246,
            "loss_tokens_upper_95": 4.479028022243714,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.710262228488922,
            "data_time": 0.005667846354227218,
            "batch_time": 0.019894507196214464,
            "samples_per_second": 2223241.396021301,
            "samples_per_second_per_gpu": 277905.1745026626,
            "loss_sequences_lower_95": 5.692372033691406,
            "loss_sequences_upper_95": 5.9294741821289065,
            "loss_tokens_lower_95": 5.48009802633169,
            "loss_tokens_upper_95": 5.700249144280501,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.086641778116641,
            "data_time": 0.02174763154175322,
            "batch_time": 0.035840551731950145,
            "samples_per_second": 2023506.4633770816,
            "samples_per_second_per_gpu": 252938.3079221352,
            "loss_sequences_lower_95": 4.933933078931726,
            "loss_sequences_upper_95": 5.238714599609375,
            "loss_tokens_lower_95": 4.934151823624321,
            "loss_tokens_upper_95": 5.237778943932574,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.810391227765517,
            "data_time": 0.004544531365475023,
            "batch_time": 0.018339642918253518,
            "samples_per_second": 2223019.1270466335,
            "samples_per_second_per_gpu": 277877.3908808292,
            "loss_sequences_lower_95": 5.7314809070933945,
            "loss_sequences_upper_95": 5.887502191716974,
            "loss_tokens_lower_95": 5.730682169596354,
            "loss_tokens_upper_95": 5.889146460330847,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.4992314654986063,
            "data_time": 0.004067450444749061,
            "batch_time": 0.018088349002472897,
            "samples_per_second": 2190980.8985633063,
            "samples_per_second_per_gpu": 273872.6123204133,
            "loss_sequences_lower_95": 1.5576363199869792,
            "loss_sequences_upper_95": 1.634353535970052,
            "loss_tokens_lower_95": 1.3963278006515107,
            "loss_tokens_upper_95": 1.4693709007040316,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.231797436305455,
            "data_time": 0.02273173417363848,
            "batch_time": 0.03663541802338192,
            "samples_per_second": 1938068.0687466697,
            "samples_per_second_per_gpu": 242258.5085933337,
            "loss_sequences_lower_95": 5.9195054989769345,
            "loss_sequences_upper_95": 6.543614923386347,
            "loss_tokens_lower_95": 5.912165745326451,
            "loss_tokens_upper_95": 6.552843482607887,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.4830164462327957,
            "data_time": 0.15074777603149414,
            "batch_time": 0.16786503791809082,
            "samples_per_second": 798035.2451476861,
            "samples_per_second_per_gpu": 99754.40564346076,
            "loss_sequences_lower_95": 2.2946963369846345,
            "loss_sequences_upper_95": 3.2575667798519135,
            "loss_tokens_lower_95": 1.9371376430865415,
            "loss_tokens_upper_95": 2.4477902693601,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.683336752891541,
            "data_time": 0.005655261732283093,
            "batch_time": 0.019634244933960928,
            "samples_per_second": 2180707.3175299633,
            "samples_per_second_per_gpu": 272588.4146912454,
            "loss_sequences_lower_95": 7.624920874023437,
            "loss_sequences_upper_95": 7.967920043945313,
            "loss_tokens_lower_95": 7.373468234454315,
            "loss_tokens_upper_95": 7.675679648668835,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.517459182262421,
            "data_time": 0.005600167172295707,
            "batch_time": 0.019392023010859415,
            "samples_per_second": 2219214.3854561886,
            "samples_per_second_per_gpu": 277401.7981820236,
            "loss_sequences_lower_95": 7.660627416992187,
            "loss_sequences_upper_95": 7.9169492919921876,
            "loss_tokens_lower_95": 7.223298644255446,
            "loss_tokens_upper_95": 7.423580218961254,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.790832703520378,
            "data_time": 0.004081783167095886,
            "batch_time": 0.017788388338376048,
            "samples_per_second": 2233194.9900255585,
            "samples_per_second_per_gpu": 279149.3737531948,
            "loss_sequences_lower_95": 5.769501098673734,
            "loss_sequences_upper_95": 5.812055799923525,
            "loss_tokens_lower_95": 5.76979943638954,
            "loss_tokens_upper_95": 5.812139559063861,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.8783416088825,
            "data_time": 0.008241173003735499,
            "batch_time": 0.022115038240784243,
            "samples_per_second": 2159026.4101595934,
            "samples_per_second_per_gpu": 269878.3012699492,
            "loss_sequences_lower_95": 4.788089912754416,
            "loss_sequences_upper_95": 4.967160254506289,
            "loss_tokens_lower_95": 4.7836966847128215,
            "loss_tokens_upper_95": 4.964951756912443,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.0979342169761654,
            "data_time": 0.005537843420391991,
            "batch_time": 0.019286233754385085,
            "samples_per_second": 2239608.8134003906,
            "samples_per_second_per_gpu": 279951.1016750488,
            "loss_sequences_lower_95": 6.994534948730468,
            "loss_sequences_upper_95": 7.201758569335937,
            "loss_tokens_lower_95": 6.995583154296875,
            "loss_tokens_upper_95": 7.198779052734375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.267218009649636,
            "data_time": 0.002202142073661934,
            "batch_time": 0.015730580805830785,
            "samples_per_second": 2277138.0120701687,
            "samples_per_second_per_gpu": 284642.2515087711,
            "loss_sequences_lower_95": 3.735437069462512,
            "loss_sequences_upper_95": 3.8167003089522233,
            "loss_tokens_lower_95": 2.6884484815896537,
            "loss_tokens_upper_95": 2.7449778160391696,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.292023965671881,
            "data_time": 0.01836909055709839,
            "batch_time": 0.03282971722739084,
            "samples_per_second": 1967033.1130459346,
            "samples_per_second_per_gpu": 245879.13913074182,
            "loss_sequences_lower_95": 5.112515907856955,
            "loss_sequences_upper_95": 5.468469625444555,
            "loss_tokens_lower_95": 5.116290385687529,
            "loss_tokens_upper_95": 5.46842998675446,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.368881805270326,
            "data_time": 0.00998599175363779,
            "batch_time": 0.024058164097368717,
            "samples_per_second": 2146828.8233257383,
            "samples_per_second_per_gpu": 268353.6029157173,
            "loss_sequences_lower_95": 5.249098091873468,
            "loss_sequences_upper_95": 5.487175508386948,
            "loss_tokens_lower_95": 5.253850767846202,
            "loss_tokens_upper_95": 5.486511122759651,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.173153704171131,
            "data_time": 0.00228869637777639,
            "batch_time": 0.01586127114254226,
            "samples_per_second": 2268657.368298969,
            "samples_per_second_per_gpu": 283582.17103737115,
            "loss_sequences_lower_95": 4.691947952876426,
            "loss_sequences_upper_95": 4.7898237734678375,
            "loss_tokens_lower_95": 3.453033741510705,
            "loss_tokens_upper_95": 3.5304726237420723,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.143953078638309,
            "data_time": 0.027790794769922893,
            "batch_time": 0.0424813578526179,
            "samples_per_second": 1978138.6780796766,
            "samples_per_second_per_gpu": 247267.33475995957,
            "loss_sequences_lower_95": 6.055655553101232,
            "loss_sequences_upper_95": 6.228183241748305,
            "loss_tokens_lower_95": 6.056963077928654,
            "loss_tokens_upper_95": 6.229582060960235,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.22547010442168,
            "data_time": 0.0036800038683545457,
            "batch_time": 0.017252006228007967,
            "samples_per_second": 2260876.0499104476,
            "samples_per_second_per_gpu": 282609.50623880595,
            "loss_sequences_lower_95": 4.1924017987265865,
            "loss_sequences_upper_95": 4.258599521275325,
            "loss_tokens_lower_95": 4.193175500525611,
            "loss_tokens_upper_95": 4.257468493167049,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.4251608165722445,
            "data_time": 0.023168377442793413,
            "batch_time": 0.03710052316839045,
            "samples_per_second": 1953604.004190679,
            "samples_per_second_per_gpu": 244200.50052383487,
            "loss_sequences_lower_95": 5.231317701617491,
            "loss_sequences_upper_95": 5.619405624241505,
            "loss_tokens_lower_95": 5.230411862864078,
            "loss_tokens_upper_95": 5.62281863018147,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2815921942392987,
            "data_time": 0.07580707967281342,
            "batch_time": 0.09178426116704941,
            "samples_per_second": 1280382.3652823004,
            "samples_per_second_per_gpu": 160047.79566028755,
            "loss_sequences_lower_95": 2.944913298288981,
            "loss_sequences_upper_95": 3.768377736409505,
            "loss_tokens_lower_95": 2.585364405314128,
            "loss_tokens_upper_95": 3.4863025877210827,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.857818551858266,
            "data_time": 0.07434424757957458,
            "batch_time": 0.09016149491071701,
            "samples_per_second": 1282972.0515299602,
            "samples_per_second_per_gpu": 160371.50644124503,
            "loss_sequences_lower_95": 2.65489751180013,
            "loss_sequences_upper_95": 3.495084794362386,
            "loss_tokens_lower_95": 2.1295729733584974,
            "loss_tokens_upper_95": 3.090054449874364,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.774923946369379,
            "data_time": 0.003894268870002877,
            "batch_time": 0.017511582220211108,
            "samples_per_second": 2256780.8372723935,
            "samples_per_second_per_gpu": 282097.6046590492,
            "loss_sequences_lower_95": 5.7493447251242635,
            "loss_sequences_upper_95": 5.8000334102310385,
            "loss_tokens_lower_95": 5.749450766292341,
            "loss_tokens_upper_95": 5.799948870696797,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.7415809264724178,
            "data_time": 0.0015400750582927687,
            "batch_time": 0.015177007740123006,
            "samples_per_second": 2264961.34155116,
            "samples_per_second_per_gpu": 283120.167693895,
            "loss_sequences_lower_95": 0.8733131891300306,
            "loss_sequences_upper_95": 0.8976877830915979,
            "loss_tokens_lower_95": 0.5978645534025002,
            "loss_tokens_upper_95": 0.6098816272390105,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.130706918521191,
            "data_time": 0.03742239624261856,
            "batch_time": 0.0521821565926075,
            "samples_per_second": 1881502.992999875,
            "samples_per_second_per_gpu": 235187.87412498437,
            "loss_sequences_lower_95": 5.149816930575633,
            "loss_sequences_upper_95": 5.54442670927273,
            "loss_tokens_lower_95": 4.824605237023305,
            "loss_tokens_upper_95": 5.12835913751612,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.940661945858517,
            "data_time": 0.12092088517688569,
            "batch_time": 0.14714803014482772,
            "samples_per_second": 1166938.2019055032,
            "samples_per_second_per_gpu": 145867.2752381879,
            "loss_sequences_lower_95": 7.477026490907411,
            "loss_sequences_upper_95": 8.64260629189981,
            "loss_tokens_lower_95": 6.895035487045476,
            "loss_tokens_upper_95": 8.71057622462143,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.055803759795864,
            "data_time": 0.029531680402301606,
            "batch_time": 0.04401781445457822,
            "samples_per_second": 1917667.2623114963,
            "samples_per_second_per_gpu": 239708.40778893704,
            "loss_sequences_lower_95": 5.016465824406321,
            "loss_sequences_upper_95": 5.363212641274056,
            "loss_tokens_lower_95": 4.684129873777859,
            "loss_tokens_upper_95": 4.941122348743692,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.186005100971315,
            "data_time": 0.02952711922781808,
            "batch_time": 0.044098873933156334,
            "samples_per_second": 1947750.6406282499,
            "samples_per_second_per_gpu": 243468.83007853123,
            "loss_sequences_lower_95": 5.119887347337676,
            "loss_sequences_upper_95": 5.418248906949671,
            "loss_tokens_lower_95": 4.869505481668523,
            "loss_tokens_upper_95": 5.085447191641257,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.152565011164037,
            "data_time": 0.031141587666102817,
            "batch_time": 0.04590918052764166,
            "samples_per_second": 1935656.8829679866,
            "samples_per_second_per_gpu": 241957.11037099833,
            "loss_sequences_lower_95": 5.144347781669802,
            "loss_sequences_upper_95": 5.558081957189048,
            "loss_tokens_lower_95": 4.699098343325456,
            "loss_tokens_upper_95": 5.029972087384093,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.34194649428856,
            "data_time": 0.029335322834196546,
            "batch_time": 0.04351019575482323,
            "samples_per_second": 1967952.5276272434,
            "samples_per_second_per_gpu": 245994.06595340543,
            "loss_sequences_lower_95": 5.262901073548852,
            "loss_sequences_upper_95": 5.559341114323313,
            "loss_tokens_lower_95": 5.047965665918273,
            "loss_tokens_upper_95": 5.246443000389409,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.594409169617647,
            "data_time": 0.030819210005395205,
            "batch_time": 0.04588451208891692,
            "samples_per_second": 1981943.8563429033,
            "samples_per_second_per_gpu": 247742.98204286292,
            "loss_sequences_lower_95": 4.500869201281056,
            "loss_sequences_upper_95": 4.738006250606561,
            "loss_tokens_lower_95": 4.348445908891488,
            "loss_tokens_upper_95": 4.502679488702837,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.942645209591563,
            "data_time": 0.02949247473762149,
            "batch_time": 0.04410904078256516,
            "samples_per_second": 1918101.0842587303,
            "samples_per_second_per_gpu": 239762.6355323413,
            "loss_sequences_lower_95": 3.9385881098305306,
            "loss_sequences_upper_95": 4.194537669856373,
            "loss_tokens_lower_95": 3.70205262370969,
            "loss_tokens_upper_95": 3.8235935498606084,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-4.0/params.txt",
    "uuid": "bce0213b-a163-4aad-a1ae-87813eeeb260",
    "creation_date": "2023_12_14-05_01_12"
}