{
    "name": "rw_original-d=512_l=8_h=4-0.5",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 789140480,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "157828096",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.443213852246602,
            "data_time": 0.032765984535217285,
            "batch_time": 0.39755742996931076,
            "samples_per_second": 1754554.930604978,
            "samples_per_second_per_gpu": 219319.36632562225,
            "loss_sequences_lower_95": 4.351239980061848,
            "loss_sequences_upper_95": 4.5371360524495445,
            "loss_tokens_lower_95": 4.428330955505372,
            "loss_tokens_upper_95": 4.458082135518391,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.12597624003087,
            "data_time": 0.0014827450457717145,
            "batch_time": 0.015314779794768968,
            "samples_per_second": 2244065.705923993,
            "samples_per_second_per_gpu": 280508.21324049914,
            "loss_sequences_lower_95": 4.123706556474194,
            "loss_sequences_upper_95": 4.128228723646749,
            "loss_tokens_lower_95": 4.114721135416667,
            "loss_tokens_upper_95": 4.137237,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6021631377083914,
            "data_time": 0.009754410743713379,
            "batch_time": 0.023584258079528807,
            "samples_per_second": 2183684.0373291587,
            "samples_per_second_per_gpu": 272960.50466614484,
            "loss_sequences_lower_95": 3.5435850057796556,
            "loss_sequences_upper_95": 3.6753166541274713,
            "loss_tokens_lower_95": 3.588561229166667,
            "loss_tokens_upper_95": 3.6157358645833333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.284551799764338,
            "data_time": 0.0016654777879777708,
            "batch_time": 0.015053324107276765,
            "samples_per_second": 2326209.576344046,
            "samples_per_second_per_gpu": 290776.1970430057,
            "loss_sequences_lower_95": 4.2427649303318296,
            "loss_sequences_upper_95": 4.328329534471649,
            "loss_tokens_lower_95": 4.271083885416666,
            "loss_tokens_upper_95": 4.297650135416666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.193617354585294,
            "data_time": 0.009923668021699823,
            "batch_time": 0.023832445600593232,
            "samples_per_second": 2176495.289920015,
            "samples_per_second_per_gpu": 272061.91124000185,
            "loss_sequences_lower_95": 4.133065609436657,
            "loss_sequences_upper_95": 4.272911814516771,
            "loss_tokens_lower_95": 4.181483229166667,
            "loss_tokens_upper_95": 4.205446374999999,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.442042759857074,
            "data_time": 0.0037939192160316134,
            "batch_time": 0.01727476392103278,
            "samples_per_second": 2309818.03982859,
            "samples_per_second_per_gpu": 288727.25497857376,
            "loss_sequences_lower_95": 4.392269787506374,
            "loss_sequences_upper_95": 4.496706325549158,
            "loss_tokens_lower_95": 4.428821697916667,
            "loss_tokens_upper_95": 4.454883072916666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4800328206529425,
            "data_time": 0.0016168500820925341,
            "batch_time": 0.01490267571578407,
            "samples_per_second": 2350074.2271545227,
            "samples_per_second_per_gpu": 293759.27839431533,
            "loss_sequences_lower_95": 4.447071607940051,
            "loss_sequences_upper_95": 4.512673070790816,
            "loss_tokens_lower_95": 4.4634348020833325,
            "loss_tokens_upper_95": 4.4974225104166665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.54784523190004,
            "data_time": 0.0016406818894051647,
            "batch_time": 0.015271667654371721,
            "samples_per_second": 2295250.2460107883,
            "samples_per_second_per_gpu": 286906.28075134853,
            "loss_sequences_lower_95": 4.522230806200916,
            "loss_sequences_upper_95": 4.575906290903141,
            "loss_tokens_lower_95": 4.535827260416666,
            "loss_tokens_upper_95": 4.5598502708333335,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.312218863789628,
            "data_time": 0.009294699108789838,
            "batch_time": 0.02298746695594182,
            "samples_per_second": 2197471.8004004373,
            "samples_per_second_per_gpu": 274683.97505005467,
            "loss_sequences_lower_95": 4.221046559403582,
            "loss_sequences_upper_95": 4.422227850192931,
            "loss_tokens_lower_95": 4.299723177083334,
            "loss_tokens_upper_95": 4.324480947916666,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.447191704874453,
            "data_time": 0.009448465891182423,
            "batch_time": 0.023133921436965466,
            "samples_per_second": 2221001.6174480743,
            "samples_per_second_per_gpu": 277625.2021810093,
            "loss_sequences_lower_95": 5.326093711400691,
            "loss_sequences_upper_95": 5.597948688009511,
            "loss_tokens_lower_95": 5.433464822916667,
            "loss_tokens_upper_95": 5.460951458333334,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4196802420012995,
            "data_time": 0.0013331910230323497,
            "batch_time": 0.014662759341289324,
            "samples_per_second": 2347074.6261325665,
            "samples_per_second_per_gpu": 293384.3282665708,
            "loss_sequences_lower_95": 4.407567937808346,
            "loss_sequences_upper_95": 4.432331224786299,
            "loss_tokens_lower_95": 4.407349927083334,
            "loss_tokens_upper_95": 4.432082666666666,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.311012195379026,
            "data_time": 0.002899175083309685,
            "batch_time": 0.016536670163906583,
            "samples_per_second": 2327143.586718147,
            "samples_per_second_per_gpu": 290892.9483397684,
            "loss_sequences_lower_95": 4.284298198911261,
            "loss_sequences_upper_95": 4.33947713233093,
            "loss_tokens_lower_95": 4.2984119375,
            "loss_tokens_upper_95": 4.323360989583333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.6387274299141845,
            "data_time": 0.010658405515045046,
            "batch_time": 0.03129879005341662,
            "samples_per_second": 2147140.9735722365,
            "samples_per_second_per_gpu": 268392.62169652956,
            "loss_sequences_lower_95": 4.545046829935504,
            "loss_sequences_upper_95": 4.752970901705914,
            "loss_tokens_lower_95": 4.625218666666666,
            "loss_tokens_upper_95": 4.652161979166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.065476528738764,
            "data_time": 0.008961645255525749,
            "batch_time": 0.022572860299828516,
            "samples_per_second": 2206269.2259666147,
            "samples_per_second_per_gpu": 275783.65324582683,
            "loss_sequences_lower_95": 3.979349680976324,
            "loss_sequences_upper_95": 4.169049874051282,
            "loss_tokens_lower_95": 4.052920791666667,
            "loss_tokens_upper_95": 4.078363791666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.161304289644415,
            "data_time": 0.08274907725197929,
            "batch_time": 0.0994357807295663,
            "samples_per_second": 1014856.9538943178,
            "samples_per_second_per_gpu": 126857.11923678973,
            "loss_sequences_lower_95": 5.074928266351873,
            "loss_sequences_upper_95": 5.261912129142067,
            "loss_tokens_lower_95": 5.1352270299738105,
            "loss_tokens_upper_95": 5.187614605643533,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4556749019956445,
            "data_time": 0.01361263475634835,
            "batch_time": 0.027624517679214478,
            "samples_per_second": 2115247.111032049,
            "samples_per_second_per_gpu": 264405.88887900615,
            "loss_sequences_lower_95": 4.38406129175303,
            "loss_sequences_upper_95": 4.526789824816646,
            "loss_tokens_lower_95": 4.4419615312500005,
            "loss_tokens_upper_95": 4.469312135416667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.085236505342348,
            "data_time": 0.012372902284065882,
            "batch_time": 0.026401097575823467,
            "samples_per_second": 2159545.399176139,
            "samples_per_second_per_gpu": 269943.17489701736,
            "loss_sequences_lower_95": 5.992333742811057,
            "loss_sequences_upper_95": 6.207550225975017,
            "loss_tokens_lower_95": 6.073422604166667,
            "loss_tokens_upper_95": 6.09692834375,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.866598473220575,
            "data_time": 0.03700854256749153,
            "batch_time": 0.05261271074414253,
            "samples_per_second": 1761960.610765069,
            "samples_per_second_per_gpu": 220245.07634563363,
            "loss_sequences_lower_95": 4.706472765813109,
            "loss_sequences_upper_95": 5.148666844602491,
            "loss_tokens_lower_95": 4.851583299480501,
            "loss_tokens_upper_95": 4.881833160900679,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.897889058485685,
            "data_time": 0.0020585458275940533,
            "batch_time": 0.015612206419671043,
            "samples_per_second": 2278590.8158925967,
            "samples_per_second_per_gpu": 284823.8519865746,
            "loss_sequences_lower_95": 4.8821613285144565,
            "loss_sequences_upper_95": 4.913913630203141,
            "loss_tokens_lower_95": 4.882010706162761,
            "loss_tokens_upper_95": 4.913578739228742,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8137284395016184,
            "data_time": 0.002128142792328148,
            "batch_time": 0.01576880123584893,
            "samples_per_second": 2262238.2884515235,
            "samples_per_second_per_gpu": 282779.78605644044,
            "loss_sequences_lower_95": 3.8181739915013195,
            "loss_sequences_upper_95": 3.8441331952673767,
            "loss_tokens_lower_95": 3.7909384842671816,
            "loss_tokens_upper_95": 3.811430401040039,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.11791459484713,
            "data_time": 0.003031421898671976,
            "batch_time": 0.016886761231202207,
            "samples_per_second": 2237474.380608685,
            "samples_per_second_per_gpu": 279684.2975760856,
            "loss_sequences_lower_95": 6.3254838066839865,
            "loss_sequences_upper_95": 6.619522600710026,
            "loss_tokens_lower_95": 5.624760524630688,
            "loss_tokens_upper_95": 5.834765229442161,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.157196439305941,
            "data_time": 0.00397855836026212,
            "batch_time": 0.01769142391833853,
            "samples_per_second": 2237105.24949339,
            "samples_per_second_per_gpu": 279638.15618667373,
            "loss_sequences_lower_95": 6.328862483723959,
            "loss_sequences_upper_95": 6.533475260416667,
            "loss_tokens_lower_95": 5.773084144064466,
            "loss_tokens_upper_95": 5.912417612519654,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.07360361018264,
            "data_time": 0.004775520902952997,
            "batch_time": 0.018392506048510338,
            "samples_per_second": 2250941.1276351796,
            "samples_per_second_per_gpu": 281367.64095439744,
            "loss_sequences_lower_95": 4.1174256805581875,
            "loss_sequences_upper_95": 4.187673360204227,
            "loss_tokens_lower_95": 3.9694820138317524,
            "loss_tokens_upper_95": 4.003209910806472,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.556103456020355,
            "data_time": 0.021780576024736677,
            "batch_time": 0.03631812334060669,
            "samples_per_second": 2008158.0296032224,
            "samples_per_second_per_gpu": 251019.7537004028,
            "loss_sequences_lower_95": 3.5284553666548297,
            "loss_sequences_upper_95": 3.6466532065651633,
            "loss_tokens_lower_95": 3.4826007051169,
            "loss_tokens_upper_95": 3.5390179066310594,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.2239567114382375,
            "data_time": 0.021077891811728477,
            "batch_time": 0.03585386462509632,
            "samples_per_second": 1923381.6579403249,
            "samples_per_second_per_gpu": 240422.7072425406,
            "loss_sequences_lower_95": 4.202027650171397,
            "loss_sequences_upper_95": 4.409161638532367,
            "loss_tokens_lower_95": 4.091044042505924,
            "loss_tokens_upper_95": 4.190610830917823,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.660454810460409,
            "data_time": 0.016925989053188227,
            "batch_time": 0.030755655887799386,
            "samples_per_second": 2066885.4270566825,
            "samples_per_second_per_gpu": 258360.67838208532,
            "loss_sequences_lower_95": 4.603961964925131,
            "loss_sequences_upper_95": 4.715868408203125,
            "loss_tokens_lower_95": 4.531933820085561,
            "loss_tokens_upper_95": 4.773992821843132,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.342063486004115,
            "data_time": 0.0018572254732783509,
            "batch_time": 0.015493154995038559,
            "samples_per_second": 2265934.602192953,
            "samples_per_second_per_gpu": 283241.82527411915,
            "loss_sequences_lower_95": 8.361607046743517,
            "loss_sequences_upper_95": 8.43912401272083,
            "loss_tokens_lower_95": 8.184677772170195,
            "loss_tokens_upper_95": 8.264732697453296,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.68948096036911,
            "data_time": 0.002903409652261926,
            "batch_time": 0.01641601324081421,
            "samples_per_second": 2277746.312247471,
            "samples_per_second_per_gpu": 284718.28903093387,
            "loss_sequences_lower_95": 6.293490158748948,
            "loss_sequences_upper_95": 6.607385767670191,
            "loss_tokens_lower_95": 4.878595741265043,
            "loss_tokens_upper_95": 5.027781052203962,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.119224926717452,
            "data_time": 0.005249432212597615,
            "batch_time": 0.018866143919326162,
            "samples_per_second": 2237869.479428341,
            "samples_per_second_per_gpu": 279733.68492854264,
            "loss_sequences_lower_95": 5.583179294208618,
            "loss_sequences_upper_95": 5.925342234328339,
            "loss_tokens_lower_95": 4.645247884272144,
            "loss_tokens_upper_95": 4.808486301339646,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.981389765325747,
            "data_time": 0.02218193028654371,
            "batch_time": 0.03648052045277187,
            "samples_per_second": 1993803.2567540146,
            "samples_per_second_per_gpu": 249225.40709425183,
            "loss_sequences_lower_95": 5.866593326934396,
            "loss_sequences_upper_95": 6.0964266703009065,
            "loss_tokens_lower_95": 5.868171448032605,
            "loss_tokens_upper_95": 6.096527879967537,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.321424703598023,
            "data_time": 0.04983043670654297,
            "batch_time": 0.0643375378388625,
            "samples_per_second": 1808535.0214818525,
            "samples_per_second_per_gpu": 226066.87768523156,
            "loss_sequences_lower_95": 4.18500691986084,
            "loss_sequences_upper_95": 4.597167846679688,
            "loss_tokens_lower_95": 3.966880096817699,
            "loss_tokens_upper_95": 4.470477971877096,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.505759662591081,
            "data_time": 0.003339114120889051,
            "batch_time": 0.017070321948981722,
            "samples_per_second": 2246698.878121989,
            "samples_per_second_per_gpu": 280837.3597652486,
            "loss_sequences_lower_95": 5.468173558245906,
            "loss_sequences_upper_95": 5.542591169156218,
            "loss_tokens_lower_95": 5.4682907307526225,
            "loss_tokens_upper_95": 5.542914311888274,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.4260655843655075,
            "data_time": 0.004977638141949453,
            "batch_time": 0.019068332130718388,
            "samples_per_second": 2181746.5209687995,
            "samples_per_second_per_gpu": 272718.31512109993,
            "loss_sequences_lower_95": 5.378458798820127,
            "loss_sequences_upper_95": 5.473343582840653,
            "loss_tokens_lower_95": 5.375927794360411,
            "loss_tokens_upper_95": 5.474611624452932,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.372208705923892,
            "data_time": 0.0034813955587135793,
            "batch_time": 0.017002059368026436,
            "samples_per_second": 2265375.693700683,
            "samples_per_second_per_gpu": 283171.96171258535,
            "loss_sequences_lower_95": 4.507414742055733,
            "loss_sequences_upper_95": 4.63015011859018,
            "loss_tokens_lower_95": 4.221108808261784,
            "loss_tokens_upper_95": 4.284356362956519,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.37416154050827,
            "data_time": 0.010601997375488281,
            "batch_time": 0.024325856007635593,
            "samples_per_second": 2149807.2279703547,
            "samples_per_second_per_gpu": 268725.90349629434,
            "loss_sequences_lower_95": 6.5465113769531245,
            "loss_sequences_upper_95": 7.100222839355468,
            "loss_tokens_lower_95": 5.698831501928566,
            "loss_tokens_upper_95": 6.060116453947097,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.879425644874573,
            "data_time": 0.14642468094825745,
            "batch_time": 0.16275332868099213,
            "samples_per_second": 832031.1822535787,
            "samples_per_second_per_gpu": 104003.89778169733,
            "loss_sequences_lower_95": 4.571094214916229,
            "loss_sequences_upper_95": 5.295796132087707,
            "loss_tokens_lower_95": 4.355208938423244,
            "loss_tokens_upper_95": 5.20456428966303,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.024878670429361,
            "data_time": 0.028131360703326287,
            "batch_time": 0.04226717796731502,
            "samples_per_second": 1888074.8095642256,
            "samples_per_second_per_gpu": 236009.3511955282,
            "loss_sequences_lower_95": 5.276262647256083,
            "loss_sequences_upper_95": 5.847855535046807,
            "loss_tokens_lower_95": 4.128234643086774,
            "loss_tokens_upper_95": 4.525481049237686,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.734622984799093,
            "data_time": 0.003008573419517941,
            "batch_time": 0.0168833802971575,
            "samples_per_second": 2215213.1047848626,
            "samples_per_second_per_gpu": 276901.6380981078,
            "loss_sequences_lower_95": 3.719104649188374,
            "loss_sequences_upper_95": 3.7499582219821703,
            "loss_tokens_lower_95": 3.7186465842070775,
            "loss_tokens_upper_95": 3.7498568220292703,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.065276346969612,
            "data_time": 0.0026139125927594634,
            "batch_time": 0.0163042349180696,
            "samples_per_second": 2259498.578293626,
            "samples_per_second_per_gpu": 282437.3222867033,
            "loss_sequences_lower_95": 5.040022499029691,
            "loss_sequences_upper_95": 5.252860003426402,
            "loss_tokens_lower_95": 4.798981258046049,
            "loss_tokens_upper_95": 5.005493315370945,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.836545195771661,
            "data_time": 0.01833240853415595,
            "batch_time": 0.03259467581907908,
            "samples_per_second": 1988550.1638854842,
            "samples_per_second_per_gpu": 248568.77048568553,
            "loss_sequences_lower_95": 3.6661473480336393,
            "loss_sequences_upper_95": 4.05316879775498,
            "loss_tokens_lower_95": 3.5925558169609304,
            "loss_tokens_upper_95": 3.905673096380666,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.163063956887682,
            "data_time": 0.004946396127343178,
            "batch_time": 0.018976974487304687,
            "samples_per_second": 2174845.48964231,
            "samples_per_second_per_gpu": 271855.68620528874,
            "loss_sequences_lower_95": 4.1853357039574295,
            "loss_sequences_upper_95": 4.322027496361978,
            "loss_tokens_lower_95": 4.033497374164776,
            "loss_tokens_upper_95": 4.183548139327513,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7893040659951,
            "data_time": 0.031013840720767065,
            "batch_time": 0.045592455636887325,
            "samples_per_second": 1945705.9191631917,
            "samples_per_second_per_gpu": 243213.23989539896,
            "loss_sequences_lower_95": 3.600027000613329,
            "loss_sequences_upper_95": 4.112119563032941,
            "loss_tokens_lower_95": 3.5164487854302613,
            "loss_tokens_upper_95": 3.908050686461307,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.652606167347819,
            "data_time": 0.002234940618407569,
            "batch_time": 0.015949679764089612,
            "samples_per_second": 2245300.9607332703,
            "samples_per_second_per_gpu": 280662.6200916588,
            "loss_sequences_lower_95": 5.646403401774105,
            "loss_sequences_upper_95": 5.658717778711992,
            "loss_tokens_lower_95": 5.646384687093669,
            "loss_tokens_upper_95": 5.65887375717331,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.2504594788967984,
            "data_time": 0.046144749901511455,
            "batch_time": 0.06107182069258257,
            "samples_per_second": 1727746.2943033057,
            "samples_per_second_per_gpu": 215968.2867879132,
            "loss_sequences_lower_95": 2.136037156188372,
            "loss_sequences_upper_95": 2.4279372280083815,
            "loss_tokens_lower_95": 1.9974062495002238,
            "loss_tokens_upper_95": 2.341859946743859,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.9988338686853835,
            "data_time": 0.001722795315780064,
            "batch_time": 0.015424987533865911,
            "samples_per_second": 2253319.092667488,
            "samples_per_second_per_gpu": 281664.886583436,
            "loss_sequences_lower_95": 6.446510764707809,
            "loss_sequences_upper_95": 6.503844564825734,
            "loss_tokens_lower_95": 5.305600834139265,
            "loss_tokens_upper_95": 5.362155065280464,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.283404522418976,
            "data_time": 0.005868675689848643,
            "batch_time": 0.019713390441167922,
            "samples_per_second": 2205229.085893624,
            "samples_per_second_per_gpu": 275653.635736703,
            "loss_sequences_lower_95": 5.251324389648437,
            "loss_sequences_upper_95": 5.490994055175781,
            "loss_tokens_lower_95": 5.082932313947095,
            "loss_tokens_upper_95": 5.30307902301455,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.217373703873676,
            "data_time": 0.02112949904748949,
            "batch_time": 0.03507530891289145,
            "samples_per_second": 2045864.9455105658,
            "samples_per_second_per_gpu": 255733.11818882072,
            "loss_sequences_lower_95": 5.092757701044497,
            "loss_sequences_upper_95": 5.342667276133662,
            "loss_tokens_lower_95": 5.09498463506284,
            "loss_tokens_upper_95": 5.338321944527004,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.775622435410817,
            "data_time": 0.004505563931292798,
            "batch_time": 0.01807482450841421,
            "samples_per_second": 2252328.4508785536,
            "samples_per_second_per_gpu": 281541.0563598192,
            "loss_sequences_lower_95": 8.641007690429689,
            "loss_sequences_upper_95": 8.909119669596356,
            "loss_tokens_lower_95": 8.639280931877368,
            "loss_tokens_upper_95": 8.912393336440578,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.610896288394928,
            "data_time": 0.004134403898360881,
            "batch_time": 0.017934356915189863,
            "samples_per_second": 2234463.878112002,
            "samples_per_second_per_gpu": 279307.9847640002,
            "loss_sequences_lower_95": 1.6771827270507813,
            "loss_sequences_upper_95": 1.771765205891927,
            "loss_tokens_lower_95": 1.4950345919617847,
            "loss_tokens_upper_95": 1.5685389155662264,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.79003407501039,
            "data_time": 0.023075346435819353,
            "batch_time": 0.03716413463865008,
            "samples_per_second": 1978856.9667347402,
            "samples_per_second_per_gpu": 247357.12084184252,
            "loss_sequences_lower_95": 5.508153715587798,
            "loss_sequences_upper_95": 6.076459176199776,
            "loss_tokens_lower_95": 5.503222917829241,
            "loss_tokens_upper_95": 6.080770961216517,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.538988471031189,
            "data_time": 0.15203842520713806,
            "batch_time": 0.1694553792476654,
            "samples_per_second": 867772.7987363649,
            "samples_per_second_per_gpu": 108471.59984204562,
            "loss_sequences_lower_95": 3.250565403699875,
            "loss_sequences_upper_95": 4.4959278941154475,
            "loss_tokens_lower_95": 2.9598751673747583,
            "loss_tokens_upper_95": 3.5393532664505476,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.149189253807068,
            "data_time": 0.005881449532887292,
            "batch_time": 0.01975164432374258,
            "samples_per_second": 2200102.7409695005,
            "samples_per_second_per_gpu": 275012.84262118756,
            "loss_sequences_lower_95": 7.0966165893554685,
            "loss_sequences_upper_95": 7.4455720703125,
            "loss_tokens_lower_95": 6.8182625552724465,
            "loss_tokens_upper_95": 7.129702454129653,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.221116723060608,
            "data_time": 0.005583401237215314,
            "batch_time": 0.019429703080464922,
            "samples_per_second": 2206528.3600692786,
            "samples_per_second_per_gpu": 275816.0450086598,
            "loss_sequences_lower_95": 7.3361654296875,
            "loss_sequences_upper_95": 7.5783074340820304,
            "loss_tokens_lower_95": 6.951352407028511,
            "loss_tokens_upper_95": 7.158081140265085,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.958070602026955,
            "data_time": 0.0038774802931973765,
            "batch_time": 0.017582206024374054,
            "samples_per_second": 2238715.2485221582,
            "samples_per_second_per_gpu": 279839.4060652698,
            "loss_sequences_lower_95": 5.9412887383977955,
            "loss_sequences_upper_95": 5.974588537519905,
            "loss_tokens_lower_95": 5.941822541275561,
            "loss_tokens_upper_95": 5.974957211961533,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.311943391867314,
            "data_time": 0.008463953196822336,
            "batch_time": 0.022315460389474368,
            "samples_per_second": 2162917.7592747216,
            "samples_per_second_per_gpu": 270364.7199093402,
            "loss_sequences_lower_95": 5.233995344452045,
            "loss_sequences_upper_95": 5.387959563787082,
            "loss_tokens_lower_95": 5.231206900366624,
            "loss_tokens_upper_95": 5.386747242448517,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.312411283493041,
            "data_time": 0.005940418394785079,
            "batch_time": 0.019595578076347472,
            "samples_per_second": 2222222.856163687,
            "samples_per_second_per_gpu": 277777.8570204609,
            "loss_sequences_lower_95": 8.2449978515625,
            "loss_sequences_upper_95": 8.381585009765626,
            "loss_tokens_lower_95": 8.243823413085938,
            "loss_tokens_upper_95": 8.380064111328124,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.205445816850707,
            "data_time": 0.002096950354332735,
            "batch_time": 0.015805543933789086,
            "samples_per_second": 2251253.538358483,
            "samples_per_second_per_gpu": 281406.6922948104,
            "loss_sequences_lower_95": 4.699517483887181,
            "loss_sequences_upper_95": 4.790557906368258,
            "loss_tokens_lower_95": 3.6086231514337967,
            "loss_tokens_upper_95": 3.6714173407307085,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.47922070079775,
            "data_time": 0.019681404318128315,
            "batch_time": 0.0341031517301287,
            "samples_per_second": 1983961.542234522,
            "samples_per_second_per_gpu": 247995.19277931526,
            "loss_sequences_lower_95": 5.351812949109433,
            "loss_sequences_upper_95": 5.606463156173478,
            "loss_tokens_lower_95": 5.351491307500583,
            "loss_tokens_upper_95": 5.607179909321799,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.4045433091182336,
            "data_time": 0.010476281866431236,
            "batch_time": 0.02443928550928831,
            "samples_per_second": 2158902.664752085,
            "samples_per_second_per_gpu": 269862.83309401065,
            "loss_sequences_lower_95": 5.312724322150736,
            "loss_sequences_upper_95": 5.493556985294117,
            "loss_tokens_lower_95": 5.3156432028377765,
            "loss_tokens_upper_95": 5.492938770967371,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9387700795739065,
            "data_time": 0.002434232706545472,
            "batch_time": 0.01631802712478886,
            "samples_per_second": 2221937.0331534403,
            "samples_per_second_per_gpu": 277742.12914418004,
            "loss_sequences_lower_95": 5.394439893758612,
            "loss_sequences_upper_95": 5.49050046779093,
            "loss_tokens_lower_95": 4.26398516836529,
            "loss_tokens_upper_95": 4.346134929095483,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.083147376933426,
            "data_time": 0.026240691542625427,
            "batch_time": 0.040335590640703835,
            "samples_per_second": 2028939.810734024,
            "samples_per_second_per_gpu": 253617.476341753,
            "loss_sequences_lower_95": 6.038911494502315,
            "loss_sequences_upper_95": 6.126980203295511,
            "loss_tokens_lower_95": 6.039448926814649,
            "loss_tokens_upper_95": 6.126182160554109,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.483583231876385,
            "data_time": 0.003942831677540464,
            "batch_time": 0.017587195094834026,
            "samples_per_second": 2248547.452522204,
            "samples_per_second_per_gpu": 281068.4315652755,
            "loss_sequences_lower_95": 5.455286786840596,
            "loss_sequences_upper_95": 5.513182384245987,
            "loss_tokens_lower_95": 5.454537073537844,
            "loss_tokens_upper_95": 5.512666075353594,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.555539406619026,
            "data_time": 0.023786744204434482,
            "batch_time": 0.037902513417330655,
            "samples_per_second": 1946894.4356475126,
            "samples_per_second_per_gpu": 243361.80445593907,
            "loss_sequences_lower_95": 5.411927795410156,
            "loss_sequences_upper_95": 5.701399319843181,
            "loss_tokens_lower_95": 5.410245447251404,
            "loss_tokens_upper_95": 5.701884164162052,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.5684764862060545,
            "data_time": 0.07544337213039398,
            "batch_time": 0.09083562344312668,
            "samples_per_second": 1410234.873437213,
            "samples_per_second_per_gpu": 176279.35917965163,
            "loss_sequences_lower_95": 4.20301596959432,
            "loss_sequences_upper_95": 5.102804425557454,
            "loss_tokens_lower_95": 3.817717583974203,
            "loss_tokens_upper_95": 5.1216982311672625,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.060959418614705,
            "data_time": 0.07860708236694336,
            "batch_time": 0.09412477910518646,
            "samples_per_second": 1385752.1972199904,
            "samples_per_second_per_gpu": 173219.0246524988,
            "loss_sequences_lower_95": 3.7473590850830076,
            "loss_sequences_upper_95": 4.624740943908691,
            "loss_tokens_lower_95": 3.1505226478148045,
            "loss_tokens_upper_95": 4.512919102358014,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.861729311205677,
            "data_time": 0.0034345536459327235,
            "batch_time": 0.017208547715652403,
            "samples_per_second": 2236422.9362848266,
            "samples_per_second_per_gpu": 279552.8670356033,
            "loss_sequences_lower_95": 5.845370029455081,
            "loss_sequences_upper_95": 5.878447351919182,
            "loss_tokens_lower_95": 5.845326724157768,
            "loss_tokens_upper_95": 5.878072533136966,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.792704493373901,
            "data_time": 0.001672518339733587,
            "batch_time": 0.015421404921056404,
            "samples_per_second": 2241908.4895144156,
            "samples_per_second_per_gpu": 280238.56118930195,
            "loss_sequences_lower_95": 2.0651862247187647,
            "loss_sequences_upper_95": 2.1044026353229928,
            "loss_tokens_lower_95": 1.5034418597711923,
            "loss_tokens_upper_95": 1.5253720144684726,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.8613309221943535,
            "data_time": 0.041643910109996796,
            "batch_time": 0.056942883878946304,
            "samples_per_second": 1823265.826815426,
            "samples_per_second_per_gpu": 227908.22835192824,
            "loss_sequences_lower_95": 5.898146285410003,
            "loss_sequences_upper_95": 6.311698877529835,
            "loss_tokens_lower_95": 5.502708467667649,
            "loss_tokens_upper_95": 5.799032912866925,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.289074614241317,
            "data_time": 0.11967553411211286,
            "batch_time": 0.15784580366952078,
            "samples_per_second": 1083702.2121842043,
            "samples_per_second_per_gpu": 135462.77652302553,
            "loss_sequences_lower_95": 8.826923597181166,
            "loss_sequences_upper_95": 9.951314585917704,
            "loss_tokens_lower_95": 8.186547945752556,
            "loss_tokens_upper_95": 10.092548059534142,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.768215933950936,
            "data_time": 0.030226142633528935,
            "batch_time": 0.044421508198692686,
            "samples_per_second": 1991075.5793432554,
            "samples_per_second_per_gpu": 248884.44741790692,
            "loss_sequences_lower_95": 5.736517957361733,
            "loss_sequences_upper_95": 6.101537155523532,
            "loss_tokens_lower_95": 5.366042219696173,
            "loss_tokens_upper_95": 5.620475912005888,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.912547232174292,
            "data_time": 0.03191940841220674,
            "batch_time": 0.04619905494508289,
            "samples_per_second": 1980708.1100904774,
            "samples_per_second_per_gpu": 247588.51376130967,
            "loss_sequences_lower_95": 5.878858473242783,
            "loss_sequences_upper_95": 6.199841680759337,
            "loss_tokens_lower_95": 5.550500913458719,
            "loss_tokens_upper_95": 5.761759647205201,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.92509367407822,
            "data_time": 0.031125996794019426,
            "batch_time": 0.04566718283153716,
            "samples_per_second": 1926849.407420564,
            "samples_per_second_per_gpu": 240856.1759275705,
            "loss_sequences_lower_95": 5.89827485433439,
            "loss_sequences_upper_95": 6.321313737078411,
            "loss_tokens_lower_95": 5.4430793784143185,
            "loss_tokens_upper_95": 5.7714509618494825,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.06092758585767,
            "data_time": 0.03005727983656384,
            "batch_time": 0.04520829518636068,
            "samples_per_second": 1844185.3062759484,
            "samples_per_second_per_gpu": 230523.16328449355,
            "loss_sequences_lower_95": 6.0052502981046345,
            "loss_sequences_upper_95": 6.318717323861471,
            "loss_tokens_lower_95": 5.727925723066954,
            "loss_tokens_upper_95": 5.922706276011245,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.367445308969628,
            "data_time": 0.03121733076778459,
            "batch_time": 0.04565650739787537,
            "samples_per_second": 1981397.5794313322,
            "samples_per_second_per_gpu": 247674.69742891652,
            "loss_sequences_lower_95": 5.296075297290494,
            "loss_sequences_upper_95": 5.532485165803329,
            "loss_tokens_lower_95": 5.099034085715853,
            "loss_tokens_upper_95": 5.244606073139076,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.830076983789119,
            "data_time": 0.029432160513741628,
            "batch_time": 0.044892191886901855,
            "samples_per_second": 1860339.3952461318,
            "samples_per_second_per_gpu": 232542.42440576648,
            "loss_sequences_lower_95": 4.805591406473299,
            "loss_sequences_upper_95": 5.0637497413449175,
            "loss_tokens_lower_95": 4.557387121383363,
            "loss_tokens_upper_95": 4.6845393615317965,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.5/params.txt",
    "uuid": "15f782eb-3a60-49e6-b4eb-d58ff077ee84",
    "creation_date": "2023_12_14-05_01_09"
}