{
    "name": "rw_original-d=576_l=24_h=8-32.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 98353520640,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 32.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "19670704128",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-32.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.0424805084864297,
            "data_time": 0.035094138234853745,
            "batch_time": 0.3984436206519604,
            "samples_per_second": 813778.449659776,
            "samples_per_second_per_gpu": 101722.306207472,
            "loss_sequences_lower_95": 2.974859352111816,
            "loss_sequences_upper_95": 3.110132376352946,
            "loss_tokens_lower_95": 3.0294034576416013,
            "loss_tokens_upper_95": 3.0554306093851724,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.109898599033169,
            "data_time": 0.0011914175678566675,
            "batch_time": 0.030787342501127548,
            "samples_per_second": 1076990.7240562136,
            "samples_per_second_per_gpu": 134623.8405070267,
            "loss_sequences_lower_95": 3.1074541961523074,
            "loss_sequences_upper_95": 3.1123565332386263,
            "loss_tokens_lower_95": 3.099510401041667,
            "loss_tokens_upper_95": 3.1203546510416666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.766546614316045,
            "data_time": 0.010219097137451172,
            "batch_time": 0.0400637788772583,
            "samples_per_second": 1040643.3856116855,
            "samples_per_second_per_gpu": 130080.42320146068,
            "loss_sequences_lower_95": 2.7148868669782362,
            "loss_sequences_upper_95": 2.83105576495735,
            "loss_tokens_lower_95": 2.7543982395833333,
            "loss_tokens_upper_95": 2.778737609375,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.197979326248169,
            "data_time": 0.0016407132344810587,
            "batch_time": 0.030395201260322018,
            "samples_per_second": 1109357.8885041154,
            "samples_per_second_per_gpu": 138669.73606301442,
            "loss_sequences_lower_95": 3.1600804506282216,
            "loss_sequences_upper_95": 3.237204051224227,
            "loss_tokens_lower_95": 3.1859993072916666,
            "loss_tokens_upper_95": 3.2098883750000002,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1780241579727826,
            "data_time": 0.009841237885068613,
            "batch_time": 0.03898341342272511,
            "samples_per_second": 1055678.345877965,
            "samples_per_second_per_gpu": 131959.79323474562,
            "loss_sequences_lower_95": 3.1227710047952995,
            "loss_sequences_upper_95": 3.2481119936692497,
            "loss_tokens_lower_95": 3.167096244791667,
            "loss_tokens_upper_95": 3.1888434375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2583630585816343,
            "data_time": 0.003821430646854898,
            "batch_time": 0.033213078651739204,
            "samples_per_second": 1086928.19561938,
            "samples_per_second_per_gpu": 135866.0244524225,
            "loss_sequences_lower_95": 3.2158119559531144,
            "loss_sequences_upper_95": 3.3040249179448504,
            "loss_tokens_lower_95": 3.2463828854166663,
            "loss_tokens_upper_95": 3.2701035260416664,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8956329535221568,
            "data_time": 0.0016176559602261368,
            "batch_time": 0.03027759173954875,
            "samples_per_second": 1116457.6444272387,
            "samples_per_second_per_gpu": 139557.20555340484,
            "loss_sequences_lower_95": 2.8645267608019767,
            "loss_sequences_upper_95": 2.9269000418526785,
            "loss_tokens_lower_95": 2.881360515625,
            "loss_tokens_upper_95": 2.9101473177083332,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6351428849784493,
            "data_time": 0.0017387299595493396,
            "batch_time": 0.030278633050667296,
            "samples_per_second": 1118846.4960384674,
            "samples_per_second_per_gpu": 139855.81200480842,
            "loss_sequences_lower_95": 3.6127516770287955,
            "loss_sequences_upper_95": 3.6596254703861253,
            "loss_tokens_lower_95": 3.62369725,
            "loss_tokens_upper_95": 3.6465656770833332,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.257793970224334,
            "data_time": 0.009790091287522089,
            "batch_time": 0.038955322333744595,
            "samples_per_second": 1050199.9124384683,
            "samples_per_second_per_gpu": 131274.98905480854,
            "loss_sequences_lower_95": 3.1805189799487104,
            "loss_sequences_upper_95": 3.348562386365441,
            "loss_tokens_lower_95": 3.2465520989583334,
            "loss_tokens_upper_95": 3.269112739583333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.286779889947341,
            "data_time": 0.00948648527264595,
            "batch_time": 0.038208785466849804,
            "samples_per_second": 1082390.3307952776,
            "samples_per_second_per_gpu": 135298.7913494097,
            "loss_sequences_lower_95": 4.189107070138803,
            "loss_sequences_upper_95": 4.403504165830348,
            "loss_tokens_lower_95": 4.273547020833334,
            "loss_tokens_upper_95": 4.300198,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3317048743405997,
            "data_time": 0.0012976411729030133,
            "batch_time": 0.029824287801874797,
            "samples_per_second": 1121169.597168399,
            "samples_per_second_per_gpu": 140146.19964604988,
            "loss_sequences_lower_95": 3.3184966718985542,
            "loss_sequences_upper_95": 3.3454796812902563,
            "loss_tokens_lower_95": 3.3206083541666667,
            "loss_tokens_upper_95": 3.3430334427083332,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1608153567220807,
            "data_time": 0.002657577060442185,
            "batch_time": 0.03187806381968832,
            "samples_per_second": 1093739.8816584577,
            "samples_per_second_per_gpu": 136717.4852073072,
            "loss_sequences_lower_95": 3.134261699836982,
            "loss_sequences_upper_95": 3.188879684327494,
            "loss_tokens_lower_95": 3.149574859375,
            "loss_tokens_upper_95": 3.1721958854166665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6640685924893703,
            "data_time": 0.009853281051273874,
            "batch_time": 0.03937186271305612,
            "samples_per_second": 1042318.1418416512,
            "samples_per_second_per_gpu": 130289.7677302064,
            "loss_sequences_lower_95": 3.587613205977434,
            "loss_sequences_upper_95": 3.753520286204123,
            "loss_tokens_lower_95": 3.6516866041666667,
            "loss_tokens_upper_95": 3.6764806145833333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9069568584018715,
            "data_time": 0.01022142718037761,
            "batch_time": 0.03949587183644572,
            "samples_per_second": 1053437.3672516232,
            "samples_per_second_per_gpu": 131679.6709064529,
            "loss_sequences_lower_95": 2.826026347795475,
            "loss_sequences_upper_95": 2.999054042093623,
            "loss_tokens_lower_95": 2.895618916666667,
            "loss_tokens_upper_95": 2.918177078125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8116724870421668,
            "data_time": 0.08068881716047015,
            "batch_time": 0.11244173560823713,
            "samples_per_second": 572437.4980877931,
            "samples_per_second_per_gpu": 71554.68726097414,
            "loss_sequences_lower_95": 3.7283206332813608,
            "loss_sequences_upper_95": 3.9202480142766776,
            "loss_tokens_lower_95": 3.7904872807589443,
            "loss_tokens_upper_95": 3.833796006982977,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.180798976831464,
            "data_time": 0.013819643042304298,
            "batch_time": 0.04327958551320163,
            "samples_per_second": 1034819.4100213276,
            "samples_per_second_per_gpu": 129352.42625266594,
            "loss_sequences_lower_95": 3.1170288708745217,
            "loss_sequences_upper_95": 3.2432932684094844,
            "loss_tokens_lower_95": 3.168150739583333,
            "loss_tokens_upper_95": 3.1931413125,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.205091965859043,
            "data_time": 0.012223559121290842,
            "batch_time": 0.041441887617111206,
            "samples_per_second": 1056115.0541213183,
            "samples_per_second_per_gpu": 132014.38176516478,
            "loss_sequences_lower_95": 5.127691988580145,
            "loss_sequences_upper_95": 5.305598679192777,
            "loss_tokens_lower_95": 5.193488020833334,
            "loss_tokens_upper_95": 5.216600427083334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.428423834628746,
            "data_time": 0.03552132844924927,
            "batch_time": 0.06585625931620598,
            "samples_per_second": 921283.6141736306,
            "samples_per_second_per_gpu": 115160.45177170383,
            "loss_sequences_lower_95": 3.2863142607642,
            "loss_sequences_upper_95": 3.679658752191262,
            "loss_tokens_lower_95": 3.414999358380427,
            "loss_tokens_upper_95": 3.4420241121385917,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.650692387228932,
            "data_time": 0.001610413261589537,
            "batch_time": 0.03067417493713744,
            "samples_per_second": 1094159.008622593,
            "samples_per_second_per_gpu": 136769.87607782413,
            "loss_sequences_lower_95": 2.636466750696571,
            "loss_sequences_upper_95": 2.664709081460974,
            "loss_tokens_lower_95": 2.636288511697052,
            "loss_tokens_upper_95": 2.665233762996724,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.7689512572519286,
            "data_time": 0.0019399824131066633,
            "batch_time": 0.030908371707436384,
            "samples_per_second": 1096573.7523667251,
            "samples_per_second_per_gpu": 137071.71904584064,
            "loss_sequences_lower_95": 2.7668458052351994,
            "loss_sequences_upper_95": 2.7916883999312265,
            "loss_tokens_lower_95": 2.7482138717056563,
            "loss_tokens_upper_95": 2.7665041319901214,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7980483181010904,
            "data_time": 0.0032637615904956355,
            "batch_time": 0.034636776813575396,
            "samples_per_second": 1073695.4603001536,
            "samples_per_second_per_gpu": 134211.9325375192,
            "loss_sequences_lower_95": 4.067482902083579,
            "loss_sequences_upper_95": 4.3543741419904345,
            "loss_tokens_lower_95": 3.231564241617897,
            "loss_tokens_upper_95": 3.4426614364613117,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6694293258668234,
            "data_time": 0.003487259308074383,
            "batch_time": 0.03248785325187318,
            "samples_per_second": 1088818.3738593133,
            "samples_per_second_per_gpu": 136102.29673241416,
            "loss_sequences_lower_95": 3.7306044270833336,
            "loss_sequences_upper_95": 3.9237981445312498,
            "loss_tokens_lower_95": 3.460483871363994,
            "loss_tokens_upper_95": 3.601565706073113,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.686773376721519,
            "data_time": 0.00437772112194769,
            "batch_time": 0.03403003147464712,
            "samples_per_second": 1066477.9567092084,
            "samples_per_second_per_gpu": 133309.74458865105,
            "loss_sequences_lower_95": 2.728195354701715,
            "loss_sequences_upper_95": 2.7825874658314182,
            "loss_tokens_lower_95": 2.6012836883026007,
            "loss_tokens_upper_95": 2.6295960226906074,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.187413199381395,
            "data_time": 0.022504732012748718,
            "batch_time": 0.05563439854553768,
            "samples_per_second": 995028.8112110541,
            "samples_per_second_per_gpu": 124378.60140138176,
            "loss_sequences_lower_95": 2.166186294555664,
            "loss_sequences_upper_95": 2.2650318076393825,
            "loss_tokens_lower_95": 2.1235298123677313,
            "loss_tokens_upper_95": 2.1684452432165173,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0105043547494073,
            "data_time": 0.01940770633518696,
            "batch_time": 0.04907452128827572,
            "samples_per_second": 986338.1057504058,
            "samples_per_second_per_gpu": 123292.26321880073,
            "loss_sequences_lower_95": 2.9919170021524235,
            "loss_sequences_upper_95": 3.165459146305006,
            "loss_tokens_lower_95": 2.9085067585794686,
            "loss_tokens_upper_95": 2.994355852234259,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.127883225282033,
            "data_time": 0.015860375685569566,
            "batch_time": 0.045622067573742986,
            "samples_per_second": 1002132.6241144211,
            "samples_per_second_per_gpu": 125266.57801430263,
            "loss_sequences_lower_95": 3.1027359161376955,
            "loss_sequences_upper_95": 3.2029200083414713,
            "loss_tokens_lower_95": 3.0036238360316556,
            "loss_tokens_upper_95": 3.1751324968373376,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.150706410697119,
            "data_time": 0.0014844198605169778,
            "batch_time": 0.03058264323804172,
            "samples_per_second": 1093464.8980975596,
            "samples_per_second_per_gpu": 136683.11226219495,
            "loss_sequences_lower_95": 5.156139786504725,
            "loss_sequences_upper_95": 5.23869186663427,
            "loss_tokens_lower_95": 5.014453992889651,
            "loss_tokens_upper_95": 5.098798103483802,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9741402621371575,
            "data_time": 0.0031054463962580533,
            "batch_time": 0.03196293435640783,
            "samples_per_second": 1097922.2283648618,
            "samples_per_second_per_gpu": 137240.27854560773,
            "loss_sequences_lower_95": 4.435804096054951,
            "loss_sequences_upper_95": 4.71734865747317,
            "loss_tokens_lower_95": 3.315016483372715,
            "loss_tokens_upper_95": 3.443653912591891,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.683370229327231,
            "data_time": 0.004908309192270846,
            "batch_time": 0.034383435909812514,
            "samples_per_second": 1067049.1790956042,
            "samples_per_second_per_gpu": 133381.14738695053,
            "loss_sequences_lower_95": 4.047314453125,
            "loss_sequences_upper_95": 4.364519701231869,
            "loss_tokens_lower_95": 3.3012347260457,
            "loss_tokens_upper_95": 3.4497317604343136,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.122831584111741,
            "data_time": 0.022590752158846174,
            "batch_time": 0.052272600787026544,
            "samples_per_second": 1001288.8231004128,
            "samples_per_second_per_gpu": 125161.1028875516,
            "loss_sequences_lower_95": 6.04567257955194,
            "loss_sequences_upper_95": 6.196667299314177,
            "loss_tokens_lower_95": 6.046968670849386,
            "loss_tokens_upper_95": 6.198730635969606,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.120525360107422,
            "data_time": 0.04863557907251211,
            "batch_time": 0.07896222518040584,
            "samples_per_second": 892005.3849733286,
            "samples_per_second_per_gpu": 111500.67312166607,
            "loss_sequences_lower_95": 2.989318534851074,
            "loss_sequences_upper_95": 3.343223815917969,
            "loss_tokens_lower_95": 2.8231925568555036,
            "loss_tokens_upper_95": 3.2619844985989213,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.23448848309512,
            "data_time": 0.0032701004746013866,
            "batch_time": 0.03215980115356133,
            "samples_per_second": 1098827.0705912479,
            "samples_per_second_per_gpu": 137353.38382390598,
            "loss_sequences_lower_95": 4.185135596752655,
            "loss_sequences_upper_95": 4.283406333962384,
            "loss_tokens_lower_95": 4.185128487437628,
            "loss_tokens_upper_95": 4.283854095865053,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7689126398991015,
            "data_time": 0.004713897020548436,
            "batch_time": 0.03369856580824487,
            "samples_per_second": 1089215.9971693996,
            "samples_per_second_per_gpu": 136151.99964617495,
            "loss_sequences_lower_95": 3.7240599686156326,
            "loss_sequences_upper_95": 3.8133717779835945,
            "loss_tokens_lower_95": 3.724300940011389,
            "loss_tokens_upper_95": 3.8148331225845875,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.197344482982768,
            "data_time": 0.0034625813461534407,
            "batch_time": 0.03259720658123107,
            "samples_per_second": 1082853.4556410718,
            "samples_per_second_per_gpu": 135356.68195513397,
            "loss_sequences_lower_95": 3.337989974826153,
            "loss_sequences_upper_95": 3.4685838435757277,
            "loss_tokens_lower_95": 3.0387906592624345,
            "loss_tokens_upper_95": 3.0958481976629577,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.102358205795288,
            "data_time": 0.010799841955304146,
            "batch_time": 0.04097809549421072,
            "samples_per_second": 1018873.7503675282,
            "samples_per_second_per_gpu": 127359.21879594102,
            "loss_sequences_lower_95": 5.248558203125,
            "loss_sequences_upper_95": 5.778378125,
            "loss_tokens_lower_95": 4.533434961648432,
            "loss_tokens_upper_95": 4.884505262191842,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.339082643389702,
            "data_time": 0.15702392160892487,
            "batch_time": 0.19305939972400665,
            "samples_per_second": 539728.475311217,
            "samples_per_second_per_gpu": 67466.05941390213,
            "loss_sequences_lower_95": 3.1547573149204253,
            "loss_sequences_upper_95": 3.536230581998825,
            "loss_tokens_lower_95": 2.8935235604472545,
            "loss_tokens_upper_95": 3.7506976160509833,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.156809644452457,
            "data_time": 0.02763903648295301,
            "batch_time": 0.057856795635629205,
            "samples_per_second": 911506.9218432668,
            "samples_per_second_per_gpu": 113938.36523040835,
            "loss_sequences_lower_95": 4.457869764306079,
            "loss_sequences_upper_95": 5.06858384581818,
            "loss_tokens_lower_95": 3.1336094526185527,
            "loss_tokens_upper_95": 3.500007982049746,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1625359616148363,
            "data_time": 0.0028194685777028403,
            "batch_time": 0.03197919742928611,
            "samples_per_second": 1083306.1303757322,
            "samples_per_second_per_gpu": 135413.26629696653,
            "loss_sequences_lower_95": 2.141970455891287,
            "loss_sequences_upper_95": 2.183982295165056,
            "loss_tokens_lower_95": 2.1419552037683625,
            "loss_tokens_upper_95": 2.183056219325101,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.408662655678799,
            "data_time": 0.002466316504398889,
            "batch_time": 0.031922164053985395,
            "samples_per_second": 1080500.0466238114,
            "samples_per_second_per_gpu": 135062.50582797642,
            "loss_sequences_lower_95": 2.382148402440023,
            "loss_sequences_upper_95": 2.51848446331324,
            "loss_tokens_lower_95": 2.268779763261028,
            "loss_tokens_upper_95": 2.4011507585715344,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0287239451111456,
            "data_time": 0.01818079087469313,
            "batch_time": 0.04826971722973718,
            "samples_per_second": 974157.6292605025,
            "samples_per_second_per_gpu": 121769.70365756282,
            "loss_sequences_lower_95": 2.8924957443069625,
            "loss_sequences_upper_95": 3.3048094920622995,
            "loss_tokens_lower_95": 2.7724757700028184,
            "loss_tokens_upper_95": 3.0619794282727946,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4709194211764136,
            "data_time": 0.004728603735566139,
            "batch_time": 0.03412777297198773,
            "samples_per_second": 1067591.5320042588,
            "samples_per_second_per_gpu": 133448.94150053235,
            "loss_sequences_lower_95": 3.520752222893646,
            "loss_sequences_upper_95": 3.6803887635963397,
            "loss_tokens_lower_95": 3.318247346914923,
            "loss_tokens_upper_95": 3.4588019979723486,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5994802110078856,
            "data_time": 0.03021849337078276,
            "batch_time": 0.06101838747660319,
            "samples_per_second": 951324.1125500161,
            "samples_per_second_per_gpu": 118915.51406875202,
            "loss_sequences_lower_95": 2.4452565076874526,
            "loss_sequences_upper_95": 2.8902565746772577,
            "loss_tokens_lower_95": 2.3179491500481113,
            "loss_tokens_upper_95": 2.653333769109082,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.0985273140195515,
            "data_time": 0.0019694500525760687,
            "batch_time": 0.03097171712959951,
            "samples_per_second": 1092869.453080527,
            "samples_per_second_per_gpu": 136608.68163506588,
            "loss_sequences_lower_95": 5.088053430998699,
            "loss_sequences_upper_95": 5.1089078167195945,
            "loss_tokens_lower_95": 5.088271042880451,
            "loss_tokens_upper_95": 5.108863257026405,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.1457289324223416,
            "data_time": 0.047590801932594996,
            "batch_time": 0.08032360076904296,
            "samples_per_second": 824982.9613748029,
            "samples_per_second_per_gpu": 103122.87017185036,
            "loss_sequences_lower_95": 1.099808226279842,
            "loss_sequences_upper_95": 1.2592153086245639,
            "loss_tokens_lower_95": 0.9827549492202493,
            "loss_tokens_upper_95": 1.2047929371391617,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.11612816763499,
            "data_time": 0.0013550172957270863,
            "batch_time": 0.03049708220854789,
            "samples_per_second": 1090267.4454275593,
            "samples_per_second_per_gpu": 136283.4306784449,
            "loss_sequences_lower_95": 4.436772133369366,
            "loss_sequences_upper_95": 4.476315585282364,
            "loss_tokens_lower_95": 3.621525054400387,
            "loss_tokens_upper_95": 3.6617500483558993,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.567287552595139,
            "data_time": 0.0056779640061514714,
            "batch_time": 0.035147677811365276,
            "samples_per_second": 1068179.0631812182,
            "samples_per_second_per_gpu": 133522.38289765228,
            "loss_sequences_lower_95": 4.55807939453125,
            "loss_sequences_upper_95": 4.751626000976563,
            "loss_tokens_lower_95": 4.385634637229312,
            "loss_tokens_upper_95": 4.571513575578234,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.398772737254267,
            "data_time": 0.02230729491023694,
            "batch_time": 0.052868800648188184,
            "samples_per_second": 970514.0805827695,
            "samples_per_second_per_gpu": 121314.26007284618,
            "loss_sequences_lower_95": 3.290203956935717,
            "loss_sequences_upper_95": 3.509363861083984,
            "loss_tokens_lower_95": 3.289568813158118,
            "loss_tokens_upper_95": 3.509313540251359,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.204834899757848,
            "data_time": 0.004720395947077188,
            "batch_time": 0.034326708819492756,
            "samples_per_second": 1065690.3289061617,
            "samples_per_second_per_gpu": 133211.2911132702,
            "loss_sequences_lower_95": 7.102701342033618,
            "loss_sequences_upper_95": 7.30414560029001,
            "loss_tokens_lower_95": 7.102849472508286,
            "loss_tokens_upper_95": 7.307703136097301,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.1327399696509044,
            "data_time": 0.004328416383012812,
            "batch_time": 0.0336450328852268,
            "samples_per_second": 1080228.8401900847,
            "samples_per_second_per_gpu": 135028.60502376058,
            "loss_sequences_lower_95": 1.162924894205729,
            "loss_sequences_upper_95": 1.204634940592448,
            "loss_tokens_lower_95": 1.0684234377344688,
            "loss_tokens_upper_95": 1.124289713932448,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.030826612881252,
            "data_time": 0.02354092683110918,
            "batch_time": 0.05300130375794002,
            "samples_per_second": 960451.1721142766,
            "samples_per_second_per_gpu": 120056.39651428457,
            "loss_sequences_lower_95": 5.676530412946429,
            "loss_sequences_upper_95": 6.3860375104631695,
            "loss_tokens_lower_95": 5.674815470377604,
            "loss_tokens_upper_95": 6.3893489728655135,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.106509957462549,
            "data_time": 0.15021979808807373,
            "batch_time": 0.18426299095153809,
            "samples_per_second": 545919.0631149538,
            "samples_per_second_per_gpu": 68239.88288936923,
            "loss_sequences_lower_95": 1.951019251346588,
            "loss_sequences_upper_95": 2.800540119409561,
            "loss_tokens_lower_95": 1.6461240756634585,
            "loss_tokens_upper_95": 2.108716116049855,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.3238665766716,
            "data_time": 0.005909741397887942,
            "batch_time": 0.035296542303902764,
            "samples_per_second": 1067700.514357842,
            "samples_per_second_per_gpu": 133462.56429473025,
            "loss_sequences_lower_95": 7.255011682128906,
            "loss_sequences_upper_95": 7.571344177246094,
            "loss_tokens_lower_95": 7.045850456224884,
            "loss_tokens_upper_95": 7.328964186924968,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.078486710071564,
            "data_time": 0.005716059416059464,
            "batch_time": 0.034750168758725365,
            "samples_per_second": 1079267.2472090751,
            "samples_per_second_per_gpu": 134908.4059011344,
            "loss_sequences_lower_95": 7.197471826171875,
            "loss_sequences_upper_95": 7.427724792480468,
            "loss_tokens_lower_95": 6.808186400694792,
            "loss_tokens_upper_95": 7.021640933079307,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.038523737951849,
            "data_time": 0.0038567407474071284,
            "batch_time": 0.033561119506989034,
            "samples_per_second": 1065551.136454996,
            "samples_per_second_per_gpu": 133193.8920568745,
            "loss_sequences_lower_95": 6.015568462171053,
            "loss_sequences_upper_95": 6.061094771413007,
            "loss_tokens_lower_95": 6.016179558513871,
            "loss_tokens_upper_95": 6.061051279516216,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9235989424856395,
            "data_time": 0.008316037878169034,
            "batch_time": 0.03774733413742388,
            "samples_per_second": 1051971.7705589863,
            "samples_per_second_per_gpu": 131496.4713198733,
            "loss_sequences_lower_95": 3.839379142140097,
            "loss_sequences_upper_95": 4.006832187259985,
            "loss_tokens_lower_95": 3.8366048364595335,
            "loss_tokens_upper_95": 4.0056433799263145,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.85298700094223,
            "data_time": 0.0055733006151895675,
            "batch_time": 0.034568587938944496,
            "samples_per_second": 1080313.2773446469,
            "samples_per_second_per_gpu": 135039.15966808086,
            "loss_sequences_lower_95": 6.775767541503907,
            "loss_sequences_upper_95": 6.931750891113281,
            "loss_tokens_lower_95": 6.775239306640625,
            "loss_tokens_upper_95": 6.931744201660156,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6863998223163095,
            "data_time": 0.0018384397593247642,
            "batch_time": 0.030997995753369395,
            "samples_per_second": 1089431.9495255323,
            "samples_per_second_per_gpu": 136178.99369069154,
            "loss_sequences_lower_95": 3.1489968106965467,
            "loss_sequences_upper_95": 3.2262281589995268,
            "loss_tokens_lower_95": 2.127987411454112,
            "loss_tokens_upper_95": 2.181211392393252,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.470897378316566,
            "data_time": 0.0192191447530474,
            "batch_time": 0.0489526697567531,
            "samples_per_second": 987380.8135041759,
            "samples_per_second_per_gpu": 123422.60168802198,
            "loss_sequences_lower_95": 3.364137655229711,
            "loss_sequences_upper_95": 3.582329166469289,
            "loss_tokens_lower_95": 3.364103556391018,
            "loss_tokens_upper_95": 3.582034802792677,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4910510254841225,
            "data_time": 0.011167812161147594,
            "batch_time": 0.040566266514360905,
            "samples_per_second": 1058278.6461829294,
            "samples_per_second_per_gpu": 132284.83077286617,
            "loss_sequences_lower_95": 3.407317875880821,
            "loss_sequences_upper_95": 3.571924582088695,
            "loss_tokens_lower_95": 3.4104251697016696,
            "loss_tokens_upper_95": 3.5725390744676777,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7165866854882816,
            "data_time": 0.0022250800527432884,
            "batch_time": 0.031291297269660436,
            "samples_per_second": 1089500.2252598144,
            "samples_per_second_per_gpu": 136187.5281574768,
            "loss_sequences_lower_95": 4.2911261870928845,
            "loss_sequences_upper_95": 4.393805966095923,
            "loss_tokens_lower_95": 2.9570507690774277,
            "loss_tokens_upper_95": 3.033152422656909,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.23714031239666,
            "data_time": 0.02703063189983368,
            "batch_time": 0.057587010165055595,
            "samples_per_second": 970670.5831142023,
            "samples_per_second_per_gpu": 121333.82288927528,
            "loss_sequences_lower_95": 6.157367056387442,
            "loss_sequences_upper_95": 6.314799087766617,
            "loss_tokens_lower_95": 6.157906410176918,
            "loss_tokens_upper_95": 6.31184497005725,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.984060483387122,
            "data_time": 0.003667650350866446,
            "batch_time": 0.032746559097653345,
            "samples_per_second": 1086340.1197865708,
            "samples_per_second_per_gpu": 135792.51497332135,
            "loss_sequences_lower_95": 2.9502441256928518,
            "loss_sequences_upper_95": 3.017442959193425,
            "loss_tokens_lower_95": 2.9505959719036694,
            "loss_tokens_upper_95": 3.0178936263618117,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7293064073451516,
            "data_time": 0.02295926700938832,
            "batch_time": 0.053545110875909976,
            "samples_per_second": 925068.588959634,
            "samples_per_second_per_gpu": 115633.57361995424,
            "loss_sequences_lower_95": 3.6324273785341132,
            "loss_sequences_upper_95": 3.8273849265089313,
            "loss_tokens_lower_95": 3.6311804391805405,
            "loss_tokens_upper_95": 3.828486670336677,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.0730266233285266,
            "data_time": 0.07851452380418777,
            "batch_time": 0.1130715012550354,
            "samples_per_second": 672823.9309798451,
            "samples_per_second_per_gpu": 84102.99137248064,
            "loss_sequences_lower_95": 1.9106690216064453,
            "loss_sequences_upper_95": 2.4074081484476726,
            "loss_tokens_lower_95": 1.7187345769670275,
            "loss_tokens_upper_95": 2.3395686149597164,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1469632267951964,
            "data_time": 0.07744468003511429,
            "batch_time": 0.110030896961689,
            "samples_per_second": 713551.3111566826,
            "samples_per_second_per_gpu": 89193.91389458532,
            "loss_sequences_lower_95": 1.985095802942912,
            "loss_sequences_upper_95": 2.5162520027160644,
            "loss_tokens_lower_95": 1.6455512829041214,
            "loss_tokens_upper_95": 2.367461099517479,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.492156326999019,
            "data_time": 0.003457446179718323,
            "batch_time": 0.03267011788397975,
            "samples_per_second": 1084517.4047363119,
            "samples_per_second_per_gpu": 135564.67559203898,
            "loss_sequences_lower_95": 4.472497698821797,
            "loss_sequences_upper_95": 4.511174535737297,
            "loss_tokens_lower_95": 4.472619229795655,
            "loss_tokens_upper_95": 4.5116992762794546,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.4880826018956319,
            "data_time": 0.001257431935885803,
            "batch_time": 0.030363088306569426,
            "samples_per_second": 1091702.7927902867,
            "samples_per_second_per_gpu": 136462.84909878584,
            "loss_sequences_lower_95": 0.5553922354632899,
            "loss_sequences_upper_95": 0.5692989399191342,
            "loss_tokens_lower_95": 0.41867181628951977,
            "loss_tokens_upper_95": 0.4262979657716519,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.06513477310421,
            "data_time": 0.040453676134347916,
            "batch_time": 0.07227016240358353,
            "samples_per_second": 934438.1212459023,
            "samples_per_second_per_gpu": 116804.76515573778,
            "loss_sequences_lower_95": 4.127637235386166,
            "loss_sequences_upper_95": 4.50703085351178,
            "loss_tokens_lower_95": 3.7339421016699674,
            "loss_tokens_upper_95": 4.035839927708947,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.281186206920727,
            "data_time": 0.11308859643482026,
            "batch_time": 0.14827680587768555,
            "samples_per_second": 525063.2648501386,
            "samples_per_second_per_gpu": 65632.90810626732,
            "loss_sequences_lower_95": 6.801956393267657,
            "loss_sequences_upper_95": 7.9836570739746096,
            "loss_tokens_lower_95": 6.083523088620033,
            "loss_tokens_upper_95": 8.314531566478587,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9334368356844274,
            "data_time": 0.02903551147097633,
            "batch_time": 0.05962809494563511,
            "samples_per_second": 954168.6292177739,
            "samples_per_second_per_gpu": 119271.07865222174,
            "loss_sequences_lower_95": 3.9263204714147055,
            "loss_sequences_upper_95": 4.243037302901105,
            "loss_tokens_lower_95": 3.574799143516085,
            "loss_tokens_upper_95": 3.816492867562027,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.091616329623432,
            "data_time": 0.03126927784511021,
            "batch_time": 0.062797140507471,
            "samples_per_second": 945016.2887888348,
            "samples_per_second_per_gpu": 118127.03609860435,
            "loss_sequences_lower_95": 4.056999094893293,
            "loss_sequences_upper_95": 4.332313044478253,
            "loss_tokens_lower_95": 3.764775246322344,
            "loss_tokens_upper_95": 3.9763785942814,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.034114775861182,
            "data_time": 0.031165216650281633,
            "batch_time": 0.06204094205583845,
            "samples_per_second": 949488.3324439417,
            "samples_per_second_per_gpu": 118686.04155549272,
            "loss_sequences_lower_95": 4.067515191799257,
            "loss_sequences_upper_95": 4.441695413356874,
            "loss_tokens_lower_95": 3.6196844029659645,
            "loss_tokens_upper_95": 3.9319422629182537,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.399870504693287,
            "data_time": 0.030974896181197392,
            "batch_time": 0.06193172080176217,
            "samples_per_second": 948816.8641136403,
            "samples_per_second_per_gpu": 118602.10801420503,
            "loss_sequences_lower_95": 4.378920503941978,
            "loss_sequences_upper_95": 4.677500682923852,
            "loss_tokens_lower_95": 4.058323731229312,
            "loss_tokens_upper_95": 4.267353388230749,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.005845379385149,
            "data_time": 0.030343653243265035,
            "batch_time": 0.06135120215239348,
            "samples_per_second": 958468.5851300926,
            "samples_per_second_per_gpu": 119808.57314126157,
            "loss_sequences_lower_95": 3.965167359535738,
            "loss_sequences_upper_95": 4.218934223814781,
            "loss_tokens_lower_95": 3.7006636968960223,
            "loss_tokens_upper_95": 3.8936742514258555,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1500286779752593,
            "data_time": 0.03160792589187622,
            "batch_time": 0.06204033749444144,
            "samples_per_second": 968902.9302275679,
            "samples_per_second_per_gpu": 121112.86627844599,
            "loss_sequences_lower_95": 3.16911054471644,
            "loss_sequences_upper_95": 3.4104582344613426,
            "loss_tokens_lower_95": 2.886764888016124,
            "loss_tokens_upper_95": 3.0085703160318715,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-32.0/params.txt",
    "uuid": "48fd39e0-d3eb-4d7e-b36e-804e1689a401",
    "creation_date": "2023_12_14-05_09_40"
}