{
    "name": "rw_original-d=576_l=24_h=8-16.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 49176760320,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "9835352064",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.182644279797872,
            "data_time": 0.032633863389492035,
            "batch_time": 0.3559936061501503,
            "samples_per_second": 837355.0474867417,
            "samples_per_second_per_gpu": 104669.38093584271,
            "loss_sequences_lower_95": 3.1067226854960124,
            "loss_sequences_upper_95": 3.2596156565348307,
            "loss_tokens_lower_95": 3.168892199198405,
            "loss_tokens_upper_95": 3.196300557454427,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.166769285435062,
            "data_time": 0.0012473380697745927,
            "batch_time": 0.030879198241323017,
            "samples_per_second": 1075828.2965277135,
            "samples_per_second_per_gpu": 134478.5370659642,
            "loss_sequences_lower_95": 3.1643427330762717,
            "loss_sequences_upper_95": 3.1692148676013834,
            "loss_tokens_lower_95": 3.1563828281250004,
            "loss_tokens_upper_95": 3.177148375,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.839594869224393,
            "data_time": 0.00923964786529541,
            "batch_time": 0.03878046798706054,
            "samples_per_second": 1052017.513750843,
            "samples_per_second_per_gpu": 131502.18921885538,
            "loss_sequences_lower_95": 2.7877617909956953,
            "loss_sequences_upper_95": 2.9042579868861607,
            "loss_tokens_lower_95": 2.8273226250000003,
            "loss_tokens_upper_95": 2.8522176874999996,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2532537070992067,
            "data_time": 0.0015937504603674537,
            "batch_time": 0.030057144792456376,
            "samples_per_second": 1119452.629653442,
            "samples_per_second_per_gpu": 139931.57870668024,
            "loss_sequences_lower_95": 3.215998897591817,
            "loss_sequences_upper_95": 3.291750538619523,
            "loss_tokens_lower_95": 3.2412411875,
            "loss_tokens_upper_95": 3.2651676614583334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2361679650129953,
            "data_time": 0.008437965020715478,
            "batch_time": 0.03750007086066136,
            "samples_per_second": 1061030.9971020608,
            "samples_per_second_per_gpu": 132628.8746377576,
            "loss_sequences_lower_95": 3.180599664816303,
            "loss_sequences_upper_95": 3.30671610472887,
            "loss_tokens_lower_95": 3.2251823958333334,
            "loss_tokens_upper_95": 3.246933552083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3375427799108155,
            "data_time": 0.0035180135265640592,
            "batch_time": 0.03219343981017237,
            "samples_per_second": 1110152.4632219803,
            "samples_per_second_per_gpu": 138769.05790274753,
            "loss_sequences_lower_95": 3.2929944005326943,
            "loss_sequences_upper_95": 3.3857946337525493,
            "loss_tokens_lower_95": 3.3252452604166667,
            "loss_tokens_upper_95": 3.3496679843750004,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.00619582662777,
            "data_time": 0.0014790073887947916,
            "batch_time": 0.03005983399722549,
            "samples_per_second": 1119201.9955024421,
            "samples_per_second_per_gpu": 139900.24943780527,
            "loss_sequences_lower_95": 2.97578102080676,
            "loss_sequences_upper_95": 3.036117057955995,
            "loss_tokens_lower_95": 2.9907491614583335,
            "loss_tokens_upper_95": 3.0222004114583334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7014805710877425,
            "data_time": 0.0016488716822695602,
            "batch_time": 0.030221752719568038,
            "samples_per_second": 1118986.102989385,
            "samples_per_second_per_gpu": 139873.2628736731,
            "loss_sequences_lower_95": 3.678858771678665,
            "loss_sequences_upper_95": 3.7262461551047124,
            "loss_tokens_lower_95": 3.689962375,
            "loss_tokens_upper_95": 3.713008041666667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3213377585256003,
            "data_time": 0.008760111672537667,
            "batch_time": 0.03766570488611857,
            "samples_per_second": 1063386.026788366,
            "samples_per_second_per_gpu": 132923.25334854575,
            "loss_sequences_lower_95": 3.239882988658378,
            "loss_sequences_upper_95": 3.4181362741361787,
            "loss_tokens_lower_95": 3.3099244114583333,
            "loss_tokens_upper_95": 3.3328846770833334,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3659458179247705,
            "data_time": 0.009189033880829811,
            "batch_time": 0.03840803634375334,
            "samples_per_second": 1067373.731265587,
            "samples_per_second_per_gpu": 133421.71640819838,
            "loss_sequences_lower_95": 4.263396625744967,
            "loss_sequences_upper_95": 4.491813713948246,
            "loss_tokens_lower_95": 4.3526991875,
            "loss_tokens_upper_95": 4.3794004270833335,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3937798835683295,
            "data_time": 0.0012494284801530197,
            "batch_time": 0.02963185597851231,
            "samples_per_second": 1126761.2169822769,
            "samples_per_second_per_gpu": 140845.1521227846,
            "loss_sequences_lower_95": 3.380560798744347,
            "loss_sequences_upper_95": 3.4075039600499353,
            "loss_tokens_lower_95": 3.3825537552083333,
            "loss_tokens_upper_95": 3.4051414479166664,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2221845539844516,
            "data_time": 0.0025291456767264054,
            "batch_time": 0.031873111621624824,
            "samples_per_second": 1090505.2623178568,
            "samples_per_second_per_gpu": 136313.1577897321,
            "loss_sequences_lower_95": 3.196262477507744,
            "loss_sequences_upper_95": 3.249517362150536,
            "loss_tokens_lower_95": 3.2108987916666667,
            "loss_tokens_upper_95": 3.2335217447916667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7587618697245753,
            "data_time": 0.009198054023410963,
            "batch_time": 0.038671983560554596,
            "samples_per_second": 1039379.3994805636,
            "samples_per_second_per_gpu": 129922.42493507045,
            "loss_sequences_lower_95": 3.6765893839435693,
            "loss_sequences_upper_95": 3.858668329265974,
            "loss_tokens_lower_95": 3.745809302083333,
            "loss_tokens_upper_95": 3.7718159375,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.96758780872749,
            "data_time": 0.009201599782206622,
            "batch_time": 0.03815851458515304,
            "samples_per_second": 1061111.8554257269,
            "samples_per_second_per_gpu": 132638.98192821586,
            "loss_sequences_lower_95": 2.8853127225110584,
            "loss_sequences_upper_95": 3.0620486008902428,
            "loss_tokens_lower_95": 2.9560860208333333,
            "loss_tokens_upper_95": 2.9790229375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8601991534233093,
            "data_time": 0.07729581424168178,
            "batch_time": 0.11287920815604073,
            "samples_per_second": 538790.3647221369,
            "samples_per_second_per_gpu": 67348.79559026711,
            "loss_sequences_lower_95": 3.784057530489835,
            "loss_sequences_upper_95": 3.953265146775679,
            "loss_tokens_lower_95": 3.838552847775546,
            "loss_tokens_upper_95": 3.881964830918746,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.288228818348476,
            "data_time": 0.013233026320284063,
            "batch_time": 0.04328467358242382,
            "samples_per_second": 1016065.2737642229,
            "samples_per_second_per_gpu": 127008.15922052786,
            "loss_sequences_lower_95": 3.2252892852872175,
            "loss_sequences_upper_95": 3.3504802470304527,
            "loss_tokens_lower_95": 3.2752277447916667,
            "loss_tokens_upper_95": 3.3011627604166667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.284380638505035,
            "data_time": 0.011737449715534845,
            "batch_time": 0.0414709709584713,
            "samples_per_second": 1042762.8888253067,
            "samples_per_second_per_gpu": 130345.36110316333,
            "loss_sequences_lower_95": 5.201853189996805,
            "loss_sequences_upper_95": 5.390152686126629,
            "loss_tokens_lower_95": 5.272592625,
            "loss_tokens_upper_95": 5.2960580833333335,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.534325982703537,
            "data_time": 0.03519406169652939,
            "batch_time": 0.06626728922128677,
            "samples_per_second": 904840.4682212259,
            "samples_per_second_per_gpu": 113105.05852765324,
            "loss_sequences_lower_95": 3.3894264346263445,
            "loss_sequences_upper_95": 3.789931625616355,
            "loss_tokens_lower_95": 3.520535816130091,
            "loss_tokens_upper_95": 3.54797149408059,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.402943905203656,
            "data_time": 0.0015911295297205584,
            "batch_time": 0.030430560604857965,
            "samples_per_second": 1102188.2373251074,
            "samples_per_second_per_gpu": 137773.52966563843,
            "loss_sequences_lower_95": 2.3920935088150728,
            "loss_sequences_upper_95": 2.413930432474897,
            "loss_tokens_lower_95": 2.392140994705615,
            "loss_tokens_upper_95": 2.4139002634952282,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8262361379715344,
            "data_time": 0.001960780780026867,
            "batch_time": 0.030828421520199746,
            "samples_per_second": 1099201.4433400996,
            "samples_per_second_per_gpu": 137400.18041751246,
            "loss_sequences_lower_95": 2.8231783297260877,
            "loss_sequences_upper_95": 2.848175655411335,
            "loss_tokens_lower_95": 2.806012809856325,
            "loss_tokens_upper_95": 2.824391230211614,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0417040855917215,
            "data_time": 0.0029776934516755733,
            "batch_time": 0.032125922894275606,
            "samples_per_second": 1094733.5601526801,
            "samples_per_second_per_gpu": 136841.69501908502,
            "loss_sequences_lower_95": 4.321202581132646,
            "loss_sequences_upper_95": 4.620774856444851,
            "loss_tokens_lower_95": 3.443604574323922,
            "loss_tokens_upper_95": 3.661513757967,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.731031977813691,
            "data_time": 0.0037745868271969736,
            "batch_time": 0.03254077266505424,
            "samples_per_second": 1094400.5464196082,
            "samples_per_second_per_gpu": 136800.06830245102,
            "loss_sequences_lower_95": 3.7804806315104167,
            "loss_sequences_upper_95": 3.9722374755859375,
            "loss_tokens_lower_95": 3.52684274886989,
            "loss_tokens_upper_95": 3.6668824992629716,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.7488375695209055,
            "data_time": 0.004349769330492206,
            "batch_time": 0.033377168764715644,
            "samples_per_second": 1085542.732456629,
            "samples_per_second_per_gpu": 135692.84155707862,
            "loss_sequences_lower_95": 2.795242311884003,
            "loss_sequences_upper_95": 2.8529195118890374,
            "loss_tokens_lower_95": 2.6574924365481722,
            "loss_tokens_upper_95": 2.687254001911722,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.233674192428589,
            "data_time": 0.02592530846595764,
            "batch_time": 0.05864821374416351,
            "samples_per_second": 986746.7841173474,
            "samples_per_second_per_gpu": 123343.34801466843,
            "loss_sequences_lower_95": 2.2131992305408827,
            "loss_sequences_upper_95": 2.313374189897017,
            "loss_tokens_lower_95": 2.170776904036078,
            "loss_tokens_upper_95": 2.2159496206729314,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0930167791794756,
            "data_time": 0.018981359899044037,
            "batch_time": 0.048596905544400215,
            "samples_per_second": 985076.1985639551,
            "samples_per_second_per_gpu": 123134.5248204944,
            "loss_sequences_lower_95": 3.076866853675064,
            "loss_sequences_upper_95": 3.2545596438038107,
            "loss_tokens_lower_95": 2.9845257886552132,
            "loss_tokens_upper_95": 3.0719818863193975,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.298482370376587,
            "data_time": 0.016237280307671964,
            "batch_time": 0.046314879869803406,
            "samples_per_second": 984060.3711244792,
            "samples_per_second_per_gpu": 123007.5463905599,
            "loss_sequences_lower_95": 3.2792244669596355,
            "loss_sequences_upper_95": 3.391334197998047,
            "loss_tokens_lower_95": 3.1471200610562806,
            "loss_tokens_upper_95": 3.356279302910615,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.953902248406317,
            "data_time": 0.0015273973376553143,
            "batch_time": 0.030609744374834873,
            "samples_per_second": 1093852.5826535826,
            "samples_per_second_per_gpu": 136731.57283169782,
            "loss_sequences_lower_95": 4.955890198639954,
            "loss_sequences_upper_95": 5.039501249092687,
            "loss_tokens_lower_95": 4.823894653874408,
            "loss_tokens_upper_95": 4.90838707094522,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8823434153339678,
            "data_time": 0.002866310961294494,
            "batch_time": 0.031779300846509484,
            "samples_per_second": 1096250.0330403212,
            "samples_per_second_per_gpu": 137031.25413004015,
            "loss_sequences_lower_95": 4.281117952391756,
            "loss_sequences_upper_95": 4.541832078105272,
            "loss_tokens_lower_95": 3.299430249843892,
            "loss_tokens_upper_95": 3.424793062547897,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7041606495201385,
            "data_time": 0.004809136326248581,
            "batch_time": 0.034035842966389014,
            "samples_per_second": 1076718.7413557642,
            "samples_per_second_per_gpu": 134589.84266947053,
            "loss_sequences_lower_95": 4.018750395791115,
            "loss_sequences_upper_95": 4.310064218150064,
            "loss_tokens_lower_95": 3.344075709906422,
            "loss_tokens_upper_95": 3.4913559408024764,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.911485382411033,
            "data_time": 0.021585396357945034,
            "batch_time": 0.05110101827553341,
            "samples_per_second": 1005288.2259393951,
            "samples_per_second_per_gpu": 125661.02824242439,
            "loss_sequences_lower_95": 5.833341457419199,
            "loss_sequences_upper_95": 5.987478763110017,
            "loss_tokens_lower_95": 5.8331289770396335,
            "loss_tokens_upper_95": 5.989093198732698,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.17146027803421,
            "data_time": 0.0429953199166518,
            "batch_time": 0.07414744908993061,
            "samples_per_second": 882799.174248298,
            "samples_per_second_per_gpu": 110349.89678103726,
            "loss_sequences_lower_95": 3.036368324279785,
            "loss_sequences_upper_95": 3.379023361206055,
            "loss_tokens_lower_95": 2.8776951552716903,
            "loss_tokens_upper_95": 3.3073743516515957,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.019398585800749,
            "data_time": 0.003148001883415113,
            "batch_time": 0.032071827379472416,
            "samples_per_second": 1097546.4109625376,
            "samples_per_second_per_gpu": 137193.3013703172,
            "loss_sequences_lower_95": 3.9719742742751247,
            "loss_sequences_upper_95": 4.066676703836682,
            "loss_tokens_lower_95": 3.9714385830148093,
            "loss_tokens_upper_95": 4.0670713270454835,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1945196783610976,
            "data_time": 0.0045055985256275966,
            "batch_time": 0.033579918922063384,
            "samples_per_second": 1087632.7551175398,
            "samples_per_second_per_gpu": 135954.09438969247,
            "loss_sequences_lower_95": 3.155834795977618,
            "loss_sequences_upper_95": 3.2331491431283914,
            "loss_tokens_lower_95": 3.1543499220989455,
            "loss_tokens_upper_95": 3.2345320368086483,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2155618824559276,
            "data_time": 0.003420413699057396,
            "batch_time": 0.03222744691449159,
            "samples_per_second": 1093821.9144344842,
            "samples_per_second_per_gpu": 136727.73930431053,
            "loss_sequences_lower_95": 3.3427825695282745,
            "loss_sequences_upper_95": 3.472802322603543,
            "loss_tokens_lower_95": 3.0721786537601066,
            "loss_tokens_upper_95": 3.1305683007553378,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.18002256822586,
            "data_time": 0.009962937794625759,
            "batch_time": 0.03960035461932421,
            "samples_per_second": 1035463.5353676231,
            "samples_per_second_per_gpu": 129432.94192095289,
            "loss_sequences_lower_95": 5.346341186523437,
            "loss_sequences_upper_95": 5.8815084716796875,
            "loss_tokens_lower_95": 4.60509126916932,
            "loss_tokens_upper_95": 4.959547926210394,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.426993504166603,
            "data_time": 0.1398533135652542,
            "batch_time": 0.1750316470861435,
            "samples_per_second": 556024.6623824622,
            "samples_per_second_per_gpu": 69503.08279780777,
            "loss_sequences_lower_95": 3.22352597117424,
            "loss_sequences_upper_95": 3.6348805248737337,
            "loss_tokens_lower_95": 2.9853152658747533,
            "loss_tokens_upper_95": 3.8377790779903016,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9918573382257043,
            "data_time": 0.027050804584584337,
            "batch_time": 0.05758700725880075,
            "samples_per_second": 912249.9735783896,
            "samples_per_second_per_gpu": 114031.2466972987,
            "loss_sequences_lower_95": 4.275808014266793,
            "loss_sequences_upper_95": 4.847064507144621,
            "loss_tokens_lower_95": 3.074465078807603,
            "loss_tokens_upper_95": 3.4349088133389407,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.11748665360292,
            "data_time": 0.0028308534787760842,
            "batch_time": 0.03170814882549974,
            "samples_per_second": 1094843.6590130334,
            "samples_per_second_per_gpu": 136855.45737662917,
            "loss_sequences_lower_95": 2.095455139346808,
            "loss_sequences_upper_95": 2.1392621054926413,
            "loss_tokens_lower_95": 2.0956863956609735,
            "loss_tokens_upper_95": 2.139240677859805,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5087345887241024,
            "data_time": 0.0025086557397927525,
            "batch_time": 0.03136299634377624,
            "samples_per_second": 1100056.5962617001,
            "samples_per_second_per_gpu": 137507.07453271252,
            "loss_sequences_lower_95": 2.4820192111618837,
            "loss_sequences_upper_95": 2.6218906111655227,
            "loss_tokens_lower_95": 2.365072777581571,
            "loss_tokens_upper_95": 2.500766884598296,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0983543339229764,
            "data_time": 0.01682287785742018,
            "batch_time": 0.04716039035055372,
            "samples_per_second": 967925.9686827413,
            "samples_per_second_per_gpu": 120990.74608534266,
            "loss_sequences_lower_95": 2.9612768319936897,
            "loss_sequences_upper_95": 3.371942099546775,
            "loss_tokens_lower_95": 2.8399247500325218,
            "loss_tokens_upper_95": 3.130119031540891,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5227080689891643,
            "data_time": 0.0045231286436319355,
            "batch_time": 0.03386110663414001,
            "samples_per_second": 1068979.0265297187,
            "samples_per_second_per_gpu": 133622.37831621483,
            "loss_sequences_lower_95": 3.5739525461535617,
            "loss_sequences_upper_95": 3.728313134727087,
            "loss_tokens_lower_95": 3.3752383541351487,
            "loss_tokens_upper_95": 3.517661286298117,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6430491441633643,
            "data_time": 0.028437208561670212,
            "batch_time": 0.05848207360222226,
            "samples_per_second": 964643.6865080826,
            "samples_per_second_per_gpu": 120580.46081351032,
            "loss_sequences_lower_95": 2.5084068391381242,
            "loss_sequences_upper_95": 2.9357092601497,
            "loss_tokens_lower_95": 2.3838643508089694,
            "loss_tokens_upper_95": 2.7275443246664075,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.631370878167142,
            "data_time": 0.0018519386500298763,
            "batch_time": 0.030714544495534193,
            "samples_per_second": 1099234.349748659,
            "samples_per_second_per_gpu": 137404.29371858237,
            "loss_sequences_lower_95": 4.618702568638727,
            "loss_sequences_upper_95": 4.644198390459342,
            "loss_tokens_lower_95": 4.618548250665758,
            "loss_tokens_upper_95": 4.644035262130552,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.1527030132349254,
            "data_time": 0.04308303486217152,
            "batch_time": 0.07387446923689409,
            "samples_per_second": 857966.240182003,
            "samples_per_second_per_gpu": 107245.78002275038,
            "loss_sequences_lower_95": 1.1007561526252228,
            "loss_sequences_upper_95": 1.2665544713585122,
            "loss_tokens_lower_95": 0.9810656185403939,
            "loss_tokens_upper_95": 1.2135363700812036,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.247971932052692,
            "data_time": 0.0012965664979498225,
            "batch_time": 0.030267310374840307,
            "samples_per_second": 1097712.84368697,
            "samples_per_second_per_gpu": 137214.10546087124,
            "loss_sequences_lower_95": 4.583609503979953,
            "loss_sequences_upper_95": 4.624211441136007,
            "loss_tokens_lower_95": 3.728740467843327,
            "loss_tokens_upper_95": 3.770425604448743,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.704982847213745,
            "data_time": 0.005267776667125641,
            "batch_time": 0.034429282423049684,
            "samples_per_second": 1077502.3463529742,
            "samples_per_second_per_gpu": 134687.79329412177,
            "loss_sequences_lower_95": 4.702355627441406,
            "loss_sequences_upper_95": 4.862572668457031,
            "loss_tokens_lower_95": 4.541422053998695,
            "loss_tokens_upper_95": 4.700011055246399,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0934877768806786,
            "data_time": 0.021296392052860585,
            "batch_time": 0.05089828725588524,
            "samples_per_second": 1003734.6491950647,
            "samples_per_second_per_gpu": 125466.83114938308,
            "loss_sequences_lower_95": 3.966849478016729,
            "loss_sequences_upper_95": 4.220569152832032,
            "loss_tokens_lower_95": 3.968031178350034,
            "loss_tokens_upper_95": 4.218333143151325,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.756722952380325,
            "data_time": 0.004315034452691136,
            "batch_time": 0.034082952393106665,
            "samples_per_second": 1060760.0666514477,
            "samples_per_second_per_gpu": 132595.00833143096,
            "loss_sequences_lower_95": 6.666820641719934,
            "loss_sequences_upper_95": 6.844798694957387,
            "loss_tokens_lower_95": 6.666982699307528,
            "loss_tokens_upper_95": 6.846074847597064,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2917350494861604,
            "data_time": 0.003928974904912583,
            "batch_time": 0.03299829966210304,
            "samples_per_second": 1087851.582578593,
            "samples_per_second_per_gpu": 135981.44782232412,
            "loss_sequences_lower_95": 1.328094364420573,
            "loss_sequences_upper_95": 1.377461844889323,
            "loss_tokens_lower_95": 1.2169968456132454,
            "loss_tokens_upper_95": 1.2823404146814976,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.959752351897103,
            "data_time": 0.0216618435723441,
            "batch_time": 0.05089974616255079,
            "samples_per_second": 968308.2070689632,
            "samples_per_second_per_gpu": 121038.5258836204,
            "loss_sequences_lower_95": 5.6121331932431175,
            "loss_sequences_upper_95": 6.301204703194754,
            "loss_tokens_lower_95": 5.616738615490141,
            "loss_tokens_upper_95": 6.3063360159737725,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.048996478319168,
            "data_time": 0.14086924493312836,
            "batch_time": 0.1748168170452118,
            "samples_per_second": 565231.5111520644,
            "samples_per_second_per_gpu": 70653.93889400805,
            "loss_sequences_lower_95": 1.8838417410850525,
            "loss_sequences_upper_95": 2.666800194978714,
            "loss_tokens_lower_95": 1.5980994879339159,
            "loss_tokens_upper_95": 2.065578084729381,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.4231809797286985,
            "data_time": 0.005396147095967853,
            "batch_time": 0.03459218995911734,
            "samples_per_second": 1075231.4507079483,
            "samples_per_second_per_gpu": 134403.93133849354,
            "loss_sequences_lower_95": 7.352399426269531,
            "loss_sequences_upper_95": 7.683884191894531,
            "loss_tokens_lower_95": 7.145404951221447,
            "loss_tokens_upper_95": 7.441563061812606,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.181149156093597,
            "data_time": 0.005302119822729202,
            "batch_time": 0.034245872781390234,
            "samples_per_second": 1083002.8203427445,
            "samples_per_second_per_gpu": 135375.35254284306,
            "loss_sequences_lower_95": 7.295344946289062,
            "loss_sequences_upper_95": 7.521409716796875,
            "loss_tokens_lower_95": 6.913540977970863,
            "loss_tokens_upper_95": 7.118936999248874,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.388706222141584,
            "data_time": 0.003502852302729884,
            "batch_time": 0.03309142119111026,
            "samples_per_second": 1070252.5584875275,
            "samples_per_second_per_gpu": 133781.56981094094,
            "loss_sequences_lower_95": 5.35697392646874,
            "loss_sequences_upper_95": 5.419753783811389,
            "loss_tokens_lower_95": 5.357279892044502,
            "loss_tokens_upper_95": 5.4201065559655754,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9415258850370134,
            "data_time": 0.008364196270254084,
            "batch_time": 0.037913729416999935,
            "samples_per_second": 1044072.3727924332,
            "samples_per_second_per_gpu": 130509.04659905416,
            "loss_sequences_lower_95": 2.8702697238248245,
            "loss_sequences_upper_95": 3.0125432027649772,
            "loss_tokens_lower_95": 2.8697775423252088,
            "loss_tokens_upper_95": 3.0128725300919257,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.04308713722229,
            "data_time": 0.005510253092599293,
            "batch_time": 0.03446154603882441,
            "samples_per_second": 1081256.185155789,
            "samples_per_second_per_gpu": 135157.0231444736,
            "loss_sequences_lower_95": 6.9728390380859375,
            "loss_sequences_upper_95": 7.114466333007813,
            "loss_tokens_lower_95": 6.97358427734375,
            "loss_tokens_upper_95": 7.113818981933594,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.643225916675948,
            "data_time": 0.0019385020989776784,
            "batch_time": 0.03063610519488502,
            "samples_per_second": 1105486.0461386943,
            "samples_per_second_per_gpu": 138185.75576733678,
            "loss_sequences_lower_95": 3.08530199987435,
            "loss_sequences_upper_95": 3.1607290496393095,
            "loss_tokens_lower_95": 2.0994310126890223,
            "loss_tokens_upper_95": 2.152391863709316,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6346285859150673,
            "data_time": 0.016179893698011125,
            "batch_time": 0.04536093473434448,
            "samples_per_second": 1005686.1563645614,
            "samples_per_second_per_gpu": 125710.76954557018,
            "loss_sequences_lower_95": 3.512588916607757,
            "loss_sequences_upper_95": 3.761029946626122,
            "loss_tokens_lower_95": 3.513708450545126,
            "loss_tokens_upper_95": 3.7579537178153424,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8681060505848306,
            "data_time": 0.009674432687461376,
            "batch_time": 0.03904299344867468,
            "samples_per_second": 1065270.2238685428,
            "samples_per_second_per_gpu": 133158.77798356785,
            "loss_sequences_lower_95": 3.783174175187653,
            "loss_sequences_upper_95": 3.950731506347656,
            "loss_tokens_lower_95": 3.788537047143076,
            "loss_tokens_upper_95": 3.9495617675781247,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6916390627708466,
            "data_time": 0.0020194121019993464,
            "batch_time": 0.03101726018300382,
            "samples_per_second": 1094774.488273863,
            "samples_per_second_per_gpu": 136846.81103423287,
            "loss_sequences_lower_95": 4.243074688008894,
            "loss_sequences_upper_95": 4.341327028920832,
            "loss_tokens_lower_95": 2.9556846198262274,
            "loss_tokens_upper_95": 3.0309489221688604,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.080262252262661,
            "data_time": 0.024862011273701985,
            "batch_time": 0.05499962468942007,
            "samples_per_second": 990162.7779865237,
            "samples_per_second_per_gpu": 123770.34724831546,
            "loss_sequences_lower_95": 5.967082496925637,
            "loss_sequences_upper_95": 6.190077928008225,
            "loss_tokens_lower_95": 5.967553549468833,
            "loss_tokens_upper_95": 6.188738174034805,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.594179512455558,
            "data_time": 0.0032859180436466203,
            "batch_time": 0.0321036925100436,
            "samples_per_second": 1096084.621396364,
            "samples_per_second_per_gpu": 137010.5776745455,
            "loss_sequences_lower_95": 3.563236229572821,
            "loss_sequences_upper_95": 3.625291796576357,
            "loss_tokens_lower_95": 3.5635392849890097,
            "loss_tokens_upper_95": 3.6255998915926035,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6760389787479513,
            "data_time": 0.022290830178694293,
            "batch_time": 0.052084584669633345,
            "samples_per_second": 949281.1927143268,
            "samples_per_second_per_gpu": 118660.14908929085,
            "loss_sequences_lower_95": 3.5315604496928095,
            "loss_sequences_upper_95": 3.8253484929649577,
            "loss_tokens_lower_95": 3.529570185096518,
            "loss_tokens_upper_95": 3.8276052530529427,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.866012555360794,
            "data_time": 0.07355282455682755,
            "batch_time": 0.10679218918085098,
            "samples_per_second": 735097.6039109648,
            "samples_per_second_per_gpu": 91887.2004888706,
            "loss_sequences_lower_95": 1.659838628768921,
            "loss_sequences_upper_95": 2.1137787310282388,
            "loss_tokens_lower_95": 1.5095232857598198,
            "loss_tokens_upper_95": 2.173006508085463,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9877901395161948,
            "data_time": 0.07234041392803192,
            "batch_time": 0.10497300326824188,
            "samples_per_second": 732168.1233165741,
            "samples_per_second_per_gpu": 91521.01541457177,
            "loss_sequences_lower_95": 1.8074564774831134,
            "loss_sequences_upper_95": 2.390341968536377,
            "loss_tokens_lower_95": 1.51540648213933,
            "loss_tokens_upper_95": 2.2634554530797377,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9358551632146597,
            "data_time": 0.0031644610393180644,
            "batch_time": 0.03220374279684569,
            "samples_per_second": 1089694.3959296807,
            "samples_per_second_per_gpu": 136211.79949121008,
            "loss_sequences_lower_95": 2.915810798566366,
            "loss_sequences_upper_95": 2.9559423180918634,
            "loss_tokens_lower_95": 2.9159355224249817,
            "loss_tokens_upper_95": 2.955989528200939,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.4750350412544481,
            "data_time": 0.0012472230427505151,
            "batch_time": 0.030206553499403876,
            "samples_per_second": 1096681.0382230757,
            "samples_per_second_per_gpu": 137085.12977788446,
            "loss_sequences_lower_95": 0.5437681590337674,
            "loss_sequences_upper_95": 0.5567089060723581,
            "loss_tokens_lower_95": 0.4044655232720055,
            "loss_tokens_upper_95": 0.41185637496353045,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5313291718640665,
            "data_time": 0.03663427755236626,
            "batch_time": 0.06710103899240494,
            "samples_per_second": 960820.5633273929,
            "samples_per_second_per_gpu": 120102.57041592411,
            "loss_sequences_lower_95": 4.572708910844458,
            "loss_sequences_upper_95": 4.973812673974225,
            "loss_tokens_lower_95": 4.2213565415054655,
            "loss_tokens_upper_95": 4.536879171560273,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 8.005438662864066,
            "data_time": 0.1136791365487235,
            "batch_time": 0.14531261580330984,
            "samples_per_second": 575071.1274564278,
            "samples_per_second_per_gpu": 71883.89093205347,
            "loss_sequences_lower_95": 7.540441647091427,
            "loss_sequences_upper_95": 8.72084408321896,
            "loss_tokens_lower_95": 6.703823569968895,
            "loss_tokens_upper_95": 9.117638595015913,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.470596378896294,
            "data_time": 0.02846748772121611,
            "batch_time": 0.06116260233379546,
            "samples_per_second": 909249.5613098419,
            "samples_per_second_per_gpu": 113656.19516373024,
            "loss_sequences_lower_95": 4.469575835437309,
            "loss_sequences_upper_95": 4.8251188138636145,
            "loss_tokens_lower_95": 4.097052875643923,
            "loss_tokens_upper_95": 4.360948793300568,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.598213863082048,
            "data_time": 0.028498374280475434,
            "batch_time": 0.059347995689937046,
            "samples_per_second": 950071.7337358211,
            "samples_per_second_per_gpu": 118758.96671697764,
            "loss_sequences_lower_95": 4.581980775042278,
            "loss_sequences_upper_95": 4.885665186440073,
            "loss_tokens_lower_95": 4.2704909886426945,
            "loss_tokens_upper_95": 4.496756110644863,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.598918349277683,
            "data_time": 0.028017975035167876,
            "batch_time": 0.05857770215897333,
            "samples_per_second": 963623.0835931028,
            "samples_per_second_per_gpu": 120452.88544913784,
            "loss_sequences_lower_95": 4.6205261323510145,
            "loss_sequences_upper_95": 5.036055457882765,
            "loss_tokens_lower_95": 4.149342701557109,
            "loss_tokens_upper_95": 4.489733655072959,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.769903193159801,
            "data_time": 0.030371135189419703,
            "batch_time": 0.06039358036858695,
            "samples_per_second": 966775.7295583433,
            "samples_per_second_per_gpu": 120846.96619479291,
            "loss_sequences_lower_95": 4.720912561184023,
            "loss_sequences_upper_95": 5.025123112376144,
            "loss_tokens_lower_95": 4.469690112532856,
            "loss_tokens_upper_95": 4.6770780284085385,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.23258426026528,
            "data_time": 0.029324755256558643,
            "batch_time": 0.059340715408325195,
            "samples_per_second": 986892.6706691628,
            "samples_per_second_per_gpu": 123361.58383364535,
            "loss_sequences_lower_95": 4.161723024356439,
            "loss_sequences_upper_95": 4.420055318322981,
            "loss_tokens_lower_95": 3.954964604964902,
            "loss_tokens_upper_95": 4.147586156862387,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3435715785840663,
            "data_time": 0.03008558920451573,
            "batch_time": 0.061041559491838725,
            "samples_per_second": 953924.2384532152,
            "samples_per_second_per_gpu": 119240.5298066519,
            "loss_sequences_lower_95": 3.341753657271222,
            "loss_sequences_upper_95": 3.591114211663967,
            "loss_tokens_lower_95": 3.109087217214813,
            "loss_tokens_upper_95": 3.2369776597347797,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-16.0/params.txt",
    "uuid": "cb0862fd-ed34-43cd-9ff7-682f5f837919",
    "creation_date": "2023_12_14-05_08_38"
}