{
    "name": "rw_original-d=576_l=24_h=8-0.5",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 1536773760,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "307354752",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.8958494424819947,
            "data_time": 0.034805167466402054,
            "batch_time": 0.3846817575395107,
            "samples_per_second": 814153.5411088344,
            "samples_per_second_per_gpu": 101769.1926386043,
            "loss_sequences_lower_95": 3.8161380386352537,
            "loss_sequences_upper_95": 3.976451193491618,
            "loss_tokens_lower_95": 3.881629015604655,
            "loss_tokens_upper_95": 3.9099111874898274,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.704636169137121,
            "data_time": 0.0011772853532791183,
            "batch_time": 0.03041366167950063,
            "samples_per_second": 1090276.1559564255,
            "samples_per_second_per_gpu": 136284.51949455318,
            "loss_sequences_lower_95": 3.702291341295192,
            "loss_sequences_upper_95": 3.706950633242744,
            "loss_tokens_lower_95": 3.693841229166667,
            "loss_tokens_upper_95": 3.7156414479166666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.267766917968283,
            "data_time": 0.009859028816223145,
            "batch_time": 0.03900677967071533,
            "samples_per_second": 1057970.2367400588,
            "samples_per_second_per_gpu": 132246.27959250734,
            "loss_sequences_lower_95": 3.2129490817323023,
            "loss_sequences_upper_95": 3.3373933504065687,
            "loss_tokens_lower_95": 3.2542593958333335,
            "loss_tokens_upper_95": 3.2812450052083335,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.833814726859024,
            "data_time": 0.0016791554854104394,
            "batch_time": 0.030046032251496064,
            "samples_per_second": 1124426.323507604,
            "samples_per_second_per_gpu": 140553.2904384505,
            "loss_sequences_lower_95": 3.7930510329413663,
            "loss_sequences_upper_95": 3.876250030202964,
            "loss_tokens_lower_95": 3.82076903125,
            "loss_tokens_upper_95": 3.84664459375,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.767243850498238,
            "data_time": 0.010455703355401635,
            "batch_time": 0.039241826866727426,
            "samples_per_second": 1067702.720024788,
            "samples_per_second_per_gpu": 133462.8400030985,
            "loss_sequences_lower_95": 3.708211540641707,
            "loss_sequences_upper_95": 3.8445853922857527,
            "loss_tokens_lower_95": 3.7555723229166666,
            "loss_tokens_upper_95": 3.7786100312500004,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.930011878606821,
            "data_time": 0.003969534907651984,
            "batch_time": 0.03280372334563214,
            "samples_per_second": 1104338.4347675345,
            "samples_per_second_per_gpu": 138042.3043459418,
            "loss_sequences_lower_95": 3.8809195056243624,
            "loss_sequences_upper_95": 3.9837948880172926,
            "loss_tokens_lower_95": 3.917399916666667,
            "loss_tokens_upper_95": 3.94272375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6601755201086705,
            "data_time": 0.0016847560106249462,
            "batch_time": 0.03011518152853015,
            "samples_per_second": 1123373.8788896354,
            "samples_per_second_per_gpu": 140421.73486120443,
            "loss_sequences_lower_95": 3.628187569754464,
            "loss_sequences_upper_95": 3.69142747528699,
            "loss_tokens_lower_95": 3.6449581458333333,
            "loss_tokens_upper_95": 3.6759188645833336,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.178933898666142,
            "data_time": 0.001880196037276526,
            "batch_time": 0.031529874195368505,
            "samples_per_second": 1100751.5064996686,
            "samples_per_second_per_gpu": 137593.93831245857,
            "loss_sequences_lower_95": 4.153817520860602,
            "loss_sequences_upper_95": 4.206185782068063,
            "loss_tokens_lower_95": 4.166996052083333,
            "loss_tokens_upper_95": 4.1907046249999995,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8846654160235956,
            "data_time": 0.010504243865845695,
            "batch_time": 0.04621657681843591,
            "samples_per_second": 1043705.4972361623,
            "samples_per_second_per_gpu": 130463.18715452029,
            "loss_sequences_lower_95": 3.7954880691156156,
            "loss_sequences_upper_95": 3.9929983991917557,
            "loss_tokens_lower_95": 3.8724929583333334,
            "loss_tokens_upper_95": 3.8967404270833335,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.01735454679949,
            "data_time": 0.009740850888192654,
            "batch_time": 0.03916448447853327,
            "samples_per_second": 1061725.4959959358,
            "samples_per_second_per_gpu": 132715.68699949197,
            "loss_sequences_lower_95": 4.897844602562222,
            "loss_sequences_upper_95": 5.1668244252563,
            "loss_tokens_lower_95": 5.003737458333333,
            "loss_tokens_upper_95": 5.031086697916667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.96281497321979,
            "data_time": 0.0013594052405447527,
            "batch_time": 0.029838941168478007,
            "samples_per_second": 1123606.9128030103,
            "samples_per_second_per_gpu": 140450.8641003763,
            "loss_sequences_lower_95": 3.9520289266993287,
            "loss_sequences_upper_95": 3.9739787915838702,
            "loss_tokens_lower_95": 3.9511398020833335,
            "loss_tokens_upper_95": 3.9746066979166668,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.828419484182578,
            "data_time": 0.0026682499147870956,
            "batch_time": 0.03210093079756737,
            "samples_per_second": 1087764.969242929,
            "samples_per_second_per_gpu": 135970.62115536613,
            "loss_sequences_lower_95": 3.8048143494084496,
            "loss_sequences_upper_95": 3.8533029755668213,
            "loss_tokens_lower_95": 3.81639978125,
            "loss_tokens_upper_95": 3.84020790625,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.252109425546673,
            "data_time": 0.01022168770137983,
            "batch_time": 0.03906218901924465,
            "samples_per_second": 1062181.127526807,
            "samples_per_second_per_gpu": 132772.64094085086,
            "loss_sequences_lower_95": 4.161766562935789,
            "loss_sequences_upper_95": 4.363362886069029,
            "loss_tokens_lower_95": 4.238756041666666,
            "loss_tokens_upper_95": 4.265176385416667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5729236835860427,
            "data_time": 0.009769787351448697,
            "batch_time": 0.039131694580929216,
            "samples_per_second": 1053975.7942363166,
            "samples_per_second_per_gpu": 131746.97427953957,
            "loss_sequences_lower_95": 3.4850699865648074,
            "loss_sequences_upper_95": 3.675879248096598,
            "loss_tokens_lower_95": 3.5606278125,
            "loss_tokens_upper_95": 3.58531815625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.627010442993858,
            "data_time": 0.08969249895640782,
            "batch_time": 0.1232890146119254,
            "samples_per_second": 558525.8370301572,
            "samples_per_second_per_gpu": 69815.72962876965,
            "loss_sequences_lower_95": 4.553302686864679,
            "loss_sequences_upper_95": 4.712682663310658,
            "loss_tokens_lower_95": 4.603296635367654,
            "loss_tokens_upper_95": 4.651257610321045,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.886344493304328,
            "data_time": 0.01469277793710882,
            "batch_time": 0.04379726810888811,
            "samples_per_second": 1050172.77661514,
            "samples_per_second_per_gpu": 131271.5970768925,
            "loss_sequences_lower_95": 3.818854213525533,
            "loss_sequences_upper_95": 3.952942072476312,
            "loss_tokens_lower_95": 3.8734132604166667,
            "loss_tokens_upper_95": 3.8989850729166666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.71989382791645,
            "data_time": 0.013180570056041082,
            "batch_time": 0.04268299539883932,
            "samples_per_second": 1050703.668765878,
            "samples_per_second_per_gpu": 131337.95859573476,
            "loss_sequences_lower_95": 5.6269303696765745,
            "loss_sequences_upper_95": 5.843282686500247,
            "loss_tokens_lower_95": 5.707986197916667,
            "loss_tokens_upper_95": 5.7316674999999995,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.291858219709552,
            "data_time": 0.03851453214883804,
            "batch_time": 0.06885795667767525,
            "samples_per_second": 927737.3903518918,
            "samples_per_second_per_gpu": 115967.17379398647,
            "loss_sequences_lower_95": 4.132264922095127,
            "loss_sequences_upper_95": 4.573891899233958,
            "loss_tokens_lower_95": 4.277265167236328,
            "loss_tokens_upper_95": 4.30655870281282,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.9254346635500905,
            "data_time": 0.0016819602875611777,
            "batch_time": 0.03074304647489126,
            "samples_per_second": 1094715.9327520798,
            "samples_per_second_per_gpu": 136839.49159400997,
            "loss_sequences_lower_95": 4.907833725386341,
            "loss_sequences_upper_95": 4.943656808592081,
            "loss_tokens_lower_95": 4.9073844315268484,
            "loss_tokens_upper_95": 4.94376211488303,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3523720036320324,
            "data_time": 0.0018114512133750186,
            "batch_time": 0.030712142159604723,
            "samples_per_second": 1098627.370949204,
            "samples_per_second_per_gpu": 137328.4213686505,
            "loss_sequences_lower_95": 3.3501561702567964,
            "loss_sequences_upper_95": 3.375805007639788,
            "loss_tokens_lower_95": 3.3324974647338252,
            "loss_tokens_upper_95": 3.352099961402129,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.2708563704112255,
            "data_time": 0.003256164554376629,
            "batch_time": 0.032109715599254206,
            "samples_per_second": 1101032.5080866877,
            "samples_per_second_per_gpu": 137629.06351083596,
            "loss_sequences_lower_95": 5.488225121874263,
            "loss_sequences_upper_95": 5.78160949908848,
            "loss_tokens_lower_95": 4.777404561200709,
            "loss_tokens_upper_95": 4.989425005491274,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.395558551073075,
            "data_time": 0.003577347290008626,
            "batch_time": 0.03256755734377719,
            "samples_per_second": 1091789.1428308934,
            "samples_per_second_per_gpu": 136473.64285386167,
            "loss_sequences_lower_95": 5.529358919270834,
            "loss_sequences_upper_95": 5.734298551432292,
            "loss_tokens_lower_95": 5.056419332252359,
            "loss_tokens_upper_95": 5.200112961871069,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.548438680090626,
            "data_time": 0.004611005250864619,
            "batch_time": 0.03380145925921732,
            "samples_per_second": 1086265.4380404698,
            "samples_per_second_per_gpu": 135783.17975505872,
            "loss_sequences_lower_95": 3.594569629498555,
            "loss_sequences_upper_95": 3.6613021931565104,
            "loss_tokens_lower_95": 3.448092960255138,
            "loss_tokens_upper_95": 3.481856330209644,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.96453607190739,
            "data_time": 0.023050246494156972,
            "batch_time": 0.052582664149148126,
            "samples_per_second": 1014655.0189913285,
            "samples_per_second_per_gpu": 126831.87737391607,
            "loss_sequences_lower_95": 2.938248596191406,
            "loss_sequences_upper_95": 3.052537931962447,
            "loss_tokens_lower_95": 2.8914489616925323,
            "loss_tokens_upper_95": 2.944822154023481,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.802136369627349,
            "data_time": 0.02059406228363514,
            "batch_time": 0.04959237203001976,
            "samples_per_second": 1006131.3406596375,
            "samples_per_second_per_gpu": 125766.41758245468,
            "loss_sequences_lower_95": 3.791778346470424,
            "loss_sequences_upper_95": 3.990102676080198,
            "loss_tokens_lower_95": 3.667233531334631,
            "loss_tokens_upper_95": 3.764294322846141,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.425186006228129,
            "data_time": 0.01711731384962033,
            "batch_time": 0.04588595873270279,
            "samples_per_second": 1024229.996366396,
            "samples_per_second_per_gpu": 128028.7495457995,
            "loss_sequences_lower_95": 4.386775899251302,
            "loss_sequences_upper_95": 4.482547485351563,
            "loss_tokens_lower_95": 4.2847812421537,
            "loss_tokens_upper_95": 4.532287129896091,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.812131008032557,
            "data_time": 0.0015201893581108436,
            "batch_time": 0.030924413983435414,
            "samples_per_second": 1084139.4930391535,
            "samples_per_second_per_gpu": 135517.4366298942,
            "loss_sequences_lower_95": 6.829725061205157,
            "loss_sequences_upper_95": 6.903377432828108,
            "loss_tokens_lower_95": 6.663968767917722,
            "loss_tokens_upper_95": 6.741660942949974,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.029442322775973,
            "data_time": 0.0030487047745877465,
            "batch_time": 0.03233186370574388,
            "samples_per_second": 1083697.67830883,
            "samples_per_second_per_gpu": 135462.20978860374,
            "loss_sequences_lower_95": 5.592373646951284,
            "loss_sequences_upper_95": 5.8998204806035615,
            "loss_tokens_lower_95": 4.262290229762716,
            "loss_tokens_upper_95": 4.404623183026085,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.524305556719621,
            "data_time": 0.005048245594308183,
            "batch_time": 0.0347785220758335,
            "samples_per_second": 1059685.8533256918,
            "samples_per_second_per_gpu": 132460.73166571147,
            "loss_sequences_lower_95": 4.970681471059754,
            "loss_sequences_upper_95": 5.312999498803461,
            "loss_tokens_lower_95": 4.088042844592006,
            "loss_tokens_upper_95": 4.247671462505546,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.6449524191416565,
            "data_time": 0.023440358894211904,
            "batch_time": 0.05404377622263772,
            "samples_per_second": 989095.9946652314,
            "samples_per_second_per_gpu": 123636.99933315393,
            "loss_sequences_lower_95": 5.548232223998466,
            "loss_sequences_upper_95": 5.737970110052797,
            "loss_tokens_lower_95": 5.548636351981664,
            "loss_tokens_upper_95": 5.740942912341253,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.714282512664795,
            "data_time": 0.0496744834459745,
            "batch_time": 0.08074911741109994,
            "samples_per_second": 890459.5015906643,
            "samples_per_second_per_gpu": 111307.43769883303,
            "loss_sequences_lower_95": 3.5858440246582033,
            "loss_sequences_upper_95": 3.9305585174560544,
            "loss_tokens_lower_95": 3.4041801930326896,
            "loss_tokens_upper_95": 3.8656141141232947,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.565765265920633,
            "data_time": 0.0033118122682005836,
            "batch_time": 0.03193480081109669,
            "samples_per_second": 1107935.4157886233,
            "samples_per_second_per_gpu": 138491.9269735779,
            "loss_sequences_lower_95": 4.5278999508220314,
            "loss_sequences_upper_95": 4.602771608319473,
            "loss_tokens_lower_95": 4.528637183041997,
            "loss_tokens_upper_95": 4.60397969209794,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.104344296904493,
            "data_time": 0.004770496735565036,
            "batch_time": 0.03383290670435347,
            "samples_per_second": 1089631.946489958,
            "samples_per_second_per_gpu": 136203.99331124476,
            "loss_sequences_lower_95": 5.059658611025799,
            "loss_sequences_upper_95": 5.149686466251408,
            "loss_tokens_lower_95": 5.057521570753993,
            "loss_tokens_upper_95": 5.15111104979269,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.871698255248376,
            "data_time": 0.0036740591407594617,
            "batch_time": 0.03228374974516504,
            "samples_per_second": 1100419.3513150637,
            "samples_per_second_per_gpu": 137552.41891438296,
            "loss_sequences_lower_95": 4.014115080558437,
            "loss_sequences_upper_95": 4.1430336770606635,
            "loss_tokens_lower_95": 3.708650335818794,
            "loss_tokens_upper_95": 3.7705121001933715,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.795410000801087,
            "data_time": 0.011273275129497051,
            "batch_time": 0.04043968487530947,
            "samples_per_second": 1047172.8213908963,
            "samples_per_second_per_gpu": 130896.60267386204,
            "loss_sequences_lower_95": 5.9918416992187495,
            "loss_sequences_upper_95": 6.519772143554687,
            "loss_tokens_lower_95": 5.19421205754157,
            "loss_tokens_upper_95": 5.554539060561094,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.355723291635513,
            "data_time": 0.15165306627750397,
            "batch_time": 0.1867121309041977,
            "samples_per_second": 512789.7041699306,
            "samples_per_second_per_gpu": 64098.713021241325,
            "loss_sequences_lower_95": 4.096500825881958,
            "loss_sequences_upper_95": 4.6885733008384705,
            "loss_tokens_lower_95": 3.8819977069723195,
            "loss_tokens_upper_95": 4.704605453315823,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.588954341137546,
            "data_time": 0.02902166894141664,
            "batch_time": 0.05842961909923148,
            "samples_per_second": 947656.3601325667,
            "samples_per_second_per_gpu": 118457.04501657083,
            "loss_sequences_lower_95": 4.880376144935345,
            "loss_sequences_upper_95": 5.474181699204718,
            "loss_tokens_lower_95": 3.6245935347599,
            "loss_tokens_upper_95": 4.003638128820374,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.688992841423938,
            "data_time": 0.0031460455308357873,
            "batch_time": 0.03185600373480055,
            "samples_per_second": 1101294.4609270273,
            "samples_per_second_per_gpu": 137661.8076158784,
            "loss_sequences_lower_95": 2.662605079660878,
            "loss_sequences_upper_95": 2.7151353321933707,
            "loss_tokens_lower_95": 2.6619335756181055,
            "loss_tokens_upper_95": 2.715522693452381,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8908594282839286,
            "data_time": 0.0025901606622295505,
            "batch_time": 0.03125081110426193,
            "samples_per_second": 1107913.6130354484,
            "samples_per_second_per_gpu": 138489.20162943105,
            "loss_sequences_lower_95": 3.8605217834271297,
            "loss_sequences_upper_95": 4.046424251876941,
            "loss_tokens_lower_95": 3.6725884638169886,
            "loss_tokens_upper_95": 3.8529219885703383,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4859139648549284,
            "data_time": 0.018073444565137226,
            "batch_time": 0.047234283553229436,
            "samples_per_second": 999205.4332534774,
            "samples_per_second_per_gpu": 124900.67915668468,
            "loss_sequences_lower_95": 3.337836498218578,
            "loss_sequences_upper_95": 3.7342732970967836,
            "loss_tokens_lower_95": 3.2235204592553917,
            "loss_tokens_upper_95": 3.5259521427913243,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8261082848885346,
            "data_time": 0.00468849316239357,
            "batch_time": 0.033422503620386124,
            "samples_per_second": 1090666.47640621,
            "samples_per_second_per_gpu": 136333.30955077626,
            "loss_sequences_lower_95": 3.8604935667885507,
            "loss_sequences_upper_95": 4.006233486305002,
            "loss_tokens_lower_95": 3.6855453565594005,
            "loss_tokens_upper_95": 3.8307856198258112,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3394156418195586,
            "data_time": 0.030529158455984935,
            "batch_time": 0.060167536849067324,
            "samples_per_second": 984745.9144078665,
            "samples_per_second_per_gpu": 123093.23930098332,
            "loss_sequences_lower_95": 3.155071798185023,
            "loss_sequences_upper_95": 3.6453734979396915,
            "loss_tokens_lower_95": 3.0539485862943514,
            "loss_tokens_upper_95": 3.4450113766345076,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.667788022659997,
            "data_time": 0.0020376391473694703,
            "batch_time": 0.03092143967110683,
            "samples_per_second": 1099971.7694887135,
            "samples_per_second_per_gpu": 137496.47118608918,
            "loss_sequences_lower_95": 5.659114860081391,
            "loss_sequences_upper_95": 5.676148628163133,
            "loss_tokens_lower_95": 5.659137911566688,
            "loss_tokens_upper_95": 5.676396695354696,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4929796486224942,
            "data_time": 0.0481303561817516,
            "batch_time": 0.07896927920254794,
            "samples_per_second": 880459.0794482139,
            "samples_per_second_per_gpu": 110057.38493102674,
            "loss_sequences_lower_95": 1.4259032314263502,
            "loss_sequences_upper_95": 1.6173977750018962,
            "loss_tokens_lower_95": 1.2789427399025237,
            "loss_tokens_upper_95": 1.5699474791543389,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.458371938811908,
            "data_time": 0.0013666515053567623,
            "batch_time": 0.030251073267713677,
            "samples_per_second": 1100082.8663513188,
            "samples_per_second_per_gpu": 137510.35829391485,
            "loss_sequences_lower_95": 5.8274890367040095,
            "loss_sequences_upper_95": 5.874265510351153,
            "loss_tokens_lower_95": 4.875808837040618,
            "loss_tokens_upper_95": 4.922204533365571,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.663676259279251,
            "data_time": 0.005703274219755142,
            "batch_time": 0.03455556148574466,
            "samples_per_second": 1088335.9076556957,
            "samples_per_second_per_gpu": 136041.98845696196,
            "loss_sequences_lower_95": 5.6569880859375,
            "loss_sequences_upper_95": 5.878630444335937,
            "loss_tokens_lower_95": 5.440899209762302,
            "loss_tokens_upper_95": 5.646165443886553,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.4805817966875825,
            "data_time": 0.02250951831623659,
            "batch_time": 0.05269119497072899,
            "samples_per_second": 989683.6302610507,
            "samples_per_second_per_gpu": 123710.45378263133,
            "loss_sequences_lower_95": 5.323888616147249,
            "loss_sequences_upper_95": 5.639399599821671,
            "loss_tokens_lower_95": 5.321548621136209,
            "loss_tokens_upper_95": 5.636571151069973,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.688778195959149,
            "data_time": 0.004829382680984865,
            "batch_time": 0.03387110060956105,
            "samples_per_second": 1085604.8750346426,
            "samples_per_second_per_gpu": 135700.60937933033,
            "loss_sequences_lower_95": 7.5713284209280305,
            "loss_sequences_upper_95": 7.8028004594282665,
            "loss_tokens_lower_95": 7.574508315577651,
            "loss_tokens_upper_95": 7.802920365767045,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7019292908509573,
            "data_time": 0.004136619732735005,
            "batch_time": 0.03293050793891258,
            "samples_per_second": 1097771.584752609,
            "samples_per_second_per_gpu": 137221.44809407613,
            "loss_sequences_lower_95": 1.775321044921875,
            "loss_sequences_upper_95": 1.8703427693684895,
            "loss_tokens_lower_95": 1.5787053063412864,
            "loss_tokens_upper_95": 1.6605252452543517,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.158480229831877,
            "data_time": 0.0245250974382673,
            "batch_time": 0.053632029465266635,
            "samples_per_second": 974469.6801671152,
            "samples_per_second_per_gpu": 121808.7100208894,
            "loss_sequences_lower_95": 5.831370297386533,
            "loss_sequences_upper_95": 6.480529639834449,
            "loss_tokens_lower_95": 5.832243201846168,
            "loss_tokens_upper_95": 6.4886534191313245,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.617713265120983,
            "data_time": 0.16076044738292694,
            "batch_time": 0.1987541764974594,
            "samples_per_second": 522704.6456601175,
            "samples_per_second_per_gpu": 65338.08070751469,
            "loss_sequences_lower_95": 2.376654177904129,
            "loss_sequences_upper_95": 3.544614887237548,
            "loss_tokens_lower_95": 2.010506550897028,
            "loss_tokens_upper_95": 2.569520735593186,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.267461463928223,
            "data_time": 0.005703407620626782,
            "batch_time": 0.03443458250590733,
            "samples_per_second": 1088884.0241767673,
            "samples_per_second_per_gpu": 136110.50302209592,
            "loss_sequences_lower_95": 7.191912536621094,
            "loss_sequences_upper_95": 7.539429406738281,
            "loss_tokens_lower_95": 6.9875886920341586,
            "loss_tokens_upper_95": 7.293557827041032,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.201068755626679,
            "data_time": 0.00584575581172156,
            "batch_time": 0.034826584278591095,
            "samples_per_second": 1084240.5636290836,
            "samples_per_second_per_gpu": 135530.07045363545,
            "loss_sequences_lower_95": 7.3160083374023435,
            "loss_sequences_upper_95": 7.5462974975585935,
            "loss_tokens_lower_95": 6.938869669249969,
            "loss_tokens_upper_95": 7.142531106229657,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.0148116878083675,
            "data_time": 0.0037554465010014664,
            "batch_time": 0.0327189759665907,
            "samples_per_second": 1091159.2965530553,
            "samples_per_second_per_gpu": 136394.9120691319,
            "loss_sequences_lower_95": 5.995735862596379,
            "loss_sequences_upper_95": 6.033767161048022,
            "loss_tokens_lower_95": 5.995912236396874,
            "loss_tokens_upper_95": 6.034091705209731,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.331364015280377,
            "data_time": 0.008566441492731837,
            "batch_time": 0.03749719559245959,
            "samples_per_second": 1069336.230488939,
            "samples_per_second_per_gpu": 133667.02881111737,
            "loss_sequences_lower_95": 5.237288730228735,
            "loss_sequences_upper_95": 5.422594399166546,
            "loss_tokens_lower_95": 5.23515394360239,
            "loss_tokens_upper_95": 5.421384299845189,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.45019981431961,
            "data_time": 0.00576010819465395,
            "batch_time": 0.034574490217935475,
            "samples_per_second": 1089569.7779992938,
            "samples_per_second_per_gpu": 136196.22224991172,
            "loss_sequences_lower_95": 7.368574243164063,
            "loss_sequences_upper_95": 7.534465783691406,
            "loss_tokens_lower_95": 7.369139416503907,
            "loss_tokens_upper_95": 7.533554760742188,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.500706107444998,
            "data_time": 0.001903271134273767,
            "batch_time": 0.03070942963454133,
            "samples_per_second": 1102468.3792320495,
            "samples_per_second_per_gpu": 137808.54740400618,
            "loss_sequences_lower_95": 3.9800212589433537,
            "loss_sequences_upper_95": 4.0625582426679285,
            "loss_tokens_lower_95": 2.912375012663695,
            "loss_tokens_upper_95": 2.9705457369337327,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.618277933170546,
            "data_time": 0.019525877067020962,
            "batch_time": 0.04886604888098581,
            "samples_per_second": 1001875.4637569046,
            "samples_per_second_per_gpu": 125234.43296961307,
            "loss_sequences_lower_95": 5.454705639739535,
            "loss_sequences_upper_95": 5.778484936614534,
            "loss_tokens_lower_95": 5.458875992049032,
            "loss_tokens_upper_95": 5.778327531956914,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.758701601215438,
            "data_time": 0.010913773439824581,
            "batch_time": 0.03981438837945461,
            "samples_per_second": 1077155.918308848,
            "samples_per_second_per_gpu": 134644.489788606,
            "loss_sequences_lower_95": 5.648795381433824,
            "loss_sequences_upper_95": 5.866800824333639,
            "loss_tokens_lower_95": 5.64797119140625,
            "loss_tokens_upper_95": 5.8657547415939035,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.404903852773851,
            "data_time": 0.0021209636310482957,
            "batch_time": 0.030981289085432063,
            "samples_per_second": 1098137.0843507615,
            "samples_per_second_per_gpu": 137267.1355438452,
            "loss_sequences_lower_95": 4.902846062531316,
            "loss_sequences_upper_95": 4.995201681013561,
            "loss_tokens_lower_95": 3.71418923079655,
            "loss_tokens_upper_95": 3.791622220070939,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.143127305167062,
            "data_time": 0.028710993627707165,
            "batch_time": 0.058457426726818085,
            "samples_per_second": 1007920.8401052793,
            "samples_per_second_per_gpu": 125990.10501315992,
            "loss_sequences_lower_95": 6.075826267464451,
            "loss_sequences_upper_95": 6.211587540560929,
            "loss_tokens_lower_95": 6.074928307911707,
            "loss_tokens_upper_95": 6.212258297551877,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.201063959897476,
            "data_time": 0.0036163197623358834,
            "batch_time": 0.032657384581327144,
            "samples_per_second": 1088466.680427088,
            "samples_per_second_per_gpu": 136058.335053386,
            "loss_sequences_lower_95": 5.165593149727638,
            "loss_sequences_upper_95": 5.235980862958716,
            "loss_tokens_lower_95": 5.166468696244266,
            "loss_tokens_upper_95": 5.236310893300841,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.674744258806544,
            "data_time": 0.02405037663199685,
            "batch_time": 0.05416976538571444,
            "samples_per_second": 947181.8576971408,
            "samples_per_second_per_gpu": 118397.7322121426,
            "loss_sequences_lower_95": 5.504363747013426,
            "loss_sequences_upper_95": 5.847455545073575,
            "loss_tokens_lower_95": 5.505301229236196,
            "loss_tokens_upper_95": 5.8467082199541105,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.286245115598043,
            "data_time": 0.07864822447299957,
            "batch_time": 0.11068370193243027,
            "samples_per_second": 746452.8252909752,
            "samples_per_second_per_gpu": 93306.6031613719,
            "loss_sequences_lower_95": 3.888261349995931,
            "loss_sequences_upper_95": 4.820450693766276,
            "loss_tokens_lower_95": 3.493819761276245,
            "loss_tokens_upper_95": 4.724429406060112,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5260918060938518,
            "data_time": 0.08322273939847946,
            "batch_time": 0.11539311707019806,
            "samples_per_second": 746393.0987595791,
            "samples_per_second_per_gpu": 93299.13734494739,
            "loss_sequences_lower_95": 3.2361752192179365,
            "loss_sequences_upper_95": 4.106586570739745,
            "loss_tokens_lower_95": 2.690980864106939,
            "loss_tokens_upper_95": 3.852965134181334,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8809339959421285,
            "data_time": 0.0033423243865607554,
            "batch_time": 0.032303186779235514,
            "samples_per_second": 1093175.4214084016,
            "samples_per_second_per_gpu": 136646.9276760502,
            "loss_sequences_lower_95": 3.8629983345222754,
            "loss_sequences_upper_95": 3.8987998276992823,
            "loss_tokens_lower_95": 3.8631489538268595,
            "loss_tokens_upper_95": 3.8987715160162,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.9423127522806278,
            "data_time": 0.0013103760632174803,
            "batch_time": 0.030522290527559736,
            "samples_per_second": 1087016.776855147,
            "samples_per_second_per_gpu": 135877.09710689337,
            "loss_sequences_lower_95": 1.1292559038522363,
            "loss_sequences_upper_95": 1.1579318873200608,
            "loss_tokens_lower_95": 0.7506054100395197,
            "loss_tokens_upper_95": 0.7648907902181985,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.857233850974736,
            "data_time": 0.04178828373551369,
            "batch_time": 0.0722430944442749,
            "samples_per_second": 966651.1027824908,
            "samples_per_second_per_gpu": 120831.38784781135,
            "loss_sequences_lower_95": 4.880924987792969,
            "loss_sequences_upper_95": 5.240059427576741,
            "loss_tokens_lower_95": 4.533145229308677,
            "loss_tokens_upper_95": 4.789214297776959,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 8.586290849221719,
            "data_time": 0.12084947313581194,
            "batch_time": 0.15544650668189638,
            "samples_per_second": 539043.4784645111,
            "samples_per_second_per_gpu": 67380.43480806389,
            "loss_sequences_lower_95": 8.050174939954603,
            "loss_sequences_upper_95": 9.368174207532729,
            "loss_tokens_lower_95": 7.461790974934896,
            "loss_tokens_upper_95": 9.405979410807292,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.7671034379703245,
            "data_time": 0.03026749406542097,
            "batch_time": 0.06205970048904419,
            "samples_per_second": 926686.8163920877,
            "samples_per_second_per_gpu": 115835.85204901097,
            "loss_sequences_lower_95": 4.737049139999762,
            "loss_sequences_upper_95": 5.066668431351824,
            "loss_tokens_lower_95": 4.389112527268188,
            "loss_tokens_upper_95": 4.605039177486334,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.940261044153353,
            "data_time": 0.03125337759653727,
            "batch_time": 0.06135644231523786,
            "samples_per_second": 960535.5265523798,
            "samples_per_second_per_gpu": 120066.94081904748,
            "loss_sequences_lower_95": 4.912488816424114,
            "loss_sequences_upper_95": 5.215184746718989,
            "loss_tokens_lower_95": 4.586702670561158,
            "loss_tokens_upper_95": 4.7678569956095345,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.865472604588764,
            "data_time": 0.031047020639692034,
            "batch_time": 0.06230686959766206,
            "samples_per_second": 947963.8700314304,
            "samples_per_second_per_gpu": 118495.4837539288,
            "loss_sequences_lower_95": 4.8239084941584895,
            "loss_sequences_upper_95": 5.223866485967868,
            "loss_tokens_lower_95": 4.459918567378882,
            "loss_tokens_upper_95": 4.738968086791079,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.094890726775658,
            "data_time": 0.02967688583192371,
            "batch_time": 0.059498511609577,
            "samples_per_second": 981355.0380604385,
            "samples_per_second_per_gpu": 122669.37975755481,
            "loss_sequences_lower_95": 5.046800818094393,
            "loss_sequences_upper_95": 5.336749658351992,
            "loss_tokens_lower_95": 4.769982815085913,
            "loss_tokens_upper_95": 4.937514849986614,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.6281633317840765,
            "data_time": 0.03231015028776946,
            "batch_time": 0.06189611811696747,
            "samples_per_second": 1001885.9850389806,
            "samples_per_second_per_gpu": 125235.74812987258,
            "loss_sequences_lower_95": 4.557635602299471,
            "loss_sequences_upper_95": 4.789249567067401,
            "loss_tokens_lower_95": 4.364750840889148,
            "loss_tokens_upper_95": 4.503554458997513,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.141329532716332,
            "data_time": 0.03308876639320737,
            "batch_time": 0.06403009664444696,
            "samples_per_second": 952750.1527319517,
            "samples_per_second_per_gpu": 119093.76909149396,
            "loss_sequences_lower_95": 4.1352812511164965,
            "loss_sequences_upper_95": 4.39040529204578,
            "loss_tokens_lower_95": 3.8510717228940625,
            "loss_tokens_upper_95": 3.9676646558544304,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.5/params.txt",
    "uuid": "e35118d6-c015-4bed-8628-2d4483edca6f",
    "creation_date": "2023_12_14-05_02_40"
}