{
    "name": "rw_original-d=1024_l=24_h=8-16.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 131717201920,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 16.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.28",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "/tmp/achal-dave-openlm-scrub_2024-01-26-08-26-07-183",
        "--train-num-samples",
        "26343440384",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/refined_web_tokenized/manifest.jsonl",
        "--data-key",
        "json.gz",
        "--name",
        "rw_original-d=1024_l=24_h=8-16.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/opt/ml/code/training/eval_data/open_lm_val/shard_00000000.tar",
        "/opt/ml/code/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/411m_16_rw_original"
    ],
    "results": [
        {
            "loss": 2.7281258245309195,
            "data_time": 0.13777969777584076,
            "batch_time": 1.592317909002304,
            "samples_per_second": 258713.1629305969,
            "samples_per_second_per_gpu": 32339.145366324614,
            "loss_sequences_lower_95": 2.648421592712402,
            "loss_sequences_upper_95": 2.8120847574869794,
            "loss_tokens_lower_95": 2.7148246574401855,
            "loss_tokens_upper_95": 2.7411943817138673,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8047058217689487,
            "data_time": 0.0025328507753970895,
            "batch_time": 0.11692569953524465,
            "samples_per_second": 1127784.2875092407,
            "samples_per_second_per_gpu": 140973.03593865508,
            "loss_sequences_lower_95": 2.802210531520959,
            "loss_sequences_upper_95": 2.8071870790391653,
            "loss_tokens_lower_95": 2.7947276614583334,
            "loss_tokens_upper_95": 2.814613630208333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5112656826875646,
            "data_time": 0.032892193645238876,
            "batch_time": 0.16909389570355415,
            "samples_per_second": 955662.2591611041,
            "samples_per_second_per_gpu": 119457.78239513801,
            "loss_sequences_lower_95": 2.4495669991629465,
            "loss_sequences_upper_95": 2.589304373604911,
            "loss_tokens_lower_95": 2.499746640625,
            "loss_tokens_upper_95": 2.5231147083333334,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.913692337163945,
            "data_time": 0.005147971604999743,
            "batch_time": 0.118024910751142,
            "samples_per_second": 1128431.2836173223,
            "samples_per_second_per_gpu": 141053.9104521653,
            "loss_sequences_lower_95": 2.8690820413176548,
            "loss_sequences_upper_95": 2.9603123439513532,
            "loss_tokens_lower_95": 2.901864895833333,
            "loss_tokens_upper_95": 2.9257424270833337,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8876569164260584,
            "data_time": 0.03523876890540123,
            "batch_time": 0.1450694166123867,
            "samples_per_second": 1026352.5048270087,
            "samples_per_second_per_gpu": 128294.06310337609,
            "loss_sequences_lower_95": 2.8235567796011805,
            "loss_sequences_upper_95": 2.9668427570288634,
            "loss_tokens_lower_95": 2.8770100052083336,
            "loss_tokens_upper_95": 2.898701677083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.968444506057179,
            "data_time": 0.012140017002820969,
            "batch_time": 0.12320609639088313,
            "samples_per_second": 1087449.1463807556,
            "samples_per_second_per_gpu": 135931.14329759445,
            "loss_sequences_lower_95": 2.916086792573079,
            "loss_sequences_upper_95": 3.02684268912517,
            "loss_tokens_lower_95": 2.9564942447916667,
            "loss_tokens_upper_95": 2.980163270833333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8946934614862716,
            "data_time": 0.005196888859455402,
            "batch_time": 0.116199084963554,
            "samples_per_second": 1125893.4634170984,
            "samples_per_second_per_gpu": 140736.6829271373,
            "loss_sequences_lower_95": 2.8551082589285715,
            "loss_sequences_upper_95": 2.9334211674904336,
            "loss_tokens_lower_95": 2.8796151406250003,
            "loss_tokens_upper_95": 2.910488630208333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.341844028003553,
            "data_time": 0.005712546016040601,
            "batch_time": 0.11754218449718074,
            "samples_per_second": 1123887.9985928282,
            "samples_per_second_per_gpu": 140485.99982410352,
            "loss_sequences_lower_95": 3.3140386636943715,
            "loss_sequences_upper_95": 3.3720082215314138,
            "loss_tokens_lower_95": 3.3304540781250003,
            "loss_tokens_upper_95": 3.3532175677083336,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9701106044335095,
            "data_time": 0.037021588534116745,
            "batch_time": 0.17721904441714287,
            "samples_per_second": 1008971.9907917151,
            "samples_per_second_per_gpu": 126121.49884896439,
            "loss_sequences_lower_95": 2.8711263144888526,
            "loss_sequences_upper_95": 3.089376930298844,
            "loss_tokens_lower_95": 2.95855378125,
            "loss_tokens_upper_95": 2.9813613645833335,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9922786036970117,
            "data_time": 0.035627033561468124,
            "batch_time": 0.14797334745526314,
            "samples_per_second": 1037708.2818648819,
            "samples_per_second_per_gpu": 129713.53523311023,
            "loss_sequences_lower_95": 3.8599082072261766,
            "loss_sequences_upper_95": 4.149542224265841,
            "loss_tokens_lower_95": 3.9788988645833334,
            "loss_tokens_upper_95": 4.006029427083333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.999573151542828,
            "data_time": 0.0040015048813584865,
            "batch_time": 0.1168838080022839,
            "samples_per_second": 1134012.1196541458,
            "samples_per_second_per_gpu": 141751.51495676822,
            "loss_sequences_lower_95": 2.985676188898263,
            "loss_sequences_upper_95": 3.013663229537824,
            "loss_tokens_lower_95": 2.988719828125,
            "loss_tokens_upper_95": 3.0102066770833336,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.830081629584303,
            "data_time": 0.008525115094686809,
            "batch_time": 0.12076029652043392,
            "samples_per_second": 1113564.18425197,
            "samples_per_second_per_gpu": 139195.52303149624,
            "loss_sequences_lower_95": 2.801298729492594,
            "loss_sequences_upper_95": 2.8611075646377815,
            "loss_tokens_lower_95": 2.818962010416667,
            "loss_tokens_upper_95": 2.8412917604166665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4006213131105683,
            "data_time": 0.033071570098400116,
            "batch_time": 0.1433737464249134,
            "samples_per_second": 1029310.4103726109,
            "samples_per_second_per_gpu": 128663.80129657636,
            "loss_sequences_lower_95": 3.301372350108551,
            "loss_sequences_upper_95": 3.5221453250784416,
            "loss_tokens_lower_95": 3.3879890833333333,
            "loss_tokens_upper_95": 3.413288854166667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.624482471928334,
            "data_time": 0.033984750509262085,
            "batch_time": 0.14384689927101135,
            "samples_per_second": 1027839.5331886774,
            "samples_per_second_per_gpu": 128479.94164858467,
            "loss_sequences_lower_95": 2.529714412650363,
            "loss_sequences_upper_95": 2.734887185650299,
            "loss_tokens_lower_95": 2.613047817708333,
            "loss_tokens_upper_95": 2.6358459479166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3479514122009277,
            "data_time": 0.11430676281452179,
            "batch_time": 0.16466306149959564,
            "samples_per_second": 529553.904106235,
            "samples_per_second_per_gpu": 66194.23801327938,
            "loss_sequences_lower_95": 3.266289086775346,
            "loss_sequences_upper_95": 3.45276798768477,
            "loss_tokens_lower_95": 3.327403163909912,
            "loss_tokens_upper_95": 3.368908396634189,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8901202803798043,
            "data_time": 0.04480598370234171,
            "batch_time": 0.148760254184405,
            "samples_per_second": 972101.592922627,
            "samples_per_second_per_gpu": 121512.69911532837,
            "loss_sequences_lower_95": 2.831832320766616,
            "loss_sequences_upper_95": 2.951608209637789,
            "loss_tokens_lower_95": 2.8777988958333336,
            "loss_tokens_upper_95": 2.902430921875,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.9687287851499695,
            "data_time": 0.045348102847735085,
            "batch_time": 0.1576875497897466,
            "samples_per_second": 1002904.3004005189,
            "samples_per_second_per_gpu": 125363.03755006487,
            "loss_sequences_lower_95": 4.875316327983282,
            "loss_sequences_upper_95": 5.098238161434284,
            "loss_tokens_lower_95": 4.9569074375,
            "loss_tokens_upper_95": 4.980475822916667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0490679389140647,
            "data_time": 0.1242929995059967,
            "batch_time": 0.23490582406520844,
            "samples_per_second": 731305.0601748676,
            "samples_per_second_per_gpu": 91413.13252185845,
            "loss_sequences_lower_95": 2.881652394279105,
            "loss_sequences_upper_95": 3.3531836587874615,
            "loss_tokens_lower_95": 3.0357347957423477,
            "loss_tokens_upper_95": 3.0627617570220447,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.739617223057749,
            "data_time": 0.0036983384327455003,
            "batch_time": 0.11624675636941736,
            "samples_per_second": 1135351.9166161555,
            "samples_per_second_per_gpu": 141918.98957701944,
            "loss_sequences_lower_95": 1.7324990430494231,
            "loss_sequences_upper_95": 1.7466801743207878,
            "loss_tokens_lower_95": 1.732482925804061,
            "loss_tokens_upper_95": 1.7468920074319452,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.470078852375625,
            "data_time": 0.004431740392612506,
            "batch_time": 0.11672503850128077,
            "samples_per_second": 1130261.475014066,
            "samples_per_second_per_gpu": 141282.68437675826,
            "loss_sequences_lower_95": 2.476880150243975,
            "loss_sequences_upper_95": 2.501268228129357,
            "loss_tokens_lower_95": 2.4594136086047063,
            "loss_tokens_upper_95": 2.477230561957526,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4786974827410986,
            "data_time": 0.010580146137405844,
            "batch_time": 0.12091639813254862,
            "samples_per_second": 1108220.4896500448,
            "samples_per_second_per_gpu": 138527.5612062556,
            "loss_sequences_lower_95": 2.956681513076804,
            "loss_sequences_upper_95": 3.2107024468956658,
            "loss_tokens_lower_95": 2.3103598110443473,
            "loss_tokens_upper_95": 2.492454844745875,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.688169091184934,
            "data_time": 0.009052162369092306,
            "batch_time": 0.11953995190560818,
            "samples_per_second": 1110124.3859360323,
            "samples_per_second_per_gpu": 138765.54824200404,
            "loss_sequences_lower_95": 2.7890880615234375,
            "loss_sequences_upper_95": 2.972258943684896,
            "loss_tokens_lower_95": 2.614805043730346,
            "loss_tokens_upper_95": 2.7476277638561317,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0110552634498764,
            "data_time": 0.014423688704317266,
            "batch_time": 0.12092159553007646,
            "samples_per_second": 1082573.705433717,
            "samples_per_second_per_gpu": 135321.71317921462,
            "loss_sequences_lower_95": 2.0872416533151297,
            "loss_sequences_upper_95": 2.134964352507226,
            "loss_tokens_lower_95": 1.9845267259189576,
            "loss_tokens_upper_95": 2.0115702821251977,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8144049113447016,
            "data_time": 0.07678420841693878,
            "batch_time": 0.17849960923194885,
            "samples_per_second": 880355.6720013679,
            "samples_per_second_per_gpu": 110044.45900017099,
            "loss_sequences_lower_95": 1.8226851966164328,
            "loss_sequences_upper_95": 1.9109765763716264,
            "loss_tokens_lower_95": 1.78349322596493,
            "loss_tokens_upper_95": 1.8256891423334676,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5405523728351205,
            "data_time": 0.07964911311864853,
            "batch_time": 0.18990179151296616,
            "samples_per_second": 919638.2368627401,
            "samples_per_second_per_gpu": 114954.77960784252,
            "loss_sequences_lower_95": 2.555119890485491,
            "loss_sequences_upper_95": 2.7146794159558354,
            "loss_tokens_lower_95": 2.491433900537407,
            "loss_tokens_upper_95": 2.57264230571894,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7245719623565674,
            "data_time": 0.051654934883117676,
            "batch_time": 0.14284644027551016,
            "samples_per_second": 934118.6693254419,
            "samples_per_second_per_gpu": 116764.83366568024,
            "loss_sequences_lower_95": 2.740202356974284,
            "loss_sequences_upper_95": 2.8470218811035153,
            "loss_tokens_lower_95": 2.629328780769565,
            "loss_tokens_upper_95": 2.8017510752448045,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9465760294773125,
            "data_time": 0.003058562676111857,
            "batch_time": 0.1156413825990269,
            "samples_per_second": 1138715.961900974,
            "samples_per_second_per_gpu": 142339.49523762174,
            "loss_sequences_lower_95": 3.99280969155984,
            "loss_sequences_upper_95": 4.072554744399267,
            "loss_tokens_lower_95": 3.8796578517696734,
            "loss_tokens_upper_95": 3.9605138112039517,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.80366544370298,
            "data_time": 0.009917725073663812,
            "batch_time": 0.12052180500406968,
            "samples_per_second": 1113832.184541542,
            "samples_per_second_per_gpu": 139229.02306769276,
            "loss_sequences_lower_95": 3.568074934731429,
            "loss_sequences_upper_95": 3.82525310066814,
            "loss_tokens_lower_95": 2.6602247191384962,
            "loss_tokens_upper_95": 2.77638775429652,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8841131640273963,
            "data_time": 0.016505208611488343,
            "batch_time": 0.12121376991271973,
            "samples_per_second": 1050220.1040570918,
            "samples_per_second_per_gpu": 131277.51300713647,
            "loss_sequences_lower_95": 3.3833205812213363,
            "loss_sequences_upper_95": 3.6780710474216085,
            "loss_tokens_lower_95": 2.777389009370462,
            "loss_tokens_upper_95": 2.916224440469708,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.493966113486791,
            "data_time": 0.0751936212182045,
            "batch_time": 0.17597178369760513,
            "samples_per_second": 883460.025230243,
            "samples_per_second_per_gpu": 110432.50315378037,
            "loss_sequences_lower_95": 5.407241528654752,
            "loss_sequences_upper_95": 5.578760016454408,
            "loss_tokens_lower_95": 5.4107300083386844,
            "loss_tokens_upper_95": 5.578278214964148,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7222619581222536,
            "data_time": 0.13395795226097107,
            "batch_time": 0.23015594482421875,
            "samples_per_second": 659177.3952114673,
            "samples_per_second_per_gpu": 82397.17440143341,
            "loss_sequences_lower_95": 2.6297690505981444,
            "loss_sequences_upper_95": 2.9781779251098635,
            "loss_tokens_lower_95": 2.507559722736611,
            "loss_tokens_upper_95": 2.90755057292249,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.436270947527129,
            "data_time": 0.011169301345944405,
            "batch_time": 0.11941099539399147,
            "samples_per_second": 1098181.7126355555,
            "samples_per_second_per_gpu": 137272.71407944444,
            "loss_sequences_lower_95": 1.418397829647318,
            "loss_sequences_upper_95": 1.4546631015555032,
            "loss_tokens_lower_95": 1.4183071015435085,
            "loss_tokens_upper_95": 1.4541660648529458,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7215003527938881,
            "data_time": 0.016495248675346373,
            "batch_time": 0.1252080723643303,
            "samples_per_second": 1085834.9363589555,
            "samples_per_second_per_gpu": 135729.36704486943,
            "loss_sequences_lower_95": 1.7081010898149571,
            "loss_sequences_upper_95": 1.7348868655143068,
            "loss_tokens_lower_95": 1.7079715513187192,
            "loss_tokens_upper_95": 1.734880407084997,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7522110172402483,
            "data_time": 0.013028929630915323,
            "batch_time": 0.12157263159751892,
            "samples_per_second": 1100343.727992741,
            "samples_per_second_per_gpu": 137542.96599909262,
            "loss_sequences_lower_95": 2.9645505222324706,
            "loss_sequences_upper_95": 3.0963550057063554,
            "loss_tokens_lower_95": 2.7101608505923176,
            "loss_tokens_upper_95": 2.768116032511765,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.381029047489166,
            "data_time": 0.03848373889923096,
            "batch_time": 0.14965426176786423,
            "samples_per_second": 1030116.7082057515,
            "samples_per_second_per_gpu": 128764.58852571894,
            "loss_sequences_lower_95": 4.76245732421875,
            "loss_sequences_upper_95": 5.298429260253906,
            "loss_tokens_lower_95": 4.109171988426022,
            "loss_tokens_upper_95": 4.458439792756763,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.989974707365036,
            "data_time": 0.11590635776519775,
            "batch_time": 0.15833178162574768,
            "samples_per_second": 452543.1091757924,
            "samples_per_second_per_gpu": 56567.88864697405,
            "loss_sequences_lower_95": 2.7412629544734957,
            "loss_sequences_upper_95": 3.235208886861801,
            "loss_tokens_lower_95": 2.578278561296134,
            "loss_tokens_upper_95": 3.2902592713805445,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4473381289120377,
            "data_time": 0.07517188787460327,
            "batch_time": 0.1564379185438156,
            "samples_per_second": 836157.482279552,
            "samples_per_second_per_gpu": 104519.685284944,
            "loss_sequences_lower_95": 4.366921120128413,
            "loss_sequences_upper_95": 5.020131271186917,
            "loss_tokens_lower_95": 2.976842456632698,
            "loss_tokens_upper_95": 3.33290260082705,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.000723517165011,
            "data_time": 0.010221381154325273,
            "batch_time": 0.12237654791937934,
            "samples_per_second": 1117511.8986674754,
            "samples_per_second_per_gpu": 139688.98733343443,
            "loss_sequences_lower_95": 1.9800878255634966,
            "loss_sequences_upper_95": 2.0213085169560943,
            "loss_tokens_lower_95": 1.9805015367313783,
            "loss_tokens_upper_95": 2.021147468403574,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7634955795252991,
            "data_time": 0.00634423843244227,
            "batch_time": 0.1172903819781978,
            "samples_per_second": 1120238.4621940919,
            "samples_per_second_per_gpu": 140029.80777426148,
            "loss_sequences_lower_95": 1.7781324289248983,
            "loss_sequences_upper_95": 1.8896506500688313,
            "loss_tokens_lower_95": 1.692526484318037,
            "loss_tokens_upper_95": 1.8024640758151431,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.667716567769592,
            "data_time": 0.05168850223223368,
            "batch_time": 0.1370743215084076,
            "samples_per_second": 814283.4504790459,
            "samples_per_second_per_gpu": 101785.43130988073,
            "loss_sequences_lower_95": 2.6172193813673306,
            "loss_sequences_upper_95": 3.0438043168176225,
            "loss_tokens_lower_95": 2.4896183716159084,
            "loss_tokens_upper_95": 2.7643567109085914,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1841700325448774,
            "data_time": 0.016018329560756682,
            "batch_time": 0.12789500504732132,
            "samples_per_second": 1098698.6554869986,
            "samples_per_second_per_gpu": 137337.33193587483,
            "loss_sequences_lower_95": 3.300106288846685,
            "loss_sequences_upper_95": 3.4574227324789115,
            "loss_tokens_lower_95": 3.098552688083081,
            "loss_tokens_upper_95": 3.2381871723451416,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.171804964542389,
            "data_time": 0.079841747879982,
            "batch_time": 0.1570914462208748,
            "samples_per_second": 800034.9538823797,
            "samples_per_second_per_gpu": 100004.36923529746,
            "loss_sequences_lower_95": 2.130582641973728,
            "loss_sequences_upper_95": 2.531168895814477,
            "loss_tokens_lower_95": 1.9667553750764681,
            "loss_tokens_upper_95": 2.2721985161985114,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.663133729908747,
            "data_time": 0.004456443945314918,
            "batch_time": 0.11630374334882443,
            "samples_per_second": 1127046.9400005732,
            "samples_per_second_per_gpu": 140880.86750007165,
            "loss_sequences_lower_95": 5.652120609668809,
            "loss_sequences_upper_95": 5.673998227379851,
            "loss_tokens_lower_95": 5.652095008845519,
            "loss_tokens_upper_95": 5.674071747943338,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2281179520690326,
            "data_time": 0.13068310916423798,
            "batch_time": 0.2288530319929123,
            "samples_per_second": 669606.763883653,
            "samples_per_second_per_gpu": 83700.84548545662,
            "loss_sequences_lower_95": 1.2221941272031913,
            "loss_sequences_upper_95": 1.3903084023484906,
            "loss_tokens_lower_95": 1.0726108609592904,
            "loss_tokens_upper_95": 1.335550417163111,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.387931122722991,
            "data_time": 0.002649508441571009,
            "batch_time": 0.11518221306162933,
            "samples_per_second": 1137082.829894083,
            "samples_per_second_per_gpu": 142135.3537367604,
            "loss_sequences_lower_95": 5.202201575193265,
            "loss_sequences_upper_95": 5.247183149485718,
            "loss_tokens_lower_95": 4.21210328820116,
            "loss_tokens_upper_95": 4.259705585106382,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.5497452044487,
            "data_time": 0.02056126482784748,
            "batch_time": 0.13151629455387592,
            "samples_per_second": 1080950.5791064738,
            "samples_per_second_per_gpu": 135118.82238830923,
            "loss_sequences_lower_95": 4.606934875488282,
            "loss_sequences_upper_95": 4.762473901367188,
            "loss_tokens_lower_95": 4.440893035440231,
            "loss_tokens_upper_95": 4.596991773990356,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.891676728621773,
            "data_time": 0.07681891322135925,
            "batch_time": 0.1816512867808342,
            "samples_per_second": 896972.2503380307,
            "samples_per_second_per_gpu": 112121.53129225384,
            "loss_sequences_lower_95": 1.8406726870329484,
            "loss_sequences_upper_95": 1.9438997185748557,
            "loss_tokens_lower_95": 1.8400842583697774,
            "loss_tokens_upper_95": 1.943407745361328,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.804051727237123,
            "data_time": 0.01553297449241985,
            "batch_time": 0.1221093860539523,
            "samples_per_second": 1082415.872856873,
            "samples_per_second_per_gpu": 135301.98410710914,
            "loss_sequences_lower_95": 6.691027462121212,
            "loss_sequences_upper_95": 6.912686841560133,
            "loss_tokens_lower_95": 6.6960515987511835,
            "loss_tokens_upper_95": 6.91318115234375,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.3701764566103618,
            "data_time": 0.014458018044630686,
            "batch_time": 0.12543809289733568,
            "samples_per_second": 1096882.7136418102,
            "samples_per_second_per_gpu": 137110.33920522628,
            "loss_sequences_lower_95": 1.4515160481770832,
            "loss_sequences_upper_95": 1.5054651896158855,
            "loss_tokens_lower_95": 1.3187989023734494,
            "loss_tokens_upper_95": 1.397581145739546,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.478820528302874,
            "data_time": 0.07844607532024384,
            "batch_time": 0.1752444952726364,
            "samples_per_second": 879263.6821028529,
            "samples_per_second_per_gpu": 109907.9602628566,
            "loss_sequences_lower_95": 5.1248060389927454,
            "loss_sequences_upper_95": 5.8349569847470235,
            "loss_tokens_lower_95": 5.129527791341146,
            "loss_tokens_upper_95": 5.834596063523065,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5416227765381336,
            "data_time": 0.11463122069835663,
            "batch_time": 0.15551114082336426,
            "samples_per_second": 478996.1300271136,
            "samples_per_second_per_gpu": 59874.5162533892,
            "loss_sequences_lower_95": 1.4127792924642564,
            "loss_sequences_upper_95": 2.0463644385337827,
            "loss_tokens_lower_95": 1.2135145678962629,
            "loss_tokens_upper_95": 1.5707499458863563,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.026847322940826,
            "data_time": 0.021642539650201797,
            "batch_time": 0.1326016392558813,
            "samples_per_second": 1080144.5906477626,
            "samples_per_second_per_gpu": 135018.07383097033,
            "loss_sequences_lower_95": 7.044396142578125,
            "loss_sequences_upper_95": 7.367313732910156,
            "loss_tokens_lower_95": 6.857653901540689,
            "loss_tokens_upper_95": 7.140484257680309,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.857953425884247,
            "data_time": 0.01975850574672222,
            "batch_time": 0.13074499368667603,
            "samples_per_second": 1081382.7616141501,
            "samples_per_second_per_gpu": 135172.84520176877,
            "loss_sequences_lower_95": 7.1055027221679685,
            "loss_sequences_upper_95": 7.331749707031251,
            "loss_tokens_lower_95": 6.7177360400953,
            "loss_tokens_upper_95": 6.929175367054488,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.6534448654668426,
            "data_time": 0.008952443798383078,
            "batch_time": 0.11887542717158794,
            "samples_per_second": 1109004.7216608126,
            "samples_per_second_per_gpu": 138625.59020760158,
            "loss_sequences_lower_95": 5.635529720499497,
            "loss_sequences_upper_95": 5.671307150021999,
            "loss_tokens_lower_95": 5.635574718325721,
            "loss_tokens_upper_95": 5.670865552584437,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.984177804396083,
            "data_time": 0.027683868513002502,
            "batch_time": 0.13033742695064335,
            "samples_per_second": 999926.7010968367,
            "samples_per_second_per_gpu": 124990.83763710459,
            "loss_sequences_lower_95": 1.946124023812524,
            "loss_sequences_upper_95": 2.0230606946344567,
            "loss_tokens_lower_95": 1.9450516239289313,
            "loss_tokens_upper_95": 2.022487672023509,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.840035524845123,
            "data_time": 0.020434455946087837,
            "batch_time": 0.13153067231178284,
            "samples_per_second": 1079374.3550039306,
            "samples_per_second_per_gpu": 134921.79437549133,
            "loss_sequences_lower_95": 6.744207629394531,
            "loss_sequences_upper_95": 6.9369740234375,
            "loss_tokens_lower_95": 6.741831359863282,
            "loss_tokens_upper_95": 6.938274267578125,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0615503660100973,
            "data_time": 0.0051864403558064655,
            "batch_time": 0.1176748035183872,
            "samples_per_second": 1128001.1739649805,
            "samples_per_second_per_gpu": 141000.14674562257,
            "loss_sequences_lower_95": 2.80143930714877,
            "loss_sequences_upper_95": 2.868554207071902,
            "loss_tokens_lower_95": 1.9091460828858828,
            "loss_tokens_upper_95": 1.9582314933428955,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1093527541231754,
            "data_time": 0.056531667709350586,
            "batch_time": 0.14713880690661343,
            "samples_per_second": 826863.391987468,
            "samples_per_second_per_gpu": 103357.9239984335,
            "loss_sequences_lower_95": 2.0502967094307514,
            "loss_sequences_upper_95": 2.167824138812165,
            "loss_tokens_lower_95": 2.049399059921948,
            "loss_tokens_upper_95": 2.1695309539339434,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9736238054200714,
            "data_time": 0.0371025986969471,
            "batch_time": 0.14988188445568085,
            "samples_per_second": 1037071.9268164742,
            "samples_per_second_per_gpu": 129633.99085205927,
            "loss_sequences_lower_95": 1.9276160954494101,
            "loss_sequences_upper_95": 2.020326873180913,
            "loss_tokens_lower_95": 1.928842914057713,
            "loss_tokens_upper_95": 2.0200000538545497,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9959270576927186,
            "data_time": 0.006292301037954906,
            "batch_time": 0.11834060696382372,
            "samples_per_second": 1122893.542260632,
            "samples_per_second_per_gpu": 140361.692782579,
            "loss_sequences_lower_95": 4.163482104060973,
            "loss_sequences_upper_95": 4.271803803786171,
            "loss_tokens_lower_95": 2.7671031369148,
            "loss_tokens_upper_95": 2.843729170580984,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.8946052732921785,
            "data_time": 0.07626921683549881,
            "batch_time": 0.16464755684137344,
            "samples_per_second": 851002.3436769303,
            "samples_per_second_per_gpu": 106375.29295961629,
            "loss_sequences_lower_95": 5.808799767872643,
            "loss_sequences_upper_95": 5.977666364397321,
            "loss_tokens_lower_95": 5.8069166556867975,
            "loss_tokens_upper_95": 5.975972396608382,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.556956232870026,
            "data_time": 0.009321771562099457,
            "batch_time": 0.12072616070508957,
            "samples_per_second": 1109962.9874379751,
            "samples_per_second_per_gpu": 138745.3734297469,
            "loss_sequences_lower_95": 3.5278088117593653,
            "loss_sequences_upper_95": 3.5866302546229933,
            "loss_tokens_lower_95": 3.5274077342555428,
            "loss_tokens_upper_95": 3.5865542798523506,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2488086003701664,
            "data_time": 0.0727955773472786,
            "batch_time": 0.1678125485777855,
            "samples_per_second": 880860.6235238975,
            "samples_per_second_per_gpu": 110107.57794048719,
            "loss_sequences_lower_95": 2.1788875320582712,
            "loss_sequences_upper_95": 2.319809789565003,
            "loss_tokens_lower_95": 2.178602259367415,
            "loss_tokens_upper_95": 2.320122739180778,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2063744833072028,
            "data_time": 0.1246170699596405,
            "batch_time": 0.1877087950706482,
            "samples_per_second": 585661.0467734542,
            "samples_per_second_per_gpu": 73207.63084668177,
            "loss_sequences_lower_95": 1.110150162378947,
            "loss_sequences_upper_95": 1.4352688789367676,
            "loss_tokens_lower_95": 0.9917389657762316,
            "loss_tokens_upper_95": 1.3442318121592205,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.076286244392395,
            "data_time": 0.12103290855884552,
            "batch_time": 0.18454518914222717,
            "samples_per_second": 587222.6751440342,
            "samples_per_second_per_gpu": 73402.83439300428,
            "loss_sequences_lower_95": 1.0564408667882283,
            "loss_sequences_upper_95": 1.4417091051737467,
            "loss_tokens_lower_95": 0.8503713779235154,
            "loss_tokens_upper_95": 1.2626591264531852,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.971585535476415,
            "data_time": 0.008291509416368272,
            "batch_time": 0.11959261640354439,
            "samples_per_second": 1114030.6288725699,
            "samples_per_second_per_gpu": 139253.82860907124,
            "loss_sequences_lower_95": 3.95152393369155,
            "loss_sequences_upper_95": 3.9912836547312223,
            "loss_tokens_lower_95": 3.951970419792434,
            "loss_tokens_upper_95": 3.99200347765556,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.3687112790263707,
            "data_time": 0.0024761347267246874,
            "batch_time": 0.11541312737329262,
            "samples_per_second": 1138055.8582956267,
            "samples_per_second_per_gpu": 142256.98228695334,
            "loss_sequences_lower_95": 0.5062829856139515,
            "loss_sequences_upper_95": 0.5245579807521754,
            "loss_tokens_lower_95": 0.3511642412561446,
            "loss_tokens_upper_95": 0.35989864706187896,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9005533147045948,
            "data_time": 0.1316731572151184,
            "batch_time": 0.245010107755661,
            "samples_per_second": 737915.9043823219,
            "samples_per_second_per_gpu": 92239.48804779023,
            "loss_sequences_lower_95": 4.049498454416831,
            "loss_sequences_upper_95": 4.481645983598363,
            "loss_tokens_lower_95": 3.7159447756425017,
            "loss_tokens_upper_95": 4.0662081548605995,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.875636577606201,
            "data_time": 0.10918492078781128,
            "batch_time": 0.153469517827034,
            "samples_per_second": 497464.29105360137,
            "samples_per_second_per_gpu": 62183.03638170017,
            "loss_sequences_lower_95": 7.306237896068676,
            "loss_sequences_upper_95": 8.58355013873126,
            "loss_tokens_lower_95": 6.210795753384814,
            "loss_tokens_upper_95": 9.302109348626784,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8867371823729537,
            "data_time": 0.07525494694709778,
            "batch_time": 0.15252844244241714,
            "samples_per_second": 801450.7176626201,
            "samples_per_second_per_gpu": 100181.33970782752,
            "loss_sequences_lower_95": 3.97221423823659,
            "loss_sequences_upper_95": 4.32376625247118,
            "loss_tokens_lower_95": 3.6388047663477714,
            "loss_tokens_upper_95": 3.929157700470458,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.006940111881349,
            "data_time": 0.07535624504089355,
            "batch_time": 0.1532101035118103,
            "samples_per_second": 793737.2521079857,
            "samples_per_second_per_gpu": 99217.15651349821,
            "loss_sequences_lower_95": 4.040297447762838,
            "loss_sequences_upper_95": 4.3414176568752385,
            "loss_tokens_lower_95": 3.80654063242128,
            "loss_tokens_upper_95": 4.056184697321743,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8877083179427356,
            "data_time": 0.07342584431171417,
            "batch_time": 0.15120205283164978,
            "samples_per_second": 799978.5588073053,
            "samples_per_second_per_gpu": 99997.31985091316,
            "loss_sequences_lower_95": 4.138818610586772,
            "loss_sequences_upper_95": 4.566129777489639,
            "loss_tokens_lower_95": 3.610197089854981,
            "loss_tokens_upper_95": 3.9763513794296292,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.1891969006236005,
            "data_time": 0.07647689431905746,
            "batch_time": 0.15398593246936798,
            "samples_per_second": 798992.6933867178,
            "samples_per_second_per_gpu": 99874.08667333973,
            "loss_sequences_lower_95": 4.169942772097704,
            "loss_sequences_upper_95": 4.480821358285299,
            "loss_tokens_lower_95": 3.9910693967825153,
            "loss_tokens_upper_95": 4.22798197618526,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7808984940096457,
            "data_time": 0.0757351666688919,
            "batch_time": 0.15216504037380219,
            "samples_per_second": 786831.803481314,
            "samples_per_second_per_gpu": 98353.97543516425,
            "loss_sequences_lower_95": 3.799448759659477,
            "loss_sequences_upper_95": 4.060167149135045,
            "loss_tokens_lower_95": 3.5960129515209127,
            "loss_tokens_upper_95": 3.7997469049338255,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8088030277228935,
            "data_time": 0.07435350865125656,
            "batch_time": 0.15187221765518188,
            "samples_per_second": 801725.6536192056,
            "samples_per_second_per_gpu": 100215.7067024007,
            "loss_sequences_lower_95": 2.8502000483070935,
            "loss_sequences_upper_95": 3.070039791014136,
            "loss_tokens_lower_95": 2.6762745465076856,
            "loss_tokens_upper_95": 2.8136486235119045,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-16.0/params.txt",
    "uuid": "f2134fff-2fae-4804-a939-7bb77acaf54b",
    "creation_date": "2024_01_26-16_40_32"
}