{
    "name": "rw_original-d=1024_l=24_h=8-32.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 263434403840,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 32.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.28",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "/tmp/achal-dave-openlm-scrub_2024-01-26-08-25-53-415",
        "--train-num-samples",
        "52686880768",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/refined_web_tokenized/manifest.jsonl",
        "--data-key",
        "json.gz",
        "--name",
        "rw_original-d=1024_l=24_h=8-32.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/opt/ml/code/training/eval_data/open_lm_val/shard_00000000.tar",
        "/opt/ml/code/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/411m_32x_rw_original"
    ],
    "results": [
        {
            "loss": 2.602446804443995,
            "data_time": 0.13071337342262268,
            "batch_time": 1.5636923015117645,
            "samples_per_second": 263278.9526242779,
            "samples_per_second_per_gpu": 32909.86907803474,
            "loss_sequences_lower_95": 2.5456183052062986,
            "loss_sequences_upper_95": 2.661706167856852,
            "loss_tokens_lower_95": 2.5901292165120444,
            "loss_tokens_upper_95": 2.6146076202392576,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.774146048023745,
            "data_time": 0.0024687862835804167,
            "batch_time": 0.11602511801552362,
            "samples_per_second": 1136470.4390095696,
            "samples_per_second_per_gpu": 142058.8048761962,
            "loss_sequences_lower_95": 2.7716734810850716,
            "loss_sequences_upper_95": 2.7766163471408487,
            "loss_tokens_lower_95": 2.764287625,
            "loss_tokens_upper_95": 2.7842695625,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5288935437494393,
            "data_time": 0.028667215257883072,
            "batch_time": 0.15978244692087173,
            "samples_per_second": 965110.6888160758,
            "samples_per_second_per_gpu": 120638.83610200947,
            "loss_sequences_lower_95": 2.453840556242028,
            "loss_sequences_upper_95": 2.6254931578344225,
            "loss_tokens_lower_95": 2.516948359375,
            "loss_tokens_upper_95": 2.541068635416667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9401580468403923,
            "data_time": 0.005014869335450624,
            "batch_time": 0.11762688740303642,
            "samples_per_second": 1131584.8142808226,
            "samples_per_second_per_gpu": 141448.10178510283,
            "loss_sequences_lower_95": 2.884319441647874,
            "loss_sequences_upper_95": 2.997957524565077,
            "loss_tokens_lower_95": 2.926952609375,
            "loss_tokens_upper_95": 2.9531758697916666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8765916566984715,
            "data_time": 0.03191768750548363,
            "batch_time": 0.14181415736675262,
            "samples_per_second": 1030009.3823707604,
            "samples_per_second_per_gpu": 128751.17279634505,
            "loss_sequences_lower_95": 2.7987942324636426,
            "loss_sequences_upper_95": 2.9760198402793248,
            "loss_tokens_lower_95": 2.8655016406250002,
            "loss_tokens_upper_95": 2.8878225104166666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.918729534123237,
            "data_time": 0.011167238156000773,
            "batch_time": 0.1228040096660455,
            "samples_per_second": 1087545.2636642319,
            "samples_per_second_per_gpu": 135943.15795802898,
            "loss_sequences_lower_95": 2.8587930374612296,
            "loss_sequences_upper_95": 2.9884252946284415,
            "loss_tokens_lower_95": 2.9069813958333333,
            "loss_tokens_upper_95": 2.9302728489583334,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.625701740834178,
            "data_time": 0.0047730238009721804,
            "batch_time": 0.11575682002764481,
            "samples_per_second": 1126804.5103210749,
            "samples_per_second_per_gpu": 140850.56379013436,
            "loss_sequences_lower_95": 2.588324313416773,
            "loss_sequences_upper_95": 2.6631206453284437,
            "loss_tokens_lower_95": 2.611706609375,
            "loss_tokens_upper_95": 2.64001265625,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3402722856012312,
            "data_time": 0.005718811954322614,
            "batch_time": 0.11677752592061695,
            "samples_per_second": 1125351.2361120223,
            "samples_per_second_per_gpu": 140668.9045140028,
            "loss_sequences_lower_95": 3.3051308593749997,
            "loss_sequences_upper_95": 3.3780506483147907,
            "loss_tokens_lower_95": 3.3282555468750004,
            "loss_tokens_upper_95": 3.3522442552083334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9668079648560624,
            "data_time": 0.029012363404035568,
            "batch_time": 0.14140524715185165,
            "samples_per_second": 1031720.2449930774,
            "samples_per_second_per_gpu": 128965.03062413467,
            "loss_sequences_lower_95": 2.8470220178123413,
            "loss_sequences_upper_95": 3.1132791658727137,
            "loss_tokens_lower_95": 2.954711203125,
            "loss_tokens_upper_95": 2.9786631249999997,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.017251852943963,
            "data_time": 0.029595736414194107,
            "batch_time": 0.14197936281561852,
            "samples_per_second": 1040045.8021107764,
            "samples_per_second_per_gpu": 130005.72526384705,
            "loss_sequences_lower_95": 3.8528879399356164,
            "loss_sequences_upper_95": 4.211523328939445,
            "loss_tokens_lower_95": 4.002675927083334,
            "loss_tokens_upper_95": 4.031926968750001,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9859409983785117,
            "data_time": 0.003826192540380806,
            "batch_time": 0.11668477627690493,
            "samples_per_second": 1134209.967028234,
            "samples_per_second_per_gpu": 141776.24587852924,
            "loss_sequences_lower_95": 2.9691982458009285,
            "loss_sequences_upper_95": 3.002895179258171,
            "loss_tokens_lower_95": 2.9746802760416666,
            "loss_tokens_upper_95": 2.9971191770833334,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8231071541081962,
            "data_time": 0.008183808703171579,
            "batch_time": 0.12024329132155369,
            "samples_per_second": 1113794.6779250586,
            "samples_per_second_per_gpu": 139224.33474063233,
            "loss_sequences_lower_95": 2.787110315566756,
            "loss_sequences_upper_95": 2.8616573640377707,
            "loss_tokens_lower_95": 2.811428385416667,
            "loss_tokens_upper_95": 2.8348686406250003,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4230100354123065,
            "data_time": 0.03125341981649399,
            "batch_time": 0.14144403859972954,
            "samples_per_second": 1031943.8808260473,
            "samples_per_second_per_gpu": 128992.98510325591,
            "loss_sequences_lower_95": 3.298986457375919,
            "loss_sequences_upper_95": 3.5769904455838777,
            "loss_tokens_lower_95": 3.4099245885416667,
            "loss_tokens_upper_95": 3.4361678437500003,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6124302437737614,
            "data_time": 0.030214007943868637,
            "batch_time": 0.1398683786392212,
            "samples_per_second": 1032966.0804874445,
            "samples_per_second_per_gpu": 129120.76006093057,
            "loss_sequences_lower_95": 2.5040378795864626,
            "loss_sequences_upper_95": 2.7406759848662645,
            "loss_tokens_lower_95": 2.6005759270833333,
            "loss_tokens_upper_95": 2.6240623385416666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.290894394571131,
            "data_time": 0.09605401754379272,
            "batch_time": 0.1456051617860794,
            "samples_per_second": 546325.2181283822,
            "samples_per_second_per_gpu": 68290.65226604778,
            "loss_sequences_lower_95": 3.1962659402327103,
            "loss_sequences_upper_95": 3.4220424998890264,
            "loss_tokens_lower_95": 3.2702134999361903,
            "loss_tokens_upper_95": 3.311903138594194,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.814699452402988,
            "data_time": 0.039660945534706116,
            "batch_time": 0.14356233179569244,
            "samples_per_second": 978622.1556294687,
            "samples_per_second_per_gpu": 122327.76945368359,
            "loss_sequences_lower_95": 2.743190981139247,
            "loss_sequences_upper_95": 2.892192535845253,
            "loss_tokens_lower_95": 2.802648442708333,
            "loss_tokens_upper_95": 2.8265735572916664,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.994216986885172,
            "data_time": 0.04018634557723999,
            "batch_time": 0.1525286982456843,
            "samples_per_second": 1008287.2116750451,
            "samples_per_second_per_gpu": 126035.90145938064,
            "loss_sequences_lower_95": 4.878883792605438,
            "loss_sequences_upper_95": 5.1563299496129815,
            "loss_tokens_lower_95": 4.981899916666667,
            "loss_tokens_upper_95": 5.00680709375,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0072739065670575,
            "data_time": 0.11268676817417145,
            "batch_time": 0.2231111228466034,
            "samples_per_second": 743180.0850223186,
            "samples_per_second_per_gpu": 92897.51062778983,
            "loss_sequences_lower_95": 2.812887304337298,
            "loss_sequences_upper_95": 3.365501641445473,
            "loss_tokens_lower_95": 2.9937875153588465,
            "loss_tokens_upper_95": 3.0211637903432376,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8107482672603599,
            "data_time": 0.0035660722038962623,
            "batch_time": 0.11598963046615773,
            "samples_per_second": 1136822.3199954415,
            "samples_per_second_per_gpu": 142102.78999943018,
            "loss_sequences_lower_95": 1.8045896907491812,
            "loss_sequences_upper_95": 1.8167790504045898,
            "loss_tokens_lower_95": 1.8045965966497828,
            "loss_tokens_upper_95": 1.8169566600143765,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.440743834272938,
            "data_time": 0.004062787075585957,
            "batch_time": 0.11607871855361553,
            "samples_per_second": 1134214.6345740494,
            "samples_per_second_per_gpu": 141776.82932175617,
            "loss_sequences_lower_95": 2.4509540885312937,
            "loss_sequences_upper_95": 2.475115136543704,
            "loss_tokens_lower_95": 2.429952699360795,
            "loss_tokens_upper_95": 2.4475090741491297,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2786416068187454,
            "data_time": 0.00960852819330552,
            "batch_time": 0.11971607366028954,
            "samples_per_second": 1111750.1221808514,
            "samples_per_second_per_gpu": 138968.76527260643,
            "loss_sequences_lower_95": 2.7212518993840784,
            "loss_sequences_upper_95": 2.9629817539191663,
            "loss_tokens_lower_95": 2.120079836369356,
            "loss_tokens_upper_95": 2.2969967038397963,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6122292730410894,
            "data_time": 0.009056669970353445,
            "batch_time": 0.11943015456199646,
            "samples_per_second": 1114600.5096128816,
            "samples_per_second_per_gpu": 139325.0637016102,
            "loss_sequences_lower_95": 2.7338489176432295,
            "loss_sequences_upper_95": 2.916185107421875,
            "loss_tokens_lower_95": 2.5350088566234277,
            "loss_tokens_upper_95": 2.6653453039013364,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0153685277659754,
            "data_time": 0.015004256909543818,
            "batch_time": 0.12143024666742845,
            "samples_per_second": 1083372.9509350953,
            "samples_per_second_per_gpu": 135421.6188668869,
            "loss_sequences_lower_95": 2.1005832628737444,
            "loss_sequences_upper_95": 2.1501703015053426,
            "loss_tokens_lower_95": 1.9869547092791116,
            "loss_tokens_upper_95": 2.014669151602781,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7791800954125143,
            "data_time": 0.06490514427423477,
            "batch_time": 0.16638382524251938,
            "samples_per_second": 892780.5150192876,
            "samples_per_second_per_gpu": 111597.56437741095,
            "loss_sequences_lower_95": 1.7884925599531694,
            "loss_sequences_upper_95": 1.8784785322709516,
            "loss_tokens_lower_95": 1.7472052066537338,
            "loss_tokens_upper_95": 1.7886714341051093,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.507699091093881,
            "data_time": 0.06745041161775589,
            "batch_time": 0.17678917944431305,
            "samples_per_second": 931725.5719957035,
            "samples_per_second_per_gpu": 116465.69649946294,
            "loss_sequences_lower_95": 2.5123036411830357,
            "loss_sequences_upper_95": 2.6691605423907845,
            "loss_tokens_lower_95": 2.4558833097071764,
            "loss_tokens_upper_95": 2.5376883172181786,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.41623818953832,
            "data_time": 0.04776657621065775,
            "batch_time": 0.13891239960988364,
            "samples_per_second": 936835.7098196135,
            "samples_per_second_per_gpu": 117104.46372745169,
            "loss_sequences_lower_95": 2.4098714803059895,
            "loss_sequences_upper_95": 2.4896921691894534,
            "loss_tokens_lower_95": 2.341372163864532,
            "loss_tokens_upper_95": 2.4846149609616424,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5704159581058756,
            "data_time": 0.0029141737979912906,
            "batch_time": 0.11555908920255097,
            "samples_per_second": 1138040.271731084,
            "samples_per_second_per_gpu": 142255.0339663855,
            "loss_sequences_lower_95": 3.608181787774962,
            "loss_sequences_upper_95": 3.6888126045716256,
            "loss_tokens_lower_95": 3.504859230165082,
            "loss_tokens_upper_95": 3.5861499089929043,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.781221855469424,
            "data_time": 0.009001956958519785,
            "batch_time": 0.11977662538227282,
            "samples_per_second": 1113903.2068437072,
            "samples_per_second_per_gpu": 139237.9008554634,
            "loss_sequences_lower_95": 3.594132661658907,
            "loss_sequences_upper_95": 3.862995237613768,
            "loss_tokens_lower_95": 2.632296633742053,
            "loss_tokens_upper_95": 2.7505317376160168,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8330640760806642,
            "data_time": 0.014710921049118041,
            "batch_time": 0.1193879097700119,
            "samples_per_second": 1051311.7205387496,
            "samples_per_second_per_gpu": 131413.9650673437,
            "loss_sequences_lower_95": 3.351785247073645,
            "loss_sequences_upper_95": 3.650682302221097,
            "loss_tokens_lower_95": 2.731161269575367,
            "loss_tokens_upper_95": 2.868601081308991,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.769532312541248,
            "data_time": 0.07306608557701111,
            "batch_time": 0.17400698363780975,
            "samples_per_second": 885124.1695968558,
            "samples_per_second_per_gpu": 110640.52119960697,
            "loss_sequences_lower_95": 5.671841360989227,
            "loss_sequences_upper_95": 5.862046779127426,
            "loss_tokens_lower_95": 5.678342775667095,
            "loss_tokens_upper_95": 5.861313326064854,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.692678966522217,
            "data_time": 0.12966661155223846,
            "batch_time": 0.22603608667850494,
            "samples_per_second": 665670.6191007076,
            "samples_per_second_per_gpu": 83208.82738758845,
            "loss_sequences_lower_95": 2.602817611694336,
            "loss_sequences_upper_95": 2.9411743240356447,
            "loss_tokens_lower_95": 2.4835438900642193,
            "loss_tokens_upper_95": 2.8740432766554393,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6905588837094536,
            "data_time": 0.010212593711912632,
            "batch_time": 0.11843048315495253,
            "samples_per_second": 1099550.211667717,
            "samples_per_second_per_gpu": 137443.77645846462,
            "loss_sequences_lower_95": 1.6633163155406379,
            "loss_sequences_upper_95": 1.717666438560525,
            "loss_tokens_lower_95": 1.663056072754406,
            "loss_tokens_upper_95": 1.7190278848825726,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8787513825856301,
            "data_time": 0.015088823437690736,
            "batch_time": 0.12363515198230743,
            "samples_per_second": 1086781.1322378018,
            "samples_per_second_per_gpu": 135847.64152972522,
            "loss_sequences_lower_95": 1.85976842931799,
            "loss_sequences_upper_95": 1.8977427189707463,
            "loss_tokens_lower_95": 1.8596355316387054,
            "loss_tokens_upper_95": 1.898096023047195,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7031122715870106,
            "data_time": 0.01185297171274821,
            "batch_time": 0.12050967911879222,
            "samples_per_second": 1101328.6979468043,
            "samples_per_second_per_gpu": 137666.08724335054,
            "loss_sequences_lower_95": 2.945669685172827,
            "loss_sequences_upper_95": 3.0773773160152,
            "loss_tokens_lower_95": 2.653499202694373,
            "loss_tokens_upper_95": 2.708814593098388,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.359777695178986,
            "data_time": 0.035057432949543,
            "batch_time": 0.14612025022506714,
            "samples_per_second": 1034109.1236407801,
            "samples_per_second_per_gpu": 129263.64045509751,
            "loss_sequences_lower_95": 4.784789392089844,
            "loss_sequences_upper_95": 5.338210034179688,
            "loss_tokens_lower_95": 4.089307303084671,
            "loss_tokens_upper_95": 4.440584024445731,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8492187708616257,
            "data_time": 0.10169649124145508,
            "batch_time": 0.143833190202713,
            "samples_per_second": 471399.01551669365,
            "samples_per_second_per_gpu": 58924.876939586706,
            "loss_sequences_lower_95": 2.6179912388324738,
            "loss_sequences_upper_95": 3.0654741704463953,
            "loss_tokens_lower_95": 2.457236283400963,
            "loss_tokens_upper_95": 3.143500168022068,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4250629455193704,
            "data_time": 0.0639626756310463,
            "batch_time": 0.14525816589593887,
            "samples_per_second": 845651.3630050502,
            "samples_per_second_per_gpu": 105706.42037563128,
            "loss_sequences_lower_95": 4.4093651607118804,
            "loss_sequences_upper_95": 5.098252044326958,
            "loss_tokens_lower_95": 2.9103079089451804,
            "loss_tokens_upper_95": 3.2681271524010005,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.110026260086833,
            "data_time": 0.00970441848039627,
            "batch_time": 0.12192611975802316,
            "samples_per_second": 1116979.3541128046,
            "samples_per_second_per_gpu": 139622.41926410058,
            "loss_sequences_lower_95": 2.086857107019987,
            "loss_sequences_upper_95": 2.133148094700538,
            "loss_tokens_lower_95": 2.087295845535032,
            "loss_tokens_upper_95": 2.1338871347169888,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.70465864286379,
            "data_time": 0.005759635349599327,
            "batch_time": 0.11674621483174766,
            "samples_per_second": 1121387.457353012,
            "samples_per_second_per_gpu": 140173.4321691265,
            "loss_sequences_lower_95": 1.718418863253202,
            "loss_sequences_upper_95": 1.8254840693149015,
            "loss_tokens_lower_95": 1.636105020698995,
            "loss_tokens_upper_95": 1.740908227156341,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6303363628003185,
            "data_time": 0.048625558614730835,
            "batch_time": 0.1336804380019506,
            "samples_per_second": 820357.2129395789,
            "samples_per_second_per_gpu": 102544.65161744737,
            "loss_sequences_lower_95": 2.5831473046606717,
            "loss_sequences_upper_95": 3.003650645720653,
            "loss_tokens_lower_95": 2.4543519647335366,
            "loss_tokens_upper_95": 2.727354692287957,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1714239272905695,
            "data_time": 0.015581481158733368,
            "batch_time": 0.12746954709291458,
            "samples_per_second": 1098968.8935775992,
            "samples_per_second_per_gpu": 137371.1116971999,
            "loss_sequences_lower_95": 3.285428571813955,
            "loss_sequences_upper_95": 3.4458871939282014,
            "loss_tokens_lower_95": 3.0853346380431583,
            "loss_tokens_upper_95": 3.224223067295974,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0770439280242456,
            "data_time": 0.06637120991945267,
            "batch_time": 0.1436896175146103,
            "samples_per_second": 809956.8142010181,
            "samples_per_second_per_gpu": 101244.60177512726,
            "loss_sequences_lower_95": 2.036392039787479,
            "loss_sequences_upper_95": 2.4260071079905443,
            "loss_tokens_lower_95": 1.8857724113526586,
            "loss_tokens_upper_95": 2.1767024726307604,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.624047784334088,
            "data_time": 0.004214179119104062,
            "batch_time": 0.11617892139876135,
            "samples_per_second": 1125989.995899797,
            "samples_per_second_per_gpu": 140748.74948747462,
            "loss_sequences_lower_95": 5.612262091090092,
            "loss_sequences_upper_95": 5.635702277174184,
            "loss_tokens_lower_95": 5.6123669948833514,
            "loss_tokens_upper_95": 5.636039024211093,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.164709426824329,
            "data_time": 0.12536664307117462,
            "batch_time": 0.22325202822685242,
            "samples_per_second": 675489.057325522,
            "samples_per_second_per_gpu": 84436.13216569024,
            "loss_sequences_lower_95": 1.1596580505371092,
            "loss_sequences_upper_95": 1.3115415776817543,
            "loss_tokens_lower_95": 1.0212805538118923,
            "loss_tokens_upper_95": 1.2676822970809076,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.035289651842737,
            "data_time": 0.0026007465594588314,
            "batch_time": 0.11541047129144637,
            "samples_per_second": 1134444.7521558814,
            "samples_per_second_per_gpu": 141805.59401948517,
            "loss_sequences_lower_95": 4.802284101890068,
            "loss_sequences_upper_95": 4.8455239349941035,
            "loss_tokens_lower_95": 3.871010239361702,
            "loss_tokens_upper_95": 3.9149205935686653,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.311931970119477,
            "data_time": 0.01905001513659954,
            "batch_time": 0.13011656515300274,
            "samples_per_second": 1080926.1228329395,
            "samples_per_second_per_gpu": 135115.76535411744,
            "loss_sequences_lower_95": 4.398810778808594,
            "loss_sequences_upper_95": 4.582749597167968,
            "loss_tokens_lower_95": 4.179523842701687,
            "loss_tokens_upper_95": 4.351550311850469,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8632050306900687,
            "data_time": 0.07120392471551895,
            "batch_time": 0.17609041929244995,
            "samples_per_second": 900231.1200809046,
            "samples_per_second_per_gpu": 112528.89001011307,
            "loss_sequences_lower_95": 1.8254440208103346,
            "loss_sequences_upper_95": 1.9018125417958136,
            "loss_tokens_lower_95": 1.8253433028511379,
            "loss_tokens_upper_95": 1.9020630115011465,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.646468631426493,
            "data_time": 0.013858258724212646,
            "batch_time": 0.12057636407288638,
            "samples_per_second": 1082469.5588485862,
            "samples_per_second_per_gpu": 135308.69485607327,
            "loss_sequences_lower_95": 5.571316445090554,
            "loss_sequences_upper_95": 5.720951528838187,
            "loss_tokens_lower_95": 5.57327255711411,
            "loss_tokens_upper_95": 5.721199350068064,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1613447334766387,
            "data_time": 0.013123445212841034,
            "batch_time": 0.12434792518615723,
            "samples_per_second": 1095961.8486881678,
            "samples_per_second_per_gpu": 136995.23108602097,
            "loss_sequences_lower_95": 1.229910636393229,
            "loss_sequences_upper_95": 1.2754389851888022,
            "loss_tokens_lower_95": 1.1176529869760403,
            "loss_tokens_upper_95": 1.1844727246367297,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.806374910899571,
            "data_time": 0.06503494828939438,
            "batch_time": 0.16191993653774261,
            "samples_per_second": 890790.7268496922,
            "samples_per_second_per_gpu": 111348.84085621152,
            "loss_sequences_lower_95": 5.459505557105655,
            "loss_sequences_upper_95": 6.156543637230283,
            "loss_tokens_lower_95": 5.464325198218936,
            "loss_tokens_upper_95": 6.1495807756696435,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6012435033917427,
            "data_time": 0.0931951254606247,
            "batch_time": 0.13429497182369232,
            "samples_per_second": 485235.84121953987,
            "samples_per_second_per_gpu": 60654.480152442484,
            "loss_sequences_lower_95": 1.4632756918668748,
            "loss_sequences_upper_95": 2.100305473804474,
            "loss_tokens_lower_95": 1.2505300431399002,
            "loss_tokens_upper_95": 1.622926453265947,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.837240266799927,
            "data_time": 0.02025192603468895,
            "batch_time": 0.1311708353459835,
            "samples_per_second": 1081332.023990977,
            "samples_per_second_per_gpu": 135166.50299887214,
            "loss_sequences_lower_95": 6.861310522460937,
            "loss_sequences_upper_95": 7.1745307250976555,
            "loss_tokens_lower_95": 6.665404591826618,
            "loss_tokens_upper_95": 6.9457688694677975,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.751003879070282,
            "data_time": 0.018388492986559868,
            "batch_time": 0.1293894499540329,
            "samples_per_second": 1082105.5902200171,
            "samples_per_second_per_gpu": 135263.19877750214,
            "loss_sequences_lower_95": 6.984688562011718,
            "loss_sequences_upper_95": 7.196201306152344,
            "loss_tokens_lower_95": 6.614536416441068,
            "loss_tokens_upper_95": 6.817275926096176,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.081651677071869,
            "data_time": 0.008934968461592993,
            "batch_time": 0.11902719115217526,
            "samples_per_second": 1107474.3648752316,
            "samples_per_second_per_gpu": 138434.29560940395,
            "loss_sequences_lower_95": 6.0622144954219745,
            "loss_sequences_upper_95": 6.100807079963544,
            "loss_tokens_lower_95": 6.061698321739859,
            "loss_tokens_upper_95": 6.100707328507375,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9656123025442964,
            "data_time": 0.02648113062093546,
            "batch_time": 0.12945874968727866,
            "samples_per_second": 997707.0491637657,
            "samples_per_second_per_gpu": 124713.38114547072,
            "loss_sequences_lower_95": 1.9426422025384624,
            "loss_sequences_upper_95": 1.9892250775924658,
            "loss_tokens_lower_95": 1.9424771169729864,
            "loss_tokens_upper_95": 1.9899339760869694,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.733939109325409,
            "data_time": 0.019058706238865852,
            "batch_time": 0.13013211078941822,
            "samples_per_second": 1080919.079137545,
            "samples_per_second_per_gpu": 135114.88489219313,
            "loss_sequences_lower_95": 7.651598803710937,
            "loss_sequences_upper_95": 7.818073510742188,
            "loss_tokens_lower_95": 7.649560046386719,
            "loss_tokens_upper_95": 7.821976147460938,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0925715265571787,
            "data_time": 0.00419790414442499,
            "batch_time": 0.11685146104140454,
            "samples_per_second": 1128832.9213744712,
            "samples_per_second_per_gpu": 141104.1151718089,
            "loss_sequences_lower_95": 2.8596562121200924,
            "loss_sequences_upper_95": 2.9293161937529564,
            "loss_tokens_lower_95": 1.936557175416169,
            "loss_tokens_upper_95": 1.986208207307352,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0151404213549484,
            "data_time": 0.048284406011754814,
            "batch_time": 0.13901910998604514,
            "samples_per_second": 833837.9927385624,
            "samples_per_second_per_gpu": 104229.7490923203,
            "loss_sequences_lower_95": 1.9661778236503031,
            "loss_sequences_upper_95": 2.0658904573810632,
            "loss_tokens_lower_95": 1.964352109538975,
            "loss_tokens_upper_95": 2.065777730230075,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9757445627567816,
            "data_time": 0.03536397963762283,
            "batch_time": 0.14812560006976128,
            "samples_per_second": 1039116.5768262645,
            "samples_per_second_per_gpu": 129889.57210328306,
            "loss_sequences_lower_95": 1.9419387697706034,
            "loss_sequences_upper_95": 2.0100094724168964,
            "loss_tokens_lower_95": 1.9419706426882277,
            "loss_tokens_upper_95": 2.008943254059436,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9147383864027776,
            "data_time": 0.00485373300219339,
            "batch_time": 0.11698384393775274,
            "samples_per_second": 1125589.510066803,
            "samples_per_second_per_gpu": 140698.68875835038,
            "loss_sequences_lower_95": 4.048697109286922,
            "loss_sequences_upper_95": 4.154779174300075,
            "loss_tokens_lower_95": 2.69381617583037,
            "loss_tokens_upper_95": 2.7674203684651904,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.09934572441868,
            "data_time": 0.06984079629182816,
            "batch_time": 0.15804269909858704,
            "samples_per_second": 860133.4988923874,
            "samples_per_second_per_gpu": 107516.68736154842,
            "loss_sequences_lower_95": 6.020830717540922,
            "loss_sequences_upper_95": 6.172506084139385,
            "loss_tokens_lower_95": 6.0202087079406414,
            "loss_tokens_upper_95": 6.171934791847511,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.124016705273853,
            "data_time": 0.008660100400447845,
            "batch_time": 0.12003009537091622,
            "samples_per_second": 1111288.5457305848,
            "samples_per_second_per_gpu": 138911.0682163231,
            "loss_sequences_lower_95": 3.100556252389144,
            "loss_sequences_upper_95": 3.1474395695957567,
            "loss_tokens_lower_95": 3.1012423622061354,
            "loss_tokens_upper_95": 3.1473537396072246,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.170008412263926,
            "data_time": 0.07071249186992645,
            "batch_time": 0.1659957841038704,
            "samples_per_second": 877433.7706216039,
            "samples_per_second_per_gpu": 109679.22132770049,
            "loss_sequences_lower_95": 2.1110745735538816,
            "loss_sequences_upper_95": 2.2313593744074254,
            "loss_tokens_lower_95": 2.1083296803594793,
            "loss_tokens_upper_95": 2.233673058667229,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1591516584157944,
            "data_time": 0.12143886089324951,
            "batch_time": 0.18424421548843384,
            "samples_per_second": 587770.1548024121,
            "samples_per_second_per_gpu": 73471.26935030152,
            "loss_sequences_lower_95": 1.0623311805725097,
            "loss_sequences_upper_95": 1.440846767425537,
            "loss_tokens_lower_95": 0.9398105065027873,
            "loss_tokens_upper_95": 1.3255728430218165,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0590948730707168,
            "data_time": 0.11631114780902863,
            "batch_time": 0.1790863424539566,
            "samples_per_second": 592648.1904909385,
            "samples_per_second_per_gpu": 74081.02381136731,
            "loss_sequences_lower_95": 1.022097838719686,
            "loss_sequences_upper_95": 1.3728442668914793,
            "loss_tokens_lower_95": 0.8446209661076578,
            "loss_tokens_upper_95": 1.2646841777844375,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.455288022115761,
            "data_time": 0.008144627014795939,
            "batch_time": 0.11941941358425,
            "samples_per_second": 1113783.1076066345,
            "samples_per_second_per_gpu": 139222.8884508293,
            "loss_sequences_lower_95": 5.418822256995582,
            "loss_sequences_upper_95": 5.491749341287739,
            "loss_tokens_lower_95": 5.419300930826583,
            "loss_tokens_upper_95": 5.492215962697902,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.32374953435062503,
            "data_time": 0.00243273295902319,
            "batch_time": 0.11570412592240965,
            "samples_per_second": 1134954.5954990222,
            "samples_per_second_per_gpu": 141869.32443737777,
            "loss_sequences_lower_95": 0.42848343159507923,
            "loss_sequences_upper_95": 0.44057127253379735,
            "loss_tokens_lower_95": 0.30939542503072287,
            "loss_tokens_upper_95": 0.3156456294813364,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9977672419210117,
            "data_time": 0.12411618232727051,
            "batch_time": 0.2376232147216797,
            "samples_per_second": 744874.1119098037,
            "samples_per_second_per_gpu": 93109.26398872546,
            "loss_sequences_lower_95": 3.1432748208834433,
            "loss_sequences_upper_95": 3.477323895552027,
            "loss_tokens_lower_95": 2.843191696987841,
            "loss_tokens_upper_95": 3.0847567133574367,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.2411333934680835,
            "data_time": 0.10820148885250092,
            "batch_time": 0.15150713920593262,
            "samples_per_second": 491647.93487003003,
            "samples_per_second_per_gpu": 61455.991858753754,
            "loss_sequences_lower_95": 5.65991784173089,
            "loss_sequences_upper_95": 6.891311408377982,
            "loss_tokens_lower_95": 4.936079821174527,
            "loss_tokens_upper_95": 7.442543745629581,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.906506616894792,
            "data_time": 0.06690564006567001,
            "batch_time": 0.1442742869257927,
            "samples_per_second": 807375.0235666416,
            "samples_per_second_per_gpu": 100921.8779458302,
            "loss_sequences_lower_95": 3.0252794033143577,
            "loss_sequences_upper_95": 3.3100955963134764,
            "loss_tokens_lower_95": 2.7078110832041107,
            "loss_tokens_upper_95": 2.904967187828532,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0759710669517517,
            "data_time": 0.06601912528276443,
            "batch_time": 0.14345334470272064,
            "samples_per_second": 809701.9377198968,
            "samples_per_second_per_gpu": 101212.7422149871,
            "loss_sequences_lower_95": 3.1872683641387196,
            "loss_sequences_upper_95": 3.451089970658465,
            "loss_tokens_lower_95": 2.905076057807186,
            "loss_tokens_upper_95": 3.0771940932570803,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8855884009745063,
            "data_time": 0.06958767771720886,
            "batch_time": 0.14722474664449692,
            "samples_per_second": 803449.1556870338,
            "samples_per_second_per_gpu": 100431.14446087922,
            "loss_sequences_lower_95": 3.010101597483565,
            "loss_sequences_upper_95": 3.3295354052287776,
            "loss_tokens_lower_95": 2.700948884337712,
            "loss_tokens_upper_95": 2.9481085208587907,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2722952191422627,
            "data_time": 0.06552258133888245,
            "batch_time": 0.14292708784341812,
            "samples_per_second": 809590.3921053325,
            "samples_per_second_per_gpu": 101198.79901316656,
            "loss_sequences_lower_95": 3.343103743762505,
            "loss_sequences_upper_95": 3.6070824506806165,
            "loss_tokens_lower_95": 3.101210026726173,
            "loss_tokens_upper_95": 3.2688107024100708,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2493285259104665,
            "data_time": 0.06878218054771423,
            "batch_time": 0.1451365351676941,
            "samples_per_second": 795940.6993627178,
            "samples_per_second_per_gpu": 99492.58742033973,
            "loss_sequences_lower_95": 3.3272260772515527,
            "loss_sequences_upper_95": 3.580210127741654,
            "loss_tokens_lower_95": 3.078268756626572,
            "loss_tokens_upper_95": 3.248409999725797,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.505792860577746,
            "data_time": 0.0701521709561348,
            "batch_time": 0.1478128582239151,
            "samples_per_second": 806546.3160019481,
            "samples_per_second_per_gpu": 100818.28950024351,
            "loss_sequences_lower_95": 2.6291464038011503,
            "loss_sequences_upper_95": 2.8560790317814524,
            "loss_tokens_lower_95": 2.3540627884361816,
            "loss_tokens_upper_95": 2.4664622125056512,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-32.0/params.txt",
    "uuid": "3768017b-2341-4211-8766-1a4f66f01bb6",
    "creation_date": "2024_01_26-16_40_31"
}