{
    "name": "rw_original-d=576_l=24_h=8-4.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 12294190080,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1229419008",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.2482611497243243,
            "data_time": 0.03803577646613121,
            "batch_time": 0.4093921259045601,
            "samples_per_second": 831406.6485893445,
            "samples_per_second_per_gpu": 103925.83107366806,
            "loss_sequences_lower_95": 3.1716663805643717,
            "loss_sequences_upper_95": 3.3260693613688153,
            "loss_tokens_lower_95": 3.2342263666788735,
            "loss_tokens_upper_95": 3.2621822039286297,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2113737009737338,
            "data_time": 0.00124944122197238,
            "batch_time": 0.030759977800179753,
            "samples_per_second": 1080537.2260054345,
            "samples_per_second_per_gpu": 135067.15325067932,
            "loss_sequences_lower_95": 3.2089461714255463,
            "loss_sequences_upper_95": 3.2138150614271934,
            "loss_tokens_lower_95": 3.200915265625,
            "loss_tokens_upper_95": 3.22189171875,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8778789909518494,
            "data_time": 0.010260583877563477,
            "batch_time": 0.039754854202270505,
            "samples_per_second": 1049597.308008664,
            "samples_per_second_per_gpu": 131199.663501083,
            "loss_sequences_lower_95": 2.8186504130460777,
            "loss_sequences_upper_95": 2.9526775437958386,
            "loss_tokens_lower_95": 2.8653231927083334,
            "loss_tokens_upper_95": 2.8906572968749997,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3353471402532047,
            "data_time": 0.0016524575062488254,
            "batch_time": 0.030229126152239348,
            "samples_per_second": 1116129.4487817278,
            "samples_per_second_per_gpu": 139516.18109771598,
            "loss_sequences_lower_95": 3.291269767839884,
            "loss_sequences_upper_95": 3.3815835313305413,
            "loss_tokens_lower_95": 3.322764552083333,
            "loss_tokens_upper_95": 3.348084864583333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.289697437810801,
            "data_time": 0.009429644778430224,
            "batch_time": 0.03851078420996191,
            "samples_per_second": 1059466.295678415,
            "samples_per_second_per_gpu": 132433.2869598019,
            "loss_sequences_lower_95": 3.2263603443526443,
            "loss_sequences_upper_95": 3.372363007772721,
            "loss_tokens_lower_95": 3.2783139583333334,
            "loss_tokens_upper_95": 3.300690052083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4241399210703607,
            "data_time": 0.0036061441768770633,
            "batch_time": 0.03241711498602577,
            "samples_per_second": 1107978.1524803115,
            "samples_per_second_per_gpu": 138497.26906003893,
            "loss_sequences_lower_95": 3.3734932739340797,
            "loss_sequences_upper_95": 3.4792376572059824,
            "loss_tokens_lower_95": 3.41131540625,
            "loss_tokens_upper_95": 3.4367543802083333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1826921067432483,
            "data_time": 0.0015626211345293004,
            "batch_time": 0.03007982294477802,
            "samples_per_second": 1121203.8970874494,
            "samples_per_second_per_gpu": 140150.48713593118,
            "loss_sequences_lower_95": 3.149757573341837,
            "loss_sequences_upper_95": 3.215566441127232,
            "loss_tokens_lower_95": 3.165919234375,
            "loss_tokens_upper_95": 3.1999914635416666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7645656684555933,
            "data_time": 0.0018038814509578052,
            "batch_time": 0.031212469365795203,
            "samples_per_second": 1093950.8230888494,
            "samples_per_second_per_gpu": 136743.85288610618,
            "loss_sequences_lower_95": 3.737333861665576,
            "loss_sequences_upper_95": 3.794290606593586,
            "loss_tokens_lower_95": 3.752656239583333,
            "loss_tokens_upper_95": 3.7762576458333332,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3909757369902076,
            "data_time": 0.00944198975487361,
            "batch_time": 0.038841035630967885,
            "samples_per_second": 1043988.8455517462,
            "samples_per_second_per_gpu": 130498.60569396827,
            "loss_sequences_lower_95": 3.2948026021321617,
            "loss_sequences_upper_95": 3.508104426686357,
            "loss_tokens_lower_95": 3.3791138125,
            "loss_tokens_upper_95": 3.4029432343750003,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.466653379055822,
            "data_time": 0.009867137297987938,
            "batch_time": 0.03971044905483723,
            "samples_per_second": 1051301.566511996,
            "samples_per_second_per_gpu": 131412.6958139995,
            "loss_sequences_lower_95": 4.339611454537735,
            "loss_sequences_upper_95": 4.623969700025476,
            "loss_tokens_lower_95": 4.452754104166667,
            "loss_tokens_upper_95": 4.480546708333333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4511127882303074,
            "data_time": 0.0013040515552582431,
            "batch_time": 0.029948040373392645,
            "samples_per_second": 1117967.570463956,
            "samples_per_second_per_gpu": 139745.9463079945,
            "loss_sequences_lower_95": 3.4371000670759386,
            "loss_sequences_upper_95": 3.465823136327429,
            "loss_tokens_lower_95": 3.4396842708333333,
            "loss_tokens_upper_95": 3.4627190625,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2875677424140495,
            "data_time": 0.0026136345907015962,
            "batch_time": 0.031739755435152714,
            "samples_per_second": 1096856.7222051509,
            "samples_per_second_per_gpu": 137107.09027564386,
            "loss_sequences_lower_95": 3.258097339202741,
            "loss_sequences_upper_95": 3.3186350271533995,
            "loss_tokens_lower_95": 3.2759728229166662,
            "loss_tokens_upper_95": 3.2993056875,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8368354772215683,
            "data_time": 0.009943194068938847,
            "batch_time": 0.03910130945590174,
            "samples_per_second": 1052106.1516845904,
            "samples_per_second_per_gpu": 131513.2689605738,
            "loss_sequences_lower_95": 3.7407603420543865,
            "loss_sequences_upper_95": 3.953906280950891,
            "loss_tokens_lower_95": 3.82347853125,
            "loss_tokens_upper_95": 3.85004046875,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.032159986418279,
            "data_time": 0.010342254106741977,
            "batch_time": 0.04006402995956847,
            "samples_per_second": 1038264.1669509491,
            "samples_per_second_per_gpu": 129783.02086886864,
            "loss_sequences_lower_95": 2.9391996488551744,
            "loss_sequences_upper_95": 3.1424982758500666,
            "loss_tokens_lower_95": 3.0203497083333333,
            "loss_tokens_upper_95": 3.0440014114583334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8757914142175154,
            "data_time": 0.0833367109298706,
            "batch_time": 0.1153697030884879,
            "samples_per_second": 564897.9533637975,
            "samples_per_second_per_gpu": 70612.24417047469,
            "loss_sequences_lower_95": 3.7927138848738235,
            "loss_sequences_upper_95": 3.9866258881308814,
            "loss_tokens_lower_95": 3.8543783968145195,
            "loss_tokens_upper_95": 3.8974652377041905,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3870540068378947,
            "data_time": 0.014348974282091314,
            "batch_time": 0.04389790242368525,
            "samples_per_second": 1035704.3699957876,
            "samples_per_second_per_gpu": 129463.04624947345,
            "loss_sequences_lower_95": 3.3195021203933583,
            "loss_sequences_upper_95": 3.454506245159894,
            "loss_tokens_lower_95": 3.373448901041667,
            "loss_tokens_upper_95": 3.4005552447916667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.251389469509075,
            "data_time": 0.012830369174480438,
            "batch_time": 0.0426156185567379,
            "samples_per_second": 1043588.8057179925,
            "samples_per_second_per_gpu": 130448.60071474906,
            "loss_sequences_lower_95": 5.157456974240909,
            "loss_sequences_upper_95": 5.376867925397324,
            "loss_tokens_lower_95": 5.239762427083334,
            "loss_tokens_upper_95": 5.263008520833334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.619897578583389,
            "data_time": 0.03804705664515495,
            "batch_time": 0.06834093108773232,
            "samples_per_second": 919633.6094981468,
            "samples_per_second_per_gpu": 114954.20118726835,
            "loss_sequences_lower_95": 3.4535607322317654,
            "loss_sequences_upper_95": 3.91814799699627,
            "loss_tokens_lower_95": 3.6056384477459016,
            "loss_tokens_upper_95": 3.634694377711562,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5832255274341787,
            "data_time": 0.0016725743210668717,
            "batch_time": 0.030776738441615006,
            "samples_per_second": 1094893.632764206,
            "samples_per_second_per_gpu": 136861.70409552575,
            "loss_sequences_lower_95": 3.5649306614197585,
            "loss_sequences_upper_95": 3.6018822091204066,
            "loss_tokens_lower_95": 3.5649378455036675,
            "loss_tokens_upper_95": 3.60196334823654,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.863133142892404,
            "data_time": 0.001845623894481902,
            "batch_time": 0.030840810601878318,
            "samples_per_second": 1095457.490716789,
            "samples_per_second_per_gpu": 136932.18633959861,
            "loss_sequences_lower_95": 2.8583595432387097,
            "loss_sequences_upper_95": 2.8834026580549192,
            "loss_tokens_lower_95": 2.8429924953115826,
            "loss_tokens_upper_95": 2.861675619884387,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.145761908557711,
            "data_time": 0.0032242659012401698,
            "batch_time": 0.03292564375011793,
            "samples_per_second": 1082032.1174218901,
            "samples_per_second_per_gpu": 135254.01467773627,
            "loss_sequences_lower_95": 4.40627003175558,
            "loss_sequences_upper_95": 4.695653431946003,
            "loss_tokens_lower_95": 3.586612461281868,
            "loss_tokens_upper_95": 3.7992164929935073,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.051341151461005,
            "data_time": 0.0038704995779280968,
            "batch_time": 0.03302337356070255,
            "samples_per_second": 1087663.086935847,
            "samples_per_second_per_gpu": 135957.88586698088,
            "loss_sequences_lower_95": 4.142866463216145,
            "loss_sequences_upper_95": 4.342171687825521,
            "loss_tokens_lower_95": 3.808249226120283,
            "loss_tokens_upper_95": 3.9487515047661166,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.801657565440799,
            "data_time": 0.004556699036473062,
            "batch_time": 0.033569875705655706,
            "samples_per_second": 1087074.8428339856,
            "samples_per_second_per_gpu": 135884.3553542482,
            "loss_sequences_lower_95": 2.8443746918164208,
            "loss_sequences_upper_95": 2.90078954689425,
            "loss_tokens_lower_95": 2.7114785811009745,
            "loss_tokens_upper_95": 2.74128373619648,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3092556747523223,
            "data_time": 0.02233427975858961,
            "batch_time": 0.05198841222694942,
            "samples_per_second": 1010391.6411560004,
            "samples_per_second_per_gpu": 126298.95514450005,
            "loss_sequences_lower_95": 2.287333984375,
            "loss_sequences_upper_95": 2.3908331021395597,
            "loss_tokens_lower_95": 2.242901533211982,
            "loss_tokens_upper_95": 2.289681937638567,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.143507194519043,
            "data_time": 0.02007010579109192,
            "batch_time": 0.05068158730864525,
            "samples_per_second": 962371.5196945826,
            "samples_per_second_per_gpu": 120296.43996182282,
            "loss_sequences_lower_95": 3.1337451483278858,
            "loss_sequences_upper_95": 3.313421325683594,
            "loss_tokens_lower_95": 3.036103310659276,
            "loss_tokens_upper_95": 3.1240303997228334,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.436892555554708,
            "data_time": 0.017073958348005246,
            "batch_time": 0.048260587912339434,
            "samples_per_second": 954558.9910873318,
            "samples_per_second_per_gpu": 119319.87388591647,
            "loss_sequences_lower_95": 3.404922673543294,
            "loss_sequences_upper_95": 3.5152167053222656,
            "loss_tokens_lower_95": 3.3083779164091474,
            "loss_tokens_upper_95": 3.4959215614498027,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.4377963339500095,
            "data_time": 0.001448585555117239,
            "batch_time": 0.03046887470692493,
            "samples_per_second": 1097227.9411885156,
            "samples_per_second_per_gpu": 137153.49264856445,
            "loss_sequences_lower_95": 5.443415649528197,
            "loss_sequences_upper_95": 5.529575133559495,
            "loss_tokens_lower_95": 5.297116693269507,
            "loss_tokens_upper_95": 5.384347544824168,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.196023412425109,
            "data_time": 0.002865698713584234,
            "batch_time": 0.03161702900124876,
            "samples_per_second": 1103015.5236722627,
            "samples_per_second_per_gpu": 137876.94045903283,
            "loss_sequences_lower_95": 4.685963866365478,
            "loss_sequences_upper_95": 4.972992498224431,
            "loss_tokens_lower_95": 3.502629717169406,
            "loss_tokens_upper_95": 3.6337996785149156,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.848399381283607,
            "data_time": 0.004975838032928673,
            "batch_time": 0.03440109861863626,
            "samples_per_second": 1070088.009532326,
            "samples_per_second_per_gpu": 133761.00119154074,
            "loss_sequences_lower_95": 4.232918624096763,
            "loss_sequences_upper_95": 4.560961278713603,
            "loss_tokens_lower_95": 3.4505253474533113,
            "loss_tokens_upper_95": 3.6009620038722168,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.954055749118056,
            "data_time": 0.02306370437145233,
            "batch_time": 0.05369048884936741,
            "samples_per_second": 987638.8220379987,
            "samples_per_second_per_gpu": 123454.85275474984,
            "loss_sequences_lower_95": 5.869054785602169,
            "loss_sequences_upper_95": 6.036945081736944,
            "loss_tokens_lower_95": 5.868795637017516,
            "loss_tokens_upper_95": 6.036401757366581,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.204073784351349,
            "data_time": 0.04839777946472168,
            "batch_time": 0.08130324803865872,
            "samples_per_second": 839521.7839589219,
            "samples_per_second_per_gpu": 104940.22299486524,
            "loss_sequences_lower_95": 3.0673353424072265,
            "loss_sequences_upper_95": 3.398798736572265,
            "loss_tokens_lower_95": 2.9164745078317167,
            "loss_tokens_upper_95": 3.3488734700811995,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.499259631733811,
            "data_time": 0.0033793905273780743,
            "batch_time": 0.032312339068921794,
            "samples_per_second": 1097447.7750096237,
            "samples_per_second_per_gpu": 137180.97187620297,
            "loss_sequences_lower_95": 4.447040263461969,
            "loss_sequences_upper_95": 4.55302042185101,
            "loss_tokens_lower_95": 4.445492244474475,
            "loss_tokens_upper_95": 4.552335778571201,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.379939228271872,
            "data_time": 0.004694178287201176,
            "batch_time": 0.03393496969201242,
            "samples_per_second": 1083434.7246184545,
            "samples_per_second_per_gpu": 135429.3405773068,
            "loss_sequences_lower_95": 4.325442072487971,
            "loss_sequences_upper_95": 4.435640992110719,
            "loss_tokens_lower_95": 4.322690401694308,
            "loss_tokens_upper_95": 4.437856073402949,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2850306418048416,
            "data_time": 0.003481331448318119,
            "batch_time": 0.03276834091408979,
            "samples_per_second": 1077534.7255907943,
            "samples_per_second_per_gpu": 134691.8406988493,
            "loss_sequences_lower_95": 3.4178765663254893,
            "loss_sequences_upper_95": 3.548125118218257,
            "loss_tokens_lower_95": 3.1313478787015945,
            "loss_tokens_upper_95": 3.189859258054019,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.2284485132694245,
            "data_time": 0.010438998229801655,
            "batch_time": 0.039468890987336636,
            "samples_per_second": 1055231.4641554074,
            "samples_per_second_per_gpu": 131903.93301942592,
            "loss_sequences_lower_95": 5.416851794433594,
            "loss_sequences_upper_95": 5.963366076660156,
            "loss_tokens_lower_95": 4.654059682122766,
            "loss_tokens_upper_95": 5.015585519019378,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7731441259384155,
            "data_time": 0.14993472397327423,
            "batch_time": 0.18355092406272888,
            "samples_per_second": 553108.6815406721,
            "samples_per_second_per_gpu": 69138.58519258401,
            "loss_sequences_lower_95": 3.5244523167610167,
            "loss_sequences_upper_95": 4.042741894721985,
            "loss_tokens_lower_95": 3.2593480428059896,
            "loss_tokens_upper_95": 4.154985861942685,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.045334089761493,
            "data_time": 0.027446046788641747,
            "batch_time": 0.05718461757010602,
            "samples_per_second": 928836.4847100592,
            "samples_per_second_per_gpu": 116104.5605887574,
            "loss_sequences_lower_95": 4.313399479306978,
            "loss_sequences_upper_95": 4.855549323421784,
            "loss_tokens_lower_95": 3.1559550907861826,
            "loss_tokens_upper_95": 3.508294574518214,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.03038517821559,
            "data_time": 0.002969451662566927,
            "batch_time": 0.03184994972414441,
            "samples_per_second": 1093402.6777440635,
            "samples_per_second_per_gpu": 136675.33471800794,
            "loss_sequences_lower_95": 2.0014815250928355,
            "loss_sequences_upper_95": 2.059280820807394,
            "loss_tokens_lower_95": 2.0002265526874456,
            "loss_tokens_upper_95": 2.06018771416967,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6131341248482434,
            "data_time": 0.0023936826589730158,
            "batch_time": 0.03145835785427249,
            "samples_per_second": 1095386.5858428404,
            "samples_per_second_per_gpu": 136923.32323035505,
            "loss_sequences_lower_95": 2.58701191773906,
            "loss_sequences_upper_95": 2.727007747496907,
            "loss_tokens_lower_95": 2.463536591900121,
            "loss_tokens_upper_95": 2.603379914676278,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1202111816231586,
            "data_time": 0.01801733672618866,
            "batch_time": 0.04766626490486993,
            "samples_per_second": 989721.1904565716,
            "samples_per_second_per_gpu": 123715.14880707145,
            "loss_sequences_lower_95": 2.97587640224359,
            "loss_sequences_upper_95": 3.3837816958025697,
            "loss_tokens_lower_95": 2.858260772451,
            "loss_tokens_upper_95": 3.1515897315040324,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5067318011760333,
            "data_time": 0.004461320489645005,
            "batch_time": 0.033388490229845046,
            "samples_per_second": 1080923.5016014948,
            "samples_per_second_per_gpu": 135115.43770018686,
            "loss_sequences_lower_95": 3.5504280072267167,
            "loss_sequences_upper_95": 3.7040153425136886,
            "loss_tokens_lower_95": 3.3611236690252566,
            "loss_tokens_upper_95": 3.5024953560341023,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.710723826797997,
            "data_time": 0.032093627112252374,
            "batch_time": 0.06293223869232904,
            "samples_per_second": 952052.65048527,
            "samples_per_second_per_gpu": 119006.58131065874,
            "loss_sequences_lower_95": 2.5401878915181975,
            "loss_sequences_upper_95": 2.989195605022151,
            "loss_tokens_lower_95": 2.425110709608865,
            "loss_tokens_upper_95": 2.773473628232374,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.687348542344119,
            "data_time": 0.002163909979929602,
            "batch_time": 0.03119214644110112,
            "samples_per_second": 1092052.8548352006,
            "samples_per_second_per_gpu": 136506.60685440007,
            "loss_sequences_lower_95": 5.675724988747749,
            "loss_sequences_upper_95": 5.698748529002676,
            "loss_tokens_lower_95": 5.67586159810087,
            "loss_tokens_upper_95": 5.698732168308662,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.1221056169676549,
            "data_time": 0.048798019235784355,
            "batch_time": 0.07916312217712403,
            "samples_per_second": 890341.3368716832,
            "samples_per_second_per_gpu": 111292.6671089604,
            "loss_sequences_lower_95": 1.0774052573639212,
            "loss_sequences_upper_95": 1.2226312137344508,
            "loss_tokens_lower_95": 0.955110232490844,
            "loss_tokens_upper_95": 1.19060083582555,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9640301132383335,
            "data_time": 0.001289629876263883,
            "batch_time": 0.03031803562625156,
            "samples_per_second": 1094913.130184009,
            "samples_per_second_per_gpu": 136864.14127300112,
            "loss_sequences_lower_95": 4.2593389163227195,
            "loss_sequences_upper_95": 4.298426188662867,
            "loss_tokens_lower_95": 3.5020600761605416,
            "loss_tokens_upper_95": 3.539953421179884,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.475855773925781,
            "data_time": 0.0056780363832201275,
            "batch_time": 0.03472704026434156,
            "samples_per_second": 1084140.1101860167,
            "samples_per_second_per_gpu": 135517.51377325208,
            "loss_sequences_lower_95": 5.457909350585937,
            "loss_sequences_upper_95": 5.648063854980469,
            "loss_tokens_lower_95": 5.291122901485765,
            "loss_tokens_upper_95": 5.474710798152915,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.372010746209518,
            "data_time": 0.022123413570856645,
            "batch_time": 0.051733970642089844,
            "samples_per_second": 1000137.7888751174,
            "samples_per_second_per_gpu": 125017.22360938968,
            "loss_sequences_lower_95": 4.233347844662873,
            "loss_sequences_upper_95": 4.5114189745032265,
            "loss_tokens_lower_95": 4.233709763236668,
            "loss_tokens_upper_95": 4.5076045691448705,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.043776915290139,
            "data_time": 0.0045896025307207225,
            "batch_time": 0.033659462110105766,
            "samples_per_second": 1084292.677297878,
            "samples_per_second_per_gpu": 135536.58466223476,
            "loss_sequences_lower_95": 6.955796046401516,
            "loss_sequences_upper_95": 7.128419392903646,
            "loss_tokens_lower_95": 6.957699991861979,
            "loss_tokens_upper_95": 7.13009589917732,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4743422153393428,
            "data_time": 0.00402656356070904,
            "batch_time": 0.03293324721620438,
            "samples_per_second": 1093027.8763820855,
            "samples_per_second_per_gpu": 136628.4845477607,
            "loss_sequences_lower_95": 1.4998267985026044,
            "loss_sequences_upper_95": 1.54181455078125,
            "loss_tokens_lower_95": 1.4000586797218888,
            "loss_tokens_upper_95": 1.4776504039115645,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.663877692676726,
            "data_time": 0.02368115314415523,
            "batch_time": 0.054184168577194214,
            "samples_per_second": 931615.0783838899,
            "samples_per_second_per_gpu": 116451.88479798623,
            "loss_sequences_lower_95": 5.354655122302828,
            "loss_sequences_upper_95": 5.972177371070498,
            "loss_tokens_lower_95": 5.358640863327753,
            "loss_tokens_upper_95": 5.978650803338914,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.0948017351329327,
            "data_time": 0.15422973036766052,
            "batch_time": 0.1883450150489807,
            "samples_per_second": 549724.8524249577,
            "samples_per_second_per_gpu": 68715.60655311971,
            "loss_sequences_lower_95": 1.9395441591739655,
            "loss_sequences_upper_95": 2.768014395236969,
            "loss_tokens_lower_95": 1.6550203177855187,
            "loss_tokens_upper_95": 2.106806886024082,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.496465452432632,
            "data_time": 0.005523275288324508,
            "batch_time": 0.03486406093551999,
            "samples_per_second": 1069316.5206119453,
            "samples_per_second_per_gpu": 133664.56507649316,
            "loss_sequences_lower_95": 7.428450463867188,
            "loss_sequences_upper_95": 7.722723596191406,
            "loss_tokens_lower_95": 7.247084378470019,
            "loss_tokens_upper_95": 7.509949329460131,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.067196171283722,
            "data_time": 0.00580831699901157,
            "batch_time": 0.034882642920055086,
            "samples_per_second": 1078150.0909016468,
            "samples_per_second_per_gpu": 134768.76136270585,
            "loss_sequences_lower_95": 7.190219702148437,
            "loss_sequences_upper_95": 7.428727416992187,
            "loss_tokens_lower_95": 6.79304023173432,
            "loss_tokens_upper_95": 7.001007150373999,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.035454205766525,
            "data_time": 0.003895890513391399,
            "batch_time": 0.03322439193725586,
            "samples_per_second": 1076545.8669289022,
            "samples_per_second_per_gpu": 134568.23336611278,
            "loss_sequences_lower_95": 6.012150166045089,
            "loss_sequences_upper_95": 6.058122590512907,
            "loss_tokens_lower_95": 6.0125593859998325,
            "loss_tokens_upper_95": 6.058468921545005,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9310631795962285,
            "data_time": 0.008312200131373102,
            "batch_time": 0.038298026312514014,
            "samples_per_second": 1036709.8085994668,
            "samples_per_second_per_gpu": 129588.72607493335,
            "loss_sequences_lower_95": 2.8457977060531876,
            "loss_sequences_upper_95": 3.019114523329493,
            "loss_tokens_lower_95": 2.843147247361331,
            "loss_tokens_upper_95": 3.0188343991515456,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.686764496803284,
            "data_time": 0.005880765498630584,
            "batch_time": 0.035419884655210704,
            "samples_per_second": 1064240.6014979153,
            "samples_per_second_per_gpu": 133030.07518723942,
            "loss_sequences_lower_95": 6.6190376953125,
            "loss_sequences_upper_95": 6.756976416015625,
            "loss_tokens_lower_95": 6.617363928222656,
            "loss_tokens_upper_95": 6.756609521484375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.723534501839496,
            "data_time": 0.0017961673339959127,
            "batch_time": 0.030578678722417648,
            "samples_per_second": 1102909.9212368184,
            "samples_per_second_per_gpu": 137863.7401546023,
            "loss_sequences_lower_95": 3.160437032506504,
            "loss_sequences_upper_95": 3.234210268596263,
            "loss_tokens_lower_95": 2.1758609646269673,
            "loss_tokens_upper_95": 2.229299620189129,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.538681899433706,
            "data_time": 0.0192623427936009,
            "batch_time": 0.049093403135027205,
            "samples_per_second": 989088.5615542346,
            "samples_per_second_per_gpu": 123636.07019427932,
            "loss_sequences_lower_95": 4.387691657222918,
            "loss_sequences_upper_95": 4.691532431075822,
            "loss_tokens_lower_95": 4.3862015453737175,
            "loss_tokens_upper_95": 4.69091954017753,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.462179414898741,
            "data_time": 0.010337804444134235,
            "batch_time": 0.04031258635222912,
            "samples_per_second": 1051873.1297258614,
            "samples_per_second_per_gpu": 131484.14121573267,
            "loss_sequences_lower_95": 4.360731356751685,
            "loss_sequences_upper_95": 4.558649615119485,
            "loss_tokens_lower_95": 4.365529665479473,
            "loss_tokens_upper_95": 4.557845291436887,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8049730407658577,
            "data_time": 0.001932890244083543,
            "batch_time": 0.030787442379756162,
            "samples_per_second": 1099466.4206330029,
            "samples_per_second_per_gpu": 137433.30257912536,
            "loss_sequences_lower_95": 4.3645793331336895,
            "loss_sequences_upper_95": 4.4652740344716895,
            "loss_tokens_lower_95": 3.050909964209774,
            "loss_tokens_upper_95": 3.1277811033563077,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.5218263202243385,
            "data_time": 0.02637447665135066,
            "batch_time": 0.05747575809558233,
            "samples_per_second": 967946.6286295435,
            "samples_per_second_per_gpu": 120993.32857869293,
            "loss_sequences_lower_95": 6.437055282996445,
            "loss_sequences_upper_95": 6.60297544772032,
            "loss_tokens_lower_95": 6.436823074905961,
            "loss_tokens_upper_95": 6.60508860431651,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.392000396171476,
            "data_time": 0.003381148653880435,
            "batch_time": 0.03254042425726214,
            "samples_per_second": 1085477.1359692223,
            "samples_per_second_per_gpu": 135684.6419961528,
            "loss_sequences_lower_95": 4.355003897290711,
            "loss_sequences_upper_95": 4.428256365574828,
            "loss_tokens_lower_95": 4.355049320885894,
            "loss_tokens_upper_95": 4.429050315366973,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.555287255824191,
            "data_time": 0.02353365421295166,
            "batch_time": 0.05610833818262274,
            "samples_per_second": 879892.4845331176,
            "samples_per_second_per_gpu": 109986.5605666397,
            "loss_sequences_lower_95": 4.384028588452385,
            "loss_sequences_upper_95": 4.730421758855431,
            "loss_tokens_lower_95": 4.381994599277533,
            "loss_tokens_upper_95": 4.730337079983313,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.094141986469428,
            "data_time": 0.08145029097795486,
            "batch_time": 0.11399834603071213,
            "samples_per_second": 740576.1793566195,
            "samples_per_second_per_gpu": 92572.02241957744,
            "loss_sequences_lower_95": 1.94637819925944,
            "loss_sequences_upper_95": 2.4598440043131506,
            "loss_tokens_lower_95": 1.745119349161784,
            "loss_tokens_upper_95": 2.3555384423997667,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9970383544762929,
            "data_time": 0.0779527947306633,
            "batch_time": 0.10909710824489594,
            "samples_per_second": 762653.9602434132,
            "samples_per_second_per_gpu": 95331.74503042665,
            "loss_sequences_lower_95": 1.8268061574300132,
            "loss_sequences_upper_95": 2.446159000396728,
            "loss_tokens_lower_95": 1.4873097837640998,
            "loss_tokens_upper_95": 2.2281323250759852,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.43904793673306,
            "data_time": 0.0032331265724007,
            "batch_time": 0.032093941428648715,
            "samples_per_second": 1096969.744974133,
            "samples_per_second_per_gpu": 137121.21812176664,
            "loss_sequences_lower_95": 6.4201290960972015,
            "loss_sequences_upper_95": 6.458053030651693,
            "loss_tokens_lower_95": 6.420122997974963,
            "loss_tokens_upper_95": 6.457200631098122,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.633610011488096,
            "data_time": 0.0012283351769593403,
            "batch_time": 0.0302727865686954,
            "samples_per_second": 1094042.3396143606,
            "samples_per_second_per_gpu": 136755.29245179507,
            "loss_sequences_lower_95": 0.7275737009842158,
            "loss_sequences_upper_95": 0.7450186230061376,
            "loss_tokens_lower_95": 0.534137335831683,
            "loss_tokens_upper_95": 0.5436902023397991,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.334779936497606,
            "data_time": 0.03783818334341049,
            "batch_time": 0.06855666637420654,
            "samples_per_second": 958583.3417640495,
            "samples_per_second_per_gpu": 119822.91772050619,
            "loss_sequences_lower_95": 4.348649465005229,
            "loss_sequences_upper_95": 4.718865402101532,
            "loss_tokens_lower_95": 4.0743440887143825,
            "loss_tokens_upper_95": 4.389357783618122,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.5067806372771395,
            "data_time": 0.1193778741927374,
            "batch_time": 0.15306922367640904,
            "samples_per_second": 542290.312460371,
            "samples_per_second_per_gpu": 67786.28905754637,
            "loss_sequences_lower_95": 6.096783674085462,
            "loss_sequences_upper_95": 7.175519376187712,
            "loss_tokens_lower_95": 5.494535092954282,
            "loss_tokens_upper_95": 7.339365848494164,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.2715148286121645,
            "data_time": 0.03069780837921869,
            "batch_time": 0.06107435056141445,
            "samples_per_second": 967321.0859187763,
            "samples_per_second_per_gpu": 120915.13573984704,
            "loss_sequences_lower_95": 4.241969001583937,
            "loss_sequences_upper_95": 4.557206474862447,
            "loss_tokens_lower_95": 3.9447059142399077,
            "loss_tokens_upper_95": 4.205228009685134,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.388926523487743,
            "data_time": 0.029965341091156006,
            "batch_time": 0.06011699211029779,
            "samples_per_second": 967623.0788160355,
            "samples_per_second_per_gpu": 120952.88485200444,
            "loss_sequences_lower_95": 4.335357424107994,
            "loss_sequences_upper_95": 4.611681645090987,
            "loss_tokens_lower_95": 4.117631310836168,
            "loss_tokens_upper_95": 4.338187567799343,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.33147446047969,
            "data_time": 0.030057294028145925,
            "batch_time": 0.06036576486769177,
            "samples_per_second": 972150.1378019912,
            "samples_per_second_per_gpu": 121518.7672252489,
            "loss_sequences_lower_95": 4.3289646706930025,
            "loss_sequences_upper_95": 4.695892975970013,
            "loss_tokens_lower_95": 3.9347846550527423,
            "loss_tokens_upper_95": 4.264945617991661,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5216353782793375,
            "data_time": 0.03170370204108102,
            "batch_time": 0.06279417446681432,
            "samples_per_second": 951421.3076578301,
            "samples_per_second_per_gpu": 118927.66345722876,
            "loss_sequences_lower_95": 4.440505023118926,
            "loss_sequences_upper_95": 4.715445495233303,
            "loss_tokens_lower_95": 4.276025124428057,
            "loss_tokens_upper_95": 4.4784410137996495,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8615715518501235,
            "data_time": 0.03093272962687928,
            "batch_time": 0.062770245987692,
            "samples_per_second": 945898.9624600105,
            "samples_per_second_per_gpu": 118237.37030750132,
            "loss_sequences_lower_95": 3.762191241720448,
            "loss_sequences_upper_95": 3.98991564638126,
            "loss_tokens_lower_95": 3.650188760191211,
            "loss_tokens_upper_95": 3.802961677848055,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2946335527955033,
            "data_time": 0.030950469630105153,
            "batch_time": 0.06226450488680885,
            "samples_per_second": 938288.4648462012,
            "samples_per_second_per_gpu": 117286.05810577516,
            "loss_sequences_lower_95": 3.293554575850324,
            "loss_sequences_upper_95": 3.5276017817055307,
            "loss_tokens_lower_95": 3.078433974014843,
            "loss_tokens_upper_95": 3.1997589085574893,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-4.0/params.txt",
    "uuid": "fff6723e-b3cf-425b-a488-fdbacacc0773",
    "creation_date": "2023_12_14-05_03_42"
}