{
    "name": "rw_original-d=576_l=24_h=8-0.25",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 768386880,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "153677376",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.4090244193871815,
            "data_time": 0.03535334765911102,
            "batch_time": 0.4246186427772045,
            "samples_per_second": 840821.2448260211,
            "samples_per_second_per_gpu": 105102.65560325264,
            "loss_sequences_lower_95": 4.3201158396403,
            "loss_sequences_upper_95": 4.499895248413086,
            "loss_tokens_lower_95": 4.394657351175944,
            "loss_tokens_upper_95": 4.423733444213867,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0991146681983315,
            "data_time": 0.0012345027999872546,
            "batch_time": 0.03034091854595822,
            "samples_per_second": 1094633.0198937347,
            "samples_per_second_per_gpu": 136829.12748671684,
            "loss_sequences_lower_95": 4.09689562524265,
            "loss_sequences_upper_95": 4.101333949517053,
            "loss_tokens_lower_95": 4.087898479166666,
            "loss_tokens_upper_95": 4.110296708333333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6096796619648837,
            "data_time": 0.009510037422180175,
            "batch_time": 0.03851947975158691,
            "samples_per_second": 1065897.8118188141,
            "samples_per_second_per_gpu": 133237.22647735177,
            "loss_sequences_lower_95": 3.567362845284598,
            "loss_sequences_upper_95": 3.6618320013552297,
            "loss_tokens_lower_95": 3.59644909375,
            "loss_tokens_upper_95": 3.6231312916666667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.194414522524961,
            "data_time": 0.0016062225362187938,
            "batch_time": 0.029803834561454624,
            "samples_per_second": 1130241.1102845862,
            "samples_per_second_per_gpu": 141280.13878557328,
            "loss_sequences_lower_95": 4.1638331487596645,
            "loss_sequences_upper_95": 4.2261793351320875,
            "loss_tokens_lower_95": 4.181853302083334,
            "loss_tokens_upper_95": 4.2067455520833335,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.141563837979816,
            "data_time": 0.009531121804894679,
            "batch_time": 0.03865687685658732,
            "samples_per_second": 1062516.2297340806,
            "samples_per_second_per_gpu": 132814.52871676008,
            "loss_sequences_lower_95": 4.093883972789508,
            "loss_sequences_upper_95": 4.202204164710889,
            "loss_tokens_lower_95": 4.129952458333333,
            "loss_tokens_upper_95": 4.153180531249999,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.4097924371871065,
            "data_time": 0.0038181608137877092,
            "batch_time": 0.032311944210011025,
            "samples_per_second": 1116161.3173532535,
            "samples_per_second_per_gpu": 139520.1646691567,
            "loss_sequences_lower_95": 4.367067587286497,
            "loss_sequences_upper_95": 4.456580818015487,
            "loss_tokens_lower_95": 4.397023562499999,
            "loss_tokens_upper_95": 4.422306552083334,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.457445732428103,
            "data_time": 0.0015766044036404732,
            "batch_time": 0.029706503788759813,
            "samples_per_second": 1135575.6468436902,
            "samples_per_second_per_gpu": 141946.95585546127,
            "loss_sequences_lower_95": 4.426766960299745,
            "loss_sequences_upper_95": 4.4873438297193875,
            "loss_tokens_lower_95": 4.441439895833334,
            "loss_tokens_upper_95": 4.473660770833333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.484663311224334,
            "data_time": 0.0016852352520453286,
            "batch_time": 0.03008868304419747,
            "samples_per_second": 1125355.355281081,
            "samples_per_second_per_gpu": 140669.41941013513,
            "loss_sequences_lower_95": 4.463794850294502,
            "loss_sequences_upper_95": 4.507585139888744,
            "loss_tokens_lower_95": 4.472949385416666,
            "loss_tokens_upper_95": 4.496434645833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.2528215869655455,
            "data_time": 0.010789122846391466,
            "batch_time": 0.048403766420152455,
            "samples_per_second": 1063349.7128428603,
            "samples_per_second_per_gpu": 132918.71410535753,
            "loss_sequences_lower_95": 4.178514979912983,
            "loss_sequences_upper_95": 4.342132878497364,
            "loss_tokens_lower_95": 4.240901979166667,
            "loss_tokens_upper_95": 4.26475015625,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.334915028259217,
            "data_time": 0.008905116468667984,
            "batch_time": 0.03799436427652836,
            "samples_per_second": 1080106.1833145318,
            "samples_per_second_per_gpu": 135013.27291431648,
            "loss_sequences_lower_95": 5.238625411082634,
            "loss_sequences_upper_95": 5.454394905180799,
            "loss_tokens_lower_95": 5.3218794375,
            "loss_tokens_upper_95": 5.347959895833333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.390713900560481,
            "data_time": 0.0013103203500385446,
            "batch_time": 0.02960629050259265,
            "samples_per_second": 1129811.3139412901,
            "samples_per_second_per_gpu": 141226.41424266127,
            "loss_sequences_lower_95": 4.382270833690216,
            "loss_sequences_upper_95": 4.399321056866264,
            "loss_tokens_lower_95": 4.378775927083333,
            "loss_tokens_upper_95": 4.402625583333333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.2395231843938035,
            "data_time": 0.002520907828452486,
            "batch_time": 0.030917702666130985,
            "samples_per_second": 1123643.2979549312,
            "samples_per_second_per_gpu": 140455.4122443664,
            "loss_sequences_lower_95": 4.224305322814713,
            "loss_sequences_upper_95": 4.255355312565078,
            "loss_tokens_lower_95": 4.227679072916667,
            "loss_tokens_upper_95": 4.251445604166667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.56891300035055,
            "data_time": 0.009688837254942642,
            "batch_time": 0.03823698179523936,
            "samples_per_second": 1069443.061869505,
            "samples_per_second_per_gpu": 133680.38273368814,
            "loss_sequences_lower_95": 4.494768804275481,
            "loss_sequences_upper_95": 4.6609271747828975,
            "loss_tokens_lower_95": 4.5558345625,
            "loss_tokens_upper_95": 4.581883052083334,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.053422739452354,
            "data_time": 0.009594762467768088,
            "batch_time": 0.038654842224729015,
            "samples_per_second": 1062791.396237984,
            "samples_per_second_per_gpu": 132848.924529748,
            "loss_sequences_lower_95": 3.981732587950293,
            "loss_sequences_upper_95": 4.13775964802977,
            "loss_tokens_lower_95": 4.04116996875,
            "loss_tokens_upper_95": 4.065649739583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.10470991784876,
            "data_time": 0.07771186317716326,
            "batch_time": 0.1128123232296535,
            "samples_per_second": 513213.3089674312,
            "samples_per_second_per_gpu": 64151.6636209289,
            "loss_sequences_lower_95": 5.0361001448197795,
            "loss_sequences_upper_95": 5.172934826937589,
            "loss_tokens_lower_95": 5.078873270208185,
            "loss_tokens_upper_95": 5.131042489138516,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.420326567947344,
            "data_time": 0.013858476823026484,
            "batch_time": 0.04334185069257563,
            "samples_per_second": 1033571.1096717372,
            "samples_per_second_per_gpu": 129196.38870896715,
            "loss_sequences_lower_95": 4.353785051618304,
            "loss_sequences_upper_95": 4.485747214448001,
            "loss_tokens_lower_95": 4.406997864583333,
            "loss_tokens_upper_95": 4.433176729166666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.966444103573117,
            "data_time": 0.012262719372908274,
            "batch_time": 0.04106106236577034,
            "samples_per_second": 1071786.1816595548,
            "samples_per_second_per_gpu": 133973.27270744435,
            "loss_sequences_lower_95": 5.882657609199786,
            "loss_sequences_upper_95": 6.074210118115105,
            "loss_tokens_lower_95": 5.954716677083334,
            "loss_tokens_upper_95": 5.977994291666667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.789867158795967,
            "data_time": 0.03567345067858696,
            "batch_time": 0.06596514955163002,
            "samples_per_second": 938130.8341246587,
            "samples_per_second_per_gpu": 117266.35426558234,
            "loss_sequences_lower_95": 4.6463080734503075,
            "loss_sequences_upper_95": 5.040050456562981,
            "loss_tokens_lower_95": 4.77530686425381,
            "loss_tokens_upper_95": 4.804822640340836,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.886677637074412,
            "data_time": 0.00173043169437617,
            "batch_time": 0.030535779027841086,
            "samples_per_second": 1107096.4555111746,
            "samples_per_second_per_gpu": 138387.05693889683,
            "loss_sequences_lower_95": 4.872406191025139,
            "loss_sequences_upper_95": 4.901275039279483,
            "loss_tokens_lower_95": 4.872193825888406,
            "loss_tokens_upper_95": 4.9009673268142,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.774914722949759,
            "data_time": 0.0018745329539487316,
            "batch_time": 0.030383077586532397,
            "samples_per_second": 1112995.841065776,
            "samples_per_second_per_gpu": 139124.480133222,
            "loss_sequences_lower_95": 3.7788250578040974,
            "loss_sequences_upper_95": 3.8042969527982473,
            "loss_tokens_lower_95": 3.7526657969623174,
            "loss_tokens_upper_95": 3.7731735457318223,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.25595201221794,
            "data_time": 0.0031904763033882163,
            "batch_time": 0.032906580151082436,
            "samples_per_second": 1070725.9315036114,
            "samples_per_second_per_gpu": 133840.74143795142,
            "loss_sequences_lower_95": 6.469648672760687,
            "loss_sequences_upper_95": 6.772408280622638,
            "loss_tokens_lower_95": 5.75144969621158,
            "loss_tokens_upper_95": 5.968618500286663,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.128710811416308,
            "data_time": 0.003927982868032253,
            "batch_time": 0.03262746825497201,
            "samples_per_second": 1100873.6224017527,
            "samples_per_second_per_gpu": 137609.2028002191,
            "loss_sequences_lower_95": 6.2970900390625,
            "loss_sequences_upper_95": 6.5002422037760414,
            "loss_tokens_lower_95": 5.748676493710692,
            "loss_tokens_upper_95": 5.886017172759434,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.150657417378342,
            "data_time": 0.004499419242548187,
            "batch_time": 0.03349069304775509,
            "samples_per_second": 1090622.402689934,
            "samples_per_second_per_gpu": 136327.80033624175,
            "loss_sequences_lower_95": 4.189889894243982,
            "loss_sequences_upper_95": 4.258362741882463,
            "loss_tokens_lower_95": 4.053292918461427,
            "loss_tokens_upper_95": 4.087506043451344,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5928620977835224,
            "data_time": 0.02277869837624686,
            "batch_time": 0.05265805125236511,
            "samples_per_second": 1014168.8610413248,
            "samples_per_second_per_gpu": 126771.1076301656,
            "loss_sequences_lower_95": 3.5654874836314807,
            "loss_sequences_upper_95": 3.6804823580655186,
            "loss_tokens_lower_95": 3.522353363083241,
            "loss_tokens_upper_95": 3.5787121775168798,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.260368865850021,
            "data_time": 0.01961006596684456,
            "batch_time": 0.04932123422622681,
            "samples_per_second": 985338.4841524356,
            "samples_per_second_per_gpu": 123167.31051905444,
            "loss_sequences_lower_95": 4.244226398078763,
            "loss_sequences_upper_95": 4.4470379887794955,
            "loss_tokens_lower_95": 4.123379772395481,
            "loss_tokens_upper_95": 4.2235045634626776,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.8758112208048505,
            "data_time": 0.016428223023047812,
            "batch_time": 0.04522723112350855,
            "samples_per_second": 1022757.3802087185,
            "samples_per_second_per_gpu": 127844.67252608981,
            "loss_sequences_lower_95": 4.812679443359375,
            "loss_sequences_upper_95": 4.930713216145834,
            "loss_tokens_lower_95": 4.749453422254036,
            "loss_tokens_upper_95": 4.991017654475529,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.9283468514573805,
            "data_time": 0.0016234402534072647,
            "batch_time": 0.030620746684818213,
            "samples_per_second": 1096967.9480295244,
            "samples_per_second_per_gpu": 137120.99350369055,
            "loss_sequences_lower_95": 7.945801064785198,
            "loss_sequences_upper_95": 8.021378919129226,
            "loss_tokens_lower_95": 7.7773379640689955,
            "loss_tokens_upper_95": 7.8560300413003485,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.7403116929611375,
            "data_time": 0.0029562187674861628,
            "batch_time": 0.03154720396003467,
            "samples_per_second": 1107681.155211502,
            "samples_per_second_per_gpu": 138460.14440143775,
            "loss_sequences_lower_95": 6.366411767824732,
            "loss_sequences_upper_95": 6.685951238047795,
            "loss_tokens_lower_95": 4.890659148643278,
            "loss_tokens_upper_95": 5.040066980124602,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.11307400566726,
            "data_time": 0.0049919026928979,
            "batch_time": 0.03368168988743344,
            "samples_per_second": 1092907.8274589113,
            "samples_per_second_per_gpu": 136613.47843236392,
            "loss_sequences_lower_95": 5.612562410009598,
            "loss_sequences_upper_95": 5.965698190109722,
            "loss_tokens_lower_95": 4.633757865440464,
            "loss_tokens_upper_95": 4.796470698713295,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.738585580973865,
            "data_time": 0.02224160943712507,
            "batch_time": 0.05354271616254534,
            "samples_per_second": 965828.4882799612,
            "samples_per_second_per_gpu": 120728.56103499515,
            "loss_sequences_lower_95": 5.642973766588185,
            "loss_sequences_upper_95": 5.833007840369934,
            "loss_tokens_lower_95": 5.643271542901862,
            "loss_tokens_upper_95": 5.832221852481093,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.167910106182099,
            "data_time": 0.05001505063130306,
            "batch_time": 0.0805254211792579,
            "samples_per_second": 898266.9788867834,
            "samples_per_second_per_gpu": 112283.37236084792,
            "loss_sequences_lower_95": 4.031435340881348,
            "loss_sequences_upper_95": 4.42591325378418,
            "loss_tokens_lower_95": 3.835431574239714,
            "loss_tokens_upper_95": 4.328166665502012,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.159108664142683,
            "data_time": 0.0032470172655850588,
            "batch_time": 0.03179887321097719,
            "samples_per_second": 1112149.5629843422,
            "samples_per_second_per_gpu": 139018.69537304278,
            "loss_sequences_lower_95": 5.12582992819217,
            "loss_sequences_upper_95": 5.192536281246001,
            "loss_tokens_lower_95": 5.124727022294012,
            "loss_tokens_upper_95": 5.1935463412391245,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.431754660157274,
            "data_time": 0.0049538305105237355,
            "batch_time": 0.03364884017147792,
            "samples_per_second": 1098156.2597338113,
            "samples_per_second_per_gpu": 137269.5324667264,
            "loss_sequences_lower_95": 5.392474460211276,
            "loss_sequences_upper_95": 5.469362091139435,
            "loss_tokens_lower_95": 5.391545436149928,
            "loss_tokens_upper_95": 5.470905705732366,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.319036783409326,
            "data_time": 0.0033918158281956586,
            "batch_time": 0.03205338229885884,
            "samples_per_second": 1099157.2498337606,
            "samples_per_second_per_gpu": 137394.65622922007,
            "loss_sequences_lower_95": 4.435371759225806,
            "loss_sequences_upper_95": 4.555143689642274,
            "loss_tokens_lower_95": 4.179509040041679,
            "loss_tokens_upper_95": 4.242017810990262,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.254469316005707,
            "data_time": 0.010100370272994041,
            "batch_time": 0.03869764320552349,
            "samples_per_second": 1065085.5926309244,
            "samples_per_second_per_gpu": 133135.69907886555,
            "loss_sequences_lower_95": 6.435280981445313,
            "loss_sequences_upper_95": 6.974049047851563,
            "loss_tokens_lower_95": 5.605280530665743,
            "loss_tokens_upper_95": 5.961931334924098,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.770489916205406,
            "data_time": 0.14604823291301727,
            "batch_time": 0.18231208622455597,
            "samples_per_second": 513968.3116454111,
            "samples_per_second_per_gpu": 64246.038955676384,
            "loss_sequences_lower_95": 4.466546797752381,
            "loss_sequences_upper_95": 5.153723609447479,
            "loss_tokens_lower_95": 4.2611737744561555,
            "loss_tokens_upper_95": 5.1013326447585525,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.948004507470405,
            "data_time": 0.026544598822898054,
            "batch_time": 0.055368086124988315,
            "samples_per_second": 952054.3749620196,
            "samples_per_second_per_gpu": 119006.79687025245,
            "loss_sequences_lower_95": 5.141951129080235,
            "loss_sequences_upper_95": 5.66702116166038,
            "loss_tokens_lower_95": 4.152904172465122,
            "loss_tokens_upper_95": 4.554144087558131,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9624795139799374,
            "data_time": 0.0028516637782255807,
            "batch_time": 0.03136984817683697,
            "samples_per_second": 1107039.6027897107,
            "samples_per_second_per_gpu": 138379.95034871384,
            "loss_sequences_lower_95": 3.945173716347204,
            "loss_sequences_upper_95": 3.9803479169226463,
            "loss_tokens_lower_95": 3.94424580065326,
            "loss_tokens_upper_95": 3.9800328805905965,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.153227289590571,
            "data_time": 0.0027028772319339254,
            "batch_time": 0.03139997000466681,
            "samples_per_second": 1106530.7263968866,
            "samples_per_second_per_gpu": 138316.34079961083,
            "loss_sequences_lower_95": 5.13034600217713,
            "loss_sequences_upper_95": 5.340596535088783,
            "loss_tokens_lower_95": 4.883365826080022,
            "loss_tokens_upper_95": 5.089593112012395,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.781918930046724,
            "data_time": 0.01757334503862593,
            "batch_time": 0.04670485191875034,
            "samples_per_second": 1000614.0055091311,
            "samples_per_second_per_gpu": 125076.75068864139,
            "loss_sequences_lower_95": 3.631793224069225,
            "loss_sequences_upper_95": 4.018555481180603,
            "loss_tokens_lower_95": 3.526398382619175,
            "loss_tokens_upper_95": 3.8362936833299606,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.122813566331329,
            "data_time": 0.004750361666083336,
            "batch_time": 0.033334898948669436,
            "samples_per_second": 1096181.9901217476,
            "samples_per_second_per_gpu": 137022.74876521845,
            "loss_sequences_lower_95": 4.154470615642882,
            "loss_sequences_upper_95": 4.297497674208884,
            "loss_tokens_lower_95": 3.981333226960414,
            "loss_tokens_upper_95": 4.130990075800554,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8590739805523944,
            "data_time": 0.028628340789249966,
            "batch_time": 0.058377680324372794,
            "samples_per_second": 985917.9882061293,
            "samples_per_second_per_gpu": 123239.74852576616,
            "loss_sequences_lower_95": 3.665249363969012,
            "loss_sequences_upper_95": 4.121464650223895,
            "loss_tokens_lower_95": 3.6355246713460954,
            "loss_tokens_upper_95": 4.027515604289865,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.123389188350022,
            "data_time": 0.0020247974540536706,
            "batch_time": 0.030922198790909527,
            "samples_per_second": 1097954.6418908911,
            "samples_per_second_per_gpu": 137244.3302363614,
            "loss_sequences_lower_95": 5.111048811324765,
            "loss_sequences_upper_95": 5.1355381037144925,
            "loss_tokens_lower_95": 5.110992305883052,
            "loss_tokens_upper_95": 5.135375864235347,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.471486098558,
            "data_time": 0.043993473052978516,
            "batch_time": 0.07454877333207564,
            "samples_per_second": 870834.6662464422,
            "samples_per_second_per_gpu": 108854.33328080528,
            "loss_sequences_lower_95": 2.334532535886302,
            "loss_sequences_upper_95": 2.6625282509813033,
            "loss_tokens_lower_95": 2.2085602150236694,
            "loss_tokens_upper_95": 2.5672274071216092,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.233905269277421,
            "data_time": 0.0013170418925177421,
            "batch_time": 0.02997109109078349,
            "samples_per_second": 1108347.414324553,
            "samples_per_second_per_gpu": 138543.42679056912,
            "loss_sequences_lower_95": 6.694931108326782,
            "loss_sequences_upper_95": 6.7510716596239515,
            "loss_tokens_lower_95": 5.519668725822051,
            "loss_tokens_upper_95": 5.576516501450676,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.96855456495285,
            "data_time": 0.0054948703637198795,
            "batch_time": 0.034398185828375435,
            "samples_per_second": 1084306.2293566864,
            "samples_per_second_per_gpu": 135538.2786695858,
            "loss_sequences_lower_95": 5.925399853515625,
            "loss_sequences_upper_95": 6.18958994140625,
            "loss_tokens_lower_95": 5.743876299817527,
            "loss_tokens_upper_95": 5.981651935149603,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.391880636629851,
            "data_time": 0.021145046767541916,
            "batch_time": 0.05064264798568467,
            "samples_per_second": 1003089.9011193912,
            "samples_per_second_per_gpu": 125386.2376399239,
            "loss_sequences_lower_95": 5.277283696713655,
            "loss_sequences_upper_95": 5.5073991062330165,
            "loss_tokens_lower_95": 5.276828188688858,
            "loss_tokens_upper_95": 5.504480643894361,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.871915157274766,
            "data_time": 0.004493493990725781,
            "batch_time": 0.03329418940716479,
            "samples_per_second": 1092857.9346378876,
            "samples_per_second_per_gpu": 136607.24182973595,
            "loss_sequences_lower_95": 7.7816153786399145,
            "loss_sequences_upper_95": 7.95999228737571,
            "loss_tokens_lower_95": 7.781609811493844,
            "loss_tokens_upper_95": 7.961128262606533,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.601763445774714,
            "data_time": 0.0042371984491957,
            "batch_time": 0.03292253993927164,
            "samples_per_second": 1101233.879546701,
            "samples_per_second_per_gpu": 137654.23494333762,
            "loss_sequences_lower_95": 1.6632437825520834,
            "loss_sequences_upper_95": 1.748306591796875,
            "loss_tokens_lower_95": 1.4896810638317826,
            "loss_tokens_upper_95": 1.5634680981767708,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.607052036694118,
            "data_time": 0.022001111081668308,
            "batch_time": 0.05207354894706181,
            "samples_per_second": 952826.5544158195,
            "samples_per_second_per_gpu": 119103.31930197743,
            "loss_sequences_lower_95": 5.321983264741443,
            "loss_sequences_upper_95": 5.893578200567336,
            "loss_tokens_lower_95": 5.319347839355468,
            "loss_tokens_upper_95": 5.901211286272321,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5939747765660286,
            "data_time": 0.14548951387405396,
            "batch_time": 0.18100011348724365,
            "samples_per_second": 493448.74411786743,
            "samples_per_second_per_gpu": 61681.09301473343,
            "loss_sequences_lower_95": 3.2998551428318024,
            "loss_sequences_upper_95": 4.594005668163299,
            "loss_tokens_lower_95": 3.020249136698615,
            "loss_tokens_upper_95": 3.6076599372785116,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.340474623680115,
            "data_time": 0.005687148325026981,
            "batch_time": 0.0350144108136495,
            "samples_per_second": 1070320.7652525436,
            "samples_per_second_per_gpu": 133790.09565656795,
            "loss_sequences_lower_95": 7.2918567382812505,
            "loss_sequences_upper_95": 7.653126538085938,
            "loss_tokens_lower_95": 7.010610946422906,
            "loss_tokens_upper_95": 7.330115902766762,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.402411816120147,
            "data_time": 0.005464206139246623,
            "batch_time": 0.03447910954081823,
            "samples_per_second": 1081867.0221517244,
            "samples_per_second_per_gpu": 135233.37776896555,
            "loss_sequences_lower_95": 7.476062084960938,
            "loss_sequences_upper_95": 7.698524414062501,
            "loss_tokens_lower_95": 7.166655036265335,
            "loss_tokens_upper_95": 7.374060602622683,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.94686776129863,
            "data_time": 0.0034035386050425246,
            "batch_time": 0.032029927773619175,
            "samples_per_second": 1102648.6412490413,
            "samples_per_second_per_gpu": 137831.08015613016,
            "loss_sequences_lower_95": 4.931888391313799,
            "loss_sequences_upper_95": 4.961486108455729,
            "loss_tokens_lower_95": 4.931919009150813,
            "loss_tokens_upper_95": 4.9616680149100105,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.044504563379947,
            "data_time": 0.008190167994657672,
            "batch_time": 0.037423388835526664,
            "samples_per_second": 1061989.5516992714,
            "samples_per_second_per_gpu": 132748.69396240893,
            "loss_sequences_lower_95": 4.9767110095046085,
            "loss_sequences_upper_95": 5.10970963391657,
            "loss_tokens_lower_95": 4.976324978548627,
            "loss_tokens_upper_95": 5.110859363749279,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.685884911537171,
            "data_time": 0.005558075412871346,
            "batch_time": 0.03473762576542203,
            "samples_per_second": 1074753.969391871,
            "samples_per_second_per_gpu": 134344.24617398388,
            "loss_sequences_lower_95": 7.584123120117187,
            "loss_sequences_upper_95": 7.7878326171875,
            "loss_tokens_lower_95": 7.583030908203125,
            "loss_tokens_upper_95": 7.786575866699218,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.412469999736057,
            "data_time": 0.0019596956177334703,
            "batch_time": 0.03079731518469596,
            "samples_per_second": 1099140.5517753677,
            "samples_per_second_per_gpu": 137392.56897192096,
            "loss_sequences_lower_95": 4.890464028870033,
            "loss_sequences_upper_95": 4.9823903700035475,
            "loss_tokens_lower_95": 3.8595173166030374,
            "loss_tokens_upper_95": 3.922634921671714,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.498945173932545,
            "data_time": 0.01805875641959054,
            "batch_time": 0.049035317557198664,
            "samples_per_second": 971926.6978808513,
            "samples_per_second_per_gpu": 121490.83723510642,
            "loss_sequences_lower_95": 5.366569860657649,
            "loss_sequences_upper_95": 5.630771466155551,
            "loss_tokens_lower_95": 5.367507365212512,
            "loss_tokens_upper_95": 5.630316720080019,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.336351926653993,
            "data_time": 0.010017450898885727,
            "batch_time": 0.03950367122888565,
            "samples_per_second": 1061554.970163893,
            "samples_per_second_per_gpu": 132694.37127048662,
            "loss_sequences_lower_95": 5.247151297774969,
            "loss_sequences_upper_95": 5.423450221641391,
            "loss_tokens_lower_95": 5.2506680477366725,
            "loss_tokens_upper_95": 5.422816580978094,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.960268281477668,
            "data_time": 0.002275963311315805,
            "batch_time": 0.031213192022809626,
            "samples_per_second": 1094013.126617939,
            "samples_per_second_per_gpu": 136751.64082724237,
            "loss_sequences_lower_95": 5.3443348487606475,
            "loss_sequences_upper_95": 5.436466076839064,
            "loss_tokens_lower_95": 4.381456396700975,
            "loss_tokens_upper_95": 4.460948899255783,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.070591510288299,
            "data_time": 0.02622700234254201,
            "batch_time": 0.05663679540157318,
            "samples_per_second": 997689.023746864,
            "samples_per_second_per_gpu": 124711.127968358,
            "loss_sequences_lower_95": 4.997128586542039,
            "loss_sequences_upper_95": 5.139686810528791,
            "loss_tokens_lower_95": 4.99771145613736,
            "loss_tokens_upper_95": 5.140125391722987,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.857469016987978,
            "data_time": 0.0034592027920360823,
            "batch_time": 0.032362801542503344,
            "samples_per_second": 1091506.7484390785,
            "samples_per_second_per_gpu": 136438.3435548848,
            "loss_sequences_lower_95": 5.827081631068425,
            "loss_sequences_upper_95": 5.887673675817087,
            "loss_tokens_lower_95": 5.827086842388188,
            "loss_tokens_upper_95": 5.887762746081804,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.621619418986793,
            "data_time": 0.0240045352415605,
            "batch_time": 0.053755454583601515,
            "samples_per_second": 945030.1523169144,
            "samples_per_second_per_gpu": 118128.7690396143,
            "loss_sequences_lower_95": 5.4894999680009855,
            "loss_sequences_upper_95": 5.753026499331576,
            "loss_tokens_lower_95": 5.490395999649196,
            "loss_tokens_upper_95": 5.754229025238926,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.169794344902039,
            "data_time": 0.07478134334087372,
            "batch_time": 0.10660537332296371,
            "samples_per_second": 735487.5907610868,
            "samples_per_second_per_gpu": 91935.94884513585,
            "loss_sequences_lower_95": 4.793066787719726,
            "loss_sequences_upper_95": 5.734502817789713,
            "loss_tokens_lower_95": 4.2780696127149795,
            "loss_tokens_upper_95": 5.624977895948621,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.329444527626038,
            "data_time": 0.07347191125154495,
            "batch_time": 0.10527989268302917,
            "samples_per_second": 753080.2799640993,
            "samples_per_second_per_gpu": 94135.03499551241,
            "loss_sequences_lower_95": 3.997842451731364,
            "loss_sequences_upper_95": 4.9828694152832025,
            "loss_tokens_lower_95": 3.3409459489115165,
            "loss_tokens_upper_95": 4.727906250685788,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.327361247044424,
            "data_time": 0.003518567778490234,
            "batch_time": 0.032422548185172255,
            "samples_per_second": 1093729.2577872076,
            "samples_per_second_per_gpu": 136716.15722340095,
            "loss_sequences_lower_95": 5.313285823591679,
            "loss_sequences_upper_95": 5.341379311832658,
            "loss_tokens_lower_95": 5.312893544619845,
            "loss_tokens_upper_95": 5.341391723812592,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9707693065576073,
            "data_time": 0.0013251006766273365,
            "batch_time": 0.03025999743515866,
            "samples_per_second": 1096775.368101197,
            "samples_per_second_per_gpu": 137096.92101264963,
            "loss_sequences_lower_95": 2.261250917593432,
            "loss_sequences_upper_95": 2.2998519428939,
            "loss_tokens_lower_95": 1.6801848592716697,
            "loss_tokens_upper_95": 1.702819357384093,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.69777717740517,
            "data_time": 0.038148246705532074,
            "batch_time": 0.07032875344157219,
            "samples_per_second": 931421.537246071,
            "samples_per_second_per_gpu": 116427.69215575888,
            "loss_sequences_lower_95": 5.748633893831508,
            "loss_sequences_upper_95": 6.136026217242864,
            "loss_tokens_lower_95": 5.325828746775977,
            "loss_tokens_upper_95": 5.597964655202192,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 9.140600900392275,
            "data_time": 0.1091681889125279,
            "batch_time": 0.14281806491670154,
            "samples_per_second": 572131.0011054942,
            "samples_per_second_per_gpu": 71516.37513818678,
            "loss_sequences_lower_95": 8.640264232738598,
            "loss_sequences_upper_95": 9.859012644999735,
            "loss_tokens_lower_95": 8.011763414924527,
            "loss_tokens_upper_95": 9.977228272991416,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.572757252832738,
            "data_time": 0.02779821270988101,
            "batch_time": 0.05854551565079462,
            "samples_per_second": 953829.4750031558,
            "samples_per_second_per_gpu": 119228.68437539447,
            "loss_sequences_lower_95": 5.550315298685214,
            "loss_sequences_upper_95": 5.905603250643102,
            "loss_tokens_lower_95": 5.160857255966149,
            "loss_tokens_upper_95": 5.3900737883725816,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.740463226306729,
            "data_time": 0.029639828772771926,
            "batch_time": 0.059696867352440244,
            "samples_per_second": 981114.7968178714,
            "samples_per_second_per_gpu": 122639.34960223392,
            "loss_sequences_lower_95": 5.72070424149676,
            "loss_sequences_upper_95": 6.040221981885956,
            "loss_tokens_lower_95": 5.355049825645743,
            "loss_tokens_upper_95": 5.5495843396343965,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.703785695680758,
            "data_time": 0.030984606061662947,
            "batch_time": 0.06160105410076323,
            "samples_per_second": 962771.3495010624,
            "samples_per_second_per_gpu": 120346.4186876328,
            "loss_sequences_lower_95": 5.675257324590915,
            "loss_sequences_upper_95": 6.081152418183117,
            "loss_tokens_lower_95": 5.252554028222038,
            "loss_tokens_upper_95": 5.548213056286839,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.9077380125115555,
            "data_time": 0.030422755650111606,
            "batch_time": 0.0600884755452474,
            "samples_per_second": 981216.6674990067,
            "samples_per_second_per_gpu": 122652.08343737584,
            "loss_sequences_lower_95": 5.874071111911681,
            "loss_sequences_upper_95": 6.182599658500857,
            "loss_tokens_lower_95": 5.542677986881815,
            "loss_tokens_upper_95": 5.722499041691004,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.369611523906637,
            "data_time": 0.03030654236122414,
            "batch_time": 0.06129624519819095,
            "samples_per_second": 979619.5287301417,
            "samples_per_second_per_gpu": 122452.44109126771,
            "loss_sequences_lower_95": 5.310332171517129,
            "loss_sequences_upper_95": 5.552485571440703,
            "loss_tokens_lower_95": 5.097983294183241,
            "loss_tokens_upper_95": 5.242637078641415,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.933093789147168,
            "data_time": 0.028965989748636883,
            "batch_time": 0.060276616187322704,
            "samples_per_second": 951127.0347840963,
            "samples_per_second_per_gpu": 118890.87934801204,
            "loss_sequences_lower_95": 4.927200559290444,
            "loss_sequences_upper_95": 5.191633215183165,
            "loss_tokens_lower_95": 4.626221227019289,
            "loss_tokens_upper_95": 4.753397024751356,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-0.25/params.txt",
    "uuid": "7fe8f8e2-3f1b-4d30-9634-8cfbde8eb4ec",
    "creation_date": "2023_12_14-05_01_29"
}