{
    "name": "rw_original-d=1024_l=24_h=8-0.25",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 2058081280,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "411616256",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=1024_l=24_h=8-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.788215074936549,
            "data_time": 0.040374528616666794,
            "batch_time": 0.42729973420500755,
            "samples_per_second": 691617.9621878543,
            "samples_per_second_per_gpu": 86452.24527348179,
            "loss_sequences_lower_95": 3.7072762298583988,
            "loss_sequences_upper_95": 3.870634943644206,
            "loss_tokens_lower_95": 3.773872667948405,
            "loss_tokens_upper_95": 3.802388051350911,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5922196839427647,
            "data_time": 0.0010343667347537818,
            "batch_time": 0.03675972189989757,
            "samples_per_second": 896993.6915465225,
            "samples_per_second_per_gpu": 112124.21144331532,
            "loss_sequences_lower_95": 3.5898513971487898,
            "loss_sequences_upper_95": 3.594591940052236,
            "loss_tokens_lower_95": 3.581438791666667,
            "loss_tokens_upper_95": 3.6031185937499997,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1607548596907633,
            "data_time": 0.008706835746765137,
            "batch_time": 0.04421595001220703,
            "samples_per_second": 867877.1384036148,
            "samples_per_second_per_gpu": 108484.64230045184,
            "loss_sequences_lower_95": 3.1192211727220185,
            "loss_sequences_upper_95": 3.211728858169244,
            "loss_tokens_lower_95": 3.1479661770833336,
            "loss_tokens_upper_95": 3.173830578125,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6557522242339617,
            "data_time": 0.0015442769386266406,
            "batch_time": 0.0368389749017201,
            "samples_per_second": 906309.7883860735,
            "samples_per_second_per_gpu": 113288.7235482592,
            "loss_sequences_lower_95": 3.6250542042525775,
            "loss_sequences_upper_95": 3.6874010450225514,
            "loss_tokens_lower_95": 3.6436701875,
            "loss_tokens_upper_95": 3.6676716666666667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6400826399777912,
            "data_time": 0.008802531724907012,
            "batch_time": 0.044412446686946064,
            "samples_per_second": 863951.6581765981,
            "samples_per_second_per_gpu": 107993.95727207477,
            "loss_sequences_lower_95": 3.5918349318494625,
            "loss_sequences_upper_95": 3.699611699556867,
            "loss_tokens_lower_95": 3.6287657812500003,
            "loss_tokens_upper_95": 3.6511592708333334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.836694358239605,
            "data_time": 0.0034395475750384126,
            "batch_time": 0.03911316330018251,
            "samples_per_second": 894964.7924077217,
            "samples_per_second_per_gpu": 111870.59905096522,
            "loss_sequences_lower_95": 3.791823064932444,
            "loss_sequences_upper_95": 3.885559165015827,
            "loss_tokens_lower_95": 3.824039947916667,
            "loss_tokens_upper_95": 3.8491859791666667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.702218905152107,
            "data_time": 0.0016241985666421265,
            "batch_time": 0.037014892789706905,
            "samples_per_second": 906501.6764394961,
            "samples_per_second_per_gpu": 113312.70955493701,
            "loss_sequences_lower_95": 3.669541015625,
            "loss_sequences_upper_95": 3.7340103934151783,
            "loss_tokens_lower_95": 3.6863922083333334,
            "loss_tokens_upper_95": 3.7183958645833335,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.075858128982064,
            "data_time": 0.0016651084761099056,
            "batch_time": 0.03758509884016228,
            "samples_per_second": 906585.110797573,
            "samples_per_second_per_gpu": 113323.13884969662,
            "loss_sequences_lower_95": 4.054801824280105,
            "loss_sequences_upper_95": 4.099371155104712,
            "loss_tokens_lower_95": 4.064158291666667,
            "loss_tokens_upper_95": 4.087424958333333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7526430531245905,
            "data_time": 0.010905227963886564,
            "batch_time": 0.046785330015515526,
            "samples_per_second": 859704.5198860965,
            "samples_per_second_per_gpu": 107463.06498576206,
            "loss_sequences_lower_95": 3.6766928820106073,
            "loss_sequences_upper_95": 3.845317586263021,
            "loss_tokens_lower_95": 3.7408488020833333,
            "loss_tokens_upper_95": 3.764434770833333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.857936916615181,
            "data_time": 0.008560290560126305,
            "batch_time": 0.04402237758040428,
            "samples_per_second": 874158.8004630237,
            "samples_per_second_per_gpu": 109269.85005787796,
            "loss_sequences_lower_95": 4.762808806528687,
            "loss_sequences_upper_95": 4.976625428934813,
            "loss_tokens_lower_95": 4.844834885416667,
            "loss_tokens_upper_95": 4.871135572916666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8620668247952694,
            "data_time": 0.0012938031499501483,
            "batch_time": 0.03671305517393715,
            "samples_per_second": 906698.6106568449,
            "samples_per_second_per_gpu": 113337.32633210561,
            "loss_sequences_lower_95": 3.8523455567142317,
            "loss_sequences_upper_95": 3.8721561180429456,
            "loss_tokens_lower_95": 3.850519020833333,
            "loss_tokens_upper_95": 3.87365771875,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6774800637025526,
            "data_time": 0.002425184456335317,
            "batch_time": 0.03781361722827057,
            "samples_per_second": 904235.377012845,
            "samples_per_second_per_gpu": 113029.42212660563,
            "loss_sequences_lower_95": 3.659221113110423,
            "loss_sequences_upper_95": 3.696474908322704,
            "loss_tokens_lower_95": 3.6660628229166665,
            "loss_tokens_upper_95": 3.68891265625,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.1183579543541216,
            "data_time": 0.008659385409750957,
            "batch_time": 0.04400028730098438,
            "samples_per_second": 863308.9867380263,
            "samples_per_second_per_gpu": 107913.62334225328,
            "loss_sequences_lower_95": 4.043113742542074,
            "loss_sequences_upper_95": 4.2132070173840015,
            "loss_tokens_lower_95": 4.105322916666666,
            "loss_tokens_upper_95": 4.1311281875,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4375293963795523,
            "data_time": 0.009038869128284227,
            "batch_time": 0.04458575799645656,
            "samples_per_second": 868581.1032431909,
            "samples_per_second_per_gpu": 108572.63790539886,
            "loss_sequences_lower_95": 3.3596083631340696,
            "loss_sequences_upper_95": 3.5281103441040287,
            "loss_tokens_lower_95": 3.425425963541667,
            "loss_tokens_upper_95": 3.4495979010416664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.372347409074957,
            "data_time": 0.07565549441746303,
            "batch_time": 0.11184838839939662,
            "samples_per_second": 507777.6316847544,
            "samples_per_second_per_gpu": 63472.2039605943,
            "loss_sequences_lower_95": 4.302885506369851,
            "loss_sequences_upper_95": 4.447834803841331,
            "loss_tokens_lower_95": 4.348889957774769,
            "loss_tokens_upper_95": 4.396362408724698,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8151260047890356,
            "data_time": 0.013054660775444725,
            "batch_time": 0.04844296926801855,
            "samples_per_second": 853319.9958584261,
            "samples_per_second_per_gpu": 106664.99948230326,
            "loss_sequences_lower_95": 3.7488984844774964,
            "loss_sequences_upper_95": 3.879154776970777,
            "loss_tokens_lower_95": 3.8020895104166668,
            "loss_tokens_upper_95": 3.827970458333333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.625830949768227,
            "data_time": 0.012280814349651337,
            "batch_time": 0.048156688610712685,
            "samples_per_second": 861823.9281177666,
            "samples_per_second_per_gpu": 107727.99101472083,
            "loss_sequences_lower_95": 5.539275205103892,
            "loss_sequences_upper_95": 5.737723398334433,
            "loss_tokens_lower_95": 5.61400740625,
            "loss_tokens_upper_95": 5.637527645833333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.0691339617869895,
            "data_time": 0.03467515483498573,
            "batch_time": 0.0711536854505539,
            "samples_per_second": 760307.0877313076,
            "samples_per_second_per_gpu": 95038.38596641345,
            "loss_sequences_lower_95": 3.9162081609006787,
            "loss_sequences_upper_95": 4.33992654769147,
            "loss_tokens_lower_95": 4.055021736270091,
            "loss_tokens_upper_95": 4.083783890771084,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.869853019035197,
            "data_time": 0.0016720385920757043,
            "batch_time": 0.03718717172232738,
            "samples_per_second": 899654.8228434107,
            "samples_per_second_per_gpu": 112456.85285542633,
            "loss_sequences_lower_95": 4.851654077944737,
            "loss_sequences_upper_95": 4.888250329925402,
            "loss_tokens_lower_95": 4.851259433195947,
            "loss_tokens_upper_95": 4.888381757628899,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.232482126570631,
            "data_time": 0.0023086757701673327,
            "batch_time": 0.03782655986820816,
            "samples_per_second": 897574.9286480717,
            "samples_per_second_per_gpu": 112196.86608100896,
            "loss_sequences_lower_95": 3.2302550333521087,
            "loss_sequences_upper_95": 3.256211178674567,
            "loss_tokens_lower_95": 3.2119707283018997,
            "loss_tokens_upper_95": 3.231335793321495,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.974557574917586,
            "data_time": 0.0030045149800465086,
            "batch_time": 0.03847108574884327,
            "samples_per_second": 897183.3465584866,
            "samples_per_second_per_gpu": 112147.91831981082,
            "loss_sequences_lower_95": 5.203626036069616,
            "loss_sequences_upper_95": 5.481183665712092,
            "loss_tokens_lower_95": 4.488946159459137,
            "loss_tokens_upper_95": 4.698395896622587,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.981778171350559,
            "data_time": 0.0035349877273782773,
            "batch_time": 0.03908136487007141,
            "samples_per_second": 890656.3905936449,
            "samples_per_second_per_gpu": 111332.04882420562,
            "loss_sequences_lower_95": 5.049367732747395,
            "loss_sequences_upper_95": 5.236873185221354,
            "loss_tokens_lower_95": 4.730345875098271,
            "loss_tokens_upper_95": 4.870203874312107,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.391552945173898,
            "data_time": 0.004190796640663665,
            "batch_time": 0.03960859541799508,
            "samples_per_second": 891600.3669611529,
            "samples_per_second_per_gpu": 111450.04587014411,
            "loss_sequences_lower_95": 3.439712239830127,
            "loss_sequences_upper_95": 3.5053419690135756,
            "loss_tokens_lower_95": 3.2889074813768433,
            "loss_tokens_upper_95": 3.322346861841787,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8506935054605655,
            "data_time": 0.021109300000326976,
            "batch_time": 0.05720911281449454,
            "samples_per_second": 827911.1487945763,
            "samples_per_second_per_gpu": 103488.89359932204,
            "loss_sequences_lower_95": 2.823557232943448,
            "loss_sequences_upper_95": 2.938821855024858,
            "loss_tokens_lower_95": 2.775461821651718,
            "loss_tokens_upper_95": 2.8272251177819205,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.62642748501836,
            "data_time": 0.01841151900589466,
            "batch_time": 0.05386894755065441,
            "samples_per_second": 820803.6159673788,
            "samples_per_second_per_gpu": 102600.45199592235,
            "loss_sequences_lower_95": 3.6145006452287944,
            "loss_sequences_upper_95": 3.8114902044802297,
            "loss_tokens_lower_95": 3.4965329518766923,
            "loss_tokens_upper_95": 3.592518532868568,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.454851545492808,
            "data_time": 0.016067423881628573,
            "batch_time": 0.051575254171322554,
            "samples_per_second": 831161.9689574701,
            "samples_per_second_per_gpu": 103895.24611968377,
            "loss_sequences_lower_95": 4.419571095784505,
            "loss_sequences_upper_95": 4.515536661783854,
            "loss_tokens_lower_95": 4.317159736171199,
            "loss_tokens_upper_95": 4.543974351352461,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.386893940035867,
            "data_time": 0.0013804535686987355,
            "batch_time": 0.03687535124422938,
            "samples_per_second": 900058.266170136,
            "samples_per_second_per_gpu": 112507.283271267,
            "loss_sequences_lower_95": 6.393838434147557,
            "loss_sequences_upper_95": 6.4839101752804975,
            "loss_tokens_lower_95": 6.237096935248339,
            "loss_tokens_upper_95": 6.329650899320321,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.610359631654389,
            "data_time": 0.0027137908759533157,
            "batch_time": 0.038264515215918525,
            "samples_per_second": 894372.4937980821,
            "samples_per_second_per_gpu": 111796.56172476026,
            "loss_sequences_lower_95": 5.064567766285906,
            "loss_sequences_upper_95": 5.338721525147307,
            "loss_tokens_lower_95": 3.9654906893945845,
            "loss_tokens_upper_95": 4.097822658112653,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.239627566150431,
            "data_time": 0.004876696177431055,
            "batch_time": 0.04024426075252327,
            "samples_per_second": 886220.4638527032,
            "samples_per_second_per_gpu": 110777.5579815879,
            "loss_sequences_lower_95": 4.599643381789276,
            "loss_sequences_upper_95": 4.909392435640198,
            "loss_tokens_lower_95": 3.8617688701244353,
            "loss_tokens_upper_95": 4.015519457826618,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.689909334052099,
            "data_time": 0.02134262876851218,
            "batch_time": 0.05682620193277087,
            "samples_per_second": 831648.5117689818,
            "samples_per_second_per_gpu": 103956.06397112273,
            "loss_sequences_lower_95": 5.609155956250891,
            "loss_sequences_upper_95": 5.76968217962953,
            "loss_tokens_lower_95": 5.607170989850885,
            "loss_tokens_upper_95": 5.769026191049515,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7268685364723204,
            "data_time": 0.04493540066939134,
            "batch_time": 0.08167095826222347,
            "samples_per_second": 756272.0157725122,
            "samples_per_second_per_gpu": 94534.00197156402,
            "loss_sequences_lower_95": 3.585481010437012,
            "loss_sequences_upper_95": 3.968307228088379,
            "loss_tokens_lower_95": 3.4053072852610686,
            "loss_tokens_upper_95": 3.874413210846657,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.034042726031589,
            "data_time": 0.00315942647266973,
            "batch_time": 0.03871612924252064,
            "samples_per_second": 895148.1385748052,
            "samples_per_second_per_gpu": 111893.51732185065,
            "loss_sequences_lower_95": 4.997366892132325,
            "loss_sequences_upper_95": 5.070756838436381,
            "loss_tokens_lower_95": 4.996468469625608,
            "loss_tokens_upper_95": 5.071472155474348,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.247845471247018,
            "data_time": 0.004470682455314898,
            "batch_time": 0.04004574638789863,
            "samples_per_second": 889516.2018623442,
            "samples_per_second_per_gpu": 111189.52523279302,
            "loss_sequences_lower_95": 5.198796402714859,
            "loss_sequences_upper_95": 5.296111785613739,
            "loss_tokens_lower_95": 5.198134433705723,
            "loss_tokens_upper_95": 5.297034961097461,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.740307674963145,
            "data_time": 0.0032944787398270347,
            "batch_time": 0.038775974679456676,
            "samples_per_second": 890014.4030243536,
            "samples_per_second_per_gpu": 111251.8003780442,
            "loss_sequences_lower_95": 3.880470822140234,
            "loss_sequences_upper_95": 4.011659076346147,
            "loss_tokens_lower_95": 3.5819791034350046,
            "loss_tokens_upper_95": 3.642229369790232,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.68474985742569,
            "data_time": 0.009582909755408764,
            "batch_time": 0.04474806785583496,
            "samples_per_second": 865550.452622061,
            "samples_per_second_per_gpu": 108193.80657775763,
            "loss_sequences_lower_95": 5.863128515624999,
            "loss_sequences_upper_95": 6.389622204589844,
            "loss_tokens_lower_95": 5.086061509218853,
            "loss_tokens_upper_95": 5.441459618397378,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.257894665002823,
            "data_time": 0.13714158535003662,
            "batch_time": 0.17714668810367584,
            "samples_per_second": 486105.5866270389,
            "samples_per_second_per_gpu": 60763.19832837986,
            "loss_sequences_lower_95": 3.9983229756355287,
            "loss_sequences_upper_95": 4.60119765996933,
            "loss_tokens_lower_95": 3.794836916868714,
            "loss_tokens_upper_95": 4.572461358432112,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.336644789953342,
            "data_time": 0.025427407406746073,
            "batch_time": 0.06049279202806189,
            "samples_per_second": 783447.384945542,
            "samples_per_second_per_gpu": 97930.92311819275,
            "loss_sequences_lower_95": 4.556713130556304,
            "loss_sequences_upper_95": 5.04630048028354,
            "loss_tokens_lower_95": 3.5033762281346883,
            "loss_tokens_upper_95": 3.879697009648922,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4435662543112247,
            "data_time": 0.0028370430486069787,
            "batch_time": 0.03829830429620213,
            "samples_per_second": 892865.8686981723,
            "samples_per_second_per_gpu": 111608.23358727153,
            "loss_sequences_lower_95": 2.417424275876133,
            "loss_sequences_upper_95": 2.4698196537789427,
            "loss_tokens_lower_95": 2.4170873372822466,
            "loss_tokens_upper_95": 2.469377758181111,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3618937918441287,
            "data_time": 0.0026646365386643147,
            "batch_time": 0.03806369296330373,
            "samples_per_second": 900103.5791112289,
            "samples_per_second_per_gpu": 112512.94738890362,
            "loss_sequences_lower_95": 3.3328593382344023,
            "loss_sequences_upper_95": 3.5052069183000194,
            "loss_tokens_lower_95": 3.1735438431347975,
            "loss_tokens_upper_95": 3.343360674472521,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4389841980113216,
            "data_time": 0.016680184337827895,
            "batch_time": 0.051986427770720586,
            "samples_per_second": 820721.0422651473,
            "samples_per_second_per_gpu": 102590.13028314341,
            "loss_sequences_lower_95": 3.2994064554626688,
            "loss_sequences_upper_95": 3.707585730919471,
            "loss_tokens_lower_95": 3.15401767727184,
            "loss_tokens_upper_95": 3.456853434997615,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.789835285651185,
            "data_time": 0.0043379485607147215,
            "batch_time": 0.039674143120646474,
            "samples_per_second": 888159.2499327281,
            "samples_per_second_per_gpu": 111019.90624159102,
            "loss_sequences_lower_95": 3.837090944020694,
            "loss_sequences_upper_95": 3.9887915270292766,
            "loss_tokens_lower_95": 3.632073700674483,
            "loss_tokens_upper_95": 3.7769395082813286,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1984290422462838,
            "data_time": 0.027067175933292935,
            "batch_time": 0.06360572008859544,
            "samples_per_second": 805282.6004500155,
            "samples_per_second_per_gpu": 100660.32505625194,
            "loss_sequences_lower_95": 3.026978767208937,
            "loss_sequences_upper_95": 3.5125386493961983,
            "loss_tokens_lower_95": 2.9072672160959163,
            "loss_tokens_upper_95": 3.2966528539362,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.752376614200709,
            "data_time": 0.0018344441748027367,
            "batch_time": 0.037244495419659615,
            "samples_per_second": 899149.2951112575,
            "samples_per_second_per_gpu": 112393.66188890718,
            "loss_sequences_lower_95": 5.742790901930387,
            "loss_sequences_upper_95": 5.762209287560638,
            "loss_tokens_lower_95": 5.7425628582747805,
            "loss_tokens_upper_95": 5.762086567704166,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4911613580092644,
            "data_time": 0.04038845842534845,
            "batch_time": 0.07620120482011275,
            "samples_per_second": 746435.5171206231,
            "samples_per_second_per_gpu": 93304.4396400779,
            "loss_sequences_lower_95": 1.4294674586323857,
            "loss_sequences_upper_95": 1.6101137735311268,
            "loss_tokens_lower_95": 1.2829387059597301,
            "loss_tokens_upper_95": 1.5652618876743218,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.927630951810058,
            "data_time": 0.0012073518594372503,
            "batch_time": 0.03665985498024652,
            "samples_per_second": 900735.4949667415,
            "samples_per_second_per_gpu": 112591.93687084269,
            "loss_sequences_lower_95": 5.260029071671908,
            "loss_sequences_upper_95": 5.301599013610456,
            "loss_tokens_lower_95": 4.406546832688588,
            "loss_tokens_upper_95": 4.450029146518375,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.573457617759704,
            "data_time": 0.005282696750428941,
            "batch_time": 0.040981785172507876,
            "samples_per_second": 881151.3086162065,
            "samples_per_second_per_gpu": 110143.91357702581,
            "loss_sequences_lower_95": 5.5642072265625,
            "loss_sequences_upper_95": 5.7615805419921875,
            "loss_tokens_lower_95": 5.385373559702484,
            "loss_tokens_upper_95": 5.571238648310615,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.540470276708188,
            "data_time": 0.020060759479716674,
            "batch_time": 0.05628921621936863,
            "samples_per_second": 820690.2179520526,
            "samples_per_second_per_gpu": 102586.27724400657,
            "loss_sequences_lower_95": 5.379060602602752,
            "loss_sequences_upper_95": 5.699938566788384,
            "loss_tokens_lower_95": 5.37952380636464,
            "loss_tokens_upper_95": 5.698500220257302,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.6139978770053744,
            "data_time": 0.004390042230307338,
            "batch_time": 0.039901781872094394,
            "samples_per_second": 889555.0648309479,
            "samples_per_second_per_gpu": 111194.38310386849,
            "loss_sequences_lower_95": 6.514699337121212,
            "loss_sequences_upper_95": 6.713694938890862,
            "loss_tokens_lower_95": 6.516339869643702,
            "loss_tokens_upper_95": 6.70958564527107,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6555208944479625,
            "data_time": 0.0038013125353671133,
            "batch_time": 0.03934922941187595,
            "samples_per_second": 892753.6959628937,
            "samples_per_second_per_gpu": 111594.21199536171,
            "loss_sequences_lower_95": 1.7116277587890625,
            "loss_sequences_upper_95": 1.7861982259114584,
            "loss_tokens_lower_95": 1.5530421152836134,
            "loss_tokens_upper_95": 1.6306398418742498,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.052125621977306,
            "data_time": 0.02188830077648163,
            "batch_time": 0.057010505880628316,
            "samples_per_second": 802661.1399607351,
            "samples_per_second_per_gpu": 100332.64249509189,
            "loss_sequences_lower_95": 5.726098211379279,
            "loss_sequences_upper_95": 6.3765572248186375,
            "loss_tokens_lower_95": 5.723867826915923,
            "loss_tokens_upper_95": 6.388556053524925,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5057452209293842,
            "data_time": 0.15302914381027222,
            "batch_time": 0.19192519783973694,
            "samples_per_second": 481535.78849569865,
            "samples_per_second_per_gpu": 60191.97356196233,
            "loss_sequences_lower_95": 2.3040939211845397,
            "loss_sequences_upper_95": 3.3619415581226346,
            "loss_tokens_lower_95": 1.9488595046210535,
            "loss_tokens_upper_95": 2.490601309550177,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.155914320468902,
            "data_time": 0.005324242606995598,
            "batch_time": 0.04079124568000672,
            "samples_per_second": 884255.2594358468,
            "samples_per_second_per_gpu": 110531.90742948085,
            "loss_sequences_lower_95": 7.099712194824218,
            "loss_sequences_upper_95": 7.434570544433594,
            "loss_tokens_lower_95": 6.855597182014065,
            "loss_tokens_upper_95": 7.151434956145569,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.851170119762421,
            "data_time": 0.005151176736468361,
            "batch_time": 0.04056621449334281,
            "samples_per_second": 886469.6120355673,
            "samples_per_second_per_gpu": 110808.70150444591,
            "loss_sequences_lower_95": 6.950259130859376,
            "loss_sequences_upper_95": 7.155248986816407,
            "loss_tokens_lower_95": 6.617463940050701,
            "loss_tokens_upper_95": 6.804977559112105,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.2155736745143155,
            "data_time": 0.0035803308455043016,
            "batch_time": 0.03907406704880322,
            "samples_per_second": 891221.5426602142,
            "samples_per_second_per_gpu": 111402.69283252678,
            "loss_sequences_lower_95": 6.190591051112555,
            "loss_sequences_upper_95": 6.240980530296682,
            "loss_tokens_lower_95": 6.190558084353,
            "loss_tokens_upper_95": 6.240503903630992,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.286768122569024,
            "data_time": 0.0074974799084159185,
            "batch_time": 0.042812546214308266,
            "samples_per_second": 873866.6469389802,
            "samples_per_second_per_gpu": 109233.33086737253,
            "loss_sequences_lower_95": 4.199574169621856,
            "loss_sequences_upper_95": 4.37301020702825,
            "loss_tokens_lower_95": 4.196936353926651,
            "loss_tokens_upper_95": 4.371725684193788,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.255667246341705,
            "data_time": 0.005672364480911739,
            "batch_time": 0.04115141762627496,
            "samples_per_second": 885531.2600113011,
            "samples_per_second_per_gpu": 110691.40750141264,
            "loss_sequences_lower_95": 6.213622021484375,
            "loss_sequences_upper_95": 6.299893627929688,
            "loss_tokens_lower_95": 6.212721179199219,
            "loss_tokens_upper_95": 6.299540771484375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1851779497109614,
            "data_time": 0.0018123339164459863,
            "batch_time": 0.03727229781772779,
            "samples_per_second": 898634.052523801,
            "samples_per_second_per_gpu": 112329.25656547512,
            "loss_sequences_lower_95": 3.651929796520222,
            "loss_sequences_upper_95": 3.731611244973983,
            "loss_tokens_lower_95": 2.597219819082455,
            "loss_tokens_upper_95": 2.654065329336049,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.538804423453203,
            "data_time": 0.01726670605795724,
            "batch_time": 0.053088482788630896,
            "samples_per_second": 818848.165430991,
            "samples_per_second_per_gpu": 102356.02067887387,
            "loss_sequences_lower_95": 5.390745214206069,
            "loss_sequences_upper_95": 5.690183679381413,
            "loss_tokens_lower_95": 5.388748624431553,
            "loss_tokens_upper_95": 5.686507427158641,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.754699132021735,
            "data_time": 0.01027968805283308,
            "batch_time": 0.04583097621798515,
            "samples_per_second": 874859.5398255767,
            "samples_per_second_per_gpu": 109357.44247819709,
            "loss_sequences_lower_95": 5.647074991861979,
            "loss_sequences_upper_95": 5.860879624310662,
            "loss_tokens_lower_95": 5.649652039770987,
            "loss_tokens_upper_95": 5.85781920189951,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.008288094353201,
            "data_time": 0.0019777078692929154,
            "batch_time": 0.037402338253077524,
            "samples_per_second": 898221.8727603758,
            "samples_per_second_per_gpu": 112277.73409504698,
            "loss_sequences_lower_95": 4.489573179921396,
            "loss_sequences_upper_95": 4.582865517936552,
            "loss_tokens_lower_95": 3.311617280870789,
            "loss_tokens_upper_95": 3.3894949413654363,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.341173636219489,
            "data_time": 0.023918646077315014,
            "batch_time": 0.06002772847811381,
            "samples_per_second": 827300.5565327496,
            "samples_per_second_per_gpu": 103412.5695665937,
            "loss_sequences_lower_95": 6.262330910011574,
            "loss_sequences_upper_95": 6.417409067305307,
            "loss_tokens_lower_95": 6.262570319604621,
            "loss_tokens_upper_95": 6.417495340014261,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.485741835048803,
            "data_time": 0.0033343090096963923,
            "batch_time": 0.038952538964221475,
            "samples_per_second": 889948.6664409958,
            "samples_per_second_per_gpu": 111243.58330512447,
            "loss_sequences_lower_95": 4.435191355480695,
            "loss_sequences_upper_95": 4.535313739368311,
            "loss_tokens_lower_95": 4.43605237301701,
            "loss_tokens_upper_95": 4.534461554197726,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.958770964909526,
            "data_time": 0.02163573611866344,
            "batch_time": 0.05665105472911488,
            "samples_per_second": 805088.0810010153,
            "samples_per_second_per_gpu": 100636.01012512691,
            "loss_sequences_lower_95": 5.779443685290883,
            "loss_sequences_upper_95": 6.13414579224818,
            "loss_tokens_lower_95": 5.780870930199485,
            "loss_tokens_upper_95": 6.138793589767901,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9981120487054187,
            "data_time": 0.06733211874961853,
            "batch_time": 0.10316265374422073,
            "samples_per_second": 653846.2278550989,
            "samples_per_second_per_gpu": 81730.77848188736,
            "loss_sequences_lower_95": 2.7199730428059894,
            "loss_sequences_upper_95": 3.415038859049479,
            "loss_tokens_lower_95": 2.4139919492933486,
            "loss_tokens_upper_95": 3.2323527336120605,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9165452420711517,
            "data_time": 0.07186148315668106,
            "batch_time": 0.1110251396894455,
            "samples_per_second": 637046.786710838,
            "samples_per_second_per_gpu": 79630.84833885475,
            "loss_sequences_lower_95": 2.7105559794108074,
            "loss_sequences_upper_95": 3.4972184626261393,
            "loss_tokens_lower_95": 2.223631282334917,
            "loss_tokens_upper_95": 3.228793540697419,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.179140692431376,
            "data_time": 0.002974518191610946,
            "batch_time": 0.03858764835355421,
            "samples_per_second": 892972.9078122607,
            "samples_per_second_per_gpu": 111621.61347653258,
            "loss_sequences_lower_95": 4.150768492843336,
            "loss_sequences_upper_95": 4.207008353276877,
            "loss_tokens_lower_95": 4.150510545149116,
            "loss_tokens_upper_95": 4.2071527378267675,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0436881089157024,
            "data_time": 0.0011758753699729427,
            "batch_time": 0.036674414262077744,
            "samples_per_second": 899681.3357925974,
            "samples_per_second_per_gpu": 112460.16697407467,
            "loss_sequences_lower_95": 1.2282687830173356,
            "loss_sequences_upper_95": 1.2554039775204302,
            "loss_tokens_lower_95": 0.8394540118736076,
            "loss_tokens_upper_95": 0.8537565589967111,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.985335012120525,
            "data_time": 0.03661220148205757,
            "batch_time": 0.09406283870339394,
            "samples_per_second": 770886.7131547172,
            "samples_per_second_per_gpu": 96360.83914433965,
            "loss_sequences_lower_95": 5.020827898641271,
            "loss_sequences_upper_95": 5.382669259619525,
            "loss_tokens_lower_95": 4.657417077345861,
            "loss_tokens_upper_95": 4.937992047408499,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 8.651317416010675,
            "data_time": 0.10748378435770671,
            "batch_time": 0.144332454318092,
            "samples_per_second": 507368.77565613406,
            "samples_per_second_per_gpu": 63421.09695701676,
            "loss_sequences_lower_95": 8.109862580170503,
            "loss_sequences_upper_95": 9.436988067626952,
            "loss_tokens_lower_95": 7.478238254123264,
            "loss_tokens_upper_95": 9.49398526792173,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.895555468594155,
            "data_time": 0.027137534958975657,
            "batch_time": 0.06314655996504284,
            "samples_per_second": 809547.2497484569,
            "samples_per_second_per_gpu": 101193.40621855711,
            "loss_sequences_lower_95": 4.887097558742616,
            "loss_sequences_upper_95": 5.230151869611042,
            "loss_tokens_lower_95": 4.513472316218986,
            "loss_tokens_upper_95": 4.747257715582948,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.0641834445116,
            "data_time": 0.02679442224048433,
            "batch_time": 0.06252634241467431,
            "samples_per_second": 807354.5126728199,
            "samples_per_second_per_gpu": 100919.31408410249,
            "loss_sequences_lower_95": 5.037759529672019,
            "loss_sequences_upper_95": 5.348190884473848,
            "loss_tokens_lower_95": 4.7116306910219015,
            "loss_tokens_upper_95": 4.908477861223906,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.989780487083808,
            "data_time": 0.02855340639750163,
            "batch_time": 0.06476721877143496,
            "samples_per_second": 802061.3414597698,
            "samples_per_second_per_gpu": 100257.66768247122,
            "loss_sequences_lower_95": 4.984742215784585,
            "loss_sequences_upper_95": 5.38706335672518,
            "loss_tokens_lower_95": 4.564835768749551,
            "loss_tokens_upper_95": 4.872280787269983,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.218705691942355,
            "data_time": 0.02958838144938151,
            "batch_time": 0.06521289689200264,
            "samples_per_second": 808811.0349214716,
            "samples_per_second_per_gpu": 101101.37936518395,
            "loss_sequences_lower_95": 5.166694287555974,
            "loss_sequences_upper_95": 5.468670217002311,
            "loss_tokens_lower_95": 4.900647428993866,
            "loss_tokens_upper_95": 5.084247425114997,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.560127438965791,
            "data_time": 0.027087873882717557,
            "batch_time": 0.06302690505981445,
            "samples_per_second": 822450.2584593637,
            "samples_per_second_per_gpu": 102806.28230742046,
            "loss_sequences_lower_95": 4.47765349394046,
            "loss_sequences_upper_95": 4.702121265482458,
            "loss_tokens_lower_95": 4.319239726528225,
            "loss_tokens_upper_95": 4.466147173881251,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.140966297649756,
            "data_time": 0.028172878991989864,
            "batch_time": 0.06463785398574103,
            "samples_per_second": 802769.6314388984,
            "samples_per_second_per_gpu": 100346.2039298623,
            "loss_sequences_lower_95": 4.127040751387433,
            "loss_sequences_upper_95": 4.368790175275105,
            "loss_tokens_lower_95": 3.881833718307339,
            "loss_tokens_upper_95": 4.013000113019891,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.25/params.txt",
    "uuid": "5daff007-e11f-450f-bf08-bfa41e1c481d",
    "creation_date": "2023_12_14-05_12_15"
}