{
    "name": "rpj-d=96_l=8_h=4-8.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 1691089920,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "338217984",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.445159268379212,
            "data_time": 0.14019976556301117,
            "batch_time": 1.4546796381473541,
            "samples_per_second": 377152.17874814593,
            "samples_per_second_per_gpu": 47144.02234351824,
            "loss_sequences_lower_95": 4.36840638478597,
            "loss_sequences_upper_95": 4.519689521789551,
            "loss_tokens_lower_95": 4.43129264831543,
            "loss_tokens_upper_95": 4.459007822672526,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.670495020574533,
            "data_time": 0.01869014296053854,
            "batch_time": 0.06385990443322012,
            "samples_per_second": 4681161.337734301,
            "samples_per_second_per_gpu": 585145.1672167876,
            "loss_sequences_lower_95": 4.668125875010294,
            "loss_sequences_upper_95": 4.672824864116048,
            "loss_tokens_lower_95": 4.658821229166667,
            "loss_tokens_upper_95": 4.6820298750000005,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.8478247842010185,
            "data_time": 0.09385997802019119,
            "batch_time": 0.1390230506658554,
            "samples_per_second": 4129100.3539837794,
            "samples_per_second_per_gpu": 516137.54424797243,
            "loss_sequences_lower_95": 3.8180152986487563,
            "loss_sequences_upper_95": 3.8772186964385362,
            "loss_tokens_lower_95": 3.835098020833333,
            "loss_tokens_upper_95": 3.8607774166666666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.545572786331177,
            "data_time": 0.013114016307027717,
            "batch_time": 0.05721856732117502,
            "samples_per_second": 5366966.749255171,
            "samples_per_second_per_gpu": 670870.8436568964,
            "loss_sequences_lower_95": 4.533798244201031,
            "loss_sequences_upper_95": 4.557433513208763,
            "loss_tokens_lower_95": 4.534054895833334,
            "loss_tokens_upper_95": 4.5569471041666665,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.656661704698551,
            "data_time": 0.09475155174732208,
            "batch_time": 0.13989875465631485,
            "samples_per_second": 4026401.1861697035,
            "samples_per_second_per_gpu": 503300.14827121294,
            "loss_sequences_lower_95": 4.624310762673434,
            "loss_sequences_upper_95": 4.689000184472856,
            "loss_tokens_lower_95": 4.644999645833334,
            "loss_tokens_upper_95": 4.668402864583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4989770915863225,
            "data_time": 0.036106725533803306,
            "batch_time": 0.07902259627978007,
            "samples_per_second": 4944494.951096344,
            "samples_per_second_per_gpu": 618061.868887043,
            "loss_sequences_lower_95": 4.4593342711696975,
            "loss_sequences_upper_95": 4.5373940038000296,
            "loss_tokens_lower_95": 4.486676177083334,
            "loss_tokens_upper_95": 4.51077334375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.238471786781233,
            "data_time": 0.013747289776802063,
            "batch_time": 0.05686221718788147,
            "samples_per_second": 5137901.762142341,
            "samples_per_second_per_gpu": 642237.7202677926,
            "loss_sequences_lower_95": 3.2100555046237247,
            "loss_sequences_upper_95": 3.266304792131696,
            "loss_tokens_lower_95": 3.2258621250000004,
            "loss_tokens_upper_95": 3.251116822916667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.757355879938415,
            "data_time": 0.014489369957070602,
            "batch_time": 0.057854185763158296,
            "samples_per_second": 5270322.15413037,
            "samples_per_second_per_gpu": 658790.2692662963,
            "loss_sequences_lower_95": 4.7494245950589,
            "loss_sequences_upper_95": 4.765262035749346,
            "loss_tokens_lower_95": 4.746099739583333,
            "loss_tokens_upper_95": 4.768877833333333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7852048088864585,
            "data_time": 0.09975937753915787,
            "batch_time": 0.1443730816245079,
            "samples_per_second": 4032179.330604711,
            "samples_per_second_per_gpu": 504022.4163255889,
            "loss_sequences_lower_95": 4.745336802412823,
            "loss_sequences_upper_95": 4.825129067025533,
            "loss_tokens_lower_95": 4.773114125,
            "loss_tokens_upper_95": 4.79735675,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3282883685568105,
            "data_time": 0.09674441069364548,
            "batch_time": 0.14213349670171738,
            "samples_per_second": 4108382.4279577998,
            "samples_per_second_per_gpu": 513547.80349472497,
            "loss_sequences_lower_95": 5.297580655667151,
            "loss_sequences_upper_95": 5.355690599524456,
            "loss_tokens_lower_95": 5.316189520833333,
            "loss_tokens_upper_95": 5.340038041666666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.706388255578452,
            "data_time": 0.010213551850154483,
            "batch_time": 0.0537215445576043,
            "samples_per_second": 5379976.880673297,
            "samples_per_second_per_gpu": 672497.1100841621,
            "loss_sequences_lower_95": 4.699281329227764,
            "loss_sequences_upper_95": 4.713688122045018,
            "loss_tokens_lower_95": 4.694324510416667,
            "loss_tokens_upper_95": 4.7185043645833336,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.631966899504815,
            "data_time": 0.023050116002559663,
            "batch_time": 0.06554491519927978,
            "samples_per_second": 5047093.316060088,
            "samples_per_second_per_gpu": 630886.664507511,
            "loss_sequences_lower_95": 4.621721652029754,
            "loss_sequences_upper_95": 4.642221665451895,
            "loss_tokens_lower_95": 4.620006625,
            "loss_tokens_upper_95": 4.643827135416666,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.838514958627326,
            "data_time": 0.09617181867361069,
            "batch_time": 0.14105667173862457,
            "samples_per_second": 4095649.802068587,
            "samples_per_second_per_gpu": 511956.22525857337,
            "loss_sequences_lower_95": 4.804215734145221,
            "loss_sequences_upper_95": 4.87291970398073,
            "loss_tokens_lower_95": 4.82711071875,
            "loss_tokens_upper_95": 4.8500537604166665,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.512774415025886,
            "data_time": 0.09991379827260971,
            "batch_time": 0.1451021507382393,
            "samples_per_second": 4097649.461992511,
            "samples_per_second_per_gpu": 512206.1827490639,
            "loss_sequences_lower_95": 4.452654059695615,
            "loss_sequences_upper_95": 4.568835051433617,
            "loss_tokens_lower_95": 4.5004814375,
            "loss_tokens_upper_95": 4.525337135416667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.556906656785444,
            "data_time": 0.15077140927314758,
            "batch_time": 0.1714005321264267,
            "samples_per_second": 1134050.0177580283,
            "samples_per_second_per_gpu": 141756.25221975354,
            "loss_sequences_lower_95": 5.481036767092618,
            "loss_sequences_upper_95": 5.626763213764537,
            "loss_tokens_lower_95": 5.5336521408774635,
            "loss_tokens_upper_95": 5.580287265777588,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.165515410309283,
            "data_time": 0.09510068595409393,
            "batch_time": 0.12989867478609085,
            "samples_per_second": 3369927.9462917,
            "samples_per_second_per_gpu": 421240.9932864625,
            "loss_sequences_lower_95": 4.0560170254276375,
            "loss_sequences_upper_95": 4.274970140818604,
            "loss_tokens_lower_95": 4.15349509375,
            "loss_tokens_upper_95": 4.1775514375,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.423572854819587,
            "data_time": 0.09888915717601776,
            "batch_time": 0.13586802035570145,
            "samples_per_second": 3605555.322484169,
            "samples_per_second_per_gpu": 450694.41531052114,
            "loss_sequences_lower_95": 6.373291563169937,
            "loss_sequences_upper_95": 6.471260622371784,
            "loss_tokens_lower_95": 6.412235895833334,
            "loss_tokens_upper_95": 6.434791145833334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.310002115906262,
            "data_time": 0.17192703485488892,
            "batch_time": 0.20057253539562225,
            "samples_per_second": 2291484.0340075106,
            "samples_per_second_per_gpu": 286435.50425093883,
            "loss_sequences_lower_95": 5.254513037009317,
            "loss_sequences_upper_95": 5.361637815881948,
            "loss_tokens_lower_95": 5.296567998166944,
            "loss_tokens_upper_95": 5.323398152335746,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.561865938675911,
            "data_time": 0.027850759842179038,
            "batch_time": 0.07250042882832614,
            "samples_per_second": 4469081.85831658,
            "samples_per_second_per_gpu": 558635.2322895725,
            "loss_sequences_lower_95": 4.547179179537281,
            "loss_sequences_upper_95": 4.576035711225698,
            "loss_tokens_lower_95": 4.547360944510219,
            "loss_tokens_upper_95": 4.57615547609404,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.511560108718158,
            "data_time": 0.028590507805347443,
            "batch_time": 0.07256350256502628,
            "samples_per_second": 4476562.42730325,
            "samples_per_second_per_gpu": 559570.3034129062,
            "loss_sequences_lower_95": 4.496699380181363,
            "loss_sequences_upper_95": 4.52278022150717,
            "loss_tokens_lower_95": 4.4996420680730775,
            "loss_tokens_upper_95": 4.521685681520765,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.609115028471445,
            "data_time": 0.05133769247266981,
            "batch_time": 0.09387741817368402,
            "samples_per_second": 4251507.527370497,
            "samples_per_second_per_gpu": 531438.4409213121,
            "loss_sequences_lower_95": 7.079691340284748,
            "loss_sequences_upper_95": 7.351542664294993,
            "loss_tokens_lower_95": 6.47725896101382,
            "loss_tokens_upper_95": 6.677868794860912,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.515680414040883,
            "data_time": 0.048783499747514725,
            "batch_time": 0.09283290058374405,
            "samples_per_second": 4469464.194166797,
            "samples_per_second_per_gpu": 558683.0242708497,
            "loss_sequences_lower_95": 6.930725992838542,
            "loss_sequences_upper_95": 7.120273209635417,
            "loss_tokens_lower_95": 6.414958456171384,
            "loss_tokens_upper_95": 6.54718352004717,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.574436597701182,
            "data_time": 0.06765554348627727,
            "batch_time": 0.10758738468090694,
            "samples_per_second": 3999449.6029439545,
            "samples_per_second_per_gpu": 499931.2003679943,
            "loss_sequences_lower_95": 4.690331605530942,
            "loss_sequences_upper_95": 4.762446248341546,
            "loss_tokens_lower_95": 4.547728809227084,
            "loss_tokens_upper_95": 4.58358775066144,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.3260452508926392,
            "data_time": 0.3414749205112457,
            "batch_time": 0.38455356657505035,
            "samples_per_second": 2404813.296410495,
            "samples_per_second_per_gpu": 300601.66205131187,
            "loss_sequences_lower_95": 3.336760919744318,
            "loss_sequences_upper_95": 3.4667935250022195,
            "loss_tokens_lower_95": 3.2952029061271793,
            "loss_tokens_upper_95": 3.3467466759674496,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.61828738621303,
            "data_time": 0.3534863591194153,
            "batch_time": 0.3999411016702652,
            "samples_per_second": 2114504.4951337986,
            "samples_per_second_per_gpu": 264313.0618917248,
            "loss_sequences_lower_95": 4.676589156170281,
            "loss_sequences_upper_95": 4.898109840860172,
            "loss_tokens_lower_95": 4.562973722071767,
            "loss_tokens_upper_95": 4.6700068498222755,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4234339841206864,
            "data_time": 0.18253836035728455,
            "batch_time": 0.21350537985563278,
            "samples_per_second": 2561297.8015527283,
            "samples_per_second_per_gpu": 320162.22519409104,
            "loss_sequences_lower_95": 4.4048447265625,
            "loss_sequences_upper_95": 4.502686401367187,
            "loss_tokens_lower_95": 4.318785806285731,
            "loss_tokens_upper_95": 4.527201739524586,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.931272250426507,
            "data_time": 0.024965666234493256,
            "batch_time": 0.06929644420742989,
            "samples_per_second": 4490163.899803242,
            "samples_per_second_per_gpu": 561270.4874754052,
            "loss_sequences_lower_95": 8.005732754908715,
            "loss_sequences_upper_95": 8.073339387210275,
            "loss_tokens_lower_95": 7.878357277133404,
            "loss_tokens_upper_95": 7.950865873907019,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6549832283045705,
            "data_time": 0.04884360581636429,
            "batch_time": 0.09129175692796707,
            "samples_per_second": 4402421.349822761,
            "samples_per_second_per_gpu": 550302.6687278451,
            "loss_sequences_lower_95": 7.03961559770886,
            "loss_sequences_upper_95": 7.376947596899989,
            "loss_tokens_lower_95": 5.4977299245891516,
            "loss_tokens_upper_95": 5.654198575691133,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.275065653966962,
            "data_time": 0.08336080312728882,
            "batch_time": 0.12574026882648467,
            "samples_per_second": 4280213.635199478,
            "samples_per_second_per_gpu": 535026.7043999347,
            "loss_sequences_lower_95": 6.112037153536956,
            "loss_sequences_upper_95": 6.479291167275491,
            "loss_tokens_lower_95": 5.164180001046204,
            "loss_tokens_upper_95": 5.336410182720233,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.629405785913336,
            "data_time": 0.35362885892391205,
            "batch_time": 0.3953838348388672,
            "samples_per_second": 2261275.3042701003,
            "samples_per_second_per_gpu": 282659.41303376254,
            "loss_sequences_lower_95": 5.5486698655777325,
            "loss_sequences_upper_95": 5.713992490724886,
            "loss_tokens_lower_95": 5.547573838604095,
            "loss_tokens_upper_95": 5.708970746602097,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.711658711433411,
            "data_time": 0.30918705463409424,
            "batch_time": 0.33525677025318146,
            "samples_per_second": 1324545.2180261482,
            "samples_per_second_per_gpu": 165568.15225326852,
            "loss_sequences_lower_95": 4.623018463134765,
            "loss_sequences_upper_95": 5.050434906005859,
            "loss_tokens_lower_95": 4.4526192707751004,
            "loss_tokens_upper_95": 4.951650962761349,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.626574728603617,
            "data_time": 0.05652717128396034,
            "batch_time": 0.09990493953227997,
            "samples_per_second": 4397741.12438365,
            "samples_per_second_per_gpu": 549717.6405479562,
            "loss_sequences_lower_95": 4.591165420391665,
            "loss_sequences_upper_95": 4.66163295093798,
            "loss_tokens_lower_95": 4.590922891544588,
            "loss_tokens_upper_95": 4.661975984758828,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.87810350382162,
            "data_time": 0.07695984244346618,
            "batch_time": 0.12033064961433411,
            "samples_per_second": 4392268.263215773,
            "samples_per_second_per_gpu": 549033.5329019716,
            "loss_sequences_lower_95": 4.832282768830621,
            "loss_sequences_upper_95": 4.9229538276291205,
            "loss_tokens_lower_95": 4.831730403165951,
            "loss_tokens_upper_95": 4.922838805602478,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.114255247230239,
            "data_time": 0.05219529569149017,
            "batch_time": 0.09380799159407616,
            "samples_per_second": 4321395.452543739,
            "samples_per_second_per_gpu": 540174.4315679674,
            "loss_sequences_lower_95": 5.371248868292641,
            "loss_sequences_upper_95": 5.487569522961438,
            "loss_tokens_lower_95": 5.074236555905655,
            "loss_tokens_upper_95": 5.1353363233435125,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.457592357635498,
            "data_time": 0.19098792970180511,
            "batch_time": 0.23653993755578995,
            "samples_per_second": 3763900.4765000013,
            "samples_per_second_per_gpu": 470487.55956250016,
            "loss_sequences_lower_95": 7.097134423828125,
            "loss_sequences_upper_95": 7.634661840820312,
            "loss_tokens_lower_95": 6.226033275510837,
            "loss_tokens_upper_95": 6.578172454732999,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.054615497589111,
            "data_time": 0.15543216466903687,
            "batch_time": 0.1721211075782776,
            "samples_per_second": 921528.2891955702,
            "samples_per_second_per_gpu": 115191.03614944627,
            "loss_sequences_lower_95": 4.768249368667602,
            "loss_sequences_upper_95": 5.478119540214538,
            "loss_tokens_lower_95": 4.514325828113775,
            "loss_tokens_upper_95": 5.428093991334411,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.320836971546042,
            "data_time": 0.36191926896572113,
            "batch_time": 0.3971780091524124,
            "samples_per_second": 2031820.3069921404,
            "samples_per_second_per_gpu": 253977.53837401755,
            "loss_sequences_lower_95": 6.579135991239,
            "loss_sequences_upper_95": 7.25829383587015,
            "loss_tokens_lower_95": 4.999928134027973,
            "loss_tokens_upper_95": 5.4516964057655715,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.27309176910387,
            "data_time": 0.05083190235826704,
            "batch_time": 0.09554572900136311,
            "samples_per_second": 4589637.841622334,
            "samples_per_second_per_gpu": 573704.7302027917,
            "loss_sequences_lower_95": 5.254048702694954,
            "loss_sequences_upper_95": 5.291996475159731,
            "loss_tokens_lower_95": 5.253644286789401,
            "loss_tokens_upper_95": 5.291374615177206,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.749215827606706,
            "data_time": 0.034609874799138025,
            "batch_time": 0.07779415803296226,
            "samples_per_second": 4391137.097215678,
            "samples_per_second_per_gpu": 548892.1371519597,
            "loss_sequences_lower_95": 5.875249229059528,
            "loss_sequences_upper_95": 6.095552433427737,
            "loss_tokens_lower_95": 5.609380651815417,
            "loss_tokens_upper_95": 5.82809833411183,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.195028413346399,
            "data_time": 0.18096552044153214,
            "batch_time": 0.2106391042470932,
            "samples_per_second": 1810563.8272733232,
            "samples_per_second_per_gpu": 226320.4784091654,
            "loss_sequences_lower_95": 4.117550648001087,
            "loss_sequences_upper_95": 4.4954948006095465,
            "loss_tokens_lower_95": 3.9968394742642808,
            "loss_tokens_upper_95": 4.325272168239767,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.605520042548898,
            "data_time": 0.0895712673664093,
            "batch_time": 0.13465953171253203,
            "samples_per_second": 4076446.151012161,
            "samples_per_second_per_gpu": 509555.7688765201,
            "loss_sequences_lower_95": 4.654514838431704,
            "loss_sequences_upper_95": 4.794148255598856,
            "loss_tokens_lower_95": 4.520983445550198,
            "loss_tokens_upper_95": 4.677319668352875,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.336062820946298,
            "data_time": 0.3217222988605499,
            "batch_time": 0.35558851063251495,
            "samples_per_second": 2385279.5234801695,
            "samples_per_second_per_gpu": 298159.9404350212,
            "loss_sequences_lower_95": 4.167648371254526,
            "loss_sequences_upper_95": 4.650322862950767,
            "loss_tokens_lower_95": 4.142194370188876,
            "loss_tokens_upper_95": 4.548929495134712,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.1696072310346395,
            "data_time": 0.027760738489396145,
            "batch_time": 0.07173060249759113,
            "samples_per_second": 4469693.0218017595,
            "samples_per_second_per_gpu": 558711.6277252199,
            "loss_sequences_lower_95": 4.160135396219869,
            "loss_sequences_upper_95": 4.179167503422559,
            "loss_tokens_lower_95": 4.160007304195214,
            "loss_tokens_upper_95": 4.179059288420184,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.0308154231136286,
            "data_time": 0.30094580352306366,
            "batch_time": 0.3266536593437195,
            "samples_per_second": 1636203.4839005615,
            "samples_per_second_per_gpu": 204525.43548757018,
            "loss_sequences_lower_95": 2.9304771052980887,
            "loss_sequences_upper_95": 3.278129859109527,
            "loss_tokens_lower_95": 2.79973184513338,
            "loss_tokens_upper_95": 3.178029105460754,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.622742115451604,
            "data_time": 0.02686644752820333,
            "batch_time": 0.07114635010560354,
            "samples_per_second": 4422443.852433327,
            "samples_per_second_per_gpu": 552805.4815541658,
            "loss_sequences_lower_95": 6.504299894768737,
            "loss_sequences_upper_95": 6.555768392950734,
            "loss_tokens_lower_95": 5.519558401837524,
            "loss_tokens_upper_95": 5.569811158123791,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.748936869621277,
            "data_time": 0.09167341887950897,
            "batch_time": 0.13622644543647766,
            "samples_per_second": 4397092.483273555,
            "samples_per_second_per_gpu": 549636.5604091943,
            "loss_sequences_lower_95": 6.778755541992187,
            "loss_sequences_upper_95": 7.03082373046875,
            "loss_tokens_lower_95": 6.617182949505752,
            "loss_tokens_upper_95": 6.859216412443204,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.5835671341937525,
            "data_time": 0.35238149762153625,
            "batch_time": 0.3963261544704437,
            "samples_per_second": 2688008.765724292,
            "samples_per_second_per_gpu": 336001.0957155365,
            "loss_sequences_lower_95": 4.469260452933933,
            "loss_sequences_upper_95": 4.695508475925611,
            "loss_tokens_lower_95": 4.47408643639606,
            "loss_tokens_upper_95": 4.69606201171875,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.307916782841538,
            "data_time": 0.06356788178284963,
            "batch_time": 0.10341860602299373,
            "samples_per_second": 4093222.992240475,
            "samples_per_second_per_gpu": 511652.8740300594,
            "loss_sequences_lower_95": 8.204651729699338,
            "loss_sequences_upper_95": 8.412027680368135,
            "loss_tokens_lower_95": 8.20338974461411,
            "loss_tokens_upper_95": 8.412929724491004,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.800042495171229,
            "data_time": 0.07025550305843353,
            "batch_time": 0.11409037560224533,
            "samples_per_second": 4570201.535211654,
            "samples_per_second_per_gpu": 571275.1919014568,
            "loss_sequences_lower_95": 1.9555026814778644,
            "loss_sequences_upper_95": 2.0593015014648435,
            "loss_tokens_lower_95": 1.7511938642644558,
            "loss_tokens_upper_95": 1.8284032284788916,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.787809133529663,
            "data_time": 0.3458457738161087,
            "batch_time": 0.38597366213798523,
            "samples_per_second": 2415262.633919169,
            "samples_per_second_per_gpu": 301907.82923989615,
            "loss_sequences_lower_95": 6.433709731329055,
            "loss_sequences_upper_95": 7.148261457170759,
            "loss_tokens_lower_95": 6.424781988234748,
            "loss_tokens_upper_95": 7.143454749697731,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.481035739183426,
            "data_time": 0.15397140383720398,
            "batch_time": 0.17370645701885223,
            "samples_per_second": 731590.6557552826,
            "samples_per_second_per_gpu": 91448.83196941033,
            "loss_sequences_lower_95": 4.109452271461487,
            "loss_sequences_upper_95": 5.502249598503112,
            "loss_tokens_lower_95": 3.8284944766329736,
            "loss_tokens_upper_95": 4.4689296572970365,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.580600023269653,
            "data_time": 0.10337567329406738,
            "batch_time": 0.1482130102813244,
            "samples_per_second": 4255171.835138127,
            "samples_per_second_per_gpu": 531896.4793922659,
            "loss_sequences_lower_95": 7.636914526367187,
            "loss_sequences_upper_95": 7.9747575195312495,
            "loss_tokens_lower_95": 7.425758178504388,
            "loss_tokens_upper_95": 7.722556053200349,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.680329894542694,
            "data_time": 0.091777253895998,
            "batch_time": 0.1363588348031044,
            "samples_per_second": 4412448.418008598,
            "samples_per_second_per_gpu": 551556.0522510747,
            "loss_sequences_lower_95": 7.884811206054687,
            "loss_sequences_upper_95": 8.12620770263672,
            "loss_tokens_lower_95": 7.574673668217169,
            "loss_tokens_upper_95": 7.763380324392839,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.40792410061178,
            "data_time": 0.04091000805298487,
            "batch_time": 0.08467189843455951,
            "samples_per_second": 4543307.5537751885,
            "samples_per_second_per_gpu": 567913.4442218986,
            "loss_sequences_lower_95": 4.384811302131348,
            "loss_sequences_upper_95": 4.431332637877661,
            "loss_tokens_lower_95": 4.384582507254651,
            "loss_tokens_upper_95": 4.430816979812689,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.192606742114698,
            "data_time": 0.12241260210673015,
            "batch_time": 0.1624727100133896,
            "samples_per_second": 4013419.078948188,
            "samples_per_second_per_gpu": 501677.3848685235,
            "loss_sequences_lower_95": 5.1233264835199455,
            "loss_sequences_upper_95": 5.261480591007825,
            "loss_tokens_lower_95": 5.124012768067156,
            "loss_tokens_upper_95": 5.259822600521433,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.286874236106872,
            "data_time": 0.09458057209849358,
            "batch_time": 0.13900763541460037,
            "samples_per_second": 4351601.267824745,
            "samples_per_second_per_gpu": 543950.1584780931,
            "loss_sequences_lower_95": 9.209067431640625,
            "loss_sequences_upper_95": 9.367943725585937,
            "loss_tokens_lower_95": 9.210438208007812,
            "loss_tokens_upper_95": 9.363736987304687,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.683520158507854,
            "data_time": 0.028284069682870592,
            "batch_time": 0.07245502514498574,
            "samples_per_second": 4466467.061327902,
            "samples_per_second_per_gpu": 558308.3826659877,
            "loss_sequences_lower_95": 5.649555779550024,
            "loss_sequences_upper_95": 5.748735845849101,
            "loss_tokens_lower_95": 4.575602925180225,
            "loss_tokens_upper_95": 4.6426406769878,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.433521505612046,
            "data_time": 0.20681042330605642,
            "batch_time": 0.23849023239953177,
            "samples_per_second": 1987056.4057375994,
            "samples_per_second_per_gpu": 248382.05071719992,
            "loss_sequences_lower_95": 4.2988943071507695,
            "loss_sequences_upper_95": 4.565667975126807,
            "loss_tokens_lower_95": 4.299332405204203,
            "loss_tokens_upper_95": 4.56587362716447,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.690720049540202,
            "data_time": 0.1939816251397133,
            "batch_time": 0.23992303013801575,
            "samples_per_second": 3709102.385693047,
            "samples_per_second_per_gpu": 463637.7982116309,
            "loss_sequences_lower_95": 4.597070575788909,
            "loss_sequences_upper_95": 4.782299768784467,
            "loss_tokens_lower_95": 4.59824296539905,
            "loss_tokens_upper_95": 4.782106742110907,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.466534378414209,
            "data_time": 0.02893662080168724,
            "batch_time": 0.0727459853515029,
            "samples_per_second": 4448922.793336566,
            "samples_per_second_per_gpu": 556115.3491670707,
            "loss_sequences_lower_95": 6.308491677627458,
            "loss_sequences_upper_95": 6.412278814649098,
            "loss_tokens_lower_95": 5.347641428103347,
            "loss_tokens_upper_95": 5.432471997355831,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.658709056793698,
            "data_time": 0.3437574803829193,
            "batch_time": 0.38156014680862427,
            "samples_per_second": 2201832.972448278,
            "samples_per_second_per_gpu": 275229.1215560348,
            "loss_sequences_lower_95": 4.590179443359375,
            "loss_sequences_upper_95": 4.728336104135665,
            "loss_tokens_lower_95": 4.590062079858528,
            "loss_tokens_upper_95": 4.726428303390584,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.174475703545666,
            "data_time": 0.05090145881359394,
            "batch_time": 0.09491300926758693,
            "samples_per_second": 4333389.5883177705,
            "samples_per_second_per_gpu": 541673.6985397213,
            "loss_sequences_lower_95": 7.14714391305906,
            "loss_sequences_upper_95": 7.201195975487385,
            "loss_tokens_lower_95": 7.147257143539756,
            "loss_tokens_upper_95": 7.201852467388188,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.664121322261477,
            "data_time": 0.3415206968784332,
            "batch_time": 0.3810780495405197,
            "samples_per_second": 2199107.594156339,
            "samples_per_second_per_gpu": 274888.44926954235,
            "loss_sequences_lower_95": 4.504312296747004,
            "loss_sequences_upper_95": 4.820181311449958,
            "loss_tokens_lower_95": 4.508416836933025,
            "loss_tokens_upper_95": 4.820529019253925,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.828005584081014,
            "data_time": 0.2855139821767807,
            "batch_time": 0.3060850352048874,
            "samples_per_second": 1114302.020621944,
            "samples_per_second_per_gpu": 139287.752577743,
            "loss_sequences_lower_95": 5.664850438435872,
            "loss_sequences_upper_95": 6.456497344970703,
            "loss_tokens_lower_95": 5.047763697306315,
            "loss_tokens_upper_95": 6.460702715979682,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.026487096150716,
            "data_time": 0.31590530276298523,
            "batch_time": 0.33579884469509125,
            "samples_per_second": 1316405.4489557622,
            "samples_per_second_per_gpu": 164550.68111947028,
            "loss_sequences_lower_95": 4.979831759134928,
            "loss_sequences_upper_95": 5.931131642659505,
            "loss_tokens_lower_95": 4.111143990848841,
            "loss_tokens_upper_95": 5.634699660740541,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.252458399630787,
            "data_time": 0.050906475101198466,
            "batch_time": 0.09386011851685387,
            "samples_per_second": 4131544.9508556486,
            "samples_per_second_per_gpu": 516443.1188569561,
            "loss_sequences_lower_95": 7.223070203194035,
            "loss_sequences_upper_95": 7.2822390320093895,
            "loss_tokens_lower_95": 7.222991934370397,
            "loss_tokens_upper_95": 7.282603092783505,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.4195437306444094,
            "data_time": 0.023302792136325585,
            "batch_time": 0.06803648896092411,
            "samples_per_second": 4459215.821113115,
            "samples_per_second_per_gpu": 557401.9776391394,
            "loss_sequences_lower_95": 3.0343528281645353,
            "loss_sequences_upper_95": 3.0707832508078026,
            "loss_tokens_lower_95": 2.3587963053011283,
            "loss_tokens_upper_95": 2.3842093010662375,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.4825115560546633,
            "data_time": 0.32127171754837036,
            "batch_time": 0.35088013112545013,
            "samples_per_second": 1743497.7703754494,
            "samples_per_second_per_gpu": 217937.22129693118,
            "loss_sequences_lower_95": 3.5088547293595442,
            "loss_sequences_upper_95": 3.878112300362174,
            "loss_tokens_lower_95": 3.318380898475881,
            "loss_tokens_upper_95": 3.5068576408879113,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.539794651237694,
            "data_time": 0.22172018885612488,
            "batch_time": 0.23980611562728882,
            "samples_per_second": 998647.9463106833,
            "samples_per_second_per_gpu": 124830.99328883541,
            "loss_sequences_lower_95": 5.204947579873575,
            "loss_sequences_upper_95": 5.968978912765915,
            "loss_tokens_lower_95": 5.025190621835215,
            "loss_tokens_upper_95": 5.937705786434221,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.3315230244543494,
            "data_time": 0.388868048787117,
            "batch_time": 0.42322438955307007,
            "samples_per_second": 1843167.1822278625,
            "samples_per_second_per_gpu": 230395.89777848282,
            "loss_sequences_lower_95": 3.3927805923834082,
            "loss_sequences_upper_95": 3.717884882484994,
            "loss_tokens_lower_95": 3.209236901414003,
            "loss_tokens_upper_95": 3.36841102029016,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.370005098784842,
            "data_time": 0.33326663076877594,
            "batch_time": 0.3679455816745758,
            "samples_per_second": 2301682.9240760305,
            "samples_per_second_per_gpu": 287710.3655095038,
            "loss_sequences_lower_95": 3.5157649807813693,
            "loss_sequences_upper_95": 3.805180070458389,
            "loss_tokens_lower_95": 3.2584112987605547,
            "loss_tokens_upper_95": 3.388694058670253,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.469689939080215,
            "data_time": 0.36558103561401367,
            "batch_time": 0.40118569135665894,
            "samples_per_second": 1814407.958125307,
            "samples_per_second_per_gpu": 226800.99476566337,
            "loss_sequences_lower_95": 3.26912075135766,
            "loss_sequences_upper_95": 3.634917756987781,
            "loss_tokens_lower_95": 3.3367260784304915,
            "loss_tokens_upper_95": 3.5510225055819618,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.465818653746349,
            "data_time": 0.3544255197048187,
            "batch_time": 0.3887007087469101,
            "samples_per_second": 2313504.073241665,
            "samples_per_second_per_gpu": 289188.0091552081,
            "loss_sequences_lower_95": 3.6333966883217417,
            "loss_sequences_upper_95": 3.9024964123237424,
            "loss_tokens_lower_95": 3.3618725833120373,
            "loss_tokens_upper_95": 3.4799527652538456,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.0479746012954236,
            "data_time": 0.34299466013908386,
            "batch_time": 0.3781997114419937,
            "samples_per_second": 2386661.8417520854,
            "samples_per_second_per_gpu": 298332.7302190107,
            "loss_sequences_lower_95": 3.0568520350485855,
            "loss_sequences_upper_95": 3.2129905463745874,
            "loss_tokens_lower_95": 2.971177267658672,
            "loss_tokens_upper_95": 3.0629781528773035,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.557660746865156,
            "data_time": 0.3361397832632065,
            "batch_time": 0.38220611214637756,
            "samples_per_second": 2381427.7684788923,
            "samples_per_second_per_gpu": 297678.47105986153,
            "loss_sequences_lower_95": 2.6789838837414255,
            "loss_sequences_upper_95": 2.8648296263159776,
            "loss_tokens_lower_95": 2.4823575066399184,
            "loss_tokens_upper_95": 2.5649039213193188,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-8.0/params.txt",
    "uuid": "e177b02d-5894-46a8-bd42-b6a149e8ad3a",
    "creation_date": "2023_12_14-06_03_37"
}