{
    "name": "rw_original-d=512_l=8_h=4-0.25",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 394570240,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "78914048",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.541667787233989,
            "data_time": 0.03346044570207596,
            "batch_time": 0.33265628293156624,
            "samples_per_second": 1717321.1070188065,
            "samples_per_second_per_gpu": 214665.1383773508,
            "loss_sequences_lower_95": 5.398494288126627,
            "loss_sequences_upper_95": 5.6858268610636395,
            "loss_tokens_lower_95": 5.52607521057129,
            "loss_tokens_upper_95": 5.55730032602946,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.826044197538269,
            "data_time": 0.0014665178687976977,
            "batch_time": 0.01524274518001086,
            "samples_per_second": 2258212.446617768,
            "samples_per_second_per_gpu": 282276.555827221,
            "loss_sequences_lower_95": 4.823904132328027,
            "loss_sequences_upper_95": 4.828150230076236,
            "loss_tokens_lower_95": 4.814570541666667,
            "loss_tokens_upper_95": 4.8377116875,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.765783374163569,
            "data_time": 0.009418938636779785,
            "batch_time": 0.023239035606384278,
            "samples_per_second": 2178994.127380595,
            "samples_per_second_per_gpu": 272374.2659225744,
            "loss_sequences_lower_95": 4.718881698919803,
            "loss_sequences_upper_95": 4.825159015266262,
            "loss_tokens_lower_95": 4.752902854166667,
            "loss_tokens_upper_95": 4.7793100937499995,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9691613117689934,
            "data_time": 0.0015602498070189828,
            "batch_time": 0.015106388710831342,
            "samples_per_second": 2307105.977534069,
            "samples_per_second_per_gpu": 288388.24719175865,
            "loss_sequences_lower_95": 4.9358276618878865,
            "loss_sequences_upper_95": 5.004501610824742,
            "loss_tokens_lower_95": 4.956478729166666,
            "loss_tokens_upper_95": 4.981800541666666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.862834603140651,
            "data_time": 0.009606682446848348,
            "batch_time": 0.023734570499435365,
            "samples_per_second": 2137413.366735328,
            "samples_per_second_per_gpu": 267176.670841916,
            "loss_sequences_lower_95": 4.810756868257542,
            "loss_sequences_upper_95": 4.928897225249809,
            "loss_tokens_lower_95": 4.850683052083333,
            "loss_tokens_upper_95": 4.8748400625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.397671956905778,
            "data_time": 0.0034209907702777696,
            "batch_time": 0.017019510917041614,
            "samples_per_second": 2294914.5858282507,
            "samples_per_second_per_gpu": 286864.32322853134,
            "loss_sequences_lower_95": 5.346704865020607,
            "loss_sequences_upper_95": 5.4521314422586675,
            "loss_tokens_lower_95": 5.383906166666667,
            "loss_tokens_upper_95": 5.410886802083333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.561128049869927,
            "data_time": 0.001525049789111338,
            "batch_time": 0.014892921167719033,
            "samples_per_second": 2341567.212193031,
            "samples_per_second_per_gpu": 292695.90152412886,
            "loss_sequences_lower_95": 6.52963502471301,
            "loss_sequences_upper_95": 6.591875039859694,
            "loss_tokens_lower_95": 6.543475947916667,
            "loss_tokens_upper_95": 6.579097927083334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.069739409341862,
            "data_time": 0.0015598769369408556,
            "batch_time": 0.015135943166684127,
            "samples_per_second": 2306925.4839037233,
            "samples_per_second_per_gpu": 288365.6854879654,
            "loss_sequences_lower_95": 5.047882086469241,
            "loss_sequences_upper_95": 5.093243261207461,
            "loss_tokens_lower_95": 5.0576575,
            "loss_tokens_upper_95": 5.0816759375,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.980760926153602,
            "data_time": 0.011567006035456582,
            "batch_time": 0.025442408190833196,
            "samples_per_second": 2190051.050393549,
            "samples_per_second_per_gpu": 273756.3812991936,
            "loss_sequences_lower_95": 4.902355845381574,
            "loss_sequences_upper_95": 5.072268006859756,
            "loss_tokens_lower_95": 4.968473697916667,
            "loss_tokens_upper_95": 4.992978052083333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.106441641984721,
            "data_time": 0.008880458772182465,
            "batch_time": 0.02261137869209051,
            "samples_per_second": 2208046.051782345,
            "samples_per_second_per_gpu": 276005.7564727931,
            "loss_sequences_lower_95": 6.008897490369472,
            "loss_sequences_upper_95": 6.22778326343642,
            "loss_tokens_lower_95": 6.093116645833334,
            "loss_tokens_upper_95": 6.119718010416666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.360632761547103,
            "data_time": 0.00119707962688468,
            "batch_time": 0.014703935974426353,
            "samples_per_second": 2323571.5334296334,
            "samples_per_second_per_gpu": 290446.44167870417,
            "loss_sequences_lower_95": 5.350074784586131,
            "loss_sequences_upper_95": 5.371711365222866,
            "loss_tokens_lower_95": 5.3482395625,
            "loss_tokens_upper_95": 5.373152958333333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.143896491961497,
            "data_time": 0.002581730571813528,
            "batch_time": 0.015924215118255744,
            "samples_per_second": 2341087.2254917747,
            "samples_per_second_per_gpu": 292635.90318647184,
            "loss_sequences_lower_95": 5.120853991907668,
            "loss_sequences_upper_95": 5.168391466120887,
            "loss_tokens_lower_95": 5.1315245625,
            "loss_tokens_upper_95": 5.156373208333333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.225077999532827,
            "data_time": 0.009747538170795668,
            "batch_time": 0.02351815333008295,
            "samples_per_second": 2172684.86704671,
            "samples_per_second_per_gpu": 271585.60838083876,
            "loss_sequences_lower_95": 5.13949955133589,
            "loss_sequences_upper_95": 5.326419432603322,
            "loss_tokens_lower_95": 5.211672468750001,
            "loss_tokens_upper_95": 5.238404041666667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.152821795275158,
            "data_time": 0.009190028407184251,
            "batch_time": 0.022774092229713955,
            "samples_per_second": 2236297.869984634,
            "samples_per_second_per_gpu": 279537.2337480793,
            "loss_sequences_lower_95": 5.0850655798513875,
            "loss_sequences_upper_95": 5.233844093586908,
            "loss_tokens_lower_95": 5.140362885416667,
            "loss_tokens_upper_95": 5.165086395833334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.050402793017301,
            "data_time": 0.08108502626419067,
            "batch_time": 0.09703599555151803,
            "samples_per_second": 1070965.1594121763,
            "samples_per_second_per_gpu": 133870.64492652204,
            "loss_sequences_lower_95": 5.975505707480691,
            "loss_sequences_upper_95": 6.141452373157848,
            "loss_tokens_lower_95": 6.0255889025601475,
            "loss_tokens_upper_95": 6.07505989074707,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.691210951123919,
            "data_time": 0.013565392656759783,
            "batch_time": 0.02726783671162345,
            "samples_per_second": 2164429.4740132657,
            "samples_per_second_per_gpu": 270553.6842516582,
            "loss_sequences_lower_95": 5.591705099834297,
            "loss_sequences_upper_95": 5.791751427831177,
            "loss_tokens_lower_95": 5.676862145833334,
            "loss_tokens_upper_95": 5.7051121770833335,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.5479248519937725,
            "data_time": 0.012192334979772568,
            "batch_time": 0.026208293934663136,
            "samples_per_second": 2154367.083180367,
            "samples_per_second_per_gpu": 269295.8853975459,
            "loss_sequences_lower_95": 6.46373223377721,
            "loss_sequences_upper_95": 6.652900184490435,
            "loss_tokens_lower_95": 6.536060875,
            "loss_tokens_upper_95": 6.559483520833334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.860185447286387,
            "data_time": 0.036320436745882034,
            "batch_time": 0.05046626552939415,
            "samples_per_second": 1913330.3889958116,
            "samples_per_second_per_gpu": 239166.29862447645,
            "loss_sequences_lower_95": 5.730571246538006,
            "loss_sequences_upper_95": 6.084614237800974,
            "loss_tokens_lower_95": 5.846037304987673,
            "loss_tokens_upper_95": 5.87506706362865,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.7318883353599945,
            "data_time": 0.0018002660032828469,
            "batch_time": 0.015270823973458015,
            "samples_per_second": 2298682.2146568573,
            "samples_per_second_per_gpu": 287335.27683210716,
            "loss_sequences_lower_95": 5.713990589615083,
            "loss_sequences_upper_95": 5.750213783871599,
            "loss_tokens_lower_95": 5.713745704849736,
            "loss_tokens_upper_95": 5.749822115684197,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4289306323145095,
            "data_time": 0.0019087165024629824,
            "batch_time": 0.015423257829277379,
            "samples_per_second": 2290354.238787068,
            "samples_per_second_per_gpu": 286294.2798483835,
            "loss_sequences_lower_95": 4.428235950025519,
            "loss_sequences_upper_95": 4.453880576578371,
            "loss_tokens_lower_95": 4.408720811164961,
            "loss_tokens_upper_95": 4.429738028219809,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.002763860832141,
            "data_time": 0.003115134684124946,
            "batch_time": 0.01667795936304033,
            "samples_per_second": 2273949.3851288417,
            "samples_per_second_per_gpu": 284243.6731411052,
            "loss_sequences_lower_95": 7.173301002671824,
            "loss_sequences_upper_95": 7.471368036283656,
            "loss_tokens_lower_95": 6.540282923907702,
            "loss_tokens_upper_95": 6.747285042236269,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.726554883559545,
            "data_time": 0.00393484263344014,
            "batch_time": 0.017715722956555956,
            "samples_per_second": 2227648.6718705343,
            "samples_per_second_per_gpu": 278456.0839838168,
            "loss_sequences_lower_95": 6.8549792805989584,
            "loss_sequences_upper_95": 7.04700869140625,
            "loss_tokens_lower_95": 6.3816149764150945,
            "loss_tokens_upper_95": 6.510402577142295,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.195290535123535,
            "data_time": 0.0045106367827540605,
            "batch_time": 0.018368894338248183,
            "samples_per_second": 2226370.687614865,
            "samples_per_second_per_gpu": 278296.33595185814,
            "loss_sequences_lower_95": 5.227025599042243,
            "loss_sequences_upper_95": 5.295228633345574,
            "loss_tokens_lower_95": 5.103088222542187,
            "loss_tokens_upper_95": 5.139989851539058,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.97483258897608,
            "data_time": 0.022058855210031782,
            "batch_time": 0.03624171870095389,
            "samples_per_second": 2074290.8856608267,
            "samples_per_second_per_gpu": 259286.36070760334,
            "loss_sequences_lower_95": 4.938950236927379,
            "loss_sequences_upper_95": 5.051515059037642,
            "loss_tokens_lower_95": 4.91655983888441,
            "loss_tokens_upper_95": 4.977702591202258,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.026579599964375,
            "data_time": 0.01977761834859848,
            "batch_time": 0.03410499915480614,
            "samples_per_second": 1981693.261188699,
            "samples_per_second_per_gpu": 247711.65764858737,
            "loss_sequences_lower_95": 4.99185566804847,
            "loss_sequences_upper_95": 5.190408723792251,
            "loss_tokens_lower_95": 4.921751253596818,
            "loss_tokens_upper_95": 5.025283051053656,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.022444632848104,
            "data_time": 0.016655590289678328,
            "batch_time": 0.030476620564093955,
            "samples_per_second": 2051292.386447189,
            "samples_per_second_per_gpu": 256411.5483058986,
            "loss_sequences_lower_95": 4.9671240844726565,
            "loss_sequences_upper_95": 5.108799326578776,
            "loss_tokens_lower_95": 4.874026198793843,
            "loss_tokens_upper_95": 5.112927140470489,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.211423028126095,
            "data_time": 0.0017544661271221326,
            "batch_time": 0.015319539622849408,
            "samples_per_second": 2284393.804356921,
            "samples_per_second_per_gpu": 285549.2255446151,
            "loss_sequences_lower_95": 9.227312301621476,
            "loss_sequences_upper_95": 9.301899974164657,
            "loss_tokens_lower_95": 9.065075336554543,
            "loss_tokens_upper_95": 9.14251517033781,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.201303516975557,
            "data_time": 0.002780631284585735,
            "batch_time": 0.01672467589378357,
            "samples_per_second": 2228904.3013446713,
            "samples_per_second_per_gpu": 278613.0376680839,
            "loss_sequences_lower_95": 6.757998790805187,
            "loss_sequences_upper_95": 7.065378681015888,
            "loss_tokens_lower_95": 5.4605546249680685,
            "loss_tokens_upper_95": 5.608342613338017,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.656172767434104,
            "data_time": 0.004509739779137276,
            "batch_time": 0.01803834253066295,
            "samples_per_second": 2258083.1769265532,
            "samples_per_second_per_gpu": 282260.39711581916,
            "loss_sequences_lower_95": 6.030526837553994,
            "loss_sequences_upper_95": 6.363125214560446,
            "loss_tokens_lower_95": 5.248638728912048,
            "loss_tokens_upper_95": 5.413117433925258,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.268991718553517,
            "data_time": 0.02246891600745065,
            "batch_time": 0.03646083389009748,
            "samples_per_second": 2070208.3741057576,
            "samples_per_second_per_gpu": 258776.0467632197,
            "loss_sequences_lower_95": 5.188365612291309,
            "loss_sequences_upper_95": 5.347585766936001,
            "loss_tokens_lower_95": 5.189036441611373,
            "loss_tokens_upper_95": 5.347366458422517,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.825432472229004,
            "data_time": 0.04679367175469032,
            "batch_time": 0.06098050796068632,
            "samples_per_second": 1809699.063800833,
            "samples_per_second_per_gpu": 226212.38297510412,
            "loss_sequences_lower_95": 4.6662637252807615,
            "loss_sequences_upper_95": 5.111702720642089,
            "loss_tokens_lower_95": 4.468341402930736,
            "loss_tokens_upper_95": 4.97368927274919,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.126769027788051,
            "data_time": 0.0031447081478095493,
            "batch_time": 0.016878043703262548,
            "samples_per_second": 2257986.133767079,
            "samples_per_second_per_gpu": 282248.2667208849,
            "loss_sequences_lower_95": 6.081625694189003,
            "loss_sequences_upper_95": 6.171882009359807,
            "loss_tokens_lower_95": 6.081605353301721,
            "loss_tokens_upper_95": 6.17136574064819,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.361044313163664,
            "data_time": 0.004525854770355473,
            "batch_time": 0.01832087432774206,
            "samples_per_second": 2230284.7015863443,
            "samples_per_second_per_gpu": 278785.58769829303,
            "loss_sequences_lower_95": 6.312777382540566,
            "loss_sequences_upper_95": 6.408387940052979,
            "loss_tokens_lower_95": 6.31060131176098,
            "loss_tokens_upper_95": 6.410311972128379,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.967315247191179,
            "data_time": 0.003380771847059608,
            "batch_time": 0.016930287651572836,
            "samples_per_second": 2264545.109800834,
            "samples_per_second_per_gpu": 283068.1387251043,
            "loss_sequences_lower_95": 5.079842086974633,
            "loss_sequences_upper_95": 5.195282347703007,
            "loss_tokens_lower_95": 4.849711927629603,
            "loss_tokens_upper_95": 4.912626939191219,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.848533989906311,
            "data_time": 0.010123735293745995,
            "batch_time": 0.0242385221645236,
            "samples_per_second": 2123567.7414941164,
            "samples_per_second_per_gpu": 265445.96768676455,
            "loss_sequences_lower_95": 7.024790710449219,
            "loss_sequences_upper_95": 7.556054565429688,
            "loss_tokens_lower_95": 6.206484995450033,
            "loss_tokens_upper_95": 6.560946128133272,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.436506748199463,
            "data_time": 0.13370871543884277,
            "batch_time": 0.15078209340572357,
            "samples_per_second": 819016.2976889554,
            "samples_per_second_per_gpu": 102377.03721111943,
            "loss_sequences_lower_95": 5.058041727542877,
            "loss_sequences_upper_95": 5.958885192871094,
            "loss_tokens_lower_95": 4.8591471266472475,
            "loss_tokens_upper_95": 5.788771714835331,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.925702743146611,
            "data_time": 0.027113896735171054,
            "batch_time": 0.041128135742025174,
            "samples_per_second": 1911233.7941239458,
            "samples_per_second_per_gpu": 238904.22426549322,
            "loss_sequences_lower_95": 6.1586801112383265,
            "loss_sequences_upper_95": 6.721316984330101,
            "loss_tokens_lower_95": 5.017672134278995,
            "loss_tokens_upper_95": 5.438544974171188,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.350755036476139,
            "data_time": 0.0028938508282105127,
            "batch_time": 0.01639825767940945,
            "samples_per_second": 2270087.5966070797,
            "samples_per_second_per_gpu": 283760.94957588497,
            "loss_sequences_lower_95": 4.3175322513072025,
            "loss_sequences_upper_95": 4.383340927397335,
            "loss_tokens_lower_95": 4.318069552666967,
            "loss_tokens_upper_95": 4.382832839719173,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.778815607686071,
            "data_time": 0.0025088338367135017,
            "batch_time": 0.015994744842997066,
            "samples_per_second": 2296129.4517873875,
            "samples_per_second_per_gpu": 287016.18147342344,
            "loss_sequences_lower_95": 6.752359271146663,
            "loss_sequences_upper_95": 6.964873996337086,
            "loss_tokens_lower_95": 6.508098259347658,
            "loss_tokens_upper_95": 6.716837995294842,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.272063134790777,
            "data_time": 0.018694771660698786,
            "batch_time": 0.032966918415493436,
            "samples_per_second": 1977859.184221634,
            "samples_per_second_per_gpu": 247232.39802770424,
            "loss_sequences_lower_95": 4.1137449830443,
            "loss_sequences_upper_95": 4.510240246000744,
            "loss_tokens_lower_95": 4.024052165240518,
            "loss_tokens_upper_95": 4.3531571831998725,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.598622821312583,
            "data_time": 0.004780759662389755,
            "batch_time": 0.018350225314497946,
            "samples_per_second": 2247439.642068303,
            "samples_per_second_per_gpu": 280929.9552585379,
            "loss_sequences_lower_95": 4.627829786389478,
            "loss_sequences_upper_95": 4.76739314078532,
            "loss_tokens_lower_95": 4.462353105235649,
            "loss_tokens_upper_95": 4.613893715067658,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.556274216349532,
            "data_time": 0.030867551054273332,
            "batch_time": 0.046075494516463505,
            "samples_per_second": 1863056.398589013,
            "samples_per_second_per_gpu": 232882.0498236266,
            "loss_sequences_lower_95": 5.283833666545589,
            "loss_sequences_upper_95": 5.784441580423494,
            "loss_tokens_lower_95": 5.367759159455292,
            "loss_tokens_upper_95": 5.749441722477824,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.894802314421014,
            "data_time": 0.0021103645294988055,
            "batch_time": 0.01567193721705706,
            "samples_per_second": 2276558.3159084204,
            "samples_per_second_per_gpu": 284569.78948855255,
            "loss_sequences_lower_95": 5.874157888218269,
            "loss_sequences_upper_95": 5.915447151930386,
            "loss_tokens_lower_95": 5.873941624262352,
            "loss_tokens_upper_95": 5.915834553629476,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.625055551528931,
            "data_time": 0.04511306502602317,
            "batch_time": 0.05967814705588601,
            "samples_per_second": 1753982.3944103864,
            "samples_per_second_per_gpu": 219247.7993012983,
            "loss_sequences_lower_95": 4.452176925511036,
            "loss_sequences_upper_95": 4.8498898182100465,
            "loss_tokens_lower_95": 4.339362628042637,
            "loss_tokens_upper_95": 4.7578202715183595,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.492718495900276,
            "data_time": 0.0014648829576655385,
            "batch_time": 0.015125904354045898,
            "samples_per_second": 2269575.592955759,
            "samples_per_second_per_gpu": 283696.94911946985,
            "loss_sequences_lower_95": 6.904388716915618,
            "loss_sequences_upper_95": 6.958394322425314,
            "loss_tokens_lower_95": 5.850646433752418,
            "loss_tokens_upper_95": 5.90475709622824,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.499371351718903,
            "data_time": 0.0055035991328103205,
            "batch_time": 0.019257754560500856,
            "samples_per_second": 2221529.324131944,
            "samples_per_second_per_gpu": 277691.165516493,
            "loss_sequences_lower_95": 6.415743676757812,
            "loss_sequences_upper_95": 6.6830408203125,
            "loss_tokens_lower_95": 6.307149251302083,
            "loss_tokens_upper_95": 6.568847863948908,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.96473389915798,
            "data_time": 0.021381267046524308,
            "batch_time": 0.036533446635230116,
            "samples_per_second": 1889916.4181590725,
            "samples_per_second_per_gpu": 236239.55226988407,
            "loss_sequences_lower_95": 5.8456132839037025,
            "loss_sequences_upper_95": 6.084621980086617,
            "loss_tokens_lower_95": 5.846011564835258,
            "loss_tokens_upper_95": 6.083148312775985,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 11.283948826428615,
            "data_time": 0.004338409886302718,
            "batch_time": 0.01828773122235953,
            "samples_per_second": 2205284.442934308,
            "samples_per_second_per_gpu": 275660.5553667885,
            "loss_sequences_lower_95": 11.121886356238162,
            "loss_sequences_upper_95": 11.443715968276516,
            "loss_tokens_lower_95": 11.124613721442945,
            "loss_tokens_upper_95": 11.444526607629024,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4617034664154054,
            "data_time": 0.004042856870813572,
            "batch_time": 0.017748086376393093,
            "samples_per_second": 2254784.055709063,
            "samples_per_second_per_gpu": 281848.0069636329,
            "loss_sequences_lower_95": 3.5048901204427083,
            "loss_sequences_upper_95": 3.579744669596354,
            "loss_tokens_lower_95": 3.359817731780212,
            "loss_tokens_upper_95": 3.4511858571553624,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.619641518592834,
            "data_time": 0.023994124361446927,
            "batch_time": 0.040621629783085415,
            "samples_per_second": 1950696.9275054778,
            "samples_per_second_per_gpu": 243837.11593818472,
            "loss_sequences_lower_95": 6.282697986421131,
            "loss_sequences_upper_95": 6.961874593098958,
            "loss_tokens_lower_95": 6.275970023018973,
            "loss_tokens_upper_95": 6.9576073492141,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.1146987825632095,
            "data_time": 0.13699442148208618,
            "batch_time": 0.15392059087753296,
            "samples_per_second": 870057.7345023275,
            "samples_per_second_per_gpu": 108757.21681279094,
            "loss_sequences_lower_95": 4.892796742916107,
            "loss_sequences_upper_95": 6.125590026378632,
            "loss_tokens_lower_95": 4.663310672720683,
            "loss_tokens_upper_95": 5.246424472454897,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.818041841506957,
            "data_time": 0.0058007580893380305,
            "batch_time": 0.01960288721417624,
            "samples_per_second": 2215309.6450769333,
            "samples_per_second_per_gpu": 276913.70563461666,
            "loss_sequences_lower_95": 8.779192919921874,
            "loss_sequences_upper_95": 9.101705834960937,
            "loss_tokens_lower_95": 8.509170041673276,
            "loss_tokens_upper_95": 8.80065236357868,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.157787337303162,
            "data_time": 0.0055134594440460205,
            "batch_time": 0.019261248054958525,
            "samples_per_second": 2225369.8348741746,
            "samples_per_second_per_gpu": 278171.2293592718,
            "loss_sequences_lower_95": 8.313329296875,
            "loss_sequences_upper_95": 8.588679687499999,
            "loss_tokens_lower_95": 7.832488625516399,
            "loss_tokens_upper_95": 8.069068030757073,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.247182305611263,
            "data_time": 0.0036569931834039083,
            "batch_time": 0.017308909598002863,
            "samples_per_second": 2252831.1075654416,
            "samples_per_second_per_gpu": 281603.8884456802,
            "loss_sequences_lower_95": 6.212644160005657,
            "loss_sequences_upper_95": 6.280507609526903,
            "loss_tokens_lower_95": 6.213253242986297,
            "loss_tokens_upper_95": 6.281247610155464,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.645767671904439,
            "data_time": 0.008037506633654823,
            "batch_time": 0.021909930554761627,
            "samples_per_second": 2184367.421532175,
            "samples_per_second_per_gpu": 273045.9276915219,
            "loss_sequences_lower_95": 5.558538771481374,
            "loss_sequences_upper_95": 5.731659601214478,
            "loss_tokens_lower_95": 5.55738052860383,
            "loss_tokens_upper_95": 5.730162355390744,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.76409884262085,
            "data_time": 0.005776376478255741,
            "batch_time": 0.01966407989698743,
            "samples_per_second": 2211300.612872373,
            "samples_per_second_per_gpu": 276412.5766090466,
            "loss_sequences_lower_95": 9.638233325195312,
            "loss_sequences_upper_95": 9.889656420898438,
            "loss_tokens_lower_95": 9.636114379882812,
            "loss_tokens_upper_95": 9.888528833007813,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.997973584474655,
            "data_time": 0.0020697842022872376,
            "batch_time": 0.01586688628494176,
            "samples_per_second": 2245047.3151815822,
            "samples_per_second_per_gpu": 280630.9143976978,
            "loss_sequences_lower_95": 6.3203723687322615,
            "loss_sequences_upper_95": 6.400601976407285,
            "loss_tokens_lower_95": 5.575324457194045,
            "loss_tokens_upper_95": 5.637167194764962,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.3553660200602975,
            "data_time": 0.01716773680278233,
            "batch_time": 0.03296065160206386,
            "samples_per_second": 2100947.409359478,
            "samples_per_second_per_gpu": 262618.42616993474,
            "loss_sequences_lower_95": 6.212461864414499,
            "loss_sequences_upper_95": 6.496465085157707,
            "loss_tokens_lower_95": 6.212312590186276,
            "loss_tokens_upper_95": 6.496201461108763,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.334943320704442,
            "data_time": 0.010268022306263447,
            "batch_time": 0.024137171916663647,
            "samples_per_second": 2178944.078664677,
            "samples_per_second_per_gpu": 272368.00983308465,
            "loss_sequences_lower_95": 6.24033722522212,
            "loss_sequences_upper_95": 6.427313495710784,
            "loss_tokens_lower_95": 6.2405050120634185,
            "loss_tokens_upper_95": 6.426666427313113,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.189779490676721,
            "data_time": 0.002102554306503414,
            "batch_time": 0.01568915415299061,
            "samples_per_second": 2279940.6736766095,
            "samples_per_second_per_gpu": 284992.5842095762,
            "loss_sequences_lower_95": 6.438920621359451,
            "loss_sequences_upper_95": 6.522019092818333,
            "loss_tokens_lower_95": 5.786137430917073,
            "loss_tokens_upper_95": 5.860938029864905,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.890367697155665,
            "data_time": 0.02615255117416382,
            "batch_time": 0.041175805032253265,
            "samples_per_second": 1942428.9535029952,
            "samples_per_second_per_gpu": 242803.6191878744,
            "loss_sequences_lower_95": 4.797492238201161,
            "loss_sequences_upper_95": 4.985002798252005,
            "loss_tokens_lower_95": 4.797027329540757,
            "loss_tokens_upper_95": 4.989667684565146,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.537892521557822,
            "data_time": 0.0034157753864050783,
            "batch_time": 0.017194918369344447,
            "samples_per_second": 2235522.7113401787,
            "samples_per_second_per_gpu": 279440.33891752234,
            "loss_sequences_lower_95": 8.51828410204033,
            "loss_sequences_upper_95": 8.55774502162175,
            "loss_tokens_lower_95": 8.518700932960627,
            "loss_tokens_upper_95": 8.557315289325306,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.076620071837046,
            "data_time": 0.021074290709062055,
            "batch_time": 0.0350678877397017,
            "samples_per_second": 1960052.226545831,
            "samples_per_second_per_gpu": 245006.52831822887,
            "loss_sequences_lower_95": 5.927039788996131,
            "loss_sequences_upper_95": 6.224489912014564,
            "loss_tokens_lower_95": 5.929084437101791,
            "loss_tokens_upper_95": 6.223638738243325,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.434583274523417,
            "data_time": 0.07675521820783615,
            "batch_time": 0.09231150895357132,
            "samples_per_second": 1424742.1032125861,
            "samples_per_second_per_gpu": 178092.76290157327,
            "loss_sequences_lower_95": 5.027767499287924,
            "loss_sequences_upper_95": 5.985905952453614,
            "loss_tokens_lower_95": 4.615115822686089,
            "loss_tokens_upper_95": 6.00051352183024,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.7969890356063845,
            "data_time": 0.0743827074766159,
            "batch_time": 0.08933480083942413,
            "samples_per_second": 1422402.3390863733,
            "samples_per_second_per_gpu": 177800.29238579667,
            "loss_sequences_lower_95": 4.414948476155599,
            "loss_sequences_upper_95": 5.522837816874186,
            "loss_tokens_lower_95": 3.849380716045251,
            "loss_tokens_upper_95": 5.250990981198429,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.392198244075185,
            "data_time": 0.003688231856069402,
            "batch_time": 0.01736296591441585,
            "samples_per_second": 2255238.25885004,
            "samples_per_second_per_gpu": 281904.782356255,
            "loss_sequences_lower_95": 8.37296337099595,
            "loss_sequences_upper_95": 8.411774007041606,
            "loss_tokens_lower_95": 8.372800548255707,
            "loss_tokens_upper_95": 8.411722690767672,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.861512589375574,
            "data_time": 0.0015484083288999303,
            "batch_time": 0.015224564209156537,
            "samples_per_second": 2263176.2886165655,
            "samples_per_second_per_gpu": 282897.0360770707,
            "loss_sequences_lower_95": 5.235349679229638,
            "loss_sequences_upper_95": 5.2721857930358,
            "loss_tokens_lower_95": 4.398732251476465,
            "loss_tokens_upper_95": 4.433093691427662,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.115215312777542,
            "data_time": 0.036669231951236725,
            "batch_time": 0.0518740750849247,
            "samples_per_second": 1890343.5436759046,
            "samples_per_second_per_gpu": 236292.94295948808,
            "loss_sequences_lower_95": 7.111499359851747,
            "loss_sequences_upper_95": 7.512907037209338,
            "loss_tokens_lower_95": 6.840222045028862,
            "loss_tokens_upper_95": 7.160543576612779,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 10.764961526200578,
            "data_time": 0.11364122799464635,
            "batch_time": 0.12971921194167363,
            "samples_per_second": 978056.4800600891,
            "samples_per_second_per_gpu": 122257.06000751114,
            "loss_sequences_lower_95": 10.302785512563345,
            "loss_sequences_upper_95": 11.4288967648068,
            "loss_tokens_lower_95": 9.615329846040702,
            "loss_tokens_upper_95": 11.665295843430506,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.09429281223111,
            "data_time": 0.028641050770169214,
            "batch_time": 0.044473670777820405,
            "samples_per_second": 1828722.5854724057,
            "samples_per_second_per_gpu": 228590.3231840507,
            "loss_sequences_lower_95": 7.035867421219988,
            "loss_sequences_upper_95": 7.350683221584413,
            "loss_tokens_lower_95": 6.775933376918629,
            "loss_tokens_upper_95": 7.046500078847771,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.141299369858532,
            "data_time": 0.03126615285873413,
            "batch_time": 0.04564105045227777,
            "samples_per_second": 1949307.6290007734,
            "samples_per_second_per_gpu": 243663.45362509668,
            "loss_sequences_lower_95": 7.083690941043017,
            "loss_sequences_upper_95": 7.3668769836425785,
            "loss_tokens_lower_95": 6.868788171488361,
            "loss_tokens_upper_95": 7.091951026885188,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.233241854644403,
            "data_time": 0.02975042661031087,
            "batch_time": 0.04441518726803008,
            "samples_per_second": 1929688.2165627703,
            "samples_per_second_per_gpu": 241211.0270703463,
            "loss_sequences_lower_95": 7.2187891890363,
            "loss_sequences_upper_95": 7.627893531613233,
            "loss_tokens_lower_95": 6.815021331067603,
            "loss_tokens_upper_95": 7.161541604145701,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.218629331123538,
            "data_time": 0.029161529881613597,
            "batch_time": 0.043360380899338496,
            "samples_per_second": 1972171.0359709447,
            "samples_per_second_per_gpu": 246521.37949636808,
            "loss_sequences_lower_95": 7.146819249595084,
            "loss_sequences_upper_95": 7.417512214474561,
            "loss_tokens_lower_95": 6.970720674984181,
            "loss_tokens_upper_95": 7.172652523242796,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.531678350815862,
            "data_time": 0.030750569002127942,
            "batch_time": 0.04552858258471077,
            "samples_per_second": 1945802.0964832997,
            "samples_per_second_per_gpu": 243225.26206041247,
            "loss_sequences_lower_95": 6.451285889430076,
            "loss_sequences_upper_95": 6.626388246524408,
            "loss_tokens_lower_95": 6.390935980458468,
            "loss_tokens_upper_95": 6.546183402950424,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.122937068706605,
            "data_time": 0.030394060271126882,
            "batch_time": 0.0450413567679269,
            "samples_per_second": 1894731.9515639555,
            "samples_per_second_per_gpu": 236841.49394549444,
            "loss_sequences_lower_95": 6.103968904076553,
            "loss_sequences_upper_95": 6.329311054508861,
            "loss_tokens_lower_95": 5.883617036806812,
            "loss_tokens_upper_95": 6.024653276164105,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-0.25/params.txt",
    "uuid": "3779b2c9-e9f4-4c5c-88c8-63b04ee073de",
    "creation_date": "2023_12_14-05_01_09"
}