{
    "name": "rw_original-d=512_l=8_h=4-8.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 12626247680,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "2525249536",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.6824473261833193,
            "data_time": 0.03484451398253441,
            "batch_time": 0.3308454863727093,
            "samples_per_second": 1716800.934680181,
            "samples_per_second_per_gpu": 214600.11683502264,
            "loss_sequences_lower_95": 3.6012432734171553,
            "loss_sequences_upper_95": 3.7644697634379067,
            "loss_tokens_lower_95": 3.6681785583496094,
            "loss_tokens_upper_95": 3.6967527516682943,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.529766943299376,
            "data_time": 0.0015749446286051885,
            "batch_time": 0.015569826155537042,
            "samples_per_second": 2219669.349047901,
            "samples_per_second_per_gpu": 277458.66863098764,
            "loss_sequences_lower_95": 3.527370110604242,
            "loss_sequences_upper_95": 3.532122850342357,
            "loss_tokens_lower_95": 3.5189374062499996,
            "loss_tokens_upper_95": 3.5404671562499996,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.099422949674178,
            "data_time": 0.009958113670349121,
            "batch_time": 0.023863619804382324,
            "samples_per_second": 2186143.632399253,
            "samples_per_second_per_gpu": 273267.95404990664,
            "loss_sequences_lower_95": 3.0406597090740592,
            "loss_sequences_upper_95": 3.1738194742008132,
            "loss_tokens_lower_95": 3.086380119791667,
            "loss_tokens_upper_95": 3.1125842239583332,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6565827181413,
            "data_time": 0.0016576252681644338,
            "batch_time": 0.01502576411554688,
            "samples_per_second": 2327359.354542883,
            "samples_per_second_per_gpu": 290919.9193178604,
            "loss_sequences_lower_95": 3.614307737999356,
            "loss_sequences_upper_95": 3.7007327239046393,
            "loss_tokens_lower_95": 3.6437091458333333,
            "loss_tokens_upper_95": 3.6691660208333334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6008881332918,
            "data_time": 0.010011732815746292,
            "batch_time": 0.023705382746054358,
            "samples_per_second": 2190398.742596215,
            "samples_per_second_per_gpu": 273799.8428245269,
            "loss_sequences_lower_95": 3.5387247928533925,
            "loss_sequences_upper_95": 3.6818399650744653,
            "loss_tokens_lower_95": 3.5892770729166665,
            "loss_tokens_upper_95": 3.6122185729166665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.758184067980924,
            "data_time": 0.003976975446162017,
            "batch_time": 0.017498532391112785,
            "samples_per_second": 2307126.959431874,
            "samples_per_second_per_gpu": 288390.86992898426,
            "loss_sequences_lower_95": 3.706823182770543,
            "loss_sequences_upper_95": 3.814948534625149,
            "loss_tokens_lower_95": 3.7454100833333333,
            "loss_tokens_upper_95": 3.7708246145833333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5490552335855914,
            "data_time": 0.0016086053887160335,
            "batch_time": 0.015004550263309945,
            "samples_per_second": 2332344.4937250577,
            "samples_per_second_per_gpu": 291543.0617156322,
            "loss_sequences_lower_95": 3.5152268016581636,
            "loss_sequences_upper_95": 3.582550422512755,
            "loss_tokens_lower_95": 3.5325998229166666,
            "loss_tokens_upper_95": 3.56600153125,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.066665707633133,
            "data_time": 0.0016649601998961355,
            "batch_time": 0.015149713908713615,
            "samples_per_second": 2316225.9562546965,
            "samples_per_second_per_gpu": 289528.24453183706,
            "loss_sequences_lower_95": 4.039979558655104,
            "loss_sequences_upper_95": 4.095505071989528,
            "loss_tokens_lower_95": 4.054655791666667,
            "loss_tokens_upper_95": 4.078543874999999,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7138043075073055,
            "data_time": 0.009809572545308915,
            "batch_time": 0.023538836411067417,
            "samples_per_second": 2182484.4919828973,
            "samples_per_second_per_gpu": 272810.56149786216,
            "loss_sequences_lower_95": 3.6191940431672385,
            "loss_sequences_upper_95": 3.8286669351221096,
            "loss_tokens_lower_95": 3.7017151354166664,
            "loss_tokens_upper_95": 3.7259869375,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.82903021503343,
            "data_time": 0.01045981328934431,
            "batch_time": 0.024459250271320343,
            "samples_per_second": 2176833.224833041,
            "samples_per_second_per_gpu": 272104.1531041301,
            "loss_sequences_lower_95": 4.703527795844399,
            "loss_sequences_upper_95": 4.986185271372437,
            "loss_tokens_lower_95": 4.815290947916666,
            "loss_tokens_upper_95": 4.84272334375,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.77083134072863,
            "data_time": 0.0014020896800807745,
            "batch_time": 0.014819321223369002,
            "samples_per_second": 2332456.914105621,
            "samples_per_second_per_gpu": 291557.11426320265,
            "loss_sequences_lower_95": 3.7568676486805366,
            "loss_sequences_upper_95": 3.7853458746959365,
            "loss_tokens_lower_95": 3.7590126041666667,
            "loss_tokens_upper_95": 3.782638104166667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6387925432007395,
            "data_time": 0.0026402145897121256,
            "batch_time": 0.017870958401301224,
            "samples_per_second": 2328998.2072853656,
            "samples_per_second_per_gpu": 291124.7759106707,
            "loss_sequences_lower_95": 3.6099799522009057,
            "loss_sequences_upper_95": 3.66918700256729,
            "loss_tokens_lower_95": 3.6269720520833335,
            "loss_tokens_upper_95": 3.6507992812500003,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.125121158228443,
            "data_time": 0.01049924650682291,
            "batch_time": 0.02410850506055025,
            "samples_per_second": 2200205.409113574,
            "samples_per_second_per_gpu": 275025.67613919673,
            "loss_sequences_lower_95": 4.029476637772565,
            "loss_sequences_upper_95": 4.24319528575843,
            "loss_tokens_lower_95": 4.111698697916667,
            "loss_tokens_upper_95": 4.138357249999999,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.375196570534327,
            "data_time": 0.010621464109990701,
            "batch_time": 0.024227043547003394,
            "samples_per_second": 2212210.9682375016,
            "samples_per_second_per_gpu": 276526.3710296877,
            "loss_sequences_lower_95": 3.283939344479936,
            "loss_sequences_upper_95": 3.4825105500075577,
            "loss_tokens_lower_95": 3.3632150625,
            "loss_tokens_upper_95": 3.387481578125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.431085483594374,
            "data_time": 0.0836767383984157,
            "batch_time": 0.10137037719999041,
            "samples_per_second": 1085484.4177931338,
            "samples_per_second_per_gpu": 135685.55222414172,
            "loss_sequences_lower_95": 4.3452449971979314,
            "loss_sequences_upper_95": 4.5411957307295365,
            "loss_tokens_lower_95": 4.407239324396307,
            "loss_tokens_upper_95": 4.455508050051603,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.709682397175113,
            "data_time": 0.014010674574158409,
            "batch_time": 0.028298109769821167,
            "samples_per_second": 2073123.6082967566,
            "samples_per_second_per_gpu": 259140.45103709458,
            "loss_sequences_lower_95": 3.641384469524417,
            "loss_sequences_upper_95": 3.7775434755375366,
            "loss_tokens_lower_95": 3.6961953750000003,
            "loss_tokens_upper_95": 3.7228612812499997,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.556486646858557,
            "data_time": 0.013367041945457458,
            "batch_time": 0.027367718517780304,
            "samples_per_second": 2165066.7963942247,
            "samples_per_second_per_gpu": 270633.3495492781,
            "loss_sequences_lower_95": 5.461473900787434,
            "loss_sequences_upper_95": 5.6838492924431065,
            "loss_tokens_lower_95": 5.544631166666667,
            "loss_tokens_upper_95": 5.568297677083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.07184190437442,
            "data_time": 0.036785922944545746,
            "batch_time": 0.05187295004725456,
            "samples_per_second": 1841250.2081596283,
            "samples_per_second_per_gpu": 230156.27601995354,
            "loss_sequences_lower_95": 3.905670322355677,
            "loss_sequences_upper_95": 4.367448062584048,
            "loss_tokens_lower_95": 4.057400149986392,
            "loss_tokens_upper_95": 4.086486109749216,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.095667736802986,
            "data_time": 0.0019384385988218094,
            "batch_time": 0.015591448151712265,
            "samples_per_second": 2265320.3144386495,
            "samples_per_second_per_gpu": 283165.0393048312,
            "loss_sequences_lower_95": 4.0780532356613195,
            "loss_sequences_upper_95": 4.113300764559002,
            "loss_tokens_lower_95": 4.077737094241828,
            "loss_tokens_upper_95": 4.113463682612876,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1868924598631168,
            "data_time": 0.002201349919389008,
            "batch_time": 0.015904066716409794,
            "samples_per_second": 2253154.2921710787,
            "samples_per_second_per_gpu": 281644.28652138484,
            "loss_sequences_lower_95": 3.183580349251892,
            "loss_sequences_upper_95": 3.2090439052464026,
            "loss_tokens_lower_95": 3.1669547213482723,
            "loss_tokens_upper_95": 3.1863118381688724,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.961540404233833,
            "data_time": 0.003126917190983203,
            "batch_time": 0.01697472914786523,
            "samples_per_second": 2231942.0421223575,
            "samples_per_second_per_gpu": 278992.7552652947,
            "loss_sequences_lower_95": 5.222712020621753,
            "loss_sequences_upper_95": 5.528817104651335,
            "loss_tokens_lower_95": 4.395265428617167,
            "loss_tokens_upper_95": 4.616383144768585,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.013634593447049,
            "data_time": 0.004049394200456903,
            "batch_time": 0.01779260090056886,
            "samples_per_second": 2230451.438253887,
            "samples_per_second_per_gpu": 278806.4297817359,
            "loss_sequences_lower_95": 5.146265315755208,
            "loss_sequences_upper_95": 5.346291178385417,
            "loss_tokens_lower_95": 4.68011882124607,
            "loss_tokens_upper_95": 4.820891804245282,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.308238128354099,
            "data_time": 0.004592746631052699,
            "batch_time": 0.018252819012372922,
            "samples_per_second": 2236754.186898144,
            "samples_per_second_per_gpu": 279594.273362268,
            "loss_sequences_lower_95": 3.357009584601853,
            "loss_sequences_upper_95": 3.4212304058176173,
            "loss_tokens_lower_95": 3.209286509756235,
            "loss_tokens_upper_95": 3.2407556998757787,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.680432158166712,
            "data_time": 0.02371617087296077,
            "batch_time": 0.038366345422608514,
            "samples_per_second": 1987034.1911556101,
            "samples_per_second_per_gpu": 248379.27389445127,
            "loss_sequences_lower_95": 2.6558799466219813,
            "loss_sequences_upper_95": 2.7691744995117187,
            "loss_tokens_lower_95": 2.6090980628968055,
            "loss_tokens_upper_95": 2.6587171375843996,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5588227680751254,
            "data_time": 0.020552974194288254,
            "batch_time": 0.03487612120807171,
            "samples_per_second": 1985287.7323743622,
            "samples_per_second_per_gpu": 248160.96654679527,
            "loss_sequences_lower_95": 3.544291101572465,
            "loss_sequences_upper_95": 3.7334365844726562,
            "loss_tokens_lower_95": 3.4327163512398444,
            "loss_tokens_upper_95": 3.5259177505606805,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7932381121317547,
            "data_time": 0.017119780564919498,
            "batch_time": 0.03115006746389927,
            "samples_per_second": 2048753.7608841485,
            "samples_per_second_per_gpu": 256094.22011051857,
            "loss_sequences_lower_95": 3.766126963297526,
            "loss_sequences_upper_95": 3.8634801228841145,
            "loss_tokens_lower_95": 3.6611502856053,
            "loss_tokens_upper_95": 3.8640228384652158,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.065969960469543,
            "data_time": 0.001818116707731896,
            "batch_time": 0.015497756912393629,
            "samples_per_second": 2259642.69840313,
            "samples_per_second_per_gpu": 282455.3373003912,
            "loss_sequences_lower_95": 7.085289655713302,
            "loss_sequences_upper_95": 7.164922217164509,
            "loss_tokens_lower_95": 6.906662536880644,
            "loss_tokens_upper_95": 6.990406702422476,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.8173293616835915,
            "data_time": 0.002802463985929553,
            "batch_time": 0.016532228497050754,
            "samples_per_second": 2244770.3284970312,
            "samples_per_second_per_gpu": 280596.2910621289,
            "loss_sequences_lower_95": 5.413571752683081,
            "loss_sequences_upper_95": 5.740379852397674,
            "loss_tokens_lower_95": 4.0278052001731375,
            "loss_tokens_upper_95": 4.168687889383231,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.348588485766596,
            "data_time": 0.004982416291494627,
            "batch_time": 0.018789373539589548,
            "samples_per_second": 2208355.187390529,
            "samples_per_second_per_gpu": 276044.39842381614,
            "loss_sequences_lower_95": 4.824666118947312,
            "loss_sequences_upper_95": 5.19116102615721,
            "loss_tokens_lower_95": 3.888640362818651,
            "loss_tokens_upper_95": 4.047071242109349,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.573763788562932,
            "data_time": 0.024498102920395986,
            "batch_time": 0.03910969410623823,
            "samples_per_second": 1967828.251077669,
            "samples_per_second_per_gpu": 245978.53138470862,
            "loss_sequences_lower_95": 5.469448504164882,
            "loss_sequences_upper_95": 5.677981915756992,
            "loss_tokens_lower_95": 5.471983205020155,
            "loss_tokens_upper_95": 5.672391276599065,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5395541405677795,
            "data_time": 0.05053678842691275,
            "batch_time": 0.06593352556228638,
            "samples_per_second": 1728775.6190092512,
            "samples_per_second_per_gpu": 216096.9523761564,
            "loss_sequences_lower_95": 3.398001289367676,
            "loss_sequences_upper_95": 3.7533366165161133,
            "loss_tokens_lower_95": 3.243088341782899,
            "loss_tokens_upper_95": 3.6984634535896967,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.885511056915642,
            "data_time": 0.003379822021125528,
            "batch_time": 0.017055318155659246,
            "samples_per_second": 2254384.8268487905,
            "samples_per_second_per_gpu": 281798.1033560988,
            "loss_sequences_lower_95": 4.838573081759372,
            "loss_sequences_upper_95": 4.932667252570848,
            "loss_tokens_lower_95": 4.8381467227682,
            "loss_tokens_upper_95": 4.932388614900525,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.01994158653428,
            "data_time": 0.004942780023297906,
            "batch_time": 0.018861390249180755,
            "samples_per_second": 2212792.3262831895,
            "samples_per_second_per_gpu": 276599.0407853987,
            "loss_sequences_lower_95": 4.964242596200604,
            "loss_sequences_upper_95": 5.074434477535064,
            "loss_tokens_lower_95": 4.96335669165259,
            "loss_tokens_upper_95": 5.076594922114942,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.682550589459765,
            "data_time": 0.003644841793546409,
            "batch_time": 0.017616782538576722,
            "samples_per_second": 2195770.094233997,
            "samples_per_second_per_gpu": 274471.26177924965,
            "loss_sequences_lower_95": 3.831242488491652,
            "loss_sequences_upper_95": 3.962371354627142,
            "loss_tokens_lower_95": 3.514626930586498,
            "loss_tokens_upper_95": 3.5747785066584896,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.592865469455719,
            "data_time": 0.010559843853116035,
            "batch_time": 0.02449140138924122,
            "samples_per_second": 2123382.615508303,
            "samples_per_second_per_gpu": 265422.8269385379,
            "loss_sequences_lower_95": 5.786667553710937,
            "loss_sequences_upper_95": 6.3337087768554685,
            "loss_tokens_lower_95": 4.997662438342778,
            "loss_tokens_upper_95": 5.357815779983247,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.051924645900726,
            "data_time": 0.1551232486963272,
            "batch_time": 0.1722358912229538,
            "samples_per_second": 997979.5724121977,
            "samples_per_second_per_gpu": 124747.44655152471,
            "loss_sequences_lower_95": 3.82324765920639,
            "loss_sequences_upper_95": 4.33671487569809,
            "loss_tokens_lower_95": 3.6167757143919497,
            "loss_tokens_upper_95": 4.378855475063982,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.2887189511595105,
            "data_time": 0.029340733873083235,
            "batch_time": 0.043613279119450996,
            "samples_per_second": 1856828.067687308,
            "samples_per_second_per_gpu": 232103.5084609135,
            "loss_sequences_lower_95": 4.5369239193269575,
            "loss_sequences_upper_95": 5.09584451434256,
            "loss_tokens_lower_95": 3.41420001833119,
            "loss_tokens_upper_95": 3.7706651276907763,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.5034294860651527,
            "data_time": 0.0030475337472226885,
            "batch_time": 0.016635049962335162,
            "samples_per_second": 2253522.8337322604,
            "samples_per_second_per_gpu": 281690.35421653255,
            "loss_sequences_lower_95": 2.4759681791653017,
            "loss_sequences_upper_95": 2.5303981529359163,
            "loss_tokens_lower_95": 2.4752298910891763,
            "loss_tokens_upper_95": 2.531059476453282,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2889165099887436,
            "data_time": 0.0027716602056027384,
            "batch_time": 0.016376031106601727,
            "samples_per_second": 2268398.2787900246,
            "samples_per_second_per_gpu": 283549.7848487531,
            "loss_sequences_lower_95": 3.261697098095163,
            "loss_sequences_upper_95": 3.4293331194358867,
            "loss_tokens_lower_95": 3.105794962105245,
            "loss_tokens_upper_95": 3.268838746852784,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3775459348937096,
            "data_time": 0.01863445672723982,
            "batch_time": 0.03309714959727393,
            "samples_per_second": 1956315.8330958937,
            "samples_per_second_per_gpu": 244539.47913698672,
            "loss_sequences_lower_95": 3.23411128564632,
            "loss_sequences_upper_95": 3.6568673270089285,
            "loss_tokens_lower_95": 3.115054566897693,
            "loss_tokens_upper_95": 3.4142557008303944,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7308448788304918,
            "data_time": 0.00486181378364563,
            "batch_time": 0.018585507199168204,
            "samples_per_second": 2219450.2901072,
            "samples_per_second_per_gpu": 277431.2862634,
            "loss_sequences_lower_95": 3.7614942639478097,
            "loss_sequences_upper_95": 3.9095752165086077,
            "loss_tokens_lower_95": 3.5898020216107747,
            "loss_tokens_upper_95": 3.733929768592936,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.0068313523036676,
            "data_time": 0.03197615487234933,
            "batch_time": 0.04628514959698632,
            "samples_per_second": 1963813.2863700886,
            "samples_per_second_per_gpu": 245476.66079626107,
            "loss_sequences_lower_95": 2.84296676356618,
            "loss_sequences_upper_95": 3.3351333432081267,
            "loss_tokens_lower_95": 2.7349194574900593,
            "loss_tokens_upper_95": 3.0947441810495895,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.460164221126238,
            "data_time": 0.002278339800908953,
            "batch_time": 0.01601250448847979,
            "samples_per_second": 2245184.2727739974,
            "samples_per_second_per_gpu": 280648.0340967497,
            "loss_sequences_lower_95": 5.452567857321465,
            "loss_sequences_upper_95": 5.46755410652443,
            "loss_tokens_lower_95": 5.452762896329266,
            "loss_tokens_upper_95": 5.46763353647292,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.3372680353886872,
            "data_time": 0.04609579606489702,
            "batch_time": 0.06187861615961248,
            "samples_per_second": 1625910.932933518,
            "samples_per_second_per_gpu": 203238.86661668975,
            "loss_sequences_lower_95": 1.2861309570016213,
            "loss_sequences_upper_95": 1.4566131517725083,
            "loss_tokens_lower_95": 1.1406427295571584,
            "loss_tokens_upper_95": 1.4129205397232039,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.813685796014668,
            "data_time": 0.0016887386681547908,
            "batch_time": 0.015425729381868246,
            "samples_per_second": 2244921.849334411,
            "samples_per_second_per_gpu": 280615.23116680136,
            "loss_sequences_lower_95": 5.167890461215933,
            "loss_sequences_upper_95": 5.2101010547693924,
            "loss_tokens_lower_95": 4.264953469535783,
            "loss_tokens_upper_95": 4.308040667311412,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.372680925369263,
            "data_time": 0.006029520242933243,
            "batch_time": 0.019784144465885466,
            "samples_per_second": 2211677.0809575208,
            "samples_per_second_per_gpu": 276459.6351196901,
            "loss_sequences_lower_95": 5.363340698242188,
            "loss_sequences_upper_95": 5.530679455566406,
            "loss_tokens_lower_95": 5.215517742395954,
            "loss_tokens_upper_95": 5.382787311787026,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.8415729553803155,
            "data_time": 0.022798718032190354,
            "batch_time": 0.037276843846854514,
            "samples_per_second": 1965638.3329089647,
            "samples_per_second_per_gpu": 245704.7916136206,
            "loss_sequences_lower_95": 4.68140869140625,
            "loss_sequences_upper_95": 5.006259314495584,
            "loss_tokens_lower_95": 4.682929899796196,
            "loss_tokens_upper_95": 5.000692032523777,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.9458320108327,
            "data_time": 0.004709520971918681,
            "batch_time": 0.018477108464183576,
            "samples_per_second": 2222508.003721523,
            "samples_per_second_per_gpu": 277813.5004651904,
            "loss_sequences_lower_95": 5.871088081128669,
            "loss_sequences_upper_95": 6.018202033765388,
            "loss_tokens_lower_95": 5.871194911147609,
            "loss_tokens_upper_95": 6.020807587594697,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.4186988501946132,
            "data_time": 0.004382792939531042,
            "batch_time": 0.01825125991029942,
            "samples_per_second": 2218997.9332696334,
            "samples_per_second_per_gpu": 277374.7416587042,
            "loss_sequences_lower_95": 1.4577220865885416,
            "loss_sequences_upper_95": 1.5105982177734376,
            "loss_tokens_lower_95": 1.3386653020583232,
            "loss_tokens_upper_95": 1.4078000614308224,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.211650797298977,
            "data_time": 0.024073909435953413,
            "batch_time": 0.03875720926693508,
            "samples_per_second": 1862801.3262063998,
            "samples_per_second_per_gpu": 232850.16577579998,
            "loss_sequences_lower_95": 5.88899171375093,
            "loss_sequences_upper_95": 6.53438739594959,
            "loss_tokens_lower_95": 5.88331790015811,
            "loss_tokens_upper_95": 6.534579758417038,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.3412178494036198,
            "data_time": 0.15685445070266724,
            "batch_time": 0.17384785413742065,
            "samples_per_second": 917507.9391330003,
            "samples_per_second_per_gpu": 114688.49239162504,
            "loss_sequences_lower_95": 2.1545684933662415,
            "loss_sequences_upper_95": 3.1016173899173736,
            "loss_tokens_lower_95": 1.7991243185456267,
            "loss_tokens_upper_95": 2.299525115022954,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.427787060022355,
            "data_time": 0.00600504165604001,
            "batch_time": 0.020070170126264057,
            "samples_per_second": 2167010.197984772,
            "samples_per_second_per_gpu": 270876.2747480965,
            "loss_sequences_lower_95": 7.355676086425781,
            "loss_sequences_upper_95": 7.711840844726563,
            "loss_tokens_lower_95": 7.143913503992175,
            "loss_tokens_upper_95": 7.455141122369396,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.397865511894226,
            "data_time": 0.006012289770065792,
            "batch_time": 0.01984331248298524,
            "samples_per_second": 2217651.1113609504,
            "samples_per_second_per_gpu": 277206.3889201188,
            "loss_sequences_lower_95": 7.521987487792969,
            "loss_sequences_upper_95": 7.750337219238281,
            "loss_tokens_lower_95": 7.122653633770969,
            "loss_tokens_upper_95": 7.337804533949361,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.91129104858511,
            "data_time": 0.004211569948738634,
            "batch_time": 0.017873120228183707,
            "samples_per_second": 2248261.2321970444,
            "samples_per_second_per_gpu": 281032.65402463055,
            "loss_sequences_lower_95": 5.889965108269779,
            "loss_sequences_upper_95": 5.932541710971547,
            "loss_tokens_lower_95": 5.890686432099606,
            "loss_tokens_upper_95": 5.9320841212181525,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.510001589808779,
            "data_time": 0.008748691967967053,
            "batch_time": 0.02248264295456993,
            "samples_per_second": 2190697.10139173,
            "samples_per_second_per_gpu": 273837.13767396624,
            "loss_sequences_lower_95": 4.402066316619264,
            "loss_sequences_upper_95": 4.613451023365495,
            "loss_tokens_lower_95": 4.402257213211645,
            "loss_tokens_upper_95": 4.611033346759193,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.594444887638092,
            "data_time": 0.005935509999593099,
            "batch_time": 0.019864352449538217,
            "samples_per_second": 2192986.7656724644,
            "samples_per_second_per_gpu": 274123.34570905805,
            "loss_sequences_lower_95": 6.4965141479492186,
            "loss_sequences_upper_95": 6.691786364746093,
            "loss_tokens_lower_95": 6.496986145019531,
            "loss_tokens_upper_95": 6.690095349121094,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1481874473160354,
            "data_time": 0.0021141952287497276,
            "batch_time": 0.015742295856511886,
            "samples_per_second": 2266735.3502878295,
            "samples_per_second_per_gpu": 283341.9187859787,
            "loss_sequences_lower_95": 3.6151318497960028,
            "loss_sequences_upper_95": 3.694781654668283,
            "loss_tokens_lower_95": 2.5659328985798,
            "loss_tokens_upper_95": 2.622743771128375,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.117236829515713,
            "data_time": 0.020542115824563164,
            "batch_time": 0.03497664758137294,
            "samples_per_second": 1979673.3444830708,
            "samples_per_second_per_gpu": 247459.16806038385,
            "loss_sequences_lower_95": 4.959207130546001,
            "loss_sequences_upper_95": 5.279067947615439,
            "loss_tokens_lower_95": 4.958083104375583,
            "loss_tokens_upper_95": 5.277628497223356,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.875328199536193,
            "data_time": 0.01175172533839941,
            "batch_time": 0.026069439947605133,
            "samples_per_second": 2104535.904239881,
            "samples_per_second_per_gpu": 263066.9880299851,
            "loss_sequences_lower_95": 4.752139736998315,
            "loss_sequences_upper_95": 4.9964117311963845,
            "loss_tokens_lower_95": 4.753481349571079,
            "loss_tokens_upper_95": 4.993282710056679,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.025805065450581,
            "data_time": 0.0023303534514190374,
            "batch_time": 0.015941047704228762,
            "samples_per_second": 2267086.5617888,
            "samples_per_second_per_gpu": 283385.8202236,
            "loss_sequences_lower_95": 4.524689097633252,
            "loss_sequences_upper_95": 4.617952308812477,
            "loss_tokens_lower_95": 3.3256431843906955,
            "loss_tokens_upper_95": 3.4012530302544266,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.2527940462506,
            "data_time": 0.030256283779939015,
            "batch_time": 0.044694689412911735,
            "samples_per_second": 1992832.797922141,
            "samples_per_second_per_gpu": 249104.09974026761,
            "loss_sequences_lower_95": 6.1651374453590035,
            "loss_sequences_upper_95": 6.335394319403108,
            "loss_tokens_lower_95": 6.166327840815145,
            "loss_tokens_upper_95": 6.335806952582465,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.400040472100634,
            "data_time": 0.0038541760636773303,
            "batch_time": 0.017600566476255984,
            "samples_per_second": 2237849.7399514667,
            "samples_per_second_per_gpu": 279731.21749393333,
            "loss_sequences_lower_95": 4.3648631617928135,
            "loss_sequences_upper_95": 4.433680468451357,
            "loss_tokens_lower_95": 4.366807954654052,
            "loss_tokens_upper_95": 4.433484394411793,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.322064343008023,
            "data_time": 0.02518618106842041,
            "batch_time": 0.03929308977994052,
            "samples_per_second": 1945259.1167728957,
            "samples_per_second_per_gpu": 243157.38959661196,
            "loss_sequences_lower_95": 5.146595690088365,
            "loss_sequences_upper_95": 5.5019931830248785,
            "loss_tokens_lower_95": 5.143674491215679,
            "loss_tokens_upper_95": 5.500113596499545,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5363547623157503,
            "data_time": 0.08576637506484985,
            "batch_time": 0.1020033210515976,
            "samples_per_second": 1348512.1193003363,
            "samples_per_second_per_gpu": 168564.01491254204,
            "loss_sequences_lower_95": 3.258635075887044,
            "loss_sequences_upper_95": 4.018945109049479,
            "loss_tokens_lower_95": 2.891804091135661,
            "loss_tokens_upper_95": 3.819487826029459,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1441068251927695,
            "data_time": 0.0823449119925499,
            "batch_time": 0.09846881031990051,
            "samples_per_second": 1371925.7980109705,
            "samples_per_second_per_gpu": 171490.7247513713,
            "loss_sequences_lower_95": 2.9905878829956056,
            "loss_sequences_upper_95": 3.722697912851969,
            "loss_tokens_lower_95": 2.425214908899886,
            "loss_tokens_upper_95": 3.442077490988742,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.250048418775398,
            "data_time": 0.0035849112914266693,
            "batch_time": 0.01748329909708024,
            "samples_per_second": 2214441.620158953,
            "samples_per_second_per_gpu": 276805.20251986914,
            "loss_sequences_lower_95": 4.232173132018594,
            "loss_sequences_upper_95": 4.268417630764452,
            "loss_tokens_lower_95": 4.232149832589286,
            "loss_tokens_upper_95": 4.2683044056056705,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.6601287658000368,
            "data_time": 0.0015289116104661245,
            "batch_time": 0.015131486331151403,
            "samples_per_second": 2272341.887061624,
            "samples_per_second_per_gpu": 284042.735882703,
            "loss_sequences_lower_95": 0.7703272339192753,
            "loss_sequences_upper_95": 0.7911159826076644,
            "loss_tokens_lower_95": 0.54162734123581,
            "loss_tokens_upper_95": 0.552188951875641,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.168189330363837,
            "data_time": 0.04966992139816284,
            "batch_time": 0.06538516655564308,
            "samples_per_second": 1833999.915880109,
            "samples_per_second_per_gpu": 229249.98948501362,
            "loss_sequences_lower_95": 5.193938500111497,
            "loss_sequences_upper_95": 5.619138954943559,
            "loss_tokens_lower_95": 4.811451016910771,
            "loss_tokens_upper_95": 5.129802511763848,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.283208048021471,
            "data_time": 0.11864752996535528,
            "batch_time": 0.1343064648764474,
            "samples_per_second": 1055434.749586596,
            "samples_per_second_per_gpu": 131929.3436983245,
            "loss_sequences_lower_95": 7.820026150265256,
            "loss_sequences_upper_95": 8.973199751570418,
            "loss_tokens_lower_95": 7.18146893536603,
            "loss_tokens_upper_95": 9.097295238353588,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0352618054645815,
            "data_time": 0.03094082786923363,
            "batch_time": 0.04524383658454532,
            "samples_per_second": 1938852.2456950857,
            "samples_per_second_per_gpu": 242356.53071188572,
            "loss_sequences_lower_95": 5.019008087530369,
            "loss_sequences_upper_95": 5.378793874601039,
            "loss_tokens_lower_95": 4.642286872831686,
            "loss_tokens_upper_95": 4.908343350833158,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.176473147985412,
            "data_time": 0.03315298046384539,
            "batch_time": 0.047597325983501616,
            "samples_per_second": 1901901.0187884774,
            "samples_per_second_per_gpu": 237737.62734855968,
            "loss_sequences_lower_95": 5.138904320321432,
            "loss_sequences_upper_95": 5.451219372633027,
            "loss_tokens_lower_95": 4.8374485435051895,
            "loss_tokens_upper_95": 5.062322954383869,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.151945753795345,
            "data_time": 0.03210176740373884,
            "batch_time": 0.046756409463428315,
            "samples_per_second": 1906120.9027829734,
            "samples_per_second_per_gpu": 238265.11284787167,
            "loss_sequences_lower_95": 5.158254642021365,
            "loss_sequences_upper_95": 5.5860550205882005,
            "loss_tokens_lower_95": 4.669697199641047,
            "loss_tokens_upper_95": 5.013535554961544,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.342899760095085,
            "data_time": 0.035380689870743526,
            "batch_time": 0.049955305599031,
            "samples_per_second": 1920112.8616721684,
            "samples_per_second_per_gpu": 240014.10770902105,
            "loss_sequences_lower_95": 5.288862386563929,
            "loss_sequences_upper_95": 5.593520820431594,
            "loss_tokens_lower_95": 5.034273284246616,
            "loss_tokens_upper_95": 5.240810910043686,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.456821148439964,
            "data_time": 0.03460788138118791,
            "batch_time": 0.05104468192583249,
            "samples_per_second": 1881502.3570503884,
            "samples_per_second_per_gpu": 235187.79463129854,
            "loss_sequences_lower_95": 4.368455808651373,
            "loss_sequences_upper_95": 4.601871580514849,
            "loss_tokens_lower_95": 4.212089723786194,
            "loss_tokens_upper_95": 4.367524689693624,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9516093469247586,
            "data_time": 0.03221895581200009,
            "batch_time": 0.046404577436901275,
            "samples_per_second": 1979604.8324131987,
            "samples_per_second_per_gpu": 247450.60405164983,
            "loss_sequences_lower_95": 3.946280521299781,
            "loss_sequences_upper_95": 4.201303817004692,
            "loss_tokens_lower_95": 3.6936986407003465,
            "loss_tokens_upper_95": 3.820578750141275,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-8.0/params.txt",
    "uuid": "1fd4e001-e58f-4c77-a416-9c4dd295082c",
    "creation_date": "2023_12_14-05_03_22"
}