{
    "name": "rw_original-d=96_l=8_h=4-2.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 422772480,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "84554496",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.678665041923523,
            "data_time": 0.13088496029376984,
            "batch_time": 1.2682172060012817,
            "samples_per_second": 380740.26605077996,
            "samples_per_second_per_gpu": 47592.533256347495,
            "loss_sequences_lower_95": 5.532958653767904,
            "loss_sequences_upper_95": 5.825956624348958,
            "loss_tokens_lower_95": 5.663391977945963,
            "loss_tokens_upper_95": 5.693806864420573,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.983436294883238,
            "data_time": 0.01963875445684873,
            "batch_time": 0.0648069746132564,
            "samples_per_second": 4643800.741747359,
            "samples_per_second_per_gpu": 580475.0927184199,
            "loss_sequences_lower_95": 4.981232380307415,
            "loss_sequences_upper_95": 4.985596935331592,
            "loss_tokens_lower_95": 4.97186290625,
            "loss_tokens_upper_95": 4.99492803125,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.77062229915541,
            "data_time": 0.09093958139419556,
            "batch_time": 0.1359737515449524,
            "samples_per_second": 4040457.5710733607,
            "samples_per_second_per_gpu": 505057.1963841701,
            "loss_sequences_lower_95": 4.714751761300223,
            "loss_sequences_upper_95": 4.842000645228795,
            "loss_tokens_lower_95": 4.756777572916667,
            "loss_tokens_upper_95": 4.78435,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.172185630306755,
            "data_time": 0.014764722240598578,
            "batch_time": 0.0589951543431533,
            "samples_per_second": 5205057.621239623,
            "samples_per_second_per_gpu": 650632.2026549529,
            "loss_sequences_lower_95": 5.1307210454252585,
            "loss_sequences_upper_95": 5.215215377335696,
            "loss_tokens_lower_95": 5.159012875,
            "loss_tokens_upper_95": 5.185331489583334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.029958316119285,
            "data_time": 0.09134594351053238,
            "batch_time": 0.1367151439189911,
            "samples_per_second": 4071592.211337413,
            "samples_per_second_per_gpu": 508949.02641717665,
            "loss_sequences_lower_95": 4.967981106200675,
            "loss_sequences_upper_95": 5.1066735914428465,
            "loss_tokens_lower_95": 5.017837875,
            "loss_tokens_upper_95": 5.042090583333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.548877335826529,
            "data_time": 0.03456078966458639,
            "batch_time": 0.07771602769692738,
            "samples_per_second": 4982614.842470825,
            "samples_per_second_per_gpu": 622826.8553088531,
            "loss_sequences_lower_95": 5.493112130751826,
            "loss_sequences_upper_95": 5.609488954420887,
            "loss_tokens_lower_95": 5.535595458333334,
            "loss_tokens_upper_95": 5.562314458333334,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.641811758644727,
            "data_time": 0.0140511155128479,
            "batch_time": 0.056649109721183775,
            "samples_per_second": 5141097.325337432,
            "samples_per_second_per_gpu": 642637.165667179,
            "loss_sequences_lower_95": 6.606903659119897,
            "loss_sequences_upper_95": 6.675926668128189,
            "loss_tokens_lower_95": 6.624579864583334,
            "loss_tokens_upper_95": 6.659405125,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.20298454714071,
            "data_time": 0.013100421742389077,
            "batch_time": 0.05654341531427283,
            "samples_per_second": 5326746.470127891,
            "samples_per_second_per_gpu": 665843.3087659864,
            "loss_sequences_lower_95": 5.176029082133508,
            "loss_sequences_upper_95": 5.2319978832624345,
            "loss_tokens_lower_95": 5.19079278125,
            "loss_tokens_upper_95": 5.215211802083333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1571728979668965,
            "data_time": 0.09191793948411942,
            "batch_time": 0.13658836483955383,
            "samples_per_second": 4064424.237862568,
            "samples_per_second_per_gpu": 508053.029732821,
            "loss_sequences_lower_95": 5.065528361002604,
            "loss_sequences_upper_95": 5.268808802162728,
            "loss_tokens_lower_95": 5.14472909375,
            "loss_tokens_upper_95": 5.169784375000001,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.275287030713831,
            "data_time": 0.09086181223392487,
            "batch_time": 0.13607542216777802,
            "samples_per_second": 4128388.2713777227,
            "samples_per_second_per_gpu": 516048.53392221534,
            "loss_sequences_lower_95": 6.153887481086339,
            "loss_sequences_upper_95": 6.421223419551322,
            "loss_tokens_lower_95": 6.261925197916666,
            "loss_tokens_upper_95": 6.288347427083333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.51800585969337,
            "data_time": 0.009986212541317117,
            "batch_time": 0.05318611808891954,
            "samples_per_second": 5412051.654876599,
            "samples_per_second_per_gpu": 676506.4568595749,
            "loss_sequences_lower_95": 5.505574088131252,
            "loss_sequences_upper_95": 5.530860967585138,
            "loss_tokens_lower_95": 5.505430125,
            "loss_tokens_upper_95": 5.530762916666666,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.332601441983529,
            "data_time": 0.02309212237596512,
            "batch_time": 0.06578699052333832,
            "samples_per_second": 5132947.768360364,
            "samples_per_second_per_gpu": 641618.4710450455,
            "loss_sequences_lower_95": 5.30516718904558,
            "loss_sequences_upper_95": 5.361293176589832,
            "loss_tokens_lower_95": 5.319652510416667,
            "loss_tokens_upper_95": 5.345282510416666,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.396203760684866,
            "data_time": 0.10566706210374832,
            "batch_time": 0.22440273314714432,
            "samples_per_second": 4021919.7151090056,
            "samples_per_second_per_gpu": 502739.9643886257,
            "loss_sequences_lower_95": 5.294208419830755,
            "loss_sequences_upper_95": 5.519034364402415,
            "loss_tokens_lower_95": 5.382673864583333,
            "loss_tokens_upper_95": 5.4099191979166665,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.323171462158079,
            "data_time": 0.0936187133193016,
            "batch_time": 0.13873428106307983,
            "samples_per_second": 4084374.170478707,
            "samples_per_second_per_gpu": 510546.7713098384,
            "loss_sequences_lower_95": 5.24437352819501,
            "loss_sequences_upper_95": 5.421768654631014,
            "loss_tokens_lower_95": 5.310496874999999,
            "loss_tokens_upper_95": 5.336150583333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.179875709793785,
            "data_time": 0.13422420620918274,
            "batch_time": 0.1593361496925354,
            "samples_per_second": 1006482.7090998217,
            "samples_per_second_per_gpu": 125810.33863747772,
            "loss_sequences_lower_95": 6.087669944763183,
            "loss_sequences_upper_95": 6.289040461453525,
            "loss_tokens_lower_95": 6.153923537514427,
            "loss_tokens_upper_95": 6.2052227193659,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.886237742254408,
            "data_time": 0.09256332367658615,
            "batch_time": 0.12746965140104294,
            "samples_per_second": 3210375.140924548,
            "samples_per_second_per_gpu": 401296.8926155685,
            "loss_sequences_lower_95": 5.782703188348442,
            "loss_sequences_upper_95": 5.991416250512481,
            "loss_tokens_lower_95": 5.872160666666667,
            "loss_tokens_upper_95": 5.900155635416667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.555244436993762,
            "data_time": 0.08968466520309448,
            "batch_time": 0.12602977454662323,
            "samples_per_second": 3773334.6760998936,
            "samples_per_second_per_gpu": 471666.8345124867,
            "loss_sequences_lower_95": 6.4586804070384645,
            "loss_sequences_upper_95": 6.683195507683666,
            "loss_tokens_lower_95": 6.5437334375,
            "loss_tokens_upper_95": 6.566982125,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.098309833495343,
            "data_time": 0.15098223090171814,
            "batch_time": 0.1809656322002411,
            "samples_per_second": 2225721.362492329,
            "samples_per_second_per_gpu": 278215.1703115411,
            "loss_sequences_lower_95": 5.940362974073066,
            "loss_sequences_upper_95": 6.379776964031282,
            "loss_tokens_lower_95": 6.083593562391938,
            "loss_tokens_upper_95": 6.1127178254674694,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.250859612838577,
            "data_time": 0.027295005321502686,
            "batch_time": 0.07174736261367798,
            "samples_per_second": 4513210.39245466,
            "samples_per_second_per_gpu": 564151.2990568324,
            "loss_sequences_lower_95": 5.235952800081897,
            "loss_sequences_upper_95": 5.265733352398163,
            "loss_tokens_lower_95": 5.235765081986184,
            "loss_tokens_upper_95": 5.265628101743876,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.62676276531003,
            "data_time": 0.027711598575115202,
            "batch_time": 0.07193302363157272,
            "samples_per_second": 4451390.730066732,
            "samples_per_second_per_gpu": 556423.8412583414,
            "loss_sequences_lower_95": 4.641967369859092,
            "loss_sequences_upper_95": 4.667261768151888,
            "loss_tokens_lower_95": 4.614972056257578,
            "loss_tokens_upper_95": 4.636345026327269,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.076250756074079,
            "data_time": 0.05475848416487376,
            "batch_time": 0.09639707373248206,
            "samples_per_second": 4333477.120894847,
            "samples_per_second_per_gpu": 541684.6401118559,
            "loss_sequences_lower_95": 7.46167759091211,
            "loss_sequences_upper_95": 7.6966685472145135,
            "loss_tokens_lower_95": 6.952298295750685,
            "loss_tokens_upper_95": 7.139713762713694,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.7957261150678,
            "data_time": 0.04066399236520132,
            "batch_time": 0.08415950462222099,
            "samples_per_second": 4594632.118874305,
            "samples_per_second_per_gpu": 574329.0148592881,
            "loss_sequences_lower_95": 7.104609928385417,
            "loss_sequences_upper_95": 7.270659114583333,
            "loss_tokens_lower_95": 6.700985099744496,
            "loss_tokens_upper_95": 6.826092374213836,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.057832925766141,
            "data_time": 0.06663207213083903,
            "batch_time": 0.1069088230530421,
            "samples_per_second": 4005618.591615292,
            "samples_per_second_per_gpu": 500702.3239519115,
            "loss_sequences_lower_95": 5.143286771391087,
            "loss_sequences_upper_95": 5.206878907360571,
            "loss_tokens_lower_95": 5.034697161959959,
            "loss_tokens_upper_95": 5.068125287363276,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.416640214486556,
            "data_time": 0.33111950755119324,
            "batch_time": 0.3737293481826782,
            "samples_per_second": 2676600.7500838884,
            "samples_per_second_per_gpu": 334575.09376048605,
            "loss_sequences_lower_95": 5.386801785555752,
            "loss_sequences_upper_95": 5.526667619185014,
            "loss_tokens_lower_95": 5.376765972992039,
            "loss_tokens_upper_95": 5.441513576413382,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.085532030767324,
            "data_time": 0.38132908940315247,
            "batch_time": 0.42673490941524506,
            "samples_per_second": 2815362.9713123734,
            "samples_per_second_per_gpu": 351920.3714140467,
            "loss_sequences_lower_95": 5.107853156887755,
            "loss_sequences_upper_95": 5.302643233318718,
            "loss_tokens_lower_95": 5.03605463460562,
            "loss_tokens_upper_95": 5.139348341232227,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.501772624651591,
            "data_time": 0.18051929026842117,
            "batch_time": 0.2120993435382843,
            "samples_per_second": 2413164.545757102,
            "samples_per_second_per_gpu": 301645.56821963773,
            "loss_sequences_lower_95": 4.48214028930664,
            "loss_sequences_upper_95": 4.586110097249349,
            "loss_tokens_lower_95": 4.3917936987576285,
            "loss_tokens_upper_95": 4.60578063285837,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.833069763340342,
            "data_time": 0.025213426910340786,
            "batch_time": 0.06937449090182782,
            "samples_per_second": 4479900.633101613,
            "samples_per_second_per_gpu": 559987.5791377017,
            "loss_sequences_lower_95": 8.9035118189004,
            "loss_sequences_upper_95": 8.974917188499582,
            "loss_tokens_lower_95": 8.779691718130346,
            "loss_tokens_upper_95": 8.85484377613001,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.935497175764154,
            "data_time": 0.04354950785636902,
            "batch_time": 0.08587641716003418,
            "samples_per_second": 4406836.022694397,
            "samples_per_second_per_gpu": 550854.5028367996,
            "loss_sequences_lower_95": 6.990304575704966,
            "loss_sequences_upper_95": 7.256253308639783,
            "loss_tokens_lower_95": 5.795946611478912,
            "loss_tokens_upper_95": 5.939519282452742,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.64418019288229,
            "data_time": 0.08083988726139069,
            "batch_time": 0.12294687330722809,
            "samples_per_second": 4243351.239712331,
            "samples_per_second_per_gpu": 530418.9049640414,
            "loss_sequences_lower_95": 6.300409487571326,
            "loss_sequences_upper_95": 6.5887756451811805,
            "loss_tokens_lower_95": 5.545209975117982,
            "loss_tokens_upper_95": 5.709222544066634,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6146258959487145,
            "data_time": 0.3473235070705414,
            "batch_time": 0.38880710303783417,
            "samples_per_second": 2729835.7866990278,
            "samples_per_second_per_gpu": 341229.47333737847,
            "loss_sequences_lower_95": 5.55787155639091,
            "loss_sequences_upper_95": 5.672763340222781,
            "loss_tokens_lower_95": 5.557149279172018,
            "loss_tokens_upper_95": 5.673460116451734,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.9963306093215945,
            "data_time": 0.2988753318786621,
            "batch_time": 0.3244565725326538,
            "samples_per_second": 1913971.17320766,
            "samples_per_second_per_gpu": 239246.3966509575,
            "loss_sequences_lower_95": 4.92043115234375,
            "loss_sequences_upper_95": 5.382298080444335,
            "loss_tokens_lower_95": 4.739424954927885,
            "loss_tokens_upper_95": 5.2341166433153505,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.192608623690102,
            "data_time": 0.052338918671011925,
            "batch_time": 0.09526262991130352,
            "samples_per_second": 4449554.38745141,
            "samples_per_second_per_gpu": 556194.2984314263,
            "loss_sequences_lower_95": 5.155458229713089,
            "loss_sequences_upper_95": 5.229881850431006,
            "loss_tokens_lower_95": 5.154237813959346,
            "loss_tokens_upper_95": 5.230001072019736,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.604974599474283,
            "data_time": 0.0741089940071106,
            "batch_time": 0.11823965609073639,
            "samples_per_second": 4322623.480296925,
            "samples_per_second_per_gpu": 540327.9350371156,
            "loss_sequences_lower_95": 5.566612729784116,
            "loss_sequences_upper_95": 5.642371525245061,
            "loss_tokens_lower_95": 5.566191162309327,
            "loss_tokens_upper_95": 5.642984926065981,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1509031484124455,
            "data_time": 0.04949352890253067,
            "batch_time": 0.09086329862475395,
            "samples_per_second": 4263489.742050083,
            "samples_per_second_per_gpu": 532936.2177562604,
            "loss_sequences_lower_95": 5.354292797134283,
            "loss_sequences_upper_95": 5.469956556783019,
            "loss_tokens_lower_95": 5.119143568205712,
            "loss_tokens_upper_95": 5.182223115722546,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.645474397659302,
            "data_time": 0.1786155253648758,
            "batch_time": 0.2241721674799919,
            "samples_per_second": 3837598.00351688,
            "samples_per_second_per_gpu": 479699.75043961,
            "loss_sequences_lower_95": 7.291573120117188,
            "loss_sequences_upper_95": 7.7931772827148444,
            "loss_tokens_lower_95": 6.417909419465586,
            "loss_tokens_upper_95": 6.757590915318911,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.353560835123062,
            "data_time": 0.14876700937747955,
            "batch_time": 0.16551846265792847,
            "samples_per_second": 919835.136002961,
            "samples_per_second_per_gpu": 114979.39200037012,
            "loss_sequences_lower_95": 5.027154695987702,
            "loss_sequences_upper_95": 5.820507109165192,
            "loss_tokens_lower_95": 4.804400213833513,
            "loss_tokens_upper_95": 5.719313742100507,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.417687744929872,
            "data_time": 0.341839462518692,
            "batch_time": 0.3782757818698883,
            "samples_per_second": 2341373.831289121,
            "samples_per_second_per_gpu": 292671.7289111401,
            "loss_sequences_lower_95": 6.176733959680316,
            "loss_sequences_upper_95": 6.6792961032911276,
            "loss_tokens_lower_95": 5.15805018617443,
            "loss_tokens_upper_95": 5.556778260309505,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.021990491501261,
            "data_time": 0.0514170014195972,
            "batch_time": 0.09658516777886285,
            "samples_per_second": 4481034.939269971,
            "samples_per_second_per_gpu": 560129.3674087464,
            "loss_sequences_lower_95": 5.0045296885239186,
            "loss_sequences_upper_95": 5.039764182981105,
            "loss_tokens_lower_95": 5.004082822653861,
            "loss_tokens_upper_95": 5.0394262897963085,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.007323763843835,
            "data_time": 0.03792972153141385,
            "batch_time": 0.08093051115671794,
            "samples_per_second": 4308820.804184127,
            "samples_per_second_per_gpu": 538602.6005230158,
            "loss_sequences_lower_95": 7.1102399391131375,
            "loss_sequences_upper_95": 7.31045270579638,
            "loss_tokens_lower_95": 6.885572828848432,
            "loss_tokens_upper_95": 7.083034620796117,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4501520329779325,
            "data_time": 0.1892808899283409,
            "batch_time": 0.21893808245658875,
            "samples_per_second": 1856911.5368520566,
            "samples_per_second_per_gpu": 232113.94210650708,
            "loss_sequences_lower_95": 4.357564737508585,
            "loss_sequences_upper_95": 4.720839135026757,
            "loss_tokens_lower_95": 4.239317932975832,
            "loss_tokens_upper_95": 4.570197397067964,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.731859995985746,
            "data_time": 0.07754090428352356,
            "batch_time": 0.1224688708782196,
            "samples_per_second": 4372795.974692083,
            "samples_per_second_per_gpu": 546599.4968365104,
            "loss_sequences_lower_95": 4.78966937347388,
            "loss_sequences_upper_95": 4.9228072433652335,
            "loss_tokens_lower_95": 4.6509980586942135,
            "loss_tokens_upper_95": 4.802990860136682,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.731672452717293,
            "data_time": 0.3468639552593231,
            "batch_time": 0.38124945759773254,
            "samples_per_second": 2027160.3449960884,
            "samples_per_second_per_gpu": 253395.04312451105,
            "loss_sequences_lower_95": 5.485393179916755,
            "loss_sequences_upper_95": 5.979330025649652,
            "loss_tokens_lower_95": 5.559146780290961,
            "loss_tokens_upper_95": 5.943879059982922,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.501771389353058,
            "data_time": 0.02789727572737069,
            "batch_time": 0.07190782635189523,
            "samples_per_second": 4456037.9604891995,
            "samples_per_second_per_gpu": 557004.7450611499,
            "loss_sequences_lower_95": 4.4918362090386825,
            "loss_sequences_upper_95": 4.511676749412382,
            "loss_tokens_lower_95": 4.492043623568464,
            "loss_tokens_upper_95": 4.511590394250725,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.904334563653446,
            "data_time": 0.3133256882429123,
            "batch_time": 0.33990588784217834,
            "samples_per_second": 1579707.7203475197,
            "samples_per_second_per_gpu": 197463.46504343997,
            "loss_sequences_lower_95": 4.760519661023779,
            "loss_sequences_upper_95": 5.118977222627806,
            "loss_tokens_lower_95": 4.65697828622777,
            "loss_tokens_upper_95": 5.059858870725915,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.022822316450143,
            "data_time": 0.022917744914690653,
            "batch_time": 0.06772027254104614,
            "samples_per_second": 4460347.082923928,
            "samples_per_second_per_gpu": 557543.385365491,
            "loss_sequences_lower_95": 6.759090875589623,
            "loss_sequences_upper_95": 6.803575406184486,
            "loss_tokens_lower_95": 5.932611025145067,
            "loss_tokens_upper_95": 5.978593882978723,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.967150787830353,
            "data_time": 0.09830102697014809,
            "batch_time": 0.1430012248456478,
            "samples_per_second": 4330070.56892507,
            "samples_per_second_per_gpu": 541258.8211156337,
            "loss_sequences_lower_95": 4.983040014648437,
            "loss_sequences_upper_95": 5.173223083496094,
            "loss_tokens_lower_95": 4.870263860692188,
            "loss_tokens_upper_95": 5.047027120568083,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.935623419803122,
            "data_time": 0.33652758598327637,
            "batch_time": 0.381387397646904,
            "samples_per_second": 2182226.9136422016,
            "samples_per_second_per_gpu": 272778.3642052752,
            "loss_sequences_lower_95": 5.824051991338314,
            "loss_sequences_upper_95": 6.047398429538893,
            "loss_tokens_lower_95": 5.824470957880435,
            "loss_tokens_upper_95": 6.045054546853771,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.182970988389217,
            "data_time": 0.06318023552497228,
            "batch_time": 0.10306107501188914,
            "samples_per_second": 4052403.0867966455,
            "samples_per_second_per_gpu": 506550.3858495807,
            "loss_sequences_lower_95": 9.087500517874053,
            "loss_sequences_upper_95": 9.27827958540483,
            "loss_tokens_lower_95": 9.087148548473012,
            "loss_tokens_upper_95": 9.27854608709162,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.6835043598810833,
            "data_time": 0.06992802768945694,
            "batch_time": 0.11398572723070781,
            "samples_per_second": 4485856.312728661,
            "samples_per_second_per_gpu": 560732.0390910826,
            "loss_sequences_lower_95": 3.7812395833333334,
            "loss_sequences_upper_95": 3.8656797688802085,
            "loss_tokens_lower_95": 3.6277974861819726,
            "loss_tokens_upper_95": 3.7226721157212883,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.523143048513504,
            "data_time": 0.3267532289028168,
            "batch_time": 0.3672632724046707,
            "samples_per_second": 2617188.449224885,
            "samples_per_second_per_gpu": 327148.55615311064,
            "loss_sequences_lower_95": 6.183021472749256,
            "loss_sequences_upper_95": 6.873971368698847,
            "loss_tokens_lower_95": 6.177743937174479,
            "loss_tokens_upper_95": 6.870167992001488,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.741573467850685,
            "data_time": 0.155367910861969,
            "batch_time": 0.1726565957069397,
            "samples_per_second": 917401.1795344672,
            "samples_per_second_per_gpu": 114675.1474418084,
            "loss_sequences_lower_95": 5.493575119972229,
            "loss_sequences_upper_95": 6.841134881973266,
            "loss_tokens_lower_95": 5.281030298606637,
            "loss_tokens_upper_95": 5.859334049814755,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.667899078845978,
            "data_time": 0.09871315583586693,
            "batch_time": 0.14321831613779068,
            "samples_per_second": 4265107.795717222,
            "samples_per_second_per_gpu": 533138.4744646527,
            "loss_sequences_lower_95": 7.7782806884765625,
            "loss_sequences_upper_95": 8.090437109375,
            "loss_tokens_lower_95": 7.516640848072653,
            "loss_tokens_upper_95": 7.793453003549334,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.850937022209168,
            "data_time": 0.097467090934515,
            "batch_time": 0.14175599440932274,
            "samples_per_second": 4340821.281255012,
            "samples_per_second_per_gpu": 542602.6601568765,
            "loss_sequences_lower_95": 8.1102634765625,
            "loss_sequences_upper_95": 8.346533276367188,
            "loss_tokens_lower_95": 7.730354834008826,
            "loss_tokens_upper_95": 7.938343758313251,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7547666541370335,
            "data_time": 0.03938845172524452,
            "batch_time": 0.0830175591011842,
            "samples_per_second": 4564054.296107368,
            "samples_per_second_per_gpu": 570506.787013421,
            "loss_sequences_lower_95": 4.722477674596148,
            "loss_sequences_upper_95": 4.785902102473915,
            "loss_tokens_lower_95": 4.723272518883046,
            "loss_tokens_upper_95": 4.787075336084165,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.835929733633812,
            "data_time": 0.11611390610535939,
            "batch_time": 0.15681752065817514,
            "samples_per_second": 4033544.312912392,
            "samples_per_second_per_gpu": 504193.039114049,
            "loss_sequences_lower_95": 4.731504444409442,
            "loss_sequences_upper_95": 4.939372419834869,
            "loss_tokens_lower_95": 4.730216508856567,
            "loss_tokens_upper_95": 4.937505409721222,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.490666626930237,
            "data_time": 0.09320224076509476,
            "batch_time": 0.13729561865329742,
            "samples_per_second": 4327488.424287736,
            "samples_per_second_per_gpu": 540936.053035967,
            "loss_sequences_lower_95": 8.434368969726563,
            "loss_sequences_upper_95": 8.549373535156251,
            "loss_tokens_lower_95": 8.433378295898438,
            "loss_tokens_upper_95": 8.546932690429689,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.086583374577416,
            "data_time": 0.02757312073594048,
            "batch_time": 0.07153453465018954,
            "samples_per_second": 4494206.394897095,
            "samples_per_second_per_gpu": 561775.7993621369,
            "loss_sequences_lower_95": 6.838569894897114,
            "loss_sequences_upper_95": 6.922372815899952,
            "loss_tokens_lower_95": 5.993559711321077,
            "loss_tokens_upper_95": 6.054756017254618,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.393738751980796,
            "data_time": 0.19641562019075667,
            "batch_time": 0.22886799914496286,
            "samples_per_second": 2056553.1355097487,
            "samples_per_second_per_gpu": 257069.1419387186,
            "loss_sequences_lower_95": 6.288126760454321,
            "loss_sequences_upper_95": 6.4989603412685115,
            "loss_tokens_lower_95": 6.28641497483894,
            "loss_tokens_upper_95": 6.496928952345208,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.299757020613726,
            "data_time": 0.18881240487098694,
            "batch_time": 0.23408792167901993,
            "samples_per_second": 3900272.992090858,
            "samples_per_second_per_gpu": 487534.12401135726,
            "loss_sequences_lower_95": 6.219968788296569,
            "loss_sequences_upper_95": 6.377508772307751,
            "loss_tokens_lower_95": 6.221951042624081,
            "loss_tokens_upper_95": 6.378382879518995,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.145503427485663,
            "data_time": 0.03395356237888336,
            "batch_time": 0.07770891953259706,
            "samples_per_second": 4331804.021041364,
            "samples_per_second_per_gpu": 541475.5026301706,
            "loss_sequences_lower_95": 6.584211040746743,
            "loss_sequences_upper_95": 6.662648704669297,
            "loss_tokens_lower_95": 6.064909639416908,
            "loss_tokens_upper_95": 6.137520807938007,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.609372623383053,
            "data_time": 0.341385155916214,
            "batch_time": 0.3790465146303177,
            "samples_per_second": 1925644.0436937322,
            "samples_per_second_per_gpu": 240705.50546171653,
            "loss_sequences_lower_95": 4.5359219747876365,
            "loss_sequences_upper_95": 4.686599537682912,
            "loss_tokens_lower_95": 4.5357245551215275,
            "loss_tokens_upper_95": 4.685348623770255,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.776927603421226,
            "data_time": 0.04316674516751216,
            "batch_time": 0.08742493964158572,
            "samples_per_second": 4439673.690182819,
            "samples_per_second_per_gpu": 554959.2112728524,
            "loss_sequences_lower_95": 7.735320682817279,
            "loss_sequences_upper_95": 7.819583870890673,
            "loss_tokens_lower_95": 7.734906494887233,
            "loss_tokens_upper_95": 7.818453895498854,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.135273407963873,
            "data_time": 0.31480012834072113,
            "batch_time": 0.3551666885614395,
            "samples_per_second": 2469241.224464793,
            "samples_per_second_per_gpu": 308655.15305809915,
            "loss_sequences_lower_95": 6.019148032179157,
            "loss_sequences_upper_95": 6.251497479781364,
            "loss_tokens_lower_95": 6.0207509457486355,
            "loss_tokens_upper_95": 6.250647728188524,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.127859354019165,
            "data_time": 0.29235342144966125,
            "batch_time": 0.31250597536563873,
            "samples_per_second": 1227451.9640822574,
            "samples_per_second_per_gpu": 153431.49551028217,
            "loss_sequences_lower_95": 5.840936495463054,
            "loss_sequences_upper_95": 6.696319859822591,
            "loss_tokens_lower_95": 5.408169163597955,
            "loss_tokens_upper_95": 6.724528545803493,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.90226883093516,
            "data_time": 0.283571258187294,
            "batch_time": 0.3032772094011307,
            "samples_per_second": 1118928.9481189263,
            "samples_per_second_per_gpu": 139866.11851486578,
            "loss_sequences_lower_95": 5.732475204467773,
            "loss_sequences_upper_95": 6.766478780110678,
            "loss_tokens_lower_95": 5.112534049387729,
            "loss_tokens_upper_95": 6.5148323487699695,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.83781475437934,
            "data_time": 0.04014745567526136,
            "batch_time": 0.08330331210579191,
            "samples_per_second": 4346498.716759081,
            "samples_per_second_per_gpu": 543312.3395948851,
            "loss_sequences_lower_95": 7.80486404351528,
            "loss_sequences_upper_95": 7.869178004763438,
            "loss_tokens_lower_95": 7.805456970844072,
            "loss_tokens_upper_95": 7.869545733040317,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.027710181747376,
            "data_time": 0.023452332576810012,
            "batch_time": 0.06770613926206613,
            "samples_per_second": 4487152.586032738,
            "samples_per_second_per_gpu": 560894.0732540922,
            "loss_sequences_lower_95": 5.726838980116939,
            "loss_sequences_upper_95": 5.7578034709874855,
            "loss_tokens_lower_95": 4.9530456842133175,
            "loss_tokens_upper_95": 4.983768102167839,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.1220541450921,
            "data_time": 0.3605658710002899,
            "batch_time": 0.38952016830444336,
            "samples_per_second": 1674317.1060120799,
            "samples_per_second_per_gpu": 209289.63825150998,
            "loss_sequences_lower_95": 7.287937254417599,
            "loss_sequences_upper_95": 7.669120380071204,
            "loss_tokens_lower_95": 6.947730901498403,
            "loss_tokens_upper_95": 7.244270053714536,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 11.10274750477559,
            "data_time": 0.2178250402212143,
            "batch_time": 0.23528461158275604,
            "samples_per_second": 1093386.223480484,
            "samples_per_second_per_gpu": 136673.2779350605,
            "loss_sequences_lower_95": 10.598868004051415,
            "loss_sequences_upper_95": 11.728631797996727,
            "loss_tokens_lower_95": 10.023752923659337,
            "loss_tokens_upper_95": 11.897264626585406,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.986097236958946,
            "data_time": 0.35637684166431427,
            "batch_time": 0.3908732980489731,
            "samples_per_second": 2467412.517035622,
            "samples_per_second_per_gpu": 308426.56462945277,
            "loss_sequences_lower_95": 7.105435236488901,
            "loss_sequences_upper_95": 7.411832427978515,
            "loss_tokens_lower_95": 6.805986697723928,
            "loss_tokens_upper_95": 7.049905169128469,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.078606294422615,
            "data_time": 0.3502407819032669,
            "batch_time": 0.38502858579158783,
            "samples_per_second": 1874327.4414457898,
            "samples_per_second_per_gpu": 234290.93018072372,
            "loss_sequences_lower_95": 7.187515965903677,
            "loss_sequences_upper_95": 7.467480803699028,
            "loss_tokens_lower_95": 6.9197941671492025,
            "loss_tokens_upper_95": 7.1265011255607185,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.111143955370275,
            "data_time": 0.3139146417379379,
            "batch_time": 0.3483198583126068,
            "samples_per_second": 2096787.11913846,
            "samples_per_second_per_gpu": 262098.3898923075,
            "loss_sequences_lower_95": 7.3081237979051545,
            "loss_sequences_upper_95": 7.710380163425352,
            "loss_tokens_lower_95": 6.904917601486127,
            "loss_tokens_upper_95": 7.224391880660581,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.167384926865741,
            "data_time": 0.2941610664129257,
            "batch_time": 0.32840852439403534,
            "samples_per_second": 2335731.2493367093,
            "samples_per_second_per_gpu": 291966.40616708866,
            "loss_sequences_lower_95": 7.240612662710794,
            "loss_sequences_upper_95": 7.5097677463438455,
            "loss_tokens_lower_95": 7.020698846834842,
            "loss_tokens_upper_95": 7.206672473562841,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.7103335546410605,
            "data_time": 0.32105568051338196,
            "batch_time": 0.3549838811159134,
            "samples_per_second": 1990570.6123063564,
            "samples_per_second_per_gpu": 248821.32653829455,
            "loss_sequences_lower_95": 6.7314194768111895,
            "loss_sequences_upper_95": 6.904190575262034,
            "loss_tokens_lower_95": 6.607722218667739,
            "loss_tokens_upper_95": 6.738476002668909,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.271843561312047,
            "data_time": 0.30844029784202576,
            "batch_time": 0.3437727689743042,
            "samples_per_second": 2134728.05566299,
            "samples_per_second_per_gpu": 266841.00695787376,
            "loss_sequences_lower_95": 6.42163056164253,
            "loss_sequences_upper_95": 6.636403972346608,
            "loss_tokens_lower_95": 6.158333039010699,
            "loss_tokens_upper_95": 6.292207808261754,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-2.0/params.txt",
    "uuid": "1289b53d-adc8-4794-83d7-34f1218907c2",
    "creation_date": "2023_12_14-05_00_57"
}