{
    "name": "rw_original-d=512_l=8_h=4-2.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 3156561920,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "631312384",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.9209626654783887,
            "data_time": 0.029869887977838516,
            "batch_time": 0.3280114494264126,
            "samples_per_second": 1720372.0535572548,
            "samples_per_second_per_gpu": 215046.50669465685,
            "loss_sequences_lower_95": 3.8359105936686198,
            "loss_sequences_upper_95": 4.00693946838379,
            "loss_tokens_lower_95": 3.9063414573669433,
            "loss_tokens_upper_95": 3.9353884124755862,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7117773968886847,
            "data_time": 0.0015590894985986542,
            "batch_time": 0.015580705201527939,
            "samples_per_second": 2213920.4525193106,
            "samples_per_second_per_gpu": 276740.05656491383,
            "loss_sequences_lower_95": 3.709383630616242,
            "loss_sequences_upper_95": 3.714114706496547,
            "loss_tokens_lower_95": 3.7008316875,
            "loss_tokens_upper_95": 3.72276878125,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.259190108338181,
            "data_time": 0.010034857749938965,
            "batch_time": 0.023690855026245117,
            "samples_per_second": 2220764.692083728,
            "samples_per_second_per_gpu": 277595.586510466,
            "loss_sequences_lower_95": 3.199639998455437,
            "loss_sequences_upper_95": 3.3347474576989002,
            "loss_tokens_lower_95": 3.245861526041667,
            "loss_tokens_upper_95": 3.27245021875,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.85483687813749,
            "data_time": 0.0015672837433062102,
            "batch_time": 0.015056421097956206,
            "samples_per_second": 2311336.794516754,
            "samples_per_second_per_gpu": 288917.09931459423,
            "loss_sequences_lower_95": 3.8117177835051543,
            "loss_sequences_upper_95": 3.9001337487918812,
            "loss_tokens_lower_95": 3.8417878854166667,
            "loss_tokens_upper_95": 3.867947083333333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.783889471392039,
            "data_time": 0.00939226245500177,
            "batch_time": 0.023447634214424043,
            "samples_per_second": 2154394.9932008646,
            "samples_per_second_per_gpu": 269299.3741501081,
            "loss_sequences_lower_95": 3.7209646500783635,
            "loss_sequences_upper_95": 3.8663095198435116,
            "loss_tokens_lower_95": 3.7721533645833336,
            "loss_tokens_upper_95": 3.7954075208333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9509183101148366,
            "data_time": 0.003664098032142805,
            "batch_time": 0.017425206044445866,
            "samples_per_second": 2273479.2166239647,
            "samples_per_second_per_gpu": 284184.9020779956,
            "loss_sequences_lower_95": 3.901353312478756,
            "loss_sequences_upper_95": 4.004983364911731,
            "loss_tokens_lower_95": 3.9382123541666667,
            "loss_tokens_upper_95": 3.96366478125,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.75573686869777,
            "data_time": 0.001567574834745821,
            "batch_time": 0.01506885113179392,
            "samples_per_second": 2314870.3878546795,
            "samples_per_second_per_gpu": 289358.79848183494,
            "loss_sequences_lower_95": 3.722546974649235,
            "loss_sequences_upper_95": 3.7883292211415815,
            "loss_tokens_lower_95": 3.73965303125,
            "loss_tokens_upper_95": 3.7725630104166665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.2088084589374,
            "data_time": 0.0016234703813821688,
            "batch_time": 0.01511122989933743,
            "samples_per_second": 2317021.9088680046,
            "samples_per_second_per_gpu": 289627.7386085006,
            "loss_sequences_lower_95": 4.18372985520288,
            "loss_sequences_upper_95": 4.23619667252945,
            "loss_tokens_lower_95": 4.1968374375,
            "loss_tokens_upper_95": 4.220589760416667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.901604031159626,
            "data_time": 0.012036441810547359,
            "batch_time": 0.025980568121350002,
            "samples_per_second": 2173856.34983208,
            "samples_per_second_per_gpu": 271732.04372901,
            "loss_sequences_lower_95": 3.8092262081983614,
            "loss_sequences_upper_95": 4.012665961040714,
            "loss_tokens_lower_95": 3.8893149791666666,
            "loss_tokens_upper_95": 3.913742145833333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.061053892369327,
            "data_time": 0.009824085980653763,
            "batch_time": 0.023909502662718296,
            "samples_per_second": 2167557.061921067,
            "samples_per_second_per_gpu": 270944.6327401334,
            "loss_sequences_lower_95": 4.939533821678915,
            "loss_sequences_upper_95": 5.209809193969244,
            "loss_tokens_lower_95": 5.04741365625,
            "loss_tokens_upper_95": 5.074530145833334,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9603746543440965,
            "data_time": 0.0012655713584985625,
            "batch_time": 0.014652604892110786,
            "samples_per_second": 2335068.0611309386,
            "samples_per_second_per_gpu": 291883.5076413673,
            "loss_sequences_lower_95": 3.946893776444943,
            "loss_sequences_upper_95": 3.9745302676827805,
            "loss_tokens_lower_95": 3.9484791666666665,
            "loss_tokens_upper_95": 3.972576270833333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8459966187673724,
            "data_time": 0.0025347303490555356,
            "batch_time": 0.015991849367267188,
            "samples_per_second": 2316569.946700179,
            "samples_per_second_per_gpu": 289571.2433375224,
            "loss_sequences_lower_95": 3.8178410159503855,
            "loss_sequences_upper_95": 3.8758071384644417,
            "loss_tokens_lower_95": 3.8339393124999996,
            "loss_tokens_upper_95": 3.8580591875,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.279658535672975,
            "data_time": 0.010034207770004574,
            "batch_time": 0.023935087113512363,
            "samples_per_second": 2156500.6179452627,
            "samples_per_second_per_gpu": 269562.57724315784,
            "loss_sequences_lower_95": 4.187247440732759,
            "loss_sequences_upper_95": 4.391310772692698,
            "loss_tokens_lower_95": 4.266350572916666,
            "loss_tokens_upper_95": 4.29275209375,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5782469877643157,
            "data_time": 0.009753361166235935,
            "batch_time": 0.02357444440226156,
            "samples_per_second": 2182817.056050944,
            "samples_per_second_per_gpu": 272852.132006368,
            "loss_sequences_lower_95": 3.4878508651329394,
            "loss_sequences_upper_95": 3.6850368150140023,
            "loss_tokens_lower_95": 3.56588375,
            "loss_tokens_upper_95": 3.5907685104166664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.673619324510748,
            "data_time": 0.07794643299920219,
            "batch_time": 0.09519356489181519,
            "samples_per_second": 1010415.2055958753,
            "samples_per_second_per_gpu": 126301.90069948441,
            "loss_sequences_lower_95": 4.585645441575484,
            "loss_sequences_upper_95": 4.785051363164729,
            "loss_tokens_lower_95": 4.648227180134167,
            "loss_tokens_upper_95": 4.699210401014849,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9271391376461997,
            "data_time": 0.014327045191418041,
            "batch_time": 0.02814488112926483,
            "samples_per_second": 2136287.6186043876,
            "samples_per_second_per_gpu": 267035.95232554845,
            "loss_sequences_lower_95": 3.856695102880717,
            "loss_sequences_upper_95": 3.9967975049255196,
            "loss_tokens_lower_95": 3.91351815625,
            "loss_tokens_upper_95": 3.940438229166667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.923981750860692,
            "data_time": 0.012580537547667822,
            "batch_time": 0.026498567312955856,
            "samples_per_second": 2205288.645189752,
            "samples_per_second_per_gpu": 275661.080648719,
            "loss_sequences_lower_95": 5.83678287717472,
            "loss_sequences_upper_95": 6.039972729041269,
            "loss_tokens_lower_95": 5.912285572916667,
            "loss_tokens_upper_95": 5.935390802083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.292259665786243,
            "data_time": 0.03659818321466446,
            "batch_time": 0.05262830853462219,
            "samples_per_second": 1745371.7511767605,
            "samples_per_second_per_gpu": 218171.46889709507,
            "loss_sequences_lower_95": 4.143376147160764,
            "loss_sequences_upper_95": 4.553019514240202,
            "loss_tokens_lower_95": 4.278097621730116,
            "loss_tokens_upper_95": 4.306775865398469,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.849838150089206,
            "data_time": 0.0021171953724294155,
            "batch_time": 0.01574097706136508,
            "samples_per_second": 2268084.211552328,
            "samples_per_second_per_gpu": 283510.526444041,
            "loss_sequences_lower_95": 4.832349742068438,
            "loss_sequences_upper_95": 4.8678846690953925,
            "loss_tokens_lower_95": 4.831698709674547,
            "loss_tokens_upper_95": 4.867519678687152,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3667548342640004,
            "data_time": 0.002229736156904014,
            "batch_time": 0.01591183971257726,
            "samples_per_second": 2254742.7911509816,
            "samples_per_second_per_gpu": 281842.8488938727,
            "loss_sequences_lower_95": 3.3658108502881645,
            "loss_sequences_upper_95": 3.3915107622205487,
            "loss_tokens_lower_95": 3.3466206415318243,
            "loss_tokens_upper_95": 3.3660305191521003,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.325450176147003,
            "data_time": 0.0031641639705877276,
            "batch_time": 0.016787755950007317,
            "samples_per_second": 2257830.401610748,
            "samples_per_second_per_gpu": 282228.8002013435,
            "loss_sequences_lower_95": 5.566628917321091,
            "loss_sequences_upper_95": 5.867670453287376,
            "loss_tokens_lower_95": 4.803512216222153,
            "loss_tokens_upper_95": 5.022123852416905,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.388217676917712,
            "data_time": 0.004346372916343364,
            "batch_time": 0.018062183831600433,
            "samples_per_second": 2235615.025374555,
            "samples_per_second_per_gpu": 279451.8781718194,
            "loss_sequences_lower_95": 5.5155267578125,
            "loss_sequences_upper_95": 5.710743815104167,
            "loss_tokens_lower_95": 5.064042403694969,
            "loss_tokens_upper_95": 5.2036986414111635,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5238788614135697,
            "data_time": 0.004612308102315727,
            "batch_time": 0.018587061722354112,
            "samples_per_second": 2191395.3838003734,
            "samples_per_second_per_gpu": 273924.4229750467,
            "loss_sequences_lower_95": 3.568897113698825,
            "loss_sequences_upper_95": 3.633327897703042,
            "loss_tokens_lower_95": 3.426304125125028,
            "loss_tokens_upper_95": 3.458539176184945,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9172620513222434,
            "data_time": 0.023680693336895535,
            "batch_time": 0.038538481507982524,
            "samples_per_second": 1984088.1787585742,
            "samples_per_second_per_gpu": 248011.02234482177,
            "loss_sequences_lower_95": 2.8905694302645597,
            "loss_sequences_upper_95": 3.0079508833451705,
            "loss_tokens_lower_95": 2.841875858170412,
            "loss_tokens_upper_95": 2.893401393857704,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7515954620984138,
            "data_time": 0.019741643220186234,
            "batch_time": 0.03391818143427372,
            "samples_per_second": 1987305.5592368303,
            "samples_per_second_per_gpu": 248413.1949046038,
            "loss_sequences_lower_95": 3.738471044423629,
            "loss_sequences_upper_95": 3.9336939846739476,
            "loss_tokens_lower_95": 3.6274753975012697,
            "loss_tokens_upper_95": 3.722890545658429,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.386505232652028,
            "data_time": 0.016642329020377915,
            "batch_time": 0.030517335121448223,
            "samples_per_second": 2077231.083461911,
            "samples_per_second_per_gpu": 259653.88543273887,
            "loss_sequences_lower_95": 4.352859568277995,
            "loss_sequences_upper_95": 4.456876892089844,
            "loss_tokens_lower_95": 4.239783288227171,
            "loss_tokens_upper_95": 4.476585224090313,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.933734616062038,
            "data_time": 0.001838152010414312,
            "batch_time": 0.015544008239612443,
            "samples_per_second": 2252717.5903950366,
            "samples_per_second_per_gpu": 281589.6987993796,
            "loss_sequences_lower_95": 6.9482096914829485,
            "loss_sequences_upper_95": 7.028456765015255,
            "loss_tokens_lower_95": 6.783591696927708,
            "loss_tokens_upper_95": 6.866655175683262,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.986434958398543,
            "data_time": 0.002953749175039714,
            "batch_time": 0.01678693894571906,
            "samples_per_second": 2223909.0174516584,
            "samples_per_second_per_gpu": 277988.6271814573,
            "loss_sequences_lower_95": 5.529884620949074,
            "loss_sequences_upper_95": 5.816289276225799,
            "loss_tokens_lower_95": 4.256031748038005,
            "loss_tokens_upper_95": 4.393377990445476,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.515283530991232,
            "data_time": 0.005181970628532204,
            "batch_time": 0.019089130533708108,
            "samples_per_second": 2185545.064363544,
            "samples_per_second_per_gpu": 273193.133045443,
            "loss_sequences_lower_95": 4.957150945565806,
            "loss_sequences_upper_95": 5.271375074158756,
            "loss_tokens_lower_95": 4.094518779054231,
            "loss_tokens_upper_95": 4.250816590911382,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.163680729800707,
            "data_time": 0.02471887000969478,
            "batch_time": 0.03885906721864428,
            "samples_per_second": 2020840.4942029847,
            "samples_per_second_per_gpu": 252605.0617753731,
            "loss_sequences_lower_95": 6.074940859999287,
            "loss_sequences_upper_95": 6.251016235351563,
            "loss_tokens_lower_95": 6.074457065808719,
            "loss_tokens_upper_95": 6.250166620402576,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.846410131454468,
            "data_time": 0.04868196982603807,
            "batch_time": 0.06348721339152409,
            "samples_per_second": 1749768.0929115152,
            "samples_per_second_per_gpu": 218721.0116139394,
            "loss_sequences_lower_95": 3.709058135986328,
            "loss_sequences_upper_95": 4.105970771789551,
            "loss_tokens_lower_95": 3.526855588854959,
            "loss_tokens_upper_95": 3.992770817028175,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.246864736751256,
            "data_time": 0.003489037964241636,
            "batch_time": 0.017254668023201098,
            "samples_per_second": 2229670.518108704,
            "samples_per_second_per_gpu": 278708.814763588,
            "loss_sequences_lower_95": 5.197720908532977,
            "loss_sequences_upper_95": 5.295482786211777,
            "loss_tokens_lower_95": 5.197622240236374,
            "loss_tokens_upper_95": 5.296424052024692,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.384038469422362,
            "data_time": 0.004909490099933369,
            "batch_time": 0.018762153280111937,
            "samples_per_second": 2206973.585700246,
            "samples_per_second_per_gpu": 275871.69821253075,
            "loss_sequences_lower_95": 5.3308721318975225,
            "loss_sequences_upper_95": 5.4367709772983215,
            "loss_tokens_lower_95": 5.328906230004862,
            "loss_tokens_upper_95": 5.438070331295429,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.86321385504761,
            "data_time": 0.0035542306323288328,
            "batch_time": 0.01728241953427539,
            "samples_per_second": 2228485.5825286233,
            "samples_per_second_per_gpu": 278560.6978160779,
            "loss_sequences_lower_95": 3.99236480746098,
            "loss_sequences_upper_95": 4.118567565320236,
            "loss_tokens_lower_95": 3.7123177559200484,
            "loss_tokens_upper_95": 3.773383740049814,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.839783174037933,
            "data_time": 0.010988831520080566,
            "batch_time": 0.025005093775689602,
            "samples_per_second": 2120179.5143123167,
            "samples_per_second_per_gpu": 265022.4392890396,
            "loss_sequences_lower_95": 6.038972485351563,
            "loss_sequences_upper_95": 6.583462158203125,
            "loss_tokens_lower_95": 5.1989035242208175,
            "loss_tokens_upper_95": 5.559386504177697,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.22638900578022,
            "data_time": 0.16395051777362823,
            "batch_time": 0.1805906444787979,
            "samples_per_second": 791914.5789441218,
            "samples_per_second_per_gpu": 98989.32236801523,
            "loss_sequences_lower_95": 3.986845076084137,
            "loss_sequences_upper_95": 4.520384216308594,
            "loss_tokens_lower_95": 3.8042066026007992,
            "loss_tokens_upper_95": 4.561104574970815,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.365978812349254,
            "data_time": 0.028104662895202637,
            "batch_time": 0.04251935887844004,
            "samples_per_second": 1830803.9425745753,
            "samples_per_second_per_gpu": 228850.4928218219,
            "loss_sequences_lower_95": 4.601063730524874,
            "loss_sequences_upper_95": 5.113959108549973,
            "loss_tokens_lower_95": 3.57413654176869,
            "loss_tokens_upper_95": 3.94375792012005,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.163589979999187,
            "data_time": 0.0030715540051460266,
            "batch_time": 0.01696669475899802,
            "samples_per_second": 2206502.3253786773,
            "samples_per_second_per_gpu": 275812.79067233467,
            "loss_sequences_lower_95": 3.1442310764884365,
            "loss_sequences_upper_95": 3.1830918431646595,
            "loss_tokens_lower_95": 3.1434822665806577,
            "loss_tokens_upper_95": 3.183505996964081,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.728466858055899,
            "data_time": 0.00267751734063382,
            "batch_time": 0.016372246477647397,
            "samples_per_second": 2253014.1858804873,
            "samples_per_second_per_gpu": 281626.7732350609,
            "loss_sequences_lower_95": 3.6986035611082135,
            "loss_sequences_upper_95": 3.881320404224481,
            "loss_tokens_lower_95": 3.520909144797553,
            "loss_tokens_upper_95": 3.698271256522284,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4912410909002953,
            "data_time": 0.01784743865331014,
            "batch_time": 0.03179390562905206,
            "samples_per_second": 2019783.6855942656,
            "samples_per_second_per_gpu": 252472.9606992832,
            "loss_sequences_lower_95": 3.3345913170894863,
            "loss_sequences_upper_95": 3.7405519813845007,
            "loss_tokens_lower_95": 3.232910105434421,
            "loss_tokens_upper_95": 3.541214212281741,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8596790195358204,
            "data_time": 0.004691171273589134,
            "batch_time": 0.01852441094815731,
            "samples_per_second": 2196651.98311888,
            "samples_per_second_per_gpu": 274581.49788986,
            "loss_sequences_lower_95": 3.888886861341999,
            "loss_sequences_upper_95": 4.036303672399122,
            "loss_tokens_lower_95": 3.71423225345121,
            "loss_tokens_upper_95": 3.8620464376891075,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.230733502928804,
            "data_time": 0.031415567511603945,
            "batch_time": 0.04647067614964077,
            "samples_per_second": 1871025.5229977802,
            "samples_per_second_per_gpu": 233878.19037472253,
            "loss_sequences_lower_95": 3.0730934515231993,
            "loss_sequences_upper_95": 3.5540935609398816,
            "loss_tokens_lower_95": 2.9600063902713347,
            "loss_tokens_upper_95": 3.334180932099434,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.64993057014418,
            "data_time": 0.0024061668314459416,
            "batch_time": 0.016211554997452155,
            "samples_per_second": 2234640.9539981056,
            "samples_per_second_per_gpu": 279330.1192497632,
            "loss_sequences_lower_95": 5.640383926394654,
            "loss_sequences_upper_95": 5.659429395644754,
            "loss_tokens_lower_95": 5.640414303563838,
            "loss_tokens_upper_95": 5.659383585701515,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.4989639092417597,
            "data_time": 0.048723667318170724,
            "batch_time": 0.06383940956809304,
            "samples_per_second": 1740065.6052348877,
            "samples_per_second_per_gpu": 217508.20065436096,
            "loss_sequences_lower_95": 1.4261629937921914,
            "loss_sequences_upper_95": 1.6360261083806602,
            "loss_tokens_lower_95": 1.2788243523151628,
            "loss_tokens_upper_95": 1.577310049472836,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.630225480355682,
            "data_time": 0.0017248851880243819,
            "batch_time": 0.015438838744003635,
            "samples_per_second": 2249914.175952813,
            "samples_per_second_per_gpu": 281239.27199410164,
            "loss_sequences_lower_95": 6.0602598127129195,
            "loss_sequences_upper_95": 6.111532609407757,
            "loss_tokens_lower_95": 4.9694034816247585,
            "loss_tokens_upper_95": 5.021619983075435,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.674432243347168,
            "data_time": 0.006308522016283066,
            "batch_time": 0.02002149252664475,
            "samples_per_second": 2215695.0947615504,
            "samples_per_second_per_gpu": 276961.8868451938,
            "loss_sequences_lower_95": 5.650845874023437,
            "loss_sequences_upper_95": 5.883922436523438,
            "loss_tokens_lower_95": 5.470437081581109,
            "loss_tokens_upper_95": 5.690197461239608,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.103095113712809,
            "data_time": 0.023085278979802535,
            "batch_time": 0.03750832404120494,
            "samples_per_second": 2002323.0108163075,
            "samples_per_second_per_gpu": 250290.37635203844,
            "loss_sequences_lower_95": 4.959384792162025,
            "loss_sequences_upper_95": 5.251356532884681,
            "loss_tokens_lower_95": 4.95889624554178,
            "loss_tokens_upper_95": 5.246326519510021,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.656790805585457,
            "data_time": 0.004651551146105111,
            "batch_time": 0.018397836800081183,
            "samples_per_second": 2224566.0326890177,
            "samples_per_second_per_gpu": 278070.7540861272,
            "loss_sequences_lower_95": 6.563984522964016,
            "loss_sequences_upper_95": 6.749397601503315,
            "loss_tokens_lower_95": 6.560740116003788,
            "loss_tokens_upper_95": 6.752743271336411,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.4275718422730763,
            "data_time": 0.004119735765964427,
            "batch_time": 0.018206912152310635,
            "samples_per_second": 2190584.838384018,
            "samples_per_second_per_gpu": 273823.1047980022,
            "loss_sequences_lower_95": 1.4931980875651043,
            "loss_sequences_upper_95": 1.5850503580729167,
            "loss_tokens_lower_95": 1.3152022215136054,
            "loss_tokens_upper_95": 1.3858852564463287,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.1742081494558425,
            "data_time": 0.02325833056654249,
            "batch_time": 0.03730610013008118,
            "samples_per_second": 1958918.5182948927,
            "samples_per_second_per_gpu": 244864.8147868616,
            "loss_sequences_lower_95": 5.860970575241815,
            "loss_sequences_upper_95": 6.487642415364583,
            "loss_tokens_lower_95": 5.859987909226191,
            "loss_tokens_upper_95": 6.489600641159784,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.7709073275327682,
            "data_time": 0.1516675055027008,
            "batch_time": 0.16851378977298737,
            "samples_per_second": 957918.2952986979,
            "samples_per_second_per_gpu": 119739.78691233724,
            "loss_sequences_lower_95": 2.5199618697166444,
            "loss_sequences_upper_95": 3.6773320376873015,
            "loss_tokens_lower_95": 2.2015699123107284,
            "loss_tokens_upper_95": 2.7449303569990335,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.408632063388825,
            "data_time": 0.005941916079748245,
            "batch_time": 0.019976350050123912,
            "samples_per_second": 2175133.220928713,
            "samples_per_second_per_gpu": 271891.65261608915,
            "loss_sequences_lower_95": 7.3444531005859375,
            "loss_sequences_upper_95": 7.669037817382812,
            "loss_tokens_lower_95": 7.135592418999841,
            "loss_tokens_upper_95": 7.421484209760998,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.277524571895599,
            "data_time": 0.005994085281614273,
            "batch_time": 0.02026600970162286,
            "samples_per_second": 2162686.1569956867,
            "samples_per_second_per_gpu": 270335.76962446084,
            "loss_sequences_lower_95": 7.406906091308594,
            "loss_sequences_upper_95": 7.649556201171875,
            "loss_tokens_lower_95": 6.995907838808995,
            "loss_tokens_upper_95": 7.200090064295662,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.348236541799199,
            "data_time": 0.004454982081384563,
            "batch_time": 0.018177992045680016,
            "samples_per_second": 2233021.5340682226,
            "samples_per_second_per_gpu": 279127.6917585278,
            "loss_sequences_lower_95": 6.328079150996271,
            "loss_sequences_upper_95": 6.368644535833264,
            "loss_tokens_lower_95": 6.328317136953151,
            "loss_tokens_upper_95": 6.3681017628540895,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.740769625442552,
            "data_time": 0.0085635603014436,
            "batch_time": 0.0225981196608068,
            "samples_per_second": 2135853.541917111,
            "samples_per_second_per_gpu": 266981.69273963885,
            "loss_sequences_lower_95": 4.661873353644633,
            "loss_sequences_upper_95": 4.817464858375937,
            "loss_tokens_lower_95": 4.6605510377664165,
            "loss_tokens_upper_95": 4.817427112210181,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.872647797107697,
            "data_time": 0.0062691645016745914,
            "batch_time": 0.02037430802981059,
            "samples_per_second": 2171151.595335774,
            "samples_per_second_per_gpu": 271393.94941697177,
            "loss_sequences_lower_95": 6.798767932128906,
            "loss_sequences_upper_95": 6.948056408691406,
            "loss_tokens_lower_95": 6.8002487060546875,
            "loss_tokens_upper_95": 6.943977270507813,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.451795908890474,
            "data_time": 0.0023887037554850875,
            "batch_time": 0.016076046667838143,
            "samples_per_second": 2250409.2432171325,
            "samples_per_second_per_gpu": 281301.15540214156,
            "loss_sequences_lower_95": 3.9351502261707663,
            "loss_sequences_upper_95": 4.018647345449976,
            "loss_tokens_lower_95": 2.854140878273898,
            "loss_tokens_upper_95": 2.9124001817573477,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.576593555621247,
            "data_time": 0.0197452221597944,
            "batch_time": 0.034703881399972096,
            "samples_per_second": 2008406.5154268835,
            "samples_per_second_per_gpu": 251050.81442836043,
            "loss_sequences_lower_95": 5.410328765413654,
            "loss_sequences_upper_95": 5.741856976409457,
            "loss_tokens_lower_95": 5.4124850714384625,
            "loss_tokens_upper_95": 5.740779170705311,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.422821271185781,
            "data_time": 0.011278461664915085,
            "batch_time": 0.02518413309007883,
            "samples_per_second": 2181671.5575029273,
            "samples_per_second_per_gpu": 272708.9446878659,
            "loss_sequences_lower_95": 5.306895895565257,
            "loss_sequences_upper_95": 5.533872632793352,
            "loss_tokens_lower_95": 5.3083977673100495,
            "loss_tokens_upper_95": 5.534601643880208,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.2851072092574745,
            "data_time": 0.0022371624314388534,
            "batch_time": 0.016058829999858363,
            "samples_per_second": 2230345.488530567,
            "samples_per_second_per_gpu": 278793.18606632086,
            "loss_sequences_lower_95": 4.746028547969122,
            "loss_sequences_upper_95": 4.83586010164177,
            "loss_tokens_lower_95": 3.6076765896519962,
            "loss_tokens_upper_95": 3.6842344640745868,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.004818452098382,
            "data_time": 0.027531810104846954,
            "batch_time": 0.04229255517323812,
            "samples_per_second": 1948889.5336771805,
            "samples_per_second_per_gpu": 243611.19170964757,
            "loss_sequences_lower_95": 5.938263472299727,
            "loss_sequences_upper_95": 6.069074568168197,
            "loss_tokens_lower_95": 5.939393930586557,
            "loss_tokens_upper_95": 6.0694640144469245,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0739670808162165,
            "data_time": 0.0042861571969857876,
            "batch_time": 0.01809720605866522,
            "samples_per_second": 2219994.281699823,
            "samples_per_second_per_gpu": 277499.2852124779,
            "loss_sequences_lower_95": 5.035162297520069,
            "loss_sequences_upper_95": 5.1121240234375,
            "loss_tokens_lower_95": 5.036046206039756,
            "loss_tokens_upper_95": 5.112228772457951,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.732144260869442,
            "data_time": 0.024456362290815874,
            "batch_time": 0.03841665441339666,
            "samples_per_second": 1917965.8722094286,
            "samples_per_second_per_gpu": 239745.73402617857,
            "loss_sequences_lower_95": 5.556107722902761,
            "loss_sequences_upper_95": 5.904044290190761,
            "loss_tokens_lower_95": 5.55922878228345,
            "loss_tokens_upper_95": 5.907233999307873,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.788646268844604,
            "data_time": 0.08206019550561905,
            "batch_time": 0.09709995239973068,
            "samples_per_second": 1429836.3769021342,
            "samples_per_second_per_gpu": 178729.54711276677,
            "loss_sequences_lower_95": 4.470307922363281,
            "loss_sequences_upper_95": 5.336496772766114,
            "loss_tokens_lower_95": 3.961709170871311,
            "loss_tokens_upper_95": 5.114523294236925,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8398008426030477,
            "data_time": 0.08167329430580139,
            "batch_time": 0.0969197154045105,
            "samples_per_second": 1355190.6444166505,
            "samples_per_second_per_gpu": 169398.83055208132,
            "loss_sequences_lower_95": 3.67938014348348,
            "loss_sequences_upper_95": 4.516837959289551,
            "loss_tokens_lower_95": 2.971179662125834,
            "loss_tokens_upper_95": 4.155412438210476,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.406542431096792,
            "data_time": 0.003679174475981391,
            "batch_time": 0.017505539944061045,
            "samples_per_second": 2223549.3176213093,
            "samples_per_second_per_gpu": 277943.66470266366,
            "loss_sequences_lower_95": 5.384866114575662,
            "loss_sequences_upper_95": 5.427878284931885,
            "loss_tokens_lower_95": 5.38531074535162,
            "loss_tokens_upper_95": 5.428178660599227,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.8073466346118756,
            "data_time": 0.0016071565155779508,
            "batch_time": 0.015384048419134362,
            "samples_per_second": 2239785.943879456,
            "samples_per_second_per_gpu": 279973.242984932,
            "loss_sequences_lower_95": 0.9700795938840996,
            "loss_sequences_upper_95": 0.997646401230617,
            "loss_tokens_lower_95": 0.6374220725434983,
            "loss_tokens_upper_95": 0.650652357702373,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.3680029478598765,
            "data_time": 0.03819833695888519,
            "batch_time": 0.05280652269721031,
            "samples_per_second": 1915324.636235266,
            "samples_per_second_per_gpu": 239415.57952940825,
            "loss_sequences_lower_95": 5.3967387161855624,
            "loss_sequences_upper_95": 5.793834896538201,
            "loss_tokens_lower_95": 5.027396967978538,
            "loss_tokens_upper_95": 5.322252419456982,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.516660496995256,
            "data_time": 0.12473770550319127,
            "batch_time": 0.14052477337065197,
            "samples_per_second": 1048085.4472139286,
            "samples_per_second_per_gpu": 131010.68090174107,
            "loss_sequences_lower_95": 8.043365849675359,
            "loss_sequences_upper_95": 9.225603505727406,
            "loss_tokens_lower_95": 7.449967410240644,
            "loss_tokens_upper_95": 9.273416778187693,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.25543801958968,
            "data_time": 0.030133991014389766,
            "batch_time": 0.04463585501625424,
            "samples_per_second": 1914937.934474647,
            "samples_per_second_per_gpu": 239367.24180933088,
            "loss_sequences_lower_95": 5.23373824328911,
            "loss_sequences_upper_95": 5.598487323667944,
            "loss_tokens_lower_95": 4.844884077349664,
            "loss_tokens_upper_95": 5.094621218592304,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.416557793210193,
            "data_time": 0.03152726093928019,
            "batch_time": 0.0465671732312157,
            "samples_per_second": 1870472.1597312433,
            "samples_per_second_per_gpu": 233809.0199664054,
            "loss_sequences_lower_95": 5.380843250925948,
            "loss_sequences_upper_95": 5.70496317235435,
            "loss_tokens_lower_95": 5.041893706663393,
            "loss_tokens_upper_95": 5.250421191967427,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.3865845741295235,
            "data_time": 0.03181724605106172,
            "batch_time": 0.04670805306661697,
            "samples_per_second": 1880518.9795646153,
            "samples_per_second_per_gpu": 235064.8724455769,
            "loss_sequences_lower_95": 5.375490123469655,
            "loss_sequences_upper_95": 5.803097924953554,
            "loss_tokens_lower_95": 4.907342989078673,
            "loss_tokens_upper_95": 5.232487676443897,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.5741484688549505,
            "data_time": 0.03283241816929409,
            "batch_time": 0.04705425387337094,
            "samples_per_second": 1953877.7684632007,
            "samples_per_second_per_gpu": 244234.72105790008,
            "loss_sequences_lower_95": 5.5214311646252145,
            "loss_sequences_upper_95": 5.834882354736328,
            "loss_tokens_lower_95": 5.2347567739516405,
            "loss_tokens_upper_95": 5.430155540775287,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.742881235869034,
            "data_time": 0.03273915361475061,
            "batch_time": 0.04748538394033173,
            "samples_per_second": 1946723.2219099528,
            "samples_per_second_per_gpu": 243340.4027387441,
            "loss_sequences_lower_95": 4.6807848628263296,
            "loss_sequences_upper_95": 4.9203190394810274,
            "loss_tokens_lower_95": 4.477592257878766,
            "loss_tokens_upper_95": 4.624333092644048,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.289434044826321,
            "data_time": 0.03223807187307449,
            "batch_time": 0.04707396881920951,
            "samples_per_second": 1901208.4246577404,
            "samples_per_second_per_gpu": 237651.05308221755,
            "loss_sequences_lower_95": 4.276497194243641,
            "loss_sequences_upper_95": 4.534705194612829,
            "loss_tokens_lower_95": 4.018932797901598,
            "loss_tokens_upper_95": 4.1429394001469255,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/params.txt",
    "uuid": "f859cec4-21b6-4818-a08b-3315a17437e2",
    "creation_date": "2023_12_14-05_01_12"
}