{
    "name": "rw_original-d=1024_l=24_h=8-0.5",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 4116162560,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "823232512",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=1024_l=24_h=8-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.3194817662239076,
            "data_time": 0.044789206236600876,
            "batch_time": 0.4463574290275574,
            "samples_per_second": 693175.1390960854,
            "samples_per_second_per_gpu": 86646.89238701068,
            "loss_sequences_lower_95": 3.2563382720947267,
            "loss_sequences_upper_95": 3.3833251698811853,
            "loss_tokens_lower_95": 3.3061235618591307,
            "loss_tokens_upper_95": 3.332616475423177,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2496822678154627,
            "data_time": 0.0010589674282412787,
            "batch_time": 0.03690264668321171,
            "samples_per_second": 893913.6344216607,
            "samples_per_second_per_gpu": 111739.20430270759,
            "loss_sequences_lower_95": 3.247303378201508,
            "loss_sequences_upper_95": 3.252072119966117,
            "loss_tokens_lower_95": 3.2391630364583333,
            "loss_tokens_upper_95": 3.260194364583333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8870770673362576,
            "data_time": 0.009653864860534668,
            "batch_time": 0.04492034149169922,
            "samples_per_second": 869751.0954364797,
            "samples_per_second_per_gpu": 108718.88692955996,
            "loss_sequences_lower_95": 2.840930661571269,
            "loss_sequences_upper_95": 2.9453122633330673,
            "loss_tokens_lower_95": 2.87478709375,
            "loss_tokens_upper_95": 2.89952621875,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3146669759455416,
            "data_time": 0.0016168698080276187,
            "batch_time": 0.036983194908029156,
            "samples_per_second": 904262.748813381,
            "samples_per_second_per_gpu": 113032.84360167262,
            "loss_sequences_lower_95": 3.2813391893524484,
            "loss_sequences_upper_95": 3.349159602528995,
            "loss_tokens_lower_95": 3.302932203125,
            "loss_tokens_upper_95": 3.3264038958333333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.308159417873002,
            "data_time": 0.010120663509900827,
            "batch_time": 0.0455240177443303,
            "samples_per_second": 865173.4550733548,
            "samples_per_second_per_gpu": 108146.68188416935,
            "loss_sequences_lower_95": 3.2564359272570327,
            "loss_sequences_upper_95": 3.3745802248082675,
            "loss_tokens_lower_95": 3.297338729166667,
            "loss_tokens_upper_95": 3.3188102135416666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.416031663552342,
            "data_time": 0.004015554228554602,
            "batch_time": 0.0394670976244885,
            "samples_per_second": 899187.9065967038,
            "samples_per_second_per_gpu": 112398.48832458798,
            "loss_sequences_lower_95": 3.3720532425725485,
            "loss_sequences_upper_95": 3.464478269326776,
            "loss_tokens_lower_95": 3.4038878854166668,
            "loss_tokens_upper_95": 3.4278522395833333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.988749580918526,
            "data_time": 0.0017882472733688976,
            "batch_time": 0.03721921090006245,
            "samples_per_second": 905472.9136006737,
            "samples_per_second_per_gpu": 113184.11420008421,
            "loss_sequences_lower_95": 2.96097524214764,
            "loss_sequences_upper_95": 3.0161032714843747,
            "loss_tokens_lower_95": 2.9750466145833334,
            "loss_tokens_upper_95": 3.0030832291666667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.777078174571092,
            "data_time": 0.0017186177420047775,
            "batch_time": 0.037655066466640595,
            "samples_per_second": 905031.1885972781,
            "samples_per_second_per_gpu": 113128.89857465976,
            "loss_sequences_lower_95": 3.7555275175883507,
            "loss_sequences_upper_95": 3.8008811150196333,
            "loss_tokens_lower_95": 3.76567228125,
            "loss_tokens_upper_95": 3.7884706875000003,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.405811643697382,
            "data_time": 0.010659233918265692,
            "batch_time": 0.04642520915894281,
            "samples_per_second": 860162.0057711757,
            "samples_per_second_per_gpu": 107520.25072139697,
            "loss_sequences_lower_95": 3.3258606329196834,
            "loss_sequences_upper_95": 3.5037177729412794,
            "loss_tokens_lower_95": 3.3944926927083334,
            "loss_tokens_upper_95": 3.4172067552083334,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.4565746181096015,
            "data_time": 0.0096067413687706,
            "batch_time": 0.04523643385618925,
            "samples_per_second": 871225.4050806196,
            "samples_per_second_per_gpu": 108903.17563507745,
            "loss_sequences_lower_95": 4.3550499475049405,
            "loss_sequences_upper_95": 4.583343626482213,
            "loss_tokens_lower_95": 4.443327322916667,
            "loss_tokens_upper_95": 4.46993,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.456936136506854,
            "data_time": 0.001310929300753049,
            "batch_time": 0.03670460820426742,
            "samples_per_second": 907121.634382849,
            "samples_per_second_per_gpu": 113390.20429785612,
            "loss_sequences_lower_95": 3.4468316990822427,
            "loss_sequences_upper_95": 3.4673658293862206,
            "loss_tokens_lower_95": 3.4460291770833336,
            "loss_tokens_upper_95": 3.4679894166666667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3045454137477215,
            "data_time": 0.002578796891745282,
            "batch_time": 0.038038553147391414,
            "samples_per_second": 902559.7873744044,
            "samples_per_second_per_gpu": 112819.97342180055,
            "loss_sequences_lower_95": 3.283614722096587,
            "loss_sequences_upper_95": 3.326483197512104,
            "loss_tokens_lower_95": 3.2934485989583333,
            "loss_tokens_upper_95": 3.3156522447916665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.817569542608203,
            "data_time": 0.010251008474779694,
            "batch_time": 0.046557162590177635,
            "samples_per_second": 862261.1108191446,
            "samples_per_second_per_gpu": 107782.63885239308,
            "loss_sequences_lower_95": 3.7395913516773898,
            "loss_sequences_upper_95": 3.91535736764904,
            "loss_tokens_lower_95": 3.8046935416666665,
            "loss_tokens_upper_95": 3.8301084270833337,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.060231884968014,
            "data_time": 0.009769486241131664,
            "batch_time": 0.045925421543805245,
            "samples_per_second": 868249.987160875,
            "samples_per_second_per_gpu": 108531.24839510937,
            "loss_sequences_lower_95": 2.979853370184811,
            "loss_sequences_upper_95": 3.153205358812134,
            "loss_tokens_lower_95": 3.048819114583333,
            "loss_tokens_upper_95": 3.0716335781249997,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.018286081877622,
            "data_time": 0.08854106494358607,
            "batch_time": 0.12476812090192523,
            "samples_per_second": 502027.34349488147,
            "samples_per_second_per_gpu": 62753.41793686018,
            "loss_sequences_lower_95": 3.9508095481178978,
            "loss_sequences_upper_95": 4.092888598008589,
            "loss_tokens_lower_95": 3.995599824732,
            "loss_tokens_upper_95": 4.041346558657559,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.315048374170465,
            "data_time": 0.013680521737445484,
            "batch_time": 0.04909250004725023,
            "samples_per_second": 853801.018884312,
            "samples_per_second_per_gpu": 106725.127360539,
            "loss_sequences_lower_95": 3.2526240846506016,
            "loss_sequences_upper_95": 3.3756373814174108,
            "loss_tokens_lower_95": 3.3028810052083335,
            "loss_tokens_upper_95": 3.3269921718749997,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.2719457772287655,
            "data_time": 0.012926574796438217,
            "batch_time": 0.0485701858997345,
            "samples_per_second": 863015.6787368477,
            "samples_per_second_per_gpu": 107876.95984210596,
            "loss_sequences_lower_95": 5.186277694500846,
            "loss_sequences_upper_95": 5.383782580534198,
            "loss_tokens_lower_95": 5.26032546875,
            "loss_tokens_upper_95": 5.283505197916667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6609216635344457,
            "data_time": 0.03582412749528885,
            "batch_time": 0.07159135118126869,
            "samples_per_second": 773326.1828392075,
            "samples_per_second_per_gpu": 96665.77285490093,
            "loss_sequences_lower_95": 3.5148606659936124,
            "loss_sequences_upper_95": 3.91826256298628,
            "loss_tokens_lower_95": 3.646904198068087,
            "loss_tokens_upper_95": 3.674895814989434,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9348409558154125,
            "data_time": 0.0015920330017194118,
            "batch_time": 0.03707788161644903,
            "samples_per_second": 899774.1693164761,
            "samples_per_second_per_gpu": 112471.77116455951,
            "loss_sequences_lower_95": 3.9151139772646344,
            "loss_sequences_upper_95": 3.954717738525495,
            "loss_tokens_lower_95": 3.914568717118733,
            "loss_tokens_upper_95": 3.954732711742006,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.902431641435082,
            "data_time": 0.0019974959124425414,
            "batch_time": 0.03751935183432451,
            "samples_per_second": 897306.8885812412,
            "samples_per_second_per_gpu": 112163.36107265516,
            "loss_sequences_lower_95": 2.9007420445457206,
            "loss_sequences_upper_95": 2.925609636791102,
            "loss_tokens_lower_95": 2.8823127992515705,
            "loss_tokens_upper_95": 2.900934125369664,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.234862857795179,
            "data_time": 0.0031104970944590663,
            "batch_time": 0.038618716061059086,
            "samples_per_second": 895693.4786839555,
            "samples_per_second_per_gpu": 111961.68483549444,
            "loss_sequences_lower_95": 4.509750413782032,
            "loss_sequences_upper_95": 4.806558768119686,
            "loss_tokens_lower_95": 3.668375296203017,
            "loss_tokens_upper_95": 3.8849049427976827,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.123008259537319,
            "data_time": 0.003341917027818396,
            "batch_time": 0.03882700648713619,
            "samples_per_second": 893152.6030359663,
            "samples_per_second_per_gpu": 111644.07537949579,
            "loss_sequences_lower_95": 4.208648470052084,
            "loss_sequences_upper_95": 4.4037094482421875,
            "loss_tokens_lower_95": 3.8924759544516507,
            "loss_tokens_upper_95": 4.0344964745479555,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8935755589769316,
            "data_time": 0.004466622483496573,
            "batch_time": 0.039960322516716086,
            "samples_per_second": 889784.3008546013,
            "samples_per_second_per_gpu": 111223.03760682516,
            "loss_sequences_lower_95": 2.9375741398994264,
            "loss_sequences_upper_95": 2.9948131130354434,
            "loss_tokens_lower_95": 2.802110506807989,
            "loss_tokens_upper_95": 2.832235889202804,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.402633009173653,
            "data_time": 0.0227450430393219,
            "batch_time": 0.05862760969570705,
            "samples_per_second": 834513.8244684115,
            "samples_per_second_per_gpu": 104314.22805855144,
            "loss_sequences_lower_95": 2.380234534523704,
            "loss_sequences_upper_95": 2.48230656710538,
            "loss_tokens_lower_95": 2.3347046515985586,
            "loss_tokens_upper_95": 2.382572767339766,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2248465119575966,
            "data_time": 0.020713014528155327,
            "batch_time": 0.05601998046040535,
            "samples_per_second": 822947.8328638661,
            "samples_per_second_per_gpu": 102868.47910798326,
            "loss_sequences_lower_95": 3.210195163026148,
            "loss_sequences_upper_95": 3.3923517219387755,
            "loss_tokens_lower_95": 3.105399907963778,
            "loss_tokens_upper_95": 3.1941415506516586,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6261360557874043,
            "data_time": 0.01729304515398466,
            "batch_time": 0.05250754723182091,
            "samples_per_second": 836587.869425179,
            "samples_per_second_per_gpu": 104573.48367814737,
            "loss_sequences_lower_95": 3.595824442545573,
            "loss_sequences_upper_95": 3.6877460123697916,
            "loss_tokens_lower_95": 3.4970436501709283,
            "loss_tokens_upper_95": 3.709082023705481,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.417315337391349,
            "data_time": 0.0013556228638070737,
            "batch_time": 0.03685192836104105,
            "samples_per_second": 900278.139857862,
            "samples_per_second_per_gpu": 112534.76748223275,
            "loss_sequences_lower_95": 5.423981330120319,
            "loss_sequences_upper_95": 5.505883018952069,
            "loss_tokens_lower_95": 5.280819834694085,
            "loss_tokens_upper_95": 5.363373890221104,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.247490893886346,
            "data_time": 0.003047773101985855,
            "batch_time": 0.03849823082853484,
            "samples_per_second": 896000.8720048392,
            "samples_per_second_per_gpu": 112000.1090006049,
            "loss_sequences_lower_95": 4.740637710519913,
            "loss_sequences_upper_95": 5.033626795296717,
            "loss_tokens_lower_95": 3.559776218840472,
            "loss_tokens_upper_95": 3.692909893482842,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.888775833637641,
            "data_time": 0.004829091397491661,
            "batch_time": 0.04021232152307356,
            "samples_per_second": 886327.317537432,
            "samples_per_second_per_gpu": 110790.914692179,
            "loss_sequences_lower_95": 4.28244058133799,
            "loss_sequences_upper_95": 4.606047459104362,
            "loss_tokens_lower_95": 3.498556048826234,
            "loss_tokens_upper_95": 3.649910733868284,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.987273760582214,
            "data_time": 0.02431644286428179,
            "batch_time": 0.060077665107590814,
            "samples_per_second": 830320.0841662113,
            "samples_per_second_per_gpu": 103790.01052077641,
            "loss_sequences_lower_95": 5.904581649989298,
            "loss_sequences_upper_95": 6.069058659523045,
            "loss_tokens_lower_95": 5.905629824285638,
            "loss_tokens_upper_95": 6.067171697747217,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2825271892547607,
            "data_time": 0.050713186080639176,
            "batch_time": 0.08643008653934185,
            "samples_per_second": 750959.2493502101,
            "samples_per_second_per_gpu": 93869.90616877626,
            "loss_sequences_lower_95": 3.149679862976074,
            "loss_sequences_upper_95": 3.4960289459228515,
            "loss_tokens_lower_95": 2.976001063911561,
            "loss_tokens_upper_95": 3.4206348398717017,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.402397187119741,
            "data_time": 0.0034138783355432056,
            "batch_time": 0.038901680817633316,
            "samples_per_second": 896392.4877449991,
            "samples_per_second_per_gpu": 112049.06096812489,
            "loss_sequences_lower_95": 4.350766868942714,
            "loss_sequences_upper_95": 4.454838794719966,
            "loss_tokens_lower_95": 4.349042603913447,
            "loss_tokens_upper_95": 4.455307084925953,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.850216029024241,
            "data_time": 0.004796935916919304,
            "batch_time": 0.04038723685807544,
            "samples_per_second": 888348.0303227295,
            "samples_per_second_per_gpu": 111043.50379034119,
            "loss_sequences_lower_95": 4.7911165704500664,
            "loss_sequences_upper_95": 4.907342154388053,
            "loss_tokens_lower_95": 4.7900126989116245,
            "loss_tokens_upper_95": 4.908197566351863,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3106001054501246,
            "data_time": 0.003463893420752902,
            "batch_time": 0.03890801145500035,
            "samples_per_second": 890793.0830229536,
            "samples_per_second_per_gpu": 111349.1353778692,
            "loss_sequences_lower_95": 3.442077397625646,
            "loss_sequences_upper_95": 3.5737368644905296,
            "loss_tokens_lower_95": 3.1565509794519264,
            "loss_tokens_upper_95": 3.214060138590766,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.216492115020752,
            "data_time": 0.010513676330447197,
            "batch_time": 0.046016507782042027,
            "samples_per_second": 860038.6268864316,
            "samples_per_second_per_gpu": 107504.82836080395,
            "loss_sequences_lower_95": 5.3789037109375,
            "loss_sequences_upper_95": 5.9107961914062495,
            "loss_tokens_lower_95": 4.650263206537475,
            "loss_tokens_upper_95": 5.00125707377668,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.698959067463875,
            "data_time": 0.1435866355895996,
            "batch_time": 0.18268483877182007,
            "samples_per_second": 492522.6173272729,
            "samples_per_second_per_gpu": 61565.32716590911,
            "loss_sequences_lower_95": 3.4712211012840273,
            "loss_sequences_upper_95": 3.9645351290702817,
            "loss_tokens_lower_95": 3.249285432662087,
            "loss_tokens_upper_95": 4.070608994056438,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.031805367990472,
            "data_time": 0.027656580539459876,
            "batch_time": 0.06357245242342036,
            "samples_per_second": 768692.9478768435,
            "samples_per_second_per_gpu": 96086.61848460544,
            "loss_sequences_lower_95": 4.29242188903107,
            "loss_sequences_upper_95": 4.804716614471085,
            "loss_tokens_lower_95": 3.219764160706736,
            "loss_tokens_upper_95": 3.5743757142558303,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1456399731575635,
            "data_time": 0.0029443779753314126,
            "batch_time": 0.03839742143948873,
            "samples_per_second": 892926.7498889344,
            "samples_per_second_per_gpu": 111615.8437361168,
            "loss_sequences_lower_95": 2.125430856473897,
            "loss_sequences_upper_95": 2.165826925308541,
            "loss_tokens_lower_95": 2.1254331496252457,
            "loss_tokens_upper_95": 2.1658772519812826,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.67708429997827,
            "data_time": 0.0022387854295226554,
            "batch_time": 0.03775923123505858,
            "samples_per_second": 897439.5352102049,
            "samples_per_second_per_gpu": 112179.94190127561,
            "loss_sequences_lower_95": 2.6504126180289393,
            "loss_sequences_upper_95": 2.7943368992804554,
            "loss_tokens_lower_95": 2.526234000467098,
            "loss_tokens_upper_95": 2.666543117388295,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.144734821695111,
            "data_time": 0.01836923427051968,
            "batch_time": 0.054197337892320424,
            "samples_per_second": 813848.2504877911,
            "samples_per_second_per_gpu": 101731.03131097389,
            "loss_sequences_lower_95": 2.9975203014555434,
            "loss_sequences_upper_95": 3.412090678791423,
            "loss_tokens_lower_95": 2.878503694631345,
            "loss_tokens_upper_95": 3.170450317495736,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5404077993195737,
            "data_time": 0.004599379748106003,
            "batch_time": 0.03998211398720741,
            "samples_per_second": 887746.5974573174,
            "samples_per_second_per_gpu": 110968.32468216468,
            "loss_sequences_lower_95": 3.5853314261974645,
            "loss_sequences_upper_95": 3.740129088151144,
            "loss_tokens_lower_95": 3.3912791195868004,
            "loss_tokens_upper_95": 3.5329177730763575,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.746313728210403,
            "data_time": 0.029626213368915376,
            "batch_time": 0.06635749623889015,
            "samples_per_second": 797713.2137823394,
            "samples_per_second_per_gpu": 99714.15172279242,
            "loss_sequences_lower_95": 2.5901694134968083,
            "loss_sequences_upper_95": 3.0364336060314643,
            "loss_tokens_lower_95": 2.471453862400273,
            "loss_tokens_upper_95": 2.8184051961727667,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.082449093607098,
            "data_time": 0.0018109752328882586,
            "batch_time": 0.037294278276285744,
            "samples_per_second": 897749.6502480189,
            "samples_per_second_per_gpu": 112218.70628100236,
            "loss_sequences_lower_95": 5.073327927304211,
            "loss_sequences_upper_95": 5.091501513193264,
            "loss_tokens_lower_95": 5.073266098532207,
            "loss_tokens_upper_95": 5.091544314722319,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1559249064297352,
            "data_time": 0.045301836187189276,
            "batch_time": 0.08215241865678267,
            "samples_per_second": 721894.6782627371,
            "samples_per_second_per_gpu": 90236.83478284214,
            "loss_sequences_lower_95": 1.1100827300432816,
            "loss_sequences_upper_95": 1.2496826320018584,
            "loss_tokens_lower_95": 0.9910800430157289,
            "loss_tokens_upper_95": 1.2191139119674386,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.238300737757353,
            "data_time": 0.0011742012598688425,
            "batch_time": 0.03669983434607177,
            "samples_per_second": 899143.9418478467,
            "samples_per_second_per_gpu": 112392.99273098084,
            "loss_sequences_lower_95": 4.56751706425249,
            "loss_sequences_upper_95": 4.6090340015723275,
            "loss_tokens_lower_95": 3.727217111943907,
            "loss_tokens_upper_95": 3.7686394584139262,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.168209335565567,
            "data_time": 0.005715601027957977,
            "batch_time": 0.04125308848562695,
            "samples_per_second": 881754.7970010422,
            "samples_per_second_per_gpu": 110219.34962513027,
            "loss_sequences_lower_95": 5.161255639648437,
            "loss_sequences_upper_95": 5.375267712402344,
            "loss_tokens_lower_95": 4.961523928424691,
            "loss_tokens_upper_95": 5.159740946593436,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.782611880095109,
            "data_time": 0.022021620960558874,
            "batch_time": 0.05787737692816783,
            "samples_per_second": 829110.6764546777,
            "samples_per_second_per_gpu": 103638.83455683471,
            "loss_sequences_lower_95": 4.6192126464843755,
            "loss_sequences_upper_95": 4.945262981912364,
            "loss_tokens_lower_95": 4.61891750169837,
            "loss_tokens_upper_95": 4.943943415102751,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.759892907287135,
            "data_time": 0.004497359316033053,
            "batch_time": 0.04016056082334863,
            "samples_per_second": 886240.9342628821,
            "samples_per_second_per_gpu": 110780.11678286026,
            "loss_sequences_lower_95": 6.6782454057173295,
            "loss_sequences_upper_95": 6.839745816317472,
            "loss_tokens_lower_95": 6.679736698035038,
            "loss_tokens_upper_95": 6.838540575432055,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.375376674691836,
            "data_time": 0.004078345412903643,
            "batch_time": 0.03964388909492087,
            "samples_per_second": 891798.001922163,
            "samples_per_second_per_gpu": 111474.75024027037,
            "loss_sequences_lower_95": 1.4204825358072917,
            "loss_sequences_upper_95": 1.4826013916015623,
            "loss_tokens_lower_95": 1.2876174121992547,
            "loss_tokens_upper_95": 1.3542067139355742,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.884216651462373,
            "data_time": 0.022199864898409163,
            "batch_time": 0.05758813662188394,
            "samples_per_second": 797725.7998424825,
            "samples_per_second_per_gpu": 99715.72498031032,
            "loss_sequences_lower_95": 5.531993553524925,
            "loss_sequences_upper_95": 6.22917221795945,
            "loss_tokens_lower_95": 5.53922360374814,
            "loss_tokens_upper_95": 6.231216619582403,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4133430793881416,
            "data_time": 0.15336453914642334,
            "batch_time": 0.1947633922100067,
            "samples_per_second": 466108.180206996,
            "samples_per_second_per_gpu": 58263.5225258745,
            "loss_sequences_lower_95": 2.2198487341403963,
            "loss_sequences_upper_95": 3.3209598839282988,
            "loss_tokens_lower_95": 1.8767659232542686,
            "loss_tokens_upper_95": 2.3835878415451837,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.204798488616944,
            "data_time": 0.006008886628680759,
            "batch_time": 0.04161943613536774,
            "samples_per_second": 881858.9964896637,
            "samples_per_second_per_gpu": 110232.37456120797,
            "loss_sequences_lower_95": 7.152392370605469,
            "loss_sequences_upper_95": 7.430998852539062,
            "loss_tokens_lower_95": 6.960640544859084,
            "loss_tokens_upper_95": 7.208027496596077,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.00831917476654,
            "data_time": 0.0057631683728051565,
            "batch_time": 0.04124274140312558,
            "samples_per_second": 884009.5781473307,
            "samples_per_second_per_gpu": 110501.19726841633,
            "loss_sequences_lower_95": 7.127460021972657,
            "loss_sequences_upper_95": 7.353241870117187,
            "loss_tokens_lower_95": 6.73897941639021,
            "loss_tokens_upper_95": 6.944096466966074,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.695696040603326,
            "data_time": 0.00336578506291112,
            "batch_time": 0.0388678981308953,
            "samples_per_second": 891815.6547097706,
            "samples_per_second_per_gpu": 111476.95683872132,
            "loss_sequences_lower_95": 5.670082207378268,
            "loss_sequences_upper_95": 5.72096507487743,
            "loss_tokens_lower_95": 5.670353962165815,
            "loss_tokens_upper_95": 5.7215873183718156,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.344660708431824,
            "data_time": 0.008232210337935617,
            "batch_time": 0.04375791405623053,
            "samples_per_second": 870793.9311755392,
            "samples_per_second_per_gpu": 108849.2413969424,
            "loss_sequences_lower_95": 3.259723322292627,
            "loss_sequences_upper_95": 3.427191668391777,
            "loss_tokens_lower_95": 3.2590983166672665,
            "loss_tokens_upper_95": 3.428747146067348,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.510516190052033,
            "data_time": 0.005696205865769159,
            "batch_time": 0.04135890092168536,
            "samples_per_second": 882276.9879225614,
            "samples_per_second_per_gpu": 110284.62349032017,
            "loss_sequences_lower_95": 5.4448048828125,
            "loss_sequences_upper_95": 5.577855078125,
            "loss_tokens_lower_95": 5.445509558105469,
            "loss_tokens_upper_95": 5.577191552734375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.724631733948483,
            "data_time": 0.0017738596478121492,
            "batch_time": 0.03741914090226864,
            "samples_per_second": 895936.9784989407,
            "samples_per_second_per_gpu": 111992.12231236759,
            "loss_sequences_lower_95": 3.1537804979452457,
            "loss_sequences_upper_95": 3.22643725609035,
            "loss_tokens_lower_95": 2.1864552659975796,
            "loss_tokens_upper_95": 2.239438370129143,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.566484115016994,
            "data_time": 0.018452019350869316,
            "batch_time": 0.05408331326075962,
            "samples_per_second": 822389.0081037816,
            "samples_per_second_per_gpu": 102798.6260129727,
            "loss_sequences_lower_95": 4.411866760253907,
            "loss_sequences_upper_95": 4.71794702330632,
            "loss_tokens_lower_95": 4.411922044896367,
            "loss_tokens_upper_95": 4.720059363521746,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.716979525136012,
            "data_time": 0.01043744944036007,
            "batch_time": 0.04611060302704573,
            "samples_per_second": 871903.3457101076,
            "samples_per_second_per_gpu": 108987.91821376345,
            "loss_sequences_lower_95": 4.609473817114737,
            "loss_sequences_upper_95": 4.822346550436581,
            "loss_tokens_lower_95": 4.613319247376685,
            "loss_tokens_upper_95": 4.821680393592984,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7195136353259666,
            "data_time": 0.0018529167590245034,
            "batch_time": 0.037402503935806033,
            "samples_per_second": 896404.7558391538,
            "samples_per_second_per_gpu": 112050.59447989422,
            "loss_sequences_lower_95": 4.23249375626331,
            "loss_sequences_upper_95": 4.326137353399411,
            "loss_tokens_lower_95": 3.0095939201295963,
            "loss_tokens_upper_95": 3.084146198605968,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.8145663120128495,
            "data_time": 0.02870031197865804,
            "batch_time": 0.0656574343641599,
            "samples_per_second": 803044.3089473433,
            "samples_per_second_per_gpu": 100380.53861841791,
            "loss_sequences_lower_95": 5.735160157541749,
            "loss_sequences_upper_95": 5.888917016730737,
            "loss_tokens_lower_95": 5.73660711056341,
            "loss_tokens_upper_95": 5.887539672851562,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.249003663923398,
            "data_time": 0.0033570206354534816,
            "batch_time": 0.03887778935414968,
            "samples_per_second": 892059.635173526,
            "samples_per_second_per_gpu": 111507.45439669075,
            "loss_sequences_lower_95": 4.219081419031919,
            "loss_sequences_upper_95": 4.278228539516437,
            "loss_tokens_lower_95": 4.2201724215166285,
            "loss_tokens_upper_95": 4.278426330753058,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.762819853222486,
            "data_time": 0.023530030250549318,
            "batch_time": 0.05954548879103227,
            "samples_per_second": 787923.5044698518,
            "samples_per_second_per_gpu": 98490.43805873148,
            "loss_sequences_lower_95": 4.58320162874981,
            "loss_sequences_upper_95": 4.944851410504684,
            "loss_tokens_lower_95": 4.578927353053418,
            "loss_tokens_upper_95": 4.944623624931261,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5016992489496865,
            "data_time": 0.08195517957210541,
            "batch_time": 0.11927132308483124,
            "samples_per_second": 635131.9450410582,
            "samples_per_second_per_gpu": 79391.49313013228,
            "loss_sequences_lower_95": 2.2776707140604655,
            "loss_sequences_upper_95": 2.8582308133443197,
            "loss_tokens_lower_95": 2.044658798641629,
            "loss_tokens_upper_95": 2.7637863636016844,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.638452923297882,
            "data_time": 0.07407569140195847,
            "batch_time": 0.11043228209018707,
            "samples_per_second": 640940.1788977874,
            "samples_per_second_per_gpu": 80117.52236222342,
            "loss_sequences_lower_95": 2.4746139272054037,
            "loss_sequences_upper_95": 3.2229140345255534,
            "loss_tokens_lower_95": 2.005118097883932,
            "loss_tokens_upper_95": 2.9239082679319917,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.182046585757181,
            "data_time": 0.0031265188063362755,
            "batch_time": 0.038618756154483035,
            "samples_per_second": 894224.9465797445,
            "samples_per_second_per_gpu": 111778.11832246806,
            "loss_sequences_lower_95": 5.161457465597386,
            "loss_sequences_upper_95": 5.202405896193851,
            "loss_tokens_lower_95": 5.162090908044919,
            "loss_tokens_upper_95": 5.20270233109352,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.6459137504098585,
            "data_time": 0.0011405508141940218,
            "batch_time": 0.03663549444248953,
            "samples_per_second": 899795.5894389227,
            "samples_per_second_per_gpu": 112474.44867986534,
            "loss_sequences_lower_95": 0.7402879964487665,
            "loss_sequences_upper_95": 0.7590041426605347,
            "loss_tokens_lower_95": 0.5487678839029954,
            "loss_tokens_upper_95": 0.5588236933948085,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.2117742140462076,
            "data_time": 0.03985461965203285,
            "batch_time": 0.09428196772933006,
            "samples_per_second": 790789.3068438607,
            "samples_per_second_per_gpu": 98848.66335548258,
            "loss_sequences_lower_95": 4.26644350787786,
            "loss_sequences_upper_95": 4.625998982106607,
            "loss_tokens_lower_95": 3.8985817714570286,
            "loss_tokens_upper_95": 4.147116405962141,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.33197290832932,
            "data_time": 0.12024889673505511,
            "batch_time": 0.15715574082874118,
            "samples_per_second": 492477.12900622847,
            "samples_per_second_per_gpu": 61559.64112577856,
            "loss_sequences_lower_95": 6.833542849566485,
            "loss_sequences_upper_95": 8.1084591221165,
            "loss_tokens_lower_95": 6.344449945143712,
            "loss_tokens_upper_95": 8.014721698525511,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.120283138461229,
            "data_time": 0.030580341815948486,
            "batch_time": 0.0668049000558399,
            "samples_per_second": 801583.2040298751,
            "samples_per_second_per_gpu": 100197.90050373439,
            "loss_sequences_lower_95": 4.121640293772628,
            "loss_sequences_upper_95": 4.453713412401153,
            "loss_tokens_lower_95": 3.7477173652360176,
            "loss_tokens_upper_95": 3.9581052716305716,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.245632732786784,
            "data_time": 0.03034208785919916,
            "batch_time": 0.06710371516999744,
            "samples_per_second": 798020.4759520957,
            "samples_per_second_per_gpu": 99752.55949401196,
            "loss_sequences_lower_95": 4.239181890720275,
            "loss_sequences_upper_95": 4.529995522847989,
            "loss_tokens_lower_95": 3.9114620401555356,
            "loss_tokens_upper_95": 4.091885048504016,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.207567800835865,
            "data_time": 0.029922425746917725,
            "batch_time": 0.06595391035079956,
            "samples_per_second": 806511.7170326655,
            "samples_per_second_per_gpu": 100813.96462908319,
            "loss_sequences_lower_95": 4.209360969357374,
            "loss_sequences_upper_95": 4.5723779073575646,
            "loss_tokens_lower_95": 3.7989514307567207,
            "loss_tokens_upper_95": 4.066683628286282,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.400550833562526,
            "data_time": 0.02949346247173491,
            "batch_time": 0.06509714183353242,
            "samples_per_second": 812654.8145255063,
            "samples_per_second_per_gpu": 101581.8518156883,
            "loss_sequences_lower_95": 4.377203304011647,
            "loss_sequences_upper_95": 4.662864610625476,
            "loss_tokens_lower_95": 4.084242823710694,
            "loss_tokens_upper_95": 4.253752977602949,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.629851667036921,
            "data_time": 0.03236208138642488,
            "batch_time": 0.06936153364770206,
            "samples_per_second": 801586.3231749157,
            "samples_per_second_per_gpu": 100198.29039686447,
            "loss_sequences_lower_95": 3.5604915642590256,
            "loss_sequences_upper_95": 3.779363189128615,
            "loss_tokens_lower_95": 3.4113015387357413,
            "loss_tokens_upper_95": 3.5405623674685582,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3093196110027594,
            "data_time": 0.03119774375643049,
            "batch_time": 0.06717075620378767,
            "samples_per_second": 806320.1880407727,
            "samples_per_second_per_gpu": 100790.02350509659,
            "loss_sequences_lower_95": 3.3209047177942788,
            "loss_sequences_upper_95": 3.5519911417147005,
            "loss_tokens_lower_95": 3.045551754869274,
            "loss_tokens_upper_95": 3.1550601477735083,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-0.5/params.txt",
    "uuid": "858c5303-b800-4fbb-a48e-c19c4c997a21",
    "creation_date": "2023_12_14-05_15_05"
}