{
    "name": "rw_original-d=1024_l=24_h=8-8.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 65858600960,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "13171720192",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=1024_l=24_h=8-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.7661501785119373,
            "data_time": 0.04422101005911827,
            "batch_time": 0.41740159690380096,
            "samples_per_second": 694003.0421394209,
            "samples_per_second_per_gpu": 86750.38026742761,
            "loss_sequences_lower_95": 2.708879019419352,
            "loss_sequences_upper_95": 2.823866335550944,
            "loss_tokens_lower_95": 2.7536727142333985,
            "loss_tokens_upper_95": 2.778229242960612,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.912887989185122,
            "data_time": 0.0011107225635513217,
            "batch_time": 0.036833597475412726,
            "samples_per_second": 896398.080530409,
            "samples_per_second_per_gpu": 112049.76006630113,
            "loss_sequences_lower_95": 2.9104218597424674,
            "loss_sequences_upper_95": 2.915345525204708,
            "loss_tokens_lower_95": 2.9027697447916667,
            "loss_tokens_upper_95": 2.922953078125,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.589055066206017,
            "data_time": 0.010014677047729492,
            "batch_time": 0.04568922519683838,
            "samples_per_second": 864871.0457578086,
            "samples_per_second_per_gpu": 108108.88071972608,
            "loss_sequences_lower_95": 2.551726915009168,
            "loss_sequences_upper_95": 2.6342418935347576,
            "loss_tokens_lower_95": 2.577666010416667,
            "loss_tokens_upper_95": 2.6003937760416664,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9304052403538496,
            "data_time": 0.0016089937600650285,
            "batch_time": 0.036979469813798606,
            "samples_per_second": 904618.7518490602,
            "samples_per_second_per_gpu": 113077.34398113252,
            "loss_sequences_lower_95": 2.9041366029719717,
            "loss_sequences_upper_95": 2.9573997060244848,
            "loss_tokens_lower_95": 2.919664848958333,
            "loss_tokens_upper_95": 2.9410952760416667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9649346433200563,
            "data_time": 0.009965347578801007,
            "batch_time": 0.045667143931901785,
            "samples_per_second": 860618.7760793437,
            "samples_per_second_per_gpu": 107577.34700991797,
            "loss_sequences_lower_95": 2.922171026723205,
            "loss_sequences_upper_95": 3.0159805950467793,
            "loss_tokens_lower_95": 2.9545861718750004,
            "loss_tokens_upper_95": 2.97514434375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.985927843058863,
            "data_time": 0.0039762776830922,
            "batch_time": 0.039464998504389885,
            "samples_per_second": 898340.9802398665,
            "samples_per_second_per_gpu": 112292.62252998332,
            "loss_sequences_lower_95": 2.950093623199567,
            "loss_sequences_upper_95": 3.023812507302643,
            "loss_tokens_lower_95": 2.9747565729166663,
            "loss_tokens_upper_95": 2.9969401458333333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.437937759538086,
            "data_time": 0.0016346102243146538,
            "batch_time": 0.037026190543835935,
            "samples_per_second": 905522.6779448229,
            "samples_per_second_per_gpu": 113190.33474310287,
            "loss_sequences_lower_95": 2.4126478196747447,
            "loss_sequences_upper_95": 2.46292604033801,
            "loss_tokens_lower_95": 2.4255204687500003,
            "loss_tokens_upper_95": 2.4509947864583332,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4045739594304747,
            "data_time": 0.0017424040586566487,
            "batch_time": 0.03729634113463334,
            "samples_per_second": 904555.7128083226,
            "samples_per_second_per_gpu": 113069.46410104033,
            "loss_sequences_lower_95": 3.389146995664267,
            "loss_sequences_upper_95": 3.4211796977257856,
            "loss_tokens_lower_95": 3.393795125,
            "loss_tokens_upper_95": 3.4153737708333334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0325645029060238,
            "data_time": 0.010058660355825273,
            "batch_time": 0.045735346892523385,
            "samples_per_second": 862597.4370705675,
            "samples_per_second_per_gpu": 107824.67963382094,
            "loss_sequences_lower_95": 2.9691582656488187,
            "loss_sequences_upper_95": 3.104908690413809,
            "loss_tokens_lower_95": 3.021756359375,
            "loss_tokens_upper_95": 3.04330453125,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.011545850825404,
            "data_time": 0.010496553033590317,
            "batch_time": 0.046514165587723255,
            "samples_per_second": 862212.6559860581,
            "samples_per_second_per_gpu": 107776.58199825726,
            "loss_sequences_lower_95": 3.9416598957046687,
            "loss_sequences_upper_95": 4.096884179398005,
            "loss_tokens_lower_95": 3.99906125,
            "loss_tokens_upper_95": 4.02408621875,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0875631881704457,
            "data_time": 0.001387496097802005,
            "batch_time": 0.036795748655487,
            "samples_per_second": 906553.6538356539,
            "samples_per_second_per_gpu": 113319.20672945674,
            "loss_sequences_lower_95": 3.077248489319241,
            "loss_sequences_upper_95": 3.098161648194463,
            "loss_tokens_lower_95": 3.0770561302083332,
            "loss_tokens_upper_95": 3.0981740729166667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9105384543457413,
            "data_time": 0.002645425455060033,
            "batch_time": 0.0380733394305176,
            "samples_per_second": 902724.5583370425,
            "samples_per_second_per_gpu": 112840.5697921303,
            "loss_sequences_lower_95": 2.891688984366865,
            "loss_sequences_upper_95": 2.9302187270196662,
            "loss_tokens_lower_95": 2.9002261510416667,
            "loss_tokens_upper_95": 2.920960994791667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4335543219506377,
            "data_time": 0.009886114964843267,
            "batch_time": 0.045615739030800316,
            "samples_per_second": 854942.443209087,
            "samples_per_second_per_gpu": 106867.80540113588,
            "loss_sequences_lower_95": 3.376467864556922,
            "loss_sequences_upper_95": 3.499371176945994,
            "loss_tokens_lower_95": 3.4219098072916667,
            "loss_tokens_upper_95": 3.4452671458333333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.689638839962526,
            "data_time": 0.010201454162597656,
            "batch_time": 0.04555200295619281,
            "samples_per_second": 865818.862787979,
            "samples_per_second_per_gpu": 108227.35784849737,
            "loss_sequences_lower_95": 2.6211697642526413,
            "loss_sequences_upper_95": 2.765432720611634,
            "loss_tokens_lower_95": 2.67874840625,
            "loss_tokens_upper_95": 2.700378541666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.487568047913638,
            "data_time": 0.08732653515679496,
            "batch_time": 0.12288605315344674,
            "samples_per_second": 510112.83606922935,
            "samples_per_second_per_gpu": 63764.10450865367,
            "loss_sequences_lower_95": 3.4189996979453348,
            "loss_sequences_upper_95": 3.560479363528165,
            "loss_tokens_lower_95": 3.467296513644132,
            "loss_tokens_upper_95": 3.508838818290017,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8718009439918806,
            "data_time": 0.01470583135431463,
            "batch_time": 0.05023675479672172,
            "samples_per_second": 852427.5413444929,
            "samples_per_second_per_gpu": 106553.44266806162,
            "loss_sequences_lower_95": 2.816136307341017,
            "loss_sequences_upper_95": 2.9266057984822,
            "loss_tokens_lower_95": 2.860271859375,
            "loss_tokens_upper_95": 2.883135541666667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.9918017827741075,
            "data_time": 0.013150202731291452,
            "batch_time": 0.04893199851115545,
            "samples_per_second": 860990.1454732682,
            "samples_per_second_per_gpu": 107623.76818415853,
            "loss_sequences_lower_95": 4.932024067498763,
            "loss_sequences_upper_95": 5.061516142268923,
            "loss_tokens_lower_95": 4.9803871979166665,
            "loss_tokens_upper_95": 5.003118885416666,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.10780538691849,
            "data_time": 0.038789600133895874,
            "batch_time": 0.07447084039449692,
            "samples_per_second": 772021.8103207502,
            "samples_per_second_per_gpu": 96502.72629009378,
            "loss_sequences_lower_95": 3.0083474831502945,
            "loss_sequences_upper_95": 3.273189006867956,
            "loss_tokens_lower_95": 3.095432744260694,
            "loss_tokens_upper_95": 3.120323368760406,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.708559012246631,
            "data_time": 0.0016269817574963755,
            "batch_time": 0.03709600523012374,
            "samples_per_second": 899777.6781472861,
            "samples_per_second_per_gpu": 112472.20976841076,
            "loss_sequences_lower_95": 1.7019702089323547,
            "loss_sequences_upper_95": 1.7153923553001709,
            "loss_tokens_lower_95": 1.701951111648314,
            "loss_tokens_upper_95": 1.715308180508697,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5851507043368405,
            "data_time": 0.001639751681856289,
            "batch_time": 0.03706824039198031,
            "samples_per_second": 900188.4154589769,
            "samples_per_second_per_gpu": 112523.55193237211,
            "loss_sequences_lower_95": 2.585000920936753,
            "loss_sequences_upper_95": 2.609338147942703,
            "loss_tokens_lower_95": 2.563313367486081,
            "loss_tokens_upper_95": 2.580958304853629,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1972117933005193,
            "data_time": 0.003174632366370975,
            "batch_time": 0.03908953702640803,
            "samples_per_second": 886229.5337829802,
            "samples_per_second_per_gpu": 110778.69172287252,
            "loss_sequences_lower_95": 3.4313805696814477,
            "loss_sequences_upper_95": 3.705326774788173,
            "loss_tokens_lower_95": 2.680371384601352,
            "loss_tokens_upper_95": 2.8790786724985855,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0355346443398545,
            "data_time": 0.0036009382060233582,
            "batch_time": 0.03943042790002011,
            "samples_per_second": 887782.8097249076,
            "samples_per_second_per_gpu": 110972.85121561345,
            "loss_sequences_lower_95": 3.0567051513671877,
            "loss_sequences_upper_95": 3.2404734049479167,
            "loss_tokens_lower_95": 2.882215132419418,
            "loss_tokens_upper_95": 3.0174960753242925,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.290963822824653,
            "data_time": 0.004753498649884853,
            "batch_time": 0.04020387590560798,
            "samples_per_second": 890768.4652845687,
            "samples_per_second_per_gpu": 111346.05816057109,
            "loss_sequences_lower_95": 2.326169464134465,
            "loss_sequences_upper_95": 2.3749769463875334,
            "loss_tokens_lower_95": 2.21491021536113,
            "loss_tokens_upper_95": 2.243210135151486,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.969763049212369,
            "data_time": 0.02690852327006204,
            "batch_time": 0.06337751873901912,
            "samples_per_second": 829843.3601900905,
            "samples_per_second_per_gpu": 103730.42002376131,
            "loss_sequences_lower_95": 1.9519012451171875,
            "loss_sequences_upper_95": 2.0443295912309125,
            "loss_tokens_lower_95": 1.9098553746661797,
            "loss_tokens_upper_95": 1.9523174025937218,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.726049619791459,
            "data_time": 0.020964594557881355,
            "batch_time": 0.056923069059848785,
            "samples_per_second": 811470.7130991665,
            "samples_per_second_per_gpu": 101433.83913739581,
            "loss_sequences_lower_95": 2.7017324517697703,
            "loss_sequences_upper_95": 2.8660416770467956,
            "loss_tokens_lower_95": 2.6311849663591738,
            "loss_tokens_upper_95": 2.7147042609068213,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.816895532608032,
            "data_time": 0.016427399256290533,
            "batch_time": 0.0522353205925379,
            "samples_per_second": 824795.2193877443,
            "samples_per_second_per_gpu": 103099.40242346804,
            "loss_sequences_lower_95": 2.7948582560221356,
            "loss_sequences_upper_95": 2.887307556152344,
            "loss_tokens_lower_95": 2.6875933709633806,
            "loss_tokens_upper_95": 2.8724736349249653,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.2400742142649746,
            "data_time": 0.001389043113634162,
            "batch_time": 0.03683695792213292,
            "samples_per_second": 901036.8963065183,
            "samples_per_second_per_gpu": 112629.6120383148,
            "loss_sequences_lower_95": 4.240990962243492,
            "loss_sequences_upper_95": 4.318471405042813,
            "loss_tokens_lower_95": 4.12192551491053,
            "loss_tokens_upper_95": 4.199286893335802,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.43336236369655,
            "data_time": 0.0031721221920627874,
            "batch_time": 0.038646138354435866,
            "samples_per_second": 897275.6469133478,
            "samples_per_second_per_gpu": 112159.45586416847,
            "loss_sequences_lower_95": 3.8090191670941183,
            "loss_sequences_upper_95": 4.066150590787432,
            "loss_tokens_lower_95": 2.8752350879083646,
            "loss_tokens_upper_95": 2.993956344285735,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.259107254877839,
            "data_time": 0.0053380884834238005,
            "batch_time": 0.04097587192380751,
            "samples_per_second": 879957.3028650484,
            "samples_per_second_per_gpu": 109994.66285813105,
            "loss_sequences_lower_95": 3.537086866984188,
            "loss_sequences_upper_95": 3.8214585235908167,
            "loss_tokens_lower_95": 2.92969147053485,
            "loss_tokens_upper_95": 3.0687458561481527,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.7772953238117095,
            "data_time": 0.023223093577793667,
            "batch_time": 0.0588196154151644,
            "samples_per_second": 830464.0812891775,
            "samples_per_second_per_gpu": 103808.01016114719,
            "loss_sequences_lower_95": 5.6858246128308725,
            "loss_sequences_upper_95": 5.863056688003888,
            "loss_tokens_lower_95": 5.685604147715111,
            "loss_tokens_upper_95": 5.8629007696561075,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9164803552627565,
            "data_time": 0.047080205037043646,
            "batch_time": 0.08443727860083947,
            "samples_per_second": 716626.263071221,
            "samples_per_second_per_gpu": 89578.28288390262,
            "loss_sequences_lower_95": 2.775701202392578,
            "loss_sequences_upper_95": 3.127282745361328,
            "loss_tokens_lower_95": 2.63801479168996,
            "loss_tokens_upper_95": 3.0539858031571443,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4166565778738645,
            "data_time": 0.0033715641571700206,
            "batch_time": 0.03887538822150669,
            "samples_per_second": 896519.8947247576,
            "samples_per_second_per_gpu": 112064.9868405947,
            "loss_sequences_lower_95": 1.4005079536867484,
            "loss_sequences_upper_95": 1.4333344255523526,
            "loss_tokens_lower_95": 1.4002571472980105,
            "loss_tokens_upper_95": 1.432979656363949,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9881492141423707,
            "data_time": 0.004982592814509281,
            "batch_time": 0.040473331246057107,
            "samples_per_second": 890538.5253577189,
            "samples_per_second_per_gpu": 111317.31566971487,
            "loss_sequences_lower_95": 1.9580612145126304,
            "loss_sequences_upper_95": 2.018816378856662,
            "loss_tokens_lower_95": 1.9578862970525568,
            "loss_tokens_upper_95": 2.0180901954644628,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.964700910419842,
            "data_time": 0.003561746224471354,
            "batch_time": 0.03890293926708641,
            "samples_per_second": 892817.4575789109,
            "samples_per_second_per_gpu": 111602.18219736386,
            "loss_sequences_lower_95": 3.0923176264218073,
            "loss_sequences_upper_95": 3.220475355611143,
            "loss_tokens_lower_95": 2.8133427150867356,
            "loss_tokens_upper_95": 2.869259047473028,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.82998064994812,
            "data_time": 0.010341358371078968,
            "batch_time": 0.04609304293990135,
            "samples_per_second": 855256.2685059976,
            "samples_per_second_per_gpu": 106907.0335632497,
            "loss_sequences_lower_95": 4.9895185546875,
            "loss_sequences_upper_95": 5.522001538085938,
            "loss_tokens_lower_95": 4.286093331519378,
            "loss_tokens_upper_95": 4.6406076306305835,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2376371920108795,
            "data_time": 0.15914501249790192,
            "batch_time": 0.19926224648952484,
            "samples_per_second": 470599.34164300165,
            "samples_per_second_per_gpu": 58824.91770537521,
            "loss_sequences_lower_95": 2.9911541640758514,
            "loss_sequences_upper_95": 3.4947436571121218,
            "loss_tokens_lower_95": 2.8009723970259746,
            "loss_tokens_upper_95": 3.5836472873030036,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.0768770178159075,
            "data_time": 0.026000847207738997,
            "batch_time": 0.06115150451660156,
            "samples_per_second": 780099.1635561866,
            "samples_per_second_per_gpu": 97512.39544452332,
            "loss_sequences_lower_95": 4.405528706517713,
            "loss_sequences_upper_95": 5.018183548149021,
            "loss_tokens_lower_95": 3.0540324351967656,
            "loss_tokens_upper_95": 3.4168370398485943,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9535498448520208,
            "data_time": 0.003032086624039544,
            "batch_time": 0.03840318529142274,
            "samples_per_second": 894576.2625469551,
            "samples_per_second_per_gpu": 111822.0328183694,
            "loss_sequences_lower_95": 1.9295382158471903,
            "loss_sequences_upper_95": 1.976878325602747,
            "loss_tokens_lower_95": 1.9295968031976982,
            "loss_tokens_upper_95": 1.9776280901014363,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0035843384360237,
            "data_time": 0.0025503830619222857,
            "batch_time": 0.03799215036628317,
            "samples_per_second": 898984.3003106465,
            "samples_per_second_per_gpu": 112373.03753883082,
            "loss_sequences_lower_95": 1.9801723889603142,
            "loss_sequences_upper_95": 2.098899519431945,
            "loss_tokens_lower_95": 1.8842711356535387,
            "loss_tokens_upper_95": 2.0006615249683857,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9005273414618804,
            "data_time": 0.018054672413402133,
            "batch_time": 0.05340885784890917,
            "samples_per_second": 819001.9544522475,
            "samples_per_second_per_gpu": 102375.24430653093,
            "loss_sequences_lower_95": 2.7776158874288144,
            "loss_sequences_upper_95": 3.208319404797676,
            "loss_tokens_lower_95": 2.631155110375954,
            "loss_tokens_upper_95": 2.912611410333314,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.364128126529793,
            "data_time": 0.00503123328089714,
            "batch_time": 0.040390662103891375,
            "samples_per_second": 887369.9357674534,
            "samples_per_second_per_gpu": 110921.24197093167,
            "loss_sequences_lower_95": 3.4176396322513813,
            "loss_sequences_upper_95": 3.575563450356403,
            "loss_tokens_lower_95": 3.212456236079593,
            "loss_tokens_upper_95": 3.3516023323904225,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3620045548532067,
            "data_time": 0.02940371206828526,
            "batch_time": 0.06514516330900647,
            "samples_per_second": 808568.5142120956,
            "samples_per_second_per_gpu": 101071.06427651196,
            "loss_sequences_lower_95": 2.2195961510262836,
            "loss_sequences_upper_95": 2.6311204212467847,
            "loss_tokens_lower_95": 2.0803174714279797,
            "loss_tokens_upper_95": 2.3980826746581236,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.5269022047054674,
            "data_time": 0.001940199436305096,
            "batch_time": 0.037421598036290166,
            "samples_per_second": 896759.1049975278,
            "samples_per_second_per_gpu": 112094.88812469097,
            "loss_sequences_lower_95": 5.516981286882376,
            "loss_sequences_upper_95": 5.536697573889779,
            "loss_tokens_lower_95": 5.517190342365348,
            "loss_tokens_upper_95": 5.536502681395654,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1276941062177268,
            "data_time": 0.050326932560313835,
            "batch_time": 0.08647418455644088,
            "samples_per_second": 731135.7945638945,
            "samples_per_second_per_gpu": 91391.97432048681,
            "loss_sequences_lower_95": 1.071226673681759,
            "loss_sequences_upper_95": 1.250593129870961,
            "loss_tokens_lower_95": 0.9460975373657601,
            "loss_tokens_upper_95": 1.1949389642190298,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.679584291799139,
            "data_time": 0.0012476483339635845,
            "batch_time": 0.03670270527398396,
            "samples_per_second": 900267.7137744381,
            "samples_per_second_per_gpu": 112533.46422180477,
            "loss_sequences_lower_95": 5.073457133615042,
            "loss_sequences_upper_95": 5.120374041863208,
            "loss_tokens_lower_95": 4.075184719535783,
            "loss_tokens_upper_95": 4.122941017891683,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.784540551662445,
            "data_time": 0.006120709672806755,
            "batch_time": 0.0415322000072116,
            "samples_per_second": 885419.2178221114,
            "samples_per_second_per_gpu": 110677.40222776393,
            "loss_sequences_lower_95": 4.78773173828125,
            "loss_sequences_upper_95": 4.929227429199218,
            "loss_tokens_lower_95": 4.640403338061315,
            "loss_tokens_upper_95": 4.783450409431192,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.041906714439392,
            "data_time": 0.021273687734442243,
            "batch_time": 0.0579996775772612,
            "samples_per_second": 814255.7789084227,
            "samples_per_second_per_gpu": 101781.97236355284,
            "loss_sequences_lower_95": 1.9790792150082797,
            "loss_sequences_upper_95": 2.1058226477581523,
            "loss_tokens_lower_95": 1.9787130107050357,
            "loss_tokens_upper_95": 2.1055408610468325,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.372239596915968,
            "data_time": 0.00473381596875478,
            "batch_time": 0.04014408121626061,
            "samples_per_second": 890666.0342554122,
            "samples_per_second_per_gpu": 111333.25428192652,
            "loss_sequences_lower_95": 6.298503750887784,
            "loss_sequences_upper_95": 6.444936652906013,
            "loss_tokens_lower_95": 6.299530066287879,
            "loss_tokens_upper_95": 6.443932162198154,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.385287911693255,
            "data_time": 0.004310407220049107,
            "batch_time": 0.039838227819889147,
            "samples_per_second": 891552.1172843522,
            "samples_per_second_per_gpu": 111444.01466054402,
            "loss_sequences_lower_95": 1.426306717936198,
            "loss_sequences_upper_95": 1.4781270833333333,
            "loss_tokens_lower_95": 1.2982023532069078,
            "loss_tokens_upper_95": 1.374700513017707,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.640840213639396,
            "data_time": 0.023524001240730286,
            "batch_time": 0.058463599000658305,
            "samples_per_second": 805114.0600456654,
            "samples_per_second_per_gpu": 100639.25750570817,
            "loss_sequences_lower_95": 5.317190566289993,
            "loss_sequences_upper_95": 5.962687189011347,
            "loss_tokens_lower_95": 5.322694353376116,
            "loss_tokens_upper_95": 5.972812078566779,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7923589758574963,
            "data_time": 0.15531301498413086,
            "batch_time": 0.19358086585998535,
            "samples_per_second": 490187.59488601814,
            "samples_per_second_per_gpu": 61273.44936075227,
            "loss_sequences_lower_95": 1.6490980952978134,
            "loss_sequences_upper_95": 2.309429895877838,
            "loss_tokens_lower_95": 1.4102060723058956,
            "loss_tokens_upper_95": 1.8033468250392637,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.354125180244446,
            "data_time": 0.00595617861974807,
            "batch_time": 0.04132450762249175,
            "samples_per_second": 886573.833981615,
            "samples_per_second_per_gpu": 110821.72924770188,
            "loss_sequences_lower_95": 7.2901969848632815,
            "loss_sequences_upper_95": 7.614962475585937,
            "loss_tokens_lower_95": 7.075764932648583,
            "loss_tokens_upper_95": 7.361622229768137,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.949130415916443,
            "data_time": 0.0058416979653494695,
            "batch_time": 0.04124841425153944,
            "samples_per_second": 885310.1735305322,
            "samples_per_second_per_gpu": 110663.77169131652,
            "loss_sequences_lower_95": 7.084768432617188,
            "loss_sequences_upper_95": 7.332367687988281,
            "loss_tokens_lower_95": 6.65221290187234,
            "loss_tokens_upper_95": 6.874928933928236,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.0710875331816,
            "data_time": 0.003562744963527922,
            "batch_time": 0.039000258876328485,
            "samples_per_second": 893254.0795269257,
            "samples_per_second_per_gpu": 111656.75994086571,
            "loss_sequences_lower_95": 6.052321095583305,
            "loss_sequences_upper_95": 6.089234323274598,
            "loss_tokens_lower_95": 6.052914988319226,
            "loss_tokens_upper_95": 6.088783035508507,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7765889349071662,
            "data_time": 0.0087037432229771,
            "batch_time": 0.04420772347925653,
            "samples_per_second": 868879.9353788432,
            "samples_per_second_per_gpu": 108609.9919223554,
            "loss_sequences_lower_95": 1.7439101071218557,
            "loss_sequences_upper_95": 1.8088579356578822,
            "loss_tokens_lower_95": 1.7439135573426698,
            "loss_tokens_upper_95": 1.8095610775339623,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 8.116771845340729,
            "data_time": 0.005918385963591319,
            "batch_time": 0.04133633157563588,
            "samples_per_second": 885109.8769723679,
            "samples_per_second_per_gpu": 110638.73462154598,
            "loss_sequences_lower_95": 8.030425024414061,
            "loss_sequences_upper_95": 8.2071072265625,
            "loss_tokens_lower_95": 8.028455200195312,
            "loss_tokens_upper_95": 8.204897167968749,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5457914250463163,
            "data_time": 0.0017324256536415258,
            "batch_time": 0.03715585580620288,
            "samples_per_second": 899651.1214696995,
            "samples_per_second_per_gpu": 112456.39018371244,
            "loss_sequences_lower_95": 2.9794104962452694,
            "loss_sequences_upper_95": 3.0527672981832428,
            "loss_tokens_lower_95": 2.018172381381169,
            "loss_tokens_upper_95": 2.069033341842448,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.12466172673809,
            "data_time": 0.01879481247493199,
            "batch_time": 0.054361425127301896,
            "samples_per_second": 820245.1232586142,
            "samples_per_second_per_gpu": 102530.64040732678,
            "loss_sequences_lower_95": 2.0378495799961374,
            "loss_sequences_upper_95": 2.2141549750940124,
            "loss_tokens_lower_95": 2.0381719788508628,
            "loss_tokens_upper_95": 2.2131640647774313,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9036126737501107,
            "data_time": 0.0101613225415349,
            "batch_time": 0.04591162130236626,
            "samples_per_second": 872698.5772786686,
            "samples_per_second_per_gpu": 109087.32215983358,
            "loss_sequences_lower_95": 1.861868319043926,
            "loss_sequences_upper_95": 1.9458109836952358,
            "loss_tokens_lower_95": 1.8619236276663986,
            "loss_tokens_upper_95": 1.9454002051259958,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7207674939396775,
            "data_time": 0.0022194023041702503,
            "batch_time": 0.03784352244362589,
            "samples_per_second": 892619.2696178274,
            "samples_per_second_per_gpu": 111577.40870222842,
            "loss_sequences_lower_95": 4.369480678668734,
            "loss_sequences_upper_95": 4.483404003527027,
            "loss_tokens_lower_95": 2.9003903815485574,
            "loss_tokens_upper_95": 2.9779153811475787,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.056965270370402,
            "data_time": 0.027729183435440063,
            "batch_time": 0.06408628076314926,
            "samples_per_second": 817731.6944286149,
            "samples_per_second_per_gpu": 102216.46180357687,
            "loss_sequences_lower_95": 5.988127596416171,
            "loss_sequences_upper_95": 6.124451894608755,
            "loss_tokens_lower_95": 5.985510334640584,
            "loss_tokens_upper_95": 6.123478626069568,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5454076649581254,
            "data_time": 0.003103877161885356,
            "batch_time": 0.038549669641860386,
            "samples_per_second": 894464.5152850791,
            "samples_per_second_per_gpu": 111808.06441063489,
            "loss_sequences_lower_95": 3.515127371225153,
            "loss_sequences_upper_95": 3.5771414641867354,
            "loss_tokens_lower_95": 3.514938061448777,
            "loss_tokens_upper_95": 3.5764234070384173,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.038119934716271,
            "data_time": 0.02371956001628529,
            "batch_time": 0.05941927216269753,
            "samples_per_second": 781909.5421158387,
            "samples_per_second_per_gpu": 97738.69276447984,
            "loss_sequences_lower_95": 1.9496042233068966,
            "loss_sequences_upper_95": 2.1307980583709423,
            "loss_tokens_lower_95": 1.9501299071080476,
            "loss_tokens_upper_95": 2.130562843396826,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7106298118829728,
            "data_time": 0.08067350089550018,
            "batch_time": 0.11699959635734558,
            "samples_per_second": 645234.6889458247,
            "samples_per_second_per_gpu": 80654.3361182281,
            "loss_sequences_lower_95": 1.5703749306996664,
            "loss_sequences_upper_95": 1.9598705863952637,
            "loss_tokens_lower_95": 1.392789888381958,
            "loss_tokens_upper_95": 1.8361019505394829,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.785671121875445,
            "data_time": 0.07629532366991043,
            "batch_time": 0.11265004426240921,
            "samples_per_second": 652685.2727621519,
            "samples_per_second_per_gpu": 81585.65909526899,
            "loss_sequences_lower_95": 1.6742771434783936,
            "loss_sequences_upper_95": 2.143066291809082,
            "loss_tokens_lower_95": 1.3568380677298215,
            "loss_tokens_upper_95": 1.9488171909632308,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.622091275778365,
            "data_time": 0.003207182729854662,
            "batch_time": 0.03869746796449119,
            "samples_per_second": 894336.7347033775,
            "samples_per_second_per_gpu": 111792.09183792218,
            "loss_sequences_lower_95": 4.585284440008285,
            "loss_sequences_upper_95": 4.659604600055228,
            "loss_tokens_lower_95": 4.584335247146539,
            "loss_tokens_upper_95": 4.660117849088734,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.35330353701031936,
            "data_time": 0.0011495123138443386,
            "batch_time": 0.03659596968303624,
            "samples_per_second": 900875.4952371377,
            "samples_per_second_per_gpu": 112609.4369046422,
            "loss_sequences_lower_95": 0.40267294916532387,
            "loss_sequences_upper_95": 0.41315749342591723,
            "loss_tokens_lower_95": 0.3063147493711762,
            "loss_tokens_upper_95": 0.3124021033823868,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4453197351590856,
            "data_time": 0.03972352296113968,
            "batch_time": 0.07625538110733032,
            "samples_per_second": 792827.2704468712,
            "samples_per_second_per_gpu": 99103.4088058589,
            "loss_sequences_lower_95": 3.495578435462291,
            "loss_sequences_upper_95": 3.822962651290293,
            "loss_tokens_lower_95": 3.145348848227171,
            "loss_tokens_upper_95": 3.393937408364806,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.305336172516282,
            "data_time": 0.12314704486301967,
            "batch_time": 0.15862183343796504,
            "samples_per_second": 503263.2158331007,
            "samples_per_second_per_gpu": 62907.90197913759,
            "loss_sequences_lower_95": 5.884471759280643,
            "loss_sequences_upper_95": 6.868579019082559,
            "loss_tokens_lower_95": 5.132714005458502,
            "loss_tokens_upper_95": 7.375861386899595,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.346246082608293,
            "data_time": 0.02908924080076672,
            "batch_time": 0.06486790804635911,
            "samples_per_second": 808380.3530642133,
            "samples_per_second_per_gpu": 101047.54413302666,
            "loss_sequences_lower_95": 3.3587633086413873,
            "loss_sequences_upper_95": 3.6444685680110283,
            "loss_tokens_lower_95": 3.024976049989487,
            "loss_tokens_upper_95": 3.230689950851556,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5095856851193963,
            "data_time": 0.03038012697583153,
            "batch_time": 0.0658600926399231,
            "samples_per_second": 810941.2971320456,
            "samples_per_second_per_gpu": 101367.6621415057,
            "loss_sequences_lower_95": 3.504569793329006,
            "loss_sequences_upper_95": 3.761450120879383,
            "loss_tokens_lower_95": 3.2079335825264597,
            "loss_tokens_upper_95": 3.389228763000982,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4030818045139313,
            "data_time": 0.03150649013973418,
            "batch_time": 0.06780031749180385,
            "samples_per_second": 805053.2035237269,
            "samples_per_second_per_gpu": 100631.65044046586,
            "loss_sequences_lower_95": 3.4216693971215224,
            "loss_sequences_upper_95": 3.744752167492378,
            "loss_tokens_lower_95": 3.0563508782323714,
            "loss_tokens_upper_95": 3.320359173117183,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7943065050171643,
            "data_time": 0.03152464968817575,
            "batch_time": 0.06719001701899938,
            "samples_per_second": 807902.7598858292,
            "samples_per_second_per_gpu": 100987.84498572865,
            "loss_sequences_lower_95": 3.8023488858850993,
            "loss_sequences_upper_95": 4.086118093351039,
            "loss_tokens_lower_95": 3.4833861258913794,
            "loss_tokens_upper_95": 3.66440104546948,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7086859578671665,
            "data_time": 0.03054939964671194,
            "batch_time": 0.06730791374489113,
            "samples_per_second": 803061.2140482217,
            "samples_per_second_per_gpu": 100382.65175602771,
            "loss_sequences_lower_95": 3.7025049529460645,
            "loss_sequences_upper_95": 3.9738821467997867,
            "loss_tokens_lower_95": 3.378143897155601,
            "loss_tokens_upper_95": 3.567093322700351,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.693211622354461,
            "data_time": 0.030618116969153994,
            "batch_time": 0.06677360761733282,
            "samples_per_second": 804179.2627293542,
            "samples_per_second_per_gpu": 100522.40784116928,
            "loss_sequences_lower_95": 2.7161969905946313,
            "loss_sequences_upper_95": 2.933232516777225,
            "loss_tokens_lower_95": 2.4424692727169983,
            "loss_tokens_upper_95": 2.5505081715736138,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-8.0/params.txt",
    "uuid": "e04bb43f-f3f1-497b-ada4-152175942f61",
    "creation_date": "2023_12_14-05_27_28"
}