{
    "name": "rw_original-d=96_l=8_h=4-16.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 3382179840,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "676435968",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.909283729394277,
            "data_time": 0.14025506377220154,
            "batch_time": 1.321423590183258,
            "samples_per_second": 372078.9881033851,
            "samples_per_second_per_gpu": 46509.873512923135,
            "loss_sequences_lower_95": 4.80139902750651,
            "loss_sequences_upper_95": 5.01866808573405,
            "loss_tokens_lower_95": 4.893989448547363,
            "loss_tokens_upper_95": 4.9244762166341145,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.516044718771621,
            "data_time": 0.018830312107442623,
            "batch_time": 0.06393333950713291,
            "samples_per_second": 4683832.096866936,
            "samples_per_second_per_gpu": 585479.012108367,
            "loss_sequences_lower_95": 4.513763709720115,
            "loss_sequences_upper_95": 4.518293983605689,
            "loss_tokens_lower_95": 4.504493364583333,
            "loss_tokens_upper_95": 4.52748978125,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.9693170644799056,
            "data_time": 0.0988062173128128,
            "batch_time": 0.1439715251326561,
            "samples_per_second": 4137763.8400177094,
            "samples_per_second_per_gpu": 517220.4800022137,
            "loss_sequences_lower_95": 3.919672154017857,
            "loss_sequences_upper_95": 4.029843382543447,
            "loss_tokens_lower_95": 3.9547122916666666,
            "loss_tokens_upper_95": 3.9835750416666666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.647515194588101,
            "data_time": 0.013772118248437581,
            "batch_time": 0.057690440039885674,
            "samples_per_second": 5379897.484923646,
            "samples_per_second_per_gpu": 672487.1856154557,
            "loss_sequences_lower_95": 4.613982442010309,
            "loss_sequences_upper_95": 4.681503332393685,
            "loss_tokens_lower_95": 4.634908708333334,
            "loss_tokens_upper_95": 4.66006540625,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.568872288628168,
            "data_time": 0.10077522695064545,
            "batch_time": 0.1450454592704773,
            "samples_per_second": 4101077.283686157,
            "samples_per_second_per_gpu": 512634.6604607696,
            "loss_sequences_lower_95": 4.517324574270463,
            "loss_sequences_upper_95": 4.630435994880505,
            "loss_tokens_lower_95": 4.55698578125,
            "loss_tokens_upper_95": 4.580775552083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.883194647866151,
            "data_time": 0.0351969450712204,
            "batch_time": 0.07832531134287517,
            "samples_per_second": 5014270.419437333,
            "samples_per_second_per_gpu": 626783.8024296666,
            "loss_sequences_lower_95": 4.840224823475208,
            "loss_sequences_upper_95": 4.929357868663962,
            "loss_tokens_lower_95": 4.870091125,
            "loss_tokens_upper_95": 4.896316197916667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.118589365725614,
            "data_time": 0.01426951065659523,
            "batch_time": 0.05699890851974487,
            "samples_per_second": 5154373.195336578,
            "samples_per_second_per_gpu": 644296.6494170723,
            "loss_sequences_lower_95": 5.083343710140306,
            "loss_sequences_upper_95": 5.152806261957909,
            "loss_tokens_lower_95": 5.101134708333333,
            "loss_tokens_upper_95": 5.136319791666667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.859179572559776,
            "data_time": 0.01420275004286515,
            "batch_time": 0.057923816536602224,
            "samples_per_second": 5282673.897565059,
            "samples_per_second_per_gpu": 660334.2371956324,
            "loss_sequences_lower_95": 4.838328615837696,
            "loss_sequences_upper_95": 4.881573237074607,
            "loss_tokens_lower_95": 4.8471096354166665,
            "loss_tokens_upper_95": 4.87128896875,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.678035419161727,
            "data_time": 0.10037468373775482,
            "batch_time": 0.14544633030891418,
            "samples_per_second": 4106937.434176266,
            "samples_per_second_per_gpu": 513367.1792720333,
            "loss_sequences_lower_95": 4.6039014862804875,
            "loss_sequences_upper_95": 4.767841543802401,
            "loss_tokens_lower_95": 4.6658205312500005,
            "loss_tokens_upper_95": 4.690100666666667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.817259889346338,
            "data_time": 0.10281836986541748,
            "batch_time": 0.14860397577285767,
            "samples_per_second": 4067867.9291693186,
            "samples_per_second_per_gpu": 508483.49114616483,
            "loss_sequences_lower_95": 5.72583409486552,
            "loss_sequences_upper_95": 5.926986947643898,
            "loss_tokens_lower_95": 5.8041869479166674,
            "loss_tokens_upper_95": 5.830127249999999,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.878535245989747,
            "data_time": 0.010712789050463972,
            "batch_time": 0.05388503084922659,
            "samples_per_second": 5425281.085285276,
            "samples_per_second_per_gpu": 678160.1356606595,
            "loss_sequences_lower_95": 4.868086079360524,
            "loss_sequences_upper_95": 4.889423842846376,
            "loss_tokens_lower_95": 4.865936239583333,
            "loss_tokens_upper_95": 4.891136447916667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.719612801189176,
            "data_time": 0.028950273990631104,
            "batch_time": 0.07124065607786179,
            "samples_per_second": 4792195.291513778,
            "samples_per_second_per_gpu": 599024.4114392223,
            "loss_sequences_lower_95": 4.697961847765255,
            "loss_sequences_upper_95": 4.742248286033163,
            "loss_tokens_lower_95": 4.706858635416667,
            "loss_tokens_upper_95": 4.732046677083333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.971517301718071,
            "data_time": 0.10166394710540771,
            "batch_time": 0.17243505269289017,
            "samples_per_second": 4189480.0748803886,
            "samples_per_second_per_gpu": 523685.00936004857,
            "loss_sequences_lower_95": 4.893807209626172,
            "loss_sequences_upper_95": 5.065080809061232,
            "loss_tokens_lower_95": 4.958573135416667,
            "loss_tokens_upper_95": 4.984699229166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.502776660647266,
            "data_time": 0.1041019931435585,
            "batch_time": 0.149161659181118,
            "samples_per_second": 4118400.3651273753,
            "samples_per_second_per_gpu": 514800.0456409219,
            "loss_sequences_lower_95": 4.4281019036007505,
            "loss_sequences_upper_95": 4.587617744399186,
            "loss_tokens_lower_95": 4.490250729166667,
            "loss_tokens_upper_95": 4.515605927083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.699045116251165,
            "data_time": 0.16320474445819855,
            "batch_time": 0.18520487844944,
            "samples_per_second": 1155804.8699567427,
            "samples_per_second_per_gpu": 144475.60874459284,
            "loss_sequences_lower_95": 5.62282244942405,
            "loss_sequences_upper_95": 5.7799973574551675,
            "loss_tokens_lower_95": 5.672405624389649,
            "loss_tokens_upper_95": 5.7261455709284,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.990931434464525,
            "data_time": 0.10151984542608261,
            "batch_time": 0.13638495653867722,
            "samples_per_second": 3345732.493731674,
            "samples_per_second_per_gpu": 418216.5617164592,
            "loss_sequences_lower_95": 4.917641900510204,
            "loss_sequences_upper_95": 5.065273558502643,
            "loss_tokens_lower_95": 4.977094979166667,
            "loss_tokens_upper_95": 5.004865041666666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.256032149835753,
            "data_time": 0.09929514676332474,
            "batch_time": 0.13632603734731674,
            "samples_per_second": 3693139.1990203,
            "samples_per_second_per_gpu": 461642.3998775375,
            "loss_sequences_lower_95": 6.180899651761419,
            "loss_sequences_upper_95": 6.351599507596059,
            "loss_tokens_lower_95": 6.2446092916666665,
            "loss_tokens_upper_95": 6.267648489583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.410866795993242,
            "data_time": 0.1707106977701187,
            "batch_time": 0.201243594288826,
            "samples_per_second": 2066201.8943784349,
            "samples_per_second_per_gpu": 258275.23679730436,
            "loss_sequences_lower_95": 5.2802477226882685,
            "loss_sequences_upper_95": 5.634484500572329,
            "loss_tokens_lower_95": 5.396085970519019,
            "loss_tokens_upper_95": 5.4254548244789005,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.556151654336242,
            "data_time": 0.026711693947965447,
            "batch_time": 0.07118842195380817,
            "samples_per_second": 4519407.790564379,
            "samples_per_second_per_gpu": 564925.9738205473,
            "loss_sequences_lower_95": 5.536633556362163,
            "loss_sequences_upper_95": 5.575113415334354,
            "loss_tokens_lower_95": 5.5365317412717205,
            "loss_tokens_upper_95": 5.575337784080793,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.201913465733691,
            "data_time": 0.027978736162185668,
            "batch_time": 0.07201579101383686,
            "samples_per_second": 4515630.169479329,
            "samples_per_second_per_gpu": 564453.7711849161,
            "loss_sequences_lower_95": 4.203818308806139,
            "loss_sequences_upper_95": 4.22979446286534,
            "loss_tokens_lower_95": 4.1906281878664435,
            "loss_tokens_upper_95": 4.2117945906307215,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.432912160047226,
            "data_time": 0.05397368305259281,
            "batch_time": 0.09563593897554609,
            "samples_per_second": 4398539.61545558,
            "samples_per_second_per_gpu": 549817.4519319475,
            "loss_sequences_lower_95": 6.8043589653696275,
            "loss_sequences_upper_95": 7.05217394713923,
            "loss_tokens_lower_95": 6.306881729136883,
            "loss_tokens_upper_95": 6.498846413733023,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.274160932858785,
            "data_time": 0.041076733420292534,
            "batch_time": 0.08472302556037903,
            "samples_per_second": 4654846.810204979,
            "samples_per_second_per_gpu": 581855.8512756224,
            "loss_sequences_lower_95": 6.613868001302083,
            "loss_sequences_upper_95": 6.789250862630208,
            "loss_tokens_lower_95": 6.182084745970912,
            "loss_tokens_upper_95": 6.309168754913522,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.335572740722552,
            "data_time": 0.06939452886581421,
            "batch_time": 0.10944850494464238,
            "samples_per_second": 4079683.17433437,
            "samples_per_second_per_gpu": 509960.39679179626,
            "loss_sequences_lower_95": 4.447027055741803,
            "loss_sequences_upper_95": 4.517551637870782,
            "loss_tokens_lower_95": 4.309314250395251,
            "loss_tokens_upper_95": 4.344544559456974,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.230948211930015,
            "data_time": 0.3491401672363281,
            "batch_time": 0.3916342258453369,
            "samples_per_second": 2165790.543600754,
            "samples_per_second_per_gpu": 270723.81795009424,
            "loss_sequences_lower_95": 4.2301469698819245,
            "loss_sequences_upper_95": 4.3622978002374815,
            "loss_tokens_lower_95": 4.190474269059257,
            "loss_tokens_upper_95": 4.253403621951526,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.493750478783432,
            "data_time": 0.38582831621170044,
            "batch_time": 0.4322165548801422,
            "samples_per_second": 2514738.7074766946,
            "samples_per_second_per_gpu": 314342.3384345868,
            "loss_sequences_lower_95": 4.5361837332589285,
            "loss_sequences_upper_95": 4.7435982093032525,
            "loss_tokens_lower_95": 4.4448266518915025,
            "loss_tokens_upper_95": 4.550542577331584,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.908688658078511,
            "data_time": 0.1836095154285431,
            "batch_time": 0.21515384316444397,
            "samples_per_second": 2670856.162193393,
            "samples_per_second_per_gpu": 333857.0202741741,
            "loss_sequences_lower_95": 4.911389495849609,
            "loss_sequences_upper_95": 5.01903705851237,
            "loss_tokens_lower_95": 4.784330298726727,
            "loss_tokens_upper_95": 5.026009999806861,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.4197728002079,
            "data_time": 0.02512531206011772,
            "batch_time": 0.06940424125641584,
            "samples_per_second": 4525584.360859739,
            "samples_per_second_per_gpu": 565698.0451074673,
            "loss_sequences_lower_95": 8.502028189742138,
            "loss_sequences_upper_95": 8.57457567756262,
            "loss_tokens_lower_95": 8.364678328366143,
            "loss_tokens_upper_95": 8.440807891412634,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.355914295522452,
            "data_time": 0.04707112312316895,
            "batch_time": 0.08964862078428268,
            "samples_per_second": 4459442.920470093,
            "samples_per_second_per_gpu": 557430.3650587617,
            "loss_sequences_lower_95": 6.470145372589831,
            "loss_sequences_upper_95": 6.740306271125974,
            "loss_tokens_lower_95": 5.216419266398445,
            "loss_tokens_upper_95": 5.356794914779178,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.081453020662171,
            "data_time": 0.08960237801074981,
            "batch_time": 0.13189601600170137,
            "samples_per_second": 4203174.915246978,
            "samples_per_second_per_gpu": 525396.8644058723,
            "loss_sequences_lower_95": 5.80253427134439,
            "loss_sequences_upper_95": 6.107275567689446,
            "loss_tokens_lower_95": 4.977117894318227,
            "loss_tokens_upper_95": 5.140453825830913,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.269896618307453,
            "data_time": 0.39515097439289093,
            "batch_time": 0.43671151995658875,
            "samples_per_second": 2232509.4730762746,
            "samples_per_second_per_gpu": 279063.6841345343,
            "loss_sequences_lower_95": 6.157653042170555,
            "loss_sequences_upper_95": 6.385183269901363,
            "loss_tokens_lower_95": 6.1545905823032605,
            "loss_tokens_upper_95": 6.386615380413456,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.551657423973084,
            "data_time": 0.3496350049972534,
            "batch_time": 0.37538953125476837,
            "samples_per_second": 1639198.1312967152,
            "samples_per_second_per_gpu": 204899.7664120894,
            "loss_sequences_lower_95": 4.484629844665528,
            "loss_sequences_upper_95": 4.8897014923095705,
            "loss_tokens_lower_95": 4.299878890527169,
            "loss_tokens_upper_95": 4.7849556965563504,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.486358913940929,
            "data_time": 0.051219578832387924,
            "batch_time": 0.09493285417556763,
            "samples_per_second": 4549709.424966635,
            "samples_per_second_per_gpu": 568713.6781208294,
            "loss_sequences_lower_95": 5.440374137386451,
            "loss_sequences_upper_95": 5.531822668444537,
            "loss_tokens_lower_95": 5.440067324838472,
            "loss_tokens_upper_95": 5.532887104077373,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.9301988959410075,
            "data_time": 0.08032343089580536,
            "batch_time": 0.12346133589744568,
            "samples_per_second": 4456336.0299243955,
            "samples_per_second_per_gpu": 557042.0037405494,
            "loss_sequences_lower_95": 5.8761177681587835,
            "loss_sequences_upper_95": 5.982379725250819,
            "loss_tokens_lower_95": 5.875625147964016,
            "loss_tokens_upper_95": 5.983437338039389,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.679955529051066,
            "data_time": 0.05262613296508789,
            "batch_time": 0.09416286647319794,
            "samples_per_second": 4235835.139709511,
            "samples_per_second_per_gpu": 529479.3924636889,
            "loss_sequences_lower_95": 4.918913207609324,
            "loss_sequences_upper_95": 5.037580414980617,
            "loss_tokens_lower_95": 4.645627364342662,
            "loss_tokens_upper_95": 4.707644424375454,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.297689709663391,
            "data_time": 0.1950993314385414,
            "batch_time": 0.2406006008386612,
            "samples_per_second": 3949797.9421368064,
            "samples_per_second_per_gpu": 493724.7427671008,
            "loss_sequences_lower_95": 6.887895544433594,
            "loss_sequences_upper_95": 7.393997973632812,
            "loss_tokens_lower_95": 6.0686934162495865,
            "loss_tokens_upper_95": 6.4099025279461035,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.175943613052368,
            "data_time": 0.1669236570596695,
            "batch_time": 0.1849842220544815,
            "samples_per_second": 825066.34558044,
            "samples_per_second_per_gpu": 103133.293197555,
            "loss_sequences_lower_95": 4.900421094894409,
            "loss_sequences_upper_95": 5.590843772888183,
            "loss_tokens_lower_95": 4.666961047293126,
            "loss_tokens_upper_95": 5.492408778749663,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.741913406328223,
            "data_time": 0.35156527161598206,
            "batch_time": 0.38724738359451294,
            "samples_per_second": 2303918.761342246,
            "samples_per_second_per_gpu": 287989.84516778076,
            "loss_sequences_lower_95": 5.396305163153287,
            "loss_sequences_upper_95": 5.950827675304193,
            "loss_tokens_lower_95": 4.48489836098242,
            "loss_tokens_upper_95": 4.896365951658505,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.093543858442727,
            "data_time": 0.050605184502071805,
            "batch_time": 0.09535289969709185,
            "samples_per_second": 4601450.850159898,
            "samples_per_second_per_gpu": 575181.3562699873,
            "loss_sequences_lower_95": 5.078055896150748,
            "loss_sequences_upper_95": 5.109016959080794,
            "loss_tokens_lower_95": 5.078057688008546,
            "loss_tokens_upper_95": 5.108949764413363,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.582920983751816,
            "data_time": 0.03229716987836929,
            "batch_time": 0.07544492256073725,
            "samples_per_second": 4414333.664627059,
            "samples_per_second_per_gpu": 551791.7080783824,
            "loss_sequences_lower_95": 5.699649580265743,
            "loss_sequences_upper_95": 5.926335785131235,
            "loss_tokens_lower_95": 5.44421392059048,
            "loss_tokens_upper_95": 5.665274694250137,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.0692109153384255,
            "data_time": 0.2007669284939766,
            "batch_time": 0.23022080212831497,
            "samples_per_second": 1743095.3696770314,
            "samples_per_second_per_gpu": 217886.92120962893,
            "loss_sequences_lower_95": 3.9619852031106912,
            "loss_sequences_upper_95": 4.312505779336224,
            "loss_tokens_lower_95": 3.882098710503874,
            "loss_tokens_upper_95": 4.210737546343808,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4343346245381055,
            "data_time": 0.0900255560874939,
            "batch_time": 0.1362198382616043,
            "samples_per_second": 4051507.5585598475,
            "samples_per_second_per_gpu": 506438.44481998094,
            "loss_sequences_lower_95": 4.499553446178843,
            "loss_sequences_upper_95": 4.638430759637678,
            "loss_tokens_lower_95": 4.3504957421284045,
            "loss_tokens_upper_95": 4.505249101411477,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.099140577199982,
            "data_time": 0.3360833525657654,
            "batch_time": 0.37172816693782806,
            "samples_per_second": 1972667.9987757816,
            "samples_per_second_per_gpu": 246583.4998469727,
            "loss_sequences_lower_95": 3.911679895912729,
            "loss_sequences_upper_95": 4.390551655466964,
            "loss_tokens_lower_95": 3.906835051345203,
            "loss_tokens_upper_95": 4.314443656087505,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.482630497838383,
            "data_time": 0.028597015572192155,
            "batch_time": 0.07237096677379548,
            "samples_per_second": 4428316.065266384,
            "samples_per_second_per_gpu": 553539.508158298,
            "loss_sequences_lower_95": 5.471993060721519,
            "loss_sequences_upper_95": 5.493305829134576,
            "loss_tokens_lower_95": 5.471846205178536,
            "loss_tokens_upper_95": 5.493354198183386,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.501327886164767,
            "data_time": 0.32297374308109283,
            "batch_time": 0.3499184846878052,
            "samples_per_second": 1674884.6909134546,
            "samples_per_second_per_gpu": 209360.58636418183,
            "loss_sequences_lower_95": 2.4169146639629475,
            "loss_sequences_upper_95": 2.71709704352814,
            "loss_tokens_lower_95": 2.2778271334442173,
            "loss_tokens_upper_95": 2.638674786195784,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3990206410987325,
            "data_time": 0.025843174358208973,
            "batch_time": 0.06997879862785339,
            "samples_per_second": 4436263.78810955,
            "samples_per_second_per_gpu": 554532.9735136938,
            "loss_sequences_lower_95": 6.261087301002358,
            "loss_sequences_upper_95": 6.308498867842636,
            "loss_tokens_lower_95": 5.3000045091876204,
            "loss_tokens_upper_95": 5.347760602030948,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.770720132350922,
            "data_time": 0.10615234076976776,
            "batch_time": 0.15089546516537666,
            "samples_per_second": 4378777.024408604,
            "samples_per_second_per_gpu": 547347.1280510755,
            "loss_sequences_lower_95": 5.788978686523437,
            "loss_sequences_upper_95": 6.03370126953125,
            "loss_tokens_lower_95": 5.643513827460364,
            "loss_tokens_upper_95": 5.8709276475190935,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.379456901550293,
            "data_time": 0.36899007856845856,
            "batch_time": 0.4122900366783142,
            "samples_per_second": 1962006.5921301912,
            "samples_per_second_per_gpu": 245250.8240162739,
            "loss_sequences_lower_95": 5.225370642620584,
            "loss_sequences_upper_95": 5.532029100501019,
            "loss_tokens_lower_95": 5.226477501910666,
            "loss_tokens_upper_95": 5.52813940960428,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.262232525059671,
            "data_time": 0.06524848689635594,
            "batch_time": 0.10526531686385472,
            "samples_per_second": 4017183.2849270087,
            "samples_per_second_per_gpu": 502147.9106158761,
            "loss_sequences_lower_95": 8.157190496271307,
            "loss_sequences_upper_95": 8.367550233783145,
            "loss_tokens_lower_95": 8.155402073715672,
            "loss_tokens_upper_95": 8.36794698079427,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.7137412339051565,
            "data_time": 0.0663347914814949,
            "batch_time": 0.11023526142040889,
            "samples_per_second": 4529859.208264339,
            "samples_per_second_per_gpu": 566232.4010330423,
            "loss_sequences_lower_95": 1.8861353068033855,
            "loss_sequences_upper_95": 2.0119986694335936,
            "loss_tokens_lower_95": 1.6616878548294318,
            "loss_tokens_upper_95": 1.7426234360931872,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.365150185993739,
            "data_time": 0.3336387574672699,
            "batch_time": 0.37489737570285797,
            "samples_per_second": 2374718.0628912686,
            "samples_per_second_per_gpu": 296839.7578614086,
            "loss_sequences_lower_95": 6.0534666079566595,
            "loss_sequences_upper_95": 6.680465131487165,
            "loss_tokens_lower_95": 6.04802740187872,
            "loss_tokens_upper_95": 6.679485836937315,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.5826258659362793,
            "data_time": 0.16445095837116241,
            "batch_time": 0.1830432265996933,
            "samples_per_second": 792869.0423456009,
            "samples_per_second_per_gpu": 99108.63029320011,
            "loss_sequences_lower_95": 3.3212796211242677,
            "loss_sequences_upper_95": 4.543620145320892,
            "loss_tokens_lower_95": 3.0020294755758696,
            "loss_tokens_upper_95": 3.5879050028692814,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.72513651227951,
            "data_time": 0.0968100056052208,
            "batch_time": 0.1410168819129467,
            "samples_per_second": 4394025.0469426755,
            "samples_per_second_per_gpu": 549253.1308678344,
            "loss_sequences_lower_95": 7.811939379882812,
            "loss_sequences_upper_95": 8.117633557128906,
            "loss_tokens_lower_95": 7.57275438131213,
            "loss_tokens_upper_95": 7.848370495584814,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.286749509334564,
            "data_time": 0.10161778703331947,
            "batch_time": 0.14652792364358902,
            "samples_per_second": 4397274.121084054,
            "samples_per_second_per_gpu": 549659.2651355068,
            "loss_sequences_lower_95": 7.503740991210938,
            "loss_sequences_upper_95": 7.733639099121094,
            "loss_tokens_lower_95": 7.175258688814472,
            "loss_tokens_upper_95": 7.369012319749155,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.532303360776936,
            "data_time": 0.03872626150647799,
            "batch_time": 0.08300425857305527,
            "samples_per_second": 4590704.583593798,
            "samples_per_second_per_gpu": 573838.0729492247,
            "loss_sequences_lower_95": 5.512336475706085,
            "loss_sequences_upper_95": 5.55249846788049,
            "loss_tokens_lower_95": 5.512282605991242,
            "loss_tokens_upper_95": 5.552694533345206,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.449962037316482,
            "data_time": 0.12364888191223145,
            "batch_time": 0.16392298539479574,
            "samples_per_second": 3877451.6570225395,
            "samples_per_second_per_gpu": 484681.45712781744,
            "loss_sequences_lower_95": 5.34727786953365,
            "loss_sequences_upper_95": 5.550836809805827,
            "loss_tokens_lower_95": 5.34725832140757,
            "loss_tokens_upper_95": 5.549737670710925,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.717115724563599,
            "data_time": 0.0918944962322712,
            "batch_time": 0.136172104626894,
            "samples_per_second": 4335129.623995627,
            "samples_per_second_per_gpu": 541891.2029994534,
            "loss_sequences_lower_95": 7.64184033203125,
            "loss_sequences_upper_95": 7.7931582031249995,
            "loss_tokens_lower_95": 7.64079365234375,
            "loss_tokens_upper_95": 7.792232568359375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.287604086875013,
            "data_time": 0.027583251396814983,
            "batch_time": 0.07142943676028933,
            "samples_per_second": 4492368.788790695,
            "samples_per_second_per_gpu": 561546.0985988369,
            "loss_sequences_lower_95": 5.207408533881268,
            "loss_sequences_upper_95": 5.299680561509579,
            "loss_tokens_lower_95": 4.1835236081266265,
            "loss_tokens_upper_95": 4.247552324387876,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.797261037933293,
            "data_time": 0.21379442725862777,
            "batch_time": 0.24596776281084334,
            "samples_per_second": 1891185.4111120496,
            "samples_per_second_per_gpu": 236398.1763890062,
            "loss_sequences_lower_95": 5.636166222415753,
            "loss_sequences_upper_95": 5.954046619472219,
            "loss_tokens_lower_95": 5.632102932147125,
            "loss_tokens_upper_95": 5.957031363871559,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.894840782763911,
            "data_time": 0.186937615275383,
            "batch_time": 0.2325773686170578,
            "samples_per_second": 3766369.828456923,
            "samples_per_second_per_gpu": 470796.22855711536,
            "loss_sequences_lower_95": 5.778994703105852,
            "loss_sequences_upper_95": 6.009028667375153,
            "loss_tokens_lower_95": 5.782442614985447,
            "loss_tokens_upper_95": 6.009272831935509,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.072139550641143,
            "data_time": 0.029416090808808804,
            "batch_time": 0.07344197249040008,
            "samples_per_second": 4472730.85557224,
            "samples_per_second_per_gpu": 559091.35694653,
            "loss_sequences_lower_95": 5.799301176132093,
            "loss_sequences_upper_95": 5.89059089432231,
            "loss_tokens_lower_95": 4.971426132822525,
            "loss_tokens_upper_95": 5.052335844988452,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.586985903442221,
            "data_time": 0.3408096879720688,
            "batch_time": 0.3784886747598648,
            "samples_per_second": 2528712.641774761,
            "samples_per_second_per_gpu": 316089.0802218451,
            "loss_sequences_lower_95": 5.517346352874917,
            "loss_sequences_upper_95": 5.6596278437861685,
            "loss_tokens_lower_95": 5.515726822141617,
            "loss_tokens_upper_95": 5.656675744435144,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.853615047734812,
            "data_time": 0.046475658049950234,
            "batch_time": 0.09076316425433525,
            "samples_per_second": 4393030.711861099,
            "samples_per_second_per_gpu": 549128.8389826374,
            "loss_sequences_lower_95": 6.8328542204224005,
            "loss_sequences_upper_95": 6.873797648485283,
            "loss_tokens_lower_95": 6.833295375812309,
            "loss_tokens_upper_95": 6.873511563455658,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.724261598679626,
            "data_time": 0.36544154584407806,
            "batch_time": 0.4056600481271744,
            "samples_per_second": 2039253.8514818395,
            "samples_per_second_per_gpu": 254906.73143522994,
            "loss_sequences_lower_95": 5.5442821910080395,
            "loss_sequences_upper_95": 5.896972804393583,
            "loss_tokens_lower_95": 5.548465173221328,
            "loss_tokens_upper_95": 5.898947484284928,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.58662994702657,
            "data_time": 0.28771884739398956,
            "batch_time": 0.3083762526512146,
            "samples_per_second": 1229053.0380306493,
            "samples_per_second_per_gpu": 153631.62975383116,
            "loss_sequences_lower_95": 5.380103594462077,
            "loss_sequences_upper_95": 6.189104741414388,
            "loss_tokens_lower_95": 4.804987992180719,
            "loss_tokens_upper_95": 6.210356627570258,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7865468184153235,
            "data_time": 0.30977240204811096,
            "batch_time": 0.33060401678085327,
            "samples_per_second": 1026152.5893954721,
            "samples_per_second_per_gpu": 128269.07367443401,
            "loss_sequences_lower_95": 4.7419189453125,
            "loss_sequences_upper_95": 5.678738098144531,
            "loss_tokens_lower_95": 3.913205624698253,
            "loss_tokens_upper_95": 5.3907676611053805,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.536782450528489,
            "data_time": 0.04309257971388953,
            "batch_time": 0.08591525256633759,
            "samples_per_second": 4342367.199350203,
            "samples_per_second_per_gpu": 542795.8999187754,
            "loss_sequences_lower_95": 7.521109901394514,
            "loss_sequences_upper_95": 7.552728506995582,
            "loss_tokens_lower_95": 7.521638942263439,
            "loss_tokens_upper_95": 7.5521932241807805,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.9404310537539746,
            "data_time": 0.023034954565581276,
            "batch_time": 0.06766948325144671,
            "samples_per_second": 4465348.001014736,
            "samples_per_second_per_gpu": 558168.500126842,
            "loss_sequences_lower_95": 2.5929871352599503,
            "loss_sequences_upper_95": 2.630171113143678,
            "loss_tokens_lower_95": 1.8828304363970718,
            "loss_tokens_upper_95": 1.9053433554655728,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.265336093001478,
            "data_time": 0.32827743887901306,
            "batch_time": 0.36007995903491974,
            "samples_per_second": 1898148.1879493275,
            "samples_per_second_per_gpu": 237268.52349366594,
            "loss_sequences_lower_95": 6.526237487792969,
            "loss_sequences_upper_95": 6.971375701183409,
            "loss_tokens_lower_95": 6.064737985714506,
            "loss_tokens_upper_95": 6.373011624236214,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.526681745374525,
            "data_time": 0.21396778523921967,
            "batch_time": 0.2374451458454132,
            "samples_per_second": 1082468.2097468958,
            "samples_per_second_per_gpu": 135308.52621836198,
            "loss_sequences_lower_95": 9.96535735259185,
            "loss_sequences_upper_95": 11.336016515783362,
            "loss_tokens_lower_95": 9.33635316071687,
            "loss_tokens_upper_95": 11.389270904917774,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.070435175081578,
            "data_time": 0.3160054683685303,
            "batch_time": 0.3494742065668106,
            "samples_per_second": 2316502.2823677603,
            "samples_per_second_per_gpu": 289562.78529597004,
            "loss_sequences_lower_95": 6.28633180943931,
            "loss_sequences_upper_95": 6.677492262677449,
            "loss_tokens_lower_95": 5.877514849663583,
            "loss_tokens_upper_95": 6.1304432723139195,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.237980054646004,
            "data_time": 0.33619022369384766,
            "batch_time": 0.37061935663223267,
            "samples_per_second": 2106514.013740934,
            "samples_per_second_per_gpu": 263314.25171761675,
            "loss_sequences_lower_95": 6.45488595729921,
            "loss_sequences_upper_95": 6.814516225675257,
            "loss_tokens_lower_95": 6.063791279721766,
            "loss_tokens_upper_95": 6.27872395909684,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.200722048922283,
            "data_time": 0.33967335522174835,
            "batch_time": 0.37487857043743134,
            "samples_per_second": 2245640.1634289715,
            "samples_per_second_per_gpu": 280705.02042862144,
            "loss_sequences_lower_95": 6.4477686812238,
            "loss_sequences_upper_95": 6.90238604661895,
            "loss_tokens_lower_95": 5.987583911933043,
            "loss_tokens_upper_95": 6.317076717155154,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.369932703855561,
            "data_time": 0.3271920680999756,
            "batch_time": 0.36106859147548676,
            "samples_per_second": 2284224.951766546,
            "samples_per_second_per_gpu": 285528.11897081824,
            "loss_sequences_lower_95": 6.544285881228563,
            "loss_sequences_upper_95": 6.877438428925305,
            "loss_tokens_lower_95": 6.206325799579561,
            "loss_tokens_upper_95": 6.398159362222547,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.929343389428181,
            "data_time": 0.36338694393634796,
            "batch_time": 0.3982912451028824,
            "samples_per_second": 2056876.1027693273,
            "samples_per_second_per_gpu": 257109.51284616592,
            "loss_sequences_lower_95": 5.998806440460016,
            "loss_sequences_upper_95": 6.232933641516643,
            "loss_tokens_lower_95": 5.794786395875987,
            "loss_tokens_upper_95": 5.935207354581018,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.410135010393654,
            "data_time": 0.3482039272785187,
            "batch_time": 0.3847668617963791,
            "samples_per_second": 1853773.682367629,
            "samples_per_second_per_gpu": 231721.7102959536,
            "loss_sequences_lower_95": 5.6426473384950215,
            "loss_sequences_upper_95": 5.924263409870427,
            "loss_tokens_lower_95": 5.284481462383212,
            "loss_tokens_upper_95": 5.4211714511754066,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-16.0/params.txt",
    "uuid": "0aa2aeb1-3917-47c2-888e-606aba020ab2",
    "creation_date": "2023_12_14-05_01_02"
}