{
    "name": "rw_original-d=576_l=24_h=8-8.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 24588380160,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1229419008",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.141026610136032,
            "data_time": 0.032289281487464905,
            "batch_time": 0.3842555247247219,
            "samples_per_second": 840650.4934444368,
            "samples_per_second_per_gpu": 105081.3116805546,
            "loss_sequences_lower_95": 3.063578421274821,
            "loss_sequences_upper_95": 3.218921305338542,
            "loss_tokens_lower_95": 3.126965357462565,
            "loss_tokens_upper_95": 3.1546387736002606,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.145112633027742,
            "data_time": 0.001062811120769247,
            "batch_time": 0.03046719399608758,
            "samples_per_second": 1087178.7439464238,
            "samples_per_second_per_gpu": 135897.34299330297,
            "loss_sequences_lower_95": 3.142680318297862,
            "loss_sequences_upper_95": 3.147565090839304,
            "loss_tokens_lower_95": 3.1347181822916665,
            "loss_tokens_upper_95": 3.1555264687499998,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.829436677329394,
            "data_time": 0.008738954544067383,
            "batch_time": 0.03821976470947266,
            "samples_per_second": 1051383.9680613985,
            "samples_per_second_per_gpu": 131422.9960076748,
            "loss_sequences_lower_95": 2.7728382593271688,
            "loss_sequences_upper_95": 2.9006837619080836,
            "loss_tokens_lower_95": 2.8168765729166667,
            "loss_tokens_upper_95": 2.841951609375,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2537648603105054,
            "data_time": 0.0013658590614795685,
            "batch_time": 0.030154849255555553,
            "samples_per_second": 1112598.8269983653,
            "samples_per_second_per_gpu": 139074.85337479567,
            "loss_sequences_lower_95": 3.21209416277384,
            "loss_sequences_upper_95": 3.2973823997261595,
            "loss_tokens_lower_95": 3.241441057291667,
            "loss_tokens_upper_95": 3.2661585625,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2224949187999345,
            "data_time": 0.008462036748331382,
            "batch_time": 0.037587283616996855,
            "samples_per_second": 1061816.2223262535,
            "samples_per_second_per_gpu": 132727.0277907817,
            "loss_sequences_lower_95": 3.160408439558538,
            "loss_sequences_upper_95": 3.3034454718626685,
            "loss_tokens_lower_95": 3.211339453125,
            "loss_tokens_upper_95": 3.2334690260416665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.341554005886729,
            "data_time": 0.0033457522159037385,
            "batch_time": 0.031780162583226745,
            "samples_per_second": 1121223.3877642492,
            "samples_per_second_per_gpu": 140152.92347053115,
            "loss_sequences_lower_95": 3.292319362492565,
            "loss_sequences_upper_95": 3.3958169632424795,
            "loss_tokens_lower_95": 3.3288390677083335,
            "loss_tokens_upper_95": 3.354054630208333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.082008869514173,
            "data_time": 0.0014319660029636899,
            "batch_time": 0.030690828310918263,
            "samples_per_second": 1099093.5618462982,
            "samples_per_second_per_gpu": 137386.69523078727,
            "loss_sequences_lower_95": 3.049562634526467,
            "loss_sequences_upper_95": 3.1141479741310585,
            "loss_tokens_lower_95": 3.0652488229166663,
            "loss_tokens_upper_95": 3.0993060416666665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6849789923023804,
            "data_time": 0.0015861781657564745,
            "batch_time": 0.030081410130812305,
            "samples_per_second": 1122810.22512062,
            "samples_per_second_per_gpu": 140351.2781400775,
            "loss_sequences_lower_95": 3.6584699361910995,
            "loss_sequences_upper_95": 3.7136720692899217,
            "loss_tokens_lower_95": 3.6733086875,
            "loss_tokens_upper_95": 3.6966946875,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3144529306791664,
            "data_time": 0.008357067902882894,
            "batch_time": 0.03713284030793205,
            "samples_per_second": 1073933.92260274,
            "samples_per_second_per_gpu": 134241.7403253425,
            "loss_sequences_lower_95": 3.2236405659497267,
            "loss_sequences_upper_95": 3.424191079488615,
            "loss_tokens_lower_95": 3.3027377239583333,
            "loss_tokens_upper_95": 3.326143265625,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3783453047982315,
            "data_time": 0.007988675497472286,
            "batch_time": 0.0367059288546443,
            "samples_per_second": 1090881.90372524,
            "samples_per_second_per_gpu": 136360.237965655,
            "loss_sequences_lower_95": 4.2585506997089615,
            "loss_sequences_upper_95": 4.52700132588624,
            "loss_tokens_lower_95": 4.364839947916667,
            "loss_tokens_upper_95": 4.392042979166666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3752775309558634,
            "data_time": 0.0011399208470284693,
            "batch_time": 0.02989513562391411,
            "samples_per_second": 1117113.480719751,
            "samples_per_second_per_gpu": 139639.18508996887,
            "loss_sequences_lower_95": 3.3615518899033847,
            "loss_sequences_upper_95": 3.3895774336306874,
            "loss_tokens_lower_95": 3.3640890937499996,
            "loss_tokens_upper_95": 3.3868381093749997,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.209068553242173,
            "data_time": 0.002361828838160989,
            "batch_time": 0.030935807390872087,
            "samples_per_second": 1119640.265152711,
            "samples_per_second_per_gpu": 139955.03314408887,
            "loss_sequences_lower_95": 3.180574123981544,
            "loss_sequences_upper_95": 3.2389876512227977,
            "loss_tokens_lower_95": 3.1976693333333333,
            "loss_tokens_upper_95": 3.220484588541667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.742129951171411,
            "data_time": 0.00834225665910442,
            "batch_time": 0.036904363292950416,
            "samples_per_second": 1075290.909915679,
            "samples_per_second_per_gpu": 134411.36373945986,
            "loss_sequences_lower_95": 3.6486845190578254,
            "loss_sequences_upper_95": 3.856958837296368,
            "loss_tokens_lower_95": 3.7287845416666667,
            "loss_tokens_upper_95": 3.75518621875,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9541582488236746,
            "data_time": 0.007745920424442367,
            "batch_time": 0.03617304444787987,
            "samples_per_second": 1083867.5584591504,
            "samples_per_second_per_gpu": 135483.4448073938,
            "loss_sequences_lower_95": 2.864568401888286,
            "loss_sequences_upper_95": 3.059100994413108,
            "loss_tokens_lower_95": 2.9424854218750003,
            "loss_tokens_upper_95": 2.965666296875,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8701072064313022,
            "data_time": 0.06824526616505214,
            "batch_time": 0.10287559883935112,
            "samples_per_second": 536849.240156044,
            "samples_per_second_per_gpu": 67106.1550195055,
            "loss_sequences_lower_95": 3.785838742689653,
            "loss_sequences_upper_95": 3.980499657717618,
            "loss_tokens_lower_95": 3.8485158746892756,
            "loss_tokens_upper_95": 3.892091248252175,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.295284976069503,
            "data_time": 0.011478867043148388,
            "batch_time": 0.0413464063947851,
            "samples_per_second": 1029460.2372854046,
            "samples_per_second_per_gpu": 128682.52966067557,
            "loss_sequences_lower_95": 3.2300456678206997,
            "loss_sequences_upper_95": 3.3604293511838326,
            "loss_tokens_lower_95": 3.2817124479166666,
            "loss_tokens_upper_95": 3.3086123802083334,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.205301551516893,
            "data_time": 0.010827449460824331,
            "batch_time": 0.039662446826696396,
            "samples_per_second": 1078986.379292129,
            "samples_per_second_per_gpu": 134873.29741151613,
            "loss_sequences_lower_95": 5.111457514699971,
            "loss_sequences_upper_95": 5.333266398648788,
            "loss_tokens_lower_95": 5.1934415000000005,
            "loss_tokens_upper_95": 5.216983375,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.509860333849172,
            "data_time": 0.02942429855465889,
            "batch_time": 0.061232760548591614,
            "samples_per_second": 902293.5320706045,
            "samples_per_second_per_gpu": 112786.69150882556,
            "loss_sequences_lower_95": 3.3555522230804944,
            "loss_sequences_upper_95": 3.783774010079806,
            "loss_tokens_lower_95": 3.4959323195160414,
            "loss_tokens_upper_95": 3.524032986750368,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.120173917131162,
            "data_time": 0.001391141859979999,
            "batch_time": 0.029994496384079743,
            "samples_per_second": 1114054.0216805246,
            "samples_per_second_per_gpu": 139256.75271006557,
            "loss_sequences_lower_95": 3.1018609837340656,
            "loss_sequences_upper_95": 3.1389751921690285,
            "loss_tokens_lower_95": 3.101332511950755,
            "loss_tokens_upper_95": 3.1391732100016916,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.799840710880604,
            "data_time": 0.0016452741281242128,
            "batch_time": 0.030420738563036464,
            "samples_per_second": 1106609.940049692,
            "samples_per_second_per_gpu": 138326.2425062115,
            "loss_sequences_lower_95": 2.79677544255533,
            "loss_sequences_upper_95": 2.8216413086520986,
            "loss_tokens_lower_95": 2.7793962932304743,
            "loss_tokens_upper_95": 2.7977468868391133,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9487460706247495,
            "data_time": 0.00261503767000966,
            "batch_time": 0.032469047228195665,
            "samples_per_second": 1103552.0751701626,
            "samples_per_second_per_gpu": 137944.00939627032,
            "loss_sequences_lower_95": 4.2166121492309285,
            "loss_sequences_upper_95": 4.509526258783065,
            "loss_tokens_lower_95": 3.3788531812622855,
            "loss_tokens_upper_95": 3.5890389061383132,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.814837711515526,
            "data_time": 0.003202505092671577,
            "batch_time": 0.03253336916578577,
            "samples_per_second": 1081812.7761543172,
            "samples_per_second_per_gpu": 135226.59701928965,
            "loss_sequences_lower_95": 3.889243937174479,
            "loss_sequences_upper_95": 4.085211409505208,
            "loss_tokens_lower_95": 3.5881209094929245,
            "loss_tokens_upper_95": 3.728755951503538,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.632538549280058,
            "data_time": 0.0037459568380410497,
            "batch_time": 0.03232297566501623,
            "samples_per_second": 1105442.9416639507,
            "samples_per_second_per_gpu": 138180.36770799384,
            "loss_sequences_lower_95": 2.6739629159830485,
            "loss_sequences_upper_95": 2.728358914112846,
            "loss_tokens_lower_95": 2.5458388726385794,
            "loss_tokens_upper_95": 2.575027652413448,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.2260120651938697,
            "data_time": 0.020487353205680847,
            "batch_time": 0.05375194123813084,
            "samples_per_second": 976085.4593426128,
            "samples_per_second_per_gpu": 122010.6824178266,
            "loss_sequences_lower_95": 2.205645332336426,
            "loss_sequences_upper_95": 2.304631417014382,
            "loss_tokens_lower_95": 2.1610480780132018,
            "loss_tokens_upper_95": 2.2066736763705532,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.061978043342123,
            "data_time": 0.016781583428382874,
            "batch_time": 0.04584649205207825,
            "samples_per_second": 1005432.2103314601,
            "samples_per_second_per_gpu": 125679.02629143251,
            "loss_sequences_lower_95": 3.051009527712452,
            "loss_sequences_upper_95": 3.2305888833805008,
            "loss_tokens_lower_95": 2.951655686674848,
            "loss_tokens_upper_95": 3.039307758018788,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.273421419461568,
            "data_time": 0.015094932837364001,
            "batch_time": 0.04464932741262974,
            "samples_per_second": 1004331.335378496,
            "samples_per_second_per_gpu": 125541.416922312,
            "loss_sequences_lower_95": 3.243285481770833,
            "loss_sequences_upper_95": 3.3364744517008464,
            "loss_tokens_lower_95": 3.151065489741869,
            "loss_tokens_upper_95": 3.3440533726412816,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.956523772198154,
            "data_time": 0.0013497445705591394,
            "batch_time": 0.030534933148588607,
            "samples_per_second": 1093899.6776674297,
            "samples_per_second_per_gpu": 136737.4597084287,
            "loss_sequences_lower_95": 4.958261936927932,
            "loss_sequences_upper_95": 5.044843215608237,
            "loss_tokens_lower_95": 4.821823383896149,
            "loss_tokens_upper_95": 4.910088433289335,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.970226773158067,
            "data_time": 0.0028797014047635483,
            "batch_time": 0.03203220375432264,
            "samples_per_second": 1091530.1670278183,
            "samples_per_second_per_gpu": 136441.27087847728,
            "loss_sequences_lower_95": 4.431138225516888,
            "loss_sequences_upper_95": 4.717742601388231,
            "loss_tokens_lower_95": 3.3284462189912576,
            "loss_tokens_upper_95": 3.4576234684111746,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6953322494803027,
            "data_time": 0.0045138464586154834,
            "batch_time": 0.033421843035801035,
            "samples_per_second": 1087378.8208122985,
            "samples_per_second_per_gpu": 135922.3526015373,
            "loss_sequences_lower_95": 4.047597881226002,
            "loss_sequences_upper_95": 4.3733239496120415,
            "loss_tokens_lower_95": 3.3194328138235116,
            "loss_tokens_upper_95": 3.466037935624395,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.250648670544908,
            "data_time": 0.01963184561048235,
            "batch_time": 0.04922462786946978,
            "samples_per_second": 1010722.6924033243,
            "samples_per_second_per_gpu": 126340.33655041554,
            "loss_sequences_lower_95": 6.163920739578875,
            "loss_sequences_upper_95": 6.33708352563588,
            "loss_tokens_lower_95": 6.164521838323167,
            "loss_tokens_upper_95": 6.336536257669806,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1656605219841003,
            "data_time": 0.041430615461789645,
            "batch_time": 0.07218628663283128,
            "samples_per_second": 901136.0500497944,
            "samples_per_second_per_gpu": 112642.0062562243,
            "loss_sequences_lower_95": 3.018579933166504,
            "loss_sequences_upper_95": 3.3632968826293945,
            "loss_tokens_lower_95": 2.876574210233467,
            "loss_tokens_upper_95": 3.311638012171218,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5393131643137155,
            "data_time": 0.002979258574347311,
            "batch_time": 0.03202947248954227,
            "samples_per_second": 1095024.6984784438,
            "samples_per_second_per_gpu": 136878.08730980547,
            "loss_sequences_lower_95": 3.4888007235258605,
            "loss_sequences_upper_95": 3.588574880953333,
            "loss_tokens_lower_95": 3.48866783930799,
            "loss_tokens_upper_95": 3.5883169590023667,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8474864573092074,
            "data_time": 0.004352506961169095,
            "batch_time": 0.033636964359252335,
            "samples_per_second": 1082550.0398752992,
            "samples_per_second_per_gpu": 135318.7549844124,
            "loss_sequences_lower_95": 3.7808138160799296,
            "loss_sequences_upper_95": 3.914844559803056,
            "loss_tokens_lower_95": 3.7793740362343877,
            "loss_tokens_upper_95": 3.9145261172482853,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2028324098789396,
            "data_time": 0.003179834419398792,
            "batch_time": 0.033016798048245984,
            "samples_per_second": 1063826.3732381326,
            "samples_per_second_per_gpu": 132978.29665476657,
            "loss_sequences_lower_95": 3.3344905351009926,
            "loss_sequences_upper_95": 3.4672970909807703,
            "loss_tokens_lower_95": 3.0529060782967035,
            "loss_tokens_upper_95": 3.1115269869474202,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.140875759840012,
            "data_time": 0.009066477417945862,
            "batch_time": 0.037837118841707706,
            "samples_per_second": 1063360.3093122812,
            "samples_per_second_per_gpu": 132920.03866403515,
            "loss_sequences_lower_95": 5.323111572265625,
            "loss_sequences_upper_95": 5.870084204101563,
            "loss_tokens_lower_95": 4.563176670238563,
            "loss_tokens_upper_95": 4.919538480504944,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.57078255712986,
            "data_time": 0.12730753421783447,
            "batch_time": 0.16087301075458527,
            "samples_per_second": 591000.6830509831,
            "samples_per_second_per_gpu": 73875.08538137289,
            "loss_sequences_lower_95": 3.3377334117889403,
            "loss_sequences_upper_95": 3.830509090423584,
            "loss_tokens_lower_95": 3.1019097251453616,
            "loss_tokens_upper_95": 3.9438521900396237,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.108894236471461,
            "data_time": 0.024018330776945075,
            "batch_time": 0.05273271367904988,
            "samples_per_second": 962440.1684349275,
            "samples_per_second_per_gpu": 120305.02105436593,
            "loss_sequences_lower_95": 4.399220381111935,
            "loss_sequences_upper_95": 4.973308694773706,
            "loss_tokens_lower_95": 3.1541686048367916,
            "loss_tokens_upper_95": 3.513203452539283,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.2052604256888673,
            "data_time": 0.0023941542539331648,
            "batch_time": 0.03147342925270399,
            "samples_per_second": 1090331.4966103933,
            "samples_per_second_per_gpu": 136291.43707629916,
            "loss_sequences_lower_95": 2.1821676658407054,
            "loss_sequences_upper_95": 2.228247598270601,
            "loss_tokens_lower_95": 2.1818321564667293,
            "loss_tokens_upper_95": 2.228608870933336,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.4618619464026468,
            "data_time": 0.0022571975980380456,
            "batch_time": 0.03142417129454429,
            "samples_per_second": 1093305.986604059,
            "samples_per_second_per_gpu": 136663.2483255074,
            "loss_sequences_lower_95": 2.436539664584041,
            "loss_sequences_upper_95": 2.5718041551568867,
            "loss_tokens_lower_95": 2.3201638706052905,
            "loss_tokens_upper_95": 2.4534929020318765,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.045840845439897,
            "data_time": 0.01595313184791141,
            "batch_time": 0.046004972524113126,
            "samples_per_second": 973881.6338556393,
            "samples_per_second_per_gpu": 121735.20423195491,
            "loss_sequences_lower_95": 2.9092775952685006,
            "loss_sequences_upper_95": 3.3120472666981455,
            "loss_tokens_lower_95": 2.794678699870996,
            "loss_tokens_upper_95": 3.0838703843645203,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4679512481757198,
            "data_time": 0.004070230200886727,
            "batch_time": 0.0326750174164772,
            "samples_per_second": 1097557.9016040429,
            "samples_per_second_per_gpu": 137194.73770050536,
            "loss_sequences_lower_95": 3.5121742061864887,
            "loss_sequences_upper_95": 3.6658544382337706,
            "loss_tokens_lower_95": 3.321324397023134,
            "loss_tokens_upper_95": 3.4627827254317953,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.623303076842936,
            "data_time": 0.025683255422682988,
            "batch_time": 0.055001633507864814,
            "samples_per_second": 994428.6800643565,
            "samples_per_second_per_gpu": 124303.58500804457,
            "loss_sequences_lower_95": 2.4821029988730827,
            "loss_sequences_upper_95": 2.923978465940894,
            "loss_tokens_lower_95": 2.3361948234222063,
            "loss_tokens_upper_95": 2.668619934380735,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.576360769762137,
            "data_time": 0.0016729339713151103,
            "batch_time": 0.031363021455856975,
            "samples_per_second": 1077042.5900344558,
            "samples_per_second_per_gpu": 134630.32375430697,
            "loss_sequences_lower_95": 5.566775044462018,
            "loss_sequences_upper_95": 5.586071013030731,
            "loss_tokens_lower_95": 5.566664436402906,
            "loss_tokens_upper_95": 5.58593124874975,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.253078489048967,
            "data_time": 0.03967918482693759,
            "batch_time": 0.07019799839366567,
            "samples_per_second": 878312.770476157,
            "samples_per_second_per_gpu": 109789.09630951962,
            "loss_sequences_lower_95": 1.2004695281241704,
            "loss_sequences_upper_95": 1.3682745072448137,
            "loss_tokens_lower_95": 1.068276855718638,
            "loss_tokens_upper_95": 1.3174459623338743,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.007594842393443,
            "data_time": 0.0011483233637501987,
            "batch_time": 0.030211895554383462,
            "samples_per_second": 1097273.656194871,
            "samples_per_second_per_gpu": 137159.20702435888,
            "loss_sequences_lower_95": 5.423505910557521,
            "loss_sequences_upper_95": 5.4709624770702305,
            "loss_tokens_lower_95": 4.373807809477756,
            "loss_tokens_upper_95": 4.4226414047388785,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.913328790426254,
            "data_time": 0.0049609967640468055,
            "batch_time": 0.03450685692211938,
            "samples_per_second": 1067773.4892885918,
            "samples_per_second_per_gpu": 133471.68616107397,
            "loss_sequences_lower_95": 4.918664624023437,
            "loss_sequences_upper_95": 5.103646130371094,
            "loss_tokens_lower_95": 4.720768908908545,
            "loss_tokens_upper_95": 4.892993296234532,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.197604104746943,
            "data_time": 0.018428855023141635,
            "batch_time": 0.048965609679787844,
            "samples_per_second": 981827.0135917371,
            "samples_per_second_per_gpu": 122728.37669896714,
            "loss_sequences_lower_95": 3.0819815063476566,
            "loss_sequences_upper_95": 3.3170329815408457,
            "loss_tokens_lower_95": 3.082882305642833,
            "loss_tokens_upper_95": 3.3153916931152345,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.027655293363513,
            "data_time": 0.0036066196050988622,
            "batch_time": 0.032849084181957935,
            "samples_per_second": 1081651.7151460964,
            "samples_per_second_per_gpu": 135206.46439326205,
            "loss_sequences_lower_95": 6.944591933741714,
            "loss_sequences_upper_95": 7.107664794921875,
            "loss_tokens_lower_95": 6.944390961618135,
            "loss_tokens_upper_95": 7.108751664595171,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.5627770528793334,
            "data_time": 0.00362596296249552,
            "batch_time": 0.03241220940934851,
            "samples_per_second": 1102062.7181729665,
            "samples_per_second_per_gpu": 137757.8397716208,
            "loss_sequences_lower_95": 1.6121496663411459,
            "loss_sequences_upper_95": 1.674887870279948,
            "loss_tokens_lower_95": 1.4660770636379552,
            "loss_tokens_upper_95": 1.5450812707895658,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.109709511484419,
            "data_time": 0.021049446293285916,
            "batch_time": 0.04994587174483708,
            "samples_per_second": 983054.8204342985,
            "samples_per_second_per_gpu": 122881.85255428731,
            "loss_sequences_lower_95": 5.74168689546131,
            "loss_sequences_upper_95": 6.472939453125,
            "loss_tokens_lower_95": 5.745647423153832,
            "loss_tokens_upper_95": 6.4761801438104545,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.036253195255995,
            "data_time": 0.12929348647594452,
            "batch_time": 0.16379790008068085,
            "samples_per_second": 579636.4149243669,
            "samples_per_second_per_gpu": 72454.55186554586,
            "loss_sequences_lower_95": 1.8652529805898665,
            "loss_sequences_upper_95": 2.5515814483165737,
            "loss_tokens_lower_95": 1.619590347132732,
            "loss_tokens_upper_95": 2.0504161276276576,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.44591618680954,
            "data_time": 0.00505110432231237,
            "batch_time": 0.03373566742927309,
            "samples_per_second": 1095462.7454929133,
            "samples_per_second_per_gpu": 136932.84318661416,
            "loss_sequences_lower_95": 7.388962194824218,
            "loss_sequences_upper_95": 7.68998896484375,
            "loss_tokens_lower_95": 7.176454764170896,
            "loss_tokens_upper_95": 7.442418896566994,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.238010907649994,
            "data_time": 0.004961214368305509,
            "batch_time": 0.033836816038404195,
            "samples_per_second": 1089847.6621149816,
            "samples_per_second_per_gpu": 136230.9577643727,
            "loss_sequences_lower_95": 7.358692736816407,
            "loss_sequences_upper_95": 7.598998291015625,
            "loss_tokens_lower_95": 6.957164677680583,
            "loss_tokens_upper_95": 7.1624692531962,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.99377815267043,
            "data_time": 0.0033500666602399435,
            "batch_time": 0.03197585475883356,
            "samples_per_second": 1103167.657794125,
            "samples_per_second_per_gpu": 137895.9572242656,
            "loss_sequences_lower_95": 5.975530480011733,
            "loss_sequences_upper_95": 6.012104873580498,
            "loss_tokens_lower_95": 5.97555917451496,
            "loss_tokens_upper_95": 6.011885842691921,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3776544832414195,
            "data_time": 0.007557702568722635,
            "batch_time": 0.03645266074788534,
            "samples_per_second": 1070854.5425582612,
            "samples_per_second_per_gpu": 133856.81781978265,
            "loss_sequences_lower_95": 3.301660878171203,
            "loss_sequences_upper_95": 3.453014686689948,
            "loss_tokens_lower_95": 3.299454311950965,
            "loss_tokens_upper_95": 3.4538815171670985,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.138312848091125,
            "data_time": 0.00519149691339523,
            "batch_time": 0.034582760125871685,
            "samples_per_second": 1071577.2013807185,
            "samples_per_second_per_gpu": 133947.1501725898,
            "loss_sequences_lower_95": 7.0671747802734375,
            "loss_sequences_upper_95": 7.21233125,
            "loss_tokens_lower_95": 7.067206469726563,
            "loss_tokens_upper_95": 7.212974694824219,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6651857352809403,
            "data_time": 0.0018041674266014748,
            "batch_time": 0.03098175156093951,
            "samples_per_second": 1091565.8197474969,
            "samples_per_second_per_gpu": 136445.7274684371,
            "loss_sequences_lower_95": 3.1065018829085855,
            "loss_sequences_upper_95": 3.180804569240776,
            "loss_tokens_lower_95": 2.1229086657664338,
            "loss_tokens_upper_95": 2.1753494013404855,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.424272264562436,
            "data_time": 0.01642704350607736,
            "batch_time": 0.04600285972867693,
            "samples_per_second": 994525.5413834968,
            "samples_per_second_per_gpu": 124315.6926729371,
            "loss_sequences_lower_95": 3.296053968970455,
            "loss_sequences_upper_95": 3.553540710904705,
            "loss_tokens_lower_95": 3.296835201889721,
            "loss_tokens_upper_95": 3.5524945842685987,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.553848310545379,
            "data_time": 0.008982154540717602,
            "batch_time": 0.03840070776641369,
            "samples_per_second": 1066751.5395664277,
            "samples_per_second_per_gpu": 133343.94244580346,
            "loss_sequences_lower_95": 3.4466525029201134,
            "loss_sequences_upper_95": 3.6584979188208484,
            "loss_tokens_lower_95": 3.4519554168102786,
            "loss_tokens_upper_95": 3.657442866306679,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7368173304915833,
            "data_time": 0.0018810081195759756,
            "batch_time": 0.0305266311151381,
            "samples_per_second": 1109230.7439313652,
            "samples_per_second_per_gpu": 138653.84299142065,
            "loss_sequences_lower_95": 4.32201800407898,
            "loss_sequences_upper_95": 4.426806866935989,
            "loss_tokens_lower_95": 2.9635438052205156,
            "loss_tokens_upper_95": 3.0403511529287495,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.085910411108108,
            "data_time": 0.022460003693898518,
            "batch_time": 0.0519602025548617,
            "samples_per_second": 1009815.9054262467,
            "samples_per_second_per_gpu": 126226.98817828084,
            "loss_sequences_lower_95": 5.991904380838707,
            "loss_sequences_upper_95": 6.176583587807953,
            "loss_tokens_lower_95": 5.9914989491619135,
            "loss_tokens_upper_95": 6.175136424877025,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.915657591163565,
            "data_time": 0.003310837879576817,
            "batch_time": 0.03223373339726375,
            "samples_per_second": 1096144.074188725,
            "samples_per_second_per_gpu": 137018.00927359064,
            "loss_sequences_lower_95": 3.886591169724771,
            "loss_sequences_upper_95": 3.9451433710220756,
            "loss_tokens_lower_95": 3.8859114956637044,
            "loss_tokens_upper_95": 3.944466310086965,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.491319356034103,
            "data_time": 0.020378344709222968,
            "batch_time": 0.04956375468860973,
            "samples_per_second": 975515.2404199424,
            "samples_per_second_per_gpu": 121939.4050524928,
            "loss_sequences_lower_95": 3.3594289242642597,
            "loss_sequences_upper_95": 3.6267526274745907,
            "loss_tokens_lower_95": 3.3549344553530793,
            "loss_tokens_upper_95": 3.6273282393668462,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9638355267544587,
            "data_time": 0.06750085949897766,
            "batch_time": 0.09833236783742905,
            "samples_per_second": 782601.2135781801,
            "samples_per_second_per_gpu": 97825.15169727251,
            "loss_sequences_lower_95": 1.7645493284861247,
            "loss_sequences_upper_95": 2.313516807556152,
            "loss_tokens_lower_95": 1.5493942870034112,
            "loss_tokens_upper_95": 2.1112356609768335,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9310521711905797,
            "data_time": 0.06461405009031296,
            "batch_time": 0.09521295130252838,
            "samples_per_second": 784695.0530456376,
            "samples_per_second_per_gpu": 98086.8816307047,
            "loss_sequences_lower_95": 1.8031188996632892,
            "loss_sequences_upper_95": 2.3883458328247067,
            "loss_tokens_lower_95": 1.4497662833567417,
            "loss_tokens_upper_95": 2.104856893989477,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5486458254492512,
            "data_time": 0.0028231727719096452,
            "batch_time": 0.03202219343381885,
            "samples_per_second": 1088940.2964241616,
            "samples_per_second_per_gpu": 136117.5370530202,
            "loss_sequences_lower_95": 3.5256022614828795,
            "loss_sequences_upper_95": 3.572691810394422,
            "loss_tokens_lower_95": 3.5249430170747424,
            "loss_tokens_upper_95": 3.5725546817470546,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.5790657677801962,
            "data_time": 0.0011365149797835027,
            "batch_time": 0.030347351449454118,
            "samples_per_second": 1091828.1266266357,
            "samples_per_second_per_gpu": 136478.51582832946,
            "loss_sequences_lower_95": 0.6543904658569762,
            "loss_sequences_upper_95": 0.668350576387583,
            "loss_tokens_lower_95": 0.5011544511108852,
            "loss_tokens_upper_95": 0.5094547509349471,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.36744036261491,
            "data_time": 0.034913282841444016,
            "batch_time": 0.06668096780776978,
            "samples_per_second": 952586.3085856132,
            "samples_per_second_per_gpu": 119073.28857320164,
            "loss_sequences_lower_95": 4.408862941471611,
            "loss_sequences_upper_95": 4.842030827079233,
            "loss_tokens_lower_95": 4.067550502506294,
            "loss_tokens_upper_95": 4.39362406157885,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.1015878625818205,
            "data_time": 0.09569270270211357,
            "batch_time": 0.12650495483761742,
            "samples_per_second": 599412.4893032508,
            "samples_per_second_per_gpu": 74926.56116290635,
            "loss_sequences_lower_95": 6.680077094000739,
            "loss_sequences_upper_95": 7.759340626484639,
            "loss_tokens_lower_95": 5.905328011218413,
            "loss_tokens_upper_95": 8.12824548791956,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3348156155609505,
            "data_time": 0.026044357390630813,
            "batch_time": 0.05803629614057995,
            "samples_per_second": 932722.3779206462,
            "samples_per_second_per_gpu": 116590.29724008078,
            "loss_sequences_lower_95": 4.297002876095656,
            "loss_sequences_upper_95": 4.660231241365759,
            "loss_tokens_lower_95": 3.9933909962415894,
            "loss_tokens_upper_95": 4.272583718263772,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.417660438432926,
            "data_time": 0.026372858456202915,
            "batch_time": 0.05638734783445086,
            "samples_per_second": 979780.3853505075,
            "samples_per_second_per_gpu": 122472.54816881343,
            "loss_sequences_lower_95": 4.350684114781822,
            "loss_sequences_upper_95": 4.638447105593798,
            "loss_tokens_lower_95": 4.1468997375982175,
            "loss_tokens_upper_95": 4.378447172859007,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.410558289870983,
            "data_time": 0.02614260571343558,
            "batch_time": 0.05544974406560262,
            "samples_per_second": 1001659.321175751,
            "samples_per_second_per_gpu": 125207.41514696888,
            "loss_sequences_lower_95": 4.431776763171684,
            "loss_sequences_upper_95": 4.838185045195789,
            "loss_tokens_lower_95": 3.9832609030760313,
            "loss_tokens_upper_95": 4.330642582617525,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.529112705370275,
            "data_time": 0.027351827848525273,
            "batch_time": 0.05706397692362467,
            "samples_per_second": 978049.3870062584,
            "samples_per_second_per_gpu": 122256.1733757823,
            "loss_sequences_lower_95": 4.444380932319454,
            "loss_sequences_upper_95": 4.728926086425782,
            "loss_tokens_lower_95": 4.281456958616262,
            "loss_tokens_upper_95": 4.495203768055759,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9434386277050706,
            "data_time": 0.027260541915893555,
            "batch_time": 0.05663451736356005,
            "samples_per_second": 1011138.5514870525,
            "samples_per_second_per_gpu": 126392.31893588156,
            "loss_sequences_lower_95": 3.8342670701305317,
            "loss_sequences_upper_95": 4.081085821116193,
            "loss_tokens_lower_95": 3.71038523805389,
            "loss_tokens_upper_95": 3.885806533800088,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0706040016034755,
            "data_time": 0.02606512819017683,
            "batch_time": 0.05687508980433146,
            "samples_per_second": 961091.6989783217,
            "samples_per_second_per_gpu": 120136.46237229022,
            "loss_sequences_lower_95": 3.070119648444943,
            "loss_sequences_upper_95": 3.3001407065042634,
            "loss_tokens_lower_95": 2.8522976473025916,
            "loss_tokens_upper_95": 2.9748111802573085,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-8.0/params.txt",
    "uuid": "fff6723e-b3cf-425b-a488-fdbacacc0774",
    "creation_date": "2023_12_14-05_03_42"
}