{
    "name": "rw_original-d=576_l=24_h=8-2.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 6147095040,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1229419008",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.4650466005007425,
            "data_time": 0.03499507158994675,
            "batch_time": 0.37876682355999947,
            "samples_per_second": 834046.5494391824,
            "samples_per_second_per_gpu": 104255.8186798978,
            "loss_sequences_lower_95": 3.392845064798991,
            "loss_sequences_upper_95": 3.5384493255615235,
            "loss_tokens_lower_95": 3.4513942273457845,
            "loss_tokens_upper_95": 3.4785220019022622,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.383059219847898,
            "data_time": 0.0011306883511710323,
            "batch_time": 0.030891904998543072,
            "samples_per_second": 1073518.5357958942,
            "samples_per_second_per_gpu": 134189.81697448678,
            "loss_sequences_lower_95": 3.3806559488935166,
            "loss_sequences_upper_95": 3.3854372551441783,
            "loss_tokens_lower_95": 3.3722968333333334,
            "loss_tokens_upper_95": 3.3937407395833334,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0091400613590165,
            "data_time": 0.009965697288513183,
            "batch_time": 0.0389993371963501,
            "samples_per_second": 1066488.1601322752,
            "samples_per_second_per_gpu": 133311.0200165344,
            "loss_sequences_lower_95": 2.954205758231027,
            "loss_sequences_upper_95": 3.078094407684949,
            "loss_tokens_lower_95": 2.996347,
            "loss_tokens_upper_95": 3.022060677083333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4910204772850904,
            "data_time": 0.0015885446612772189,
            "batch_time": 0.02999110039519636,
            "samples_per_second": 1123446.5164700607,
            "samples_per_second_per_gpu": 140430.8145587576,
            "loss_sequences_lower_95": 3.450774675821521,
            "loss_sequences_upper_95": 3.532856002335696,
            "loss_tokens_lower_95": 3.4787036510416667,
            "loss_tokens_upper_95": 3.503430427083333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.44782719495583,
            "data_time": 0.009688370731247373,
            "batch_time": 0.04035523593188282,
            "samples_per_second": 1010940.4571167349,
            "samples_per_second_per_gpu": 126367.55713959187,
            "loss_sequences_lower_95": 3.3896086403405836,
            "loss_sequences_upper_95": 3.5238568276834585,
            "loss_tokens_lower_95": 3.4365108072916666,
            "loss_tokens_upper_95": 3.4588768854166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.562404037737506,
            "data_time": 0.003706906152808148,
            "batch_time": 0.032400763877060104,
            "samples_per_second": 1111399.6761489797,
            "samples_per_second_per_gpu": 138924.95951862246,
            "loss_sequences_lower_95": 3.513778344212058,
            "loss_sequences_upper_95": 3.6162080081444383,
            "loss_tokens_lower_95": 3.5501976041666667,
            "loss_tokens_upper_95": 3.5745431875,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.178740245906674,
            "data_time": 0.0015943489883696663,
            "batch_time": 0.030100113124287342,
            "samples_per_second": 1121051.3613502625,
            "samples_per_second_per_gpu": 140131.42016878282,
            "loss_sequences_lower_95": 3.1484400460379462,
            "loss_sequences_upper_95": 3.208713548309949,
            "loss_tokens_lower_95": 3.164400489583333,
            "loss_tokens_upper_95": 3.194013395833333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9069437815381596,
            "data_time": 0.0017383196323000703,
            "batch_time": 0.030849784450618756,
            "samples_per_second": 1113230.9982024312,
            "samples_per_second_per_gpu": 139153.8747753039,
            "loss_sequences_lower_95": 3.8821972144960735,
            "loss_sequences_upper_95": 3.9335729098494765,
            "loss_tokens_lower_95": 3.89526403125,
            "loss_tokens_upper_95": 3.9184871145833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.553232953315828,
            "data_time": 0.011541833007146441,
            "batch_time": 0.042005063049376956,
            "samples_per_second": 1040907.6392008069,
            "samples_per_second_per_gpu": 130113.45490010086,
            "loss_sequences_lower_95": 3.4651351866683338,
            "loss_sequences_upper_95": 3.661226995204522,
            "loss_tokens_lower_95": 3.5413341979166666,
            "loss_tokens_upper_95": 3.564964197916667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.646064483129931,
            "data_time": 0.009832550771534443,
            "batch_time": 0.03915326949208975,
            "samples_per_second": 1062860.2987591843,
            "samples_per_second_per_gpu": 132857.53734489804,
            "loss_sequences_lower_95": 4.527692928992712,
            "loss_sequences_upper_95": 4.79288731752177,
            "loss_tokens_lower_95": 4.632351489583333,
            "loss_tokens_upper_95": 4.65979671875,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6173859329048503,
            "data_time": 0.0012650821071090552,
            "batch_time": 0.02970318083372401,
            "samples_per_second": 1125757.9686568282,
            "samples_per_second_per_gpu": 140719.74608210352,
            "loss_sequences_lower_95": 3.605900113060162,
            "loss_sequences_upper_95": 3.6294302105103124,
            "loss_tokens_lower_95": 3.60593853125,
            "loss_tokens_upper_95": 3.6289243125,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4594411926436353,
            "data_time": 0.0026071627471568088,
            "batch_time": 0.0316288653857305,
            "samples_per_second": 1103798.2286470882,
            "samples_per_second_per_gpu": 137974.77858088602,
            "loss_sequences_lower_95": 3.4354669929196167,
            "loss_sequences_upper_95": 3.4846955146455905,
            "loss_tokens_lower_95": 3.4479043750000002,
            "loss_tokens_upper_95": 3.47101025,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9568342069583298,
            "data_time": 0.009573530302688538,
            "batch_time": 0.03888447199885553,
            "samples_per_second": 1055462.9331826875,
            "samples_per_second_per_gpu": 131932.86664783594,
            "loss_sequences_lower_95": 3.8676601077190034,
            "loss_sequences_upper_95": 4.06632349969891,
            "loss_tokens_lower_95": 3.943686239583333,
            "loss_tokens_upper_95": 3.9696893125,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.206076952687592,
            "data_time": 0.009525473849231978,
            "batch_time": 0.03858599529798287,
            "samples_per_second": 1071767.3126674325,
            "samples_per_second_per_gpu": 133970.91408342906,
            "loss_sequences_lower_95": 3.1188556927527524,
            "loss_sequences_upper_95": 3.3079592578522785,
            "loss_tokens_lower_95": 3.1942453385416667,
            "loss_tokens_upper_95": 3.217990875,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.263173336332494,
            "data_time": 0.08249365431921822,
            "batch_time": 0.11593502759933472,
            "samples_per_second": 541355.0031337633,
            "samples_per_second_per_gpu": 67669.37539172042,
            "loss_sequences_lower_95": 4.187020440535112,
            "loss_sequences_upper_95": 4.352065797285601,
            "loss_tokens_lower_95": 4.239249454845082,
            "loss_tokens_upper_95": 4.287644126198508,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4835539489723852,
            "data_time": 0.013665447180921381,
            "batch_time": 0.042757585644721985,
            "samples_per_second": 1048773.097580042,
            "samples_per_second_per_gpu": 131096.63719750525,
            "loss_sequences_lower_95": 3.4182897273027515,
            "loss_sequences_upper_95": 3.546554245128576,
            "loss_tokens_lower_95": 3.471028765625,
            "loss_tokens_upper_95": 3.4957185208333335,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.389041682975588,
            "data_time": 0.01290253053108851,
            "batch_time": 0.04220056161284447,
            "samples_per_second": 1055060.404493229,
            "samples_per_second_per_gpu": 131882.55056165362,
            "loss_sequences_lower_95": 5.295961791670103,
            "loss_sequences_upper_95": 5.512579844935273,
            "loss_tokens_lower_95": 5.377311427083333,
            "loss_tokens_upper_95": 5.400955020833333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8512905797020336,
            "data_time": 0.03545035794377327,
            "batch_time": 0.06739122420549393,
            "samples_per_second": 888356.2219151454,
            "samples_per_second_per_gpu": 111044.52773939318,
            "loss_sequences_lower_95": 3.6927577597195986,
            "loss_sequences_upper_95": 4.132962586449795,
            "loss_tokens_lower_95": 3.8371635061795595,
            "loss_tokens_upper_95": 3.865530783231141,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.01086738486182,
            "data_time": 0.0016821107587401578,
            "batch_time": 0.030599723101205328,
            "samples_per_second": 1099549.4003431639,
            "samples_per_second_per_gpu": 137443.67504289548,
            "loss_sequences_lower_95": 3.994417069661551,
            "loss_sequences_upper_95": 4.027565879370816,
            "loss_tokens_lower_95": 3.994494091664738,
            "loss_tokens_upper_95": 4.0273265234987,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0337716320600436,
            "data_time": 0.0018957760778202373,
            "batch_time": 0.03094496359680868,
            "samples_per_second": 1093041.4218288339,
            "samples_per_second_per_gpu": 136630.17772860423,
            "loss_sequences_lower_95": 3.030652723405447,
            "loss_sequences_upper_95": 3.055965054194259,
            "loss_tokens_lower_95": 3.014135429153371,
            "loss_tokens_upper_95": 3.0331417947967148,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.735107493938338,
            "data_time": 0.00329096827385675,
            "batch_time": 0.03235902772772211,
            "samples_per_second": 1091836.8257160892,
            "samples_per_second_per_gpu": 136479.60321451115,
            "loss_sequences_lower_95": 5.012707294649711,
            "loss_sequences_upper_95": 5.325225527352976,
            "loss_tokens_lower_95": 4.145420929384084,
            "loss_tokens_upper_95": 4.37265599405081,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.571846172402302,
            "data_time": 0.0039737419562136875,
            "batch_time": 0.03316870981708486,
            "samples_per_second": 1082526.8903259712,
            "samples_per_second_per_gpu": 135315.8612907464,
            "loss_sequences_lower_95": 4.679894425455729,
            "loss_sequences_upper_95": 4.875476375325521,
            "loss_tokens_lower_95": 4.279114276238207,
            "loss_tokens_upper_95": 4.4217071786556605,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.131399776648174,
            "data_time": 0.004685266345155006,
            "batch_time": 0.03356494515190297,
            "samples_per_second": 1092358.1348927375,
            "samples_per_second_per_gpu": 136544.7668615922,
            "loss_sequences_lower_95": 3.17704396049031,
            "loss_sequences_upper_95": 3.2376262349554583,
            "loss_tokens_lower_95": 3.0354153873959446,
            "loss_tokens_upper_95": 3.066679717748766,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.506071484088898,
            "data_time": 0.023207839046205794,
            "batch_time": 0.05363105663231441,
            "samples_per_second": 1000385.0671847746,
            "samples_per_second_per_gpu": 125048.13339809682,
            "loss_sequences_lower_95": 2.483554021661932,
            "loss_sequences_upper_95": 2.588615361993963,
            "loss_tokens_lower_95": 2.4408795223218784,
            "loss_tokens_upper_95": 2.4889072302038193,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4143156898264984,
            "data_time": 0.020570699125528336,
            "batch_time": 0.050037916749715805,
            "samples_per_second": 994782.0144454536,
            "samples_per_second_per_gpu": 124347.7518056817,
            "loss_sequences_lower_95": 3.4023281424386163,
            "loss_sequences_upper_95": 3.5958711585220025,
            "loss_tokens_lower_95": 3.2837597854603926,
            "loss_tokens_upper_95": 3.3757121567154704,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.100319178899129,
            "data_time": 0.016372540058233798,
            "batch_time": 0.04590079111930651,
            "samples_per_second": 1007083.5455210652,
            "samples_per_second_per_gpu": 125885.44319013316,
            "loss_sequences_lower_95": 4.065966328938802,
            "loss_sequences_upper_95": 4.164836649576823,
            "loss_tokens_lower_95": 3.973426756605377,
            "loss_tokens_upper_95": 4.189457666800255,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.1947412827779,
            "data_time": 0.0015226122596439632,
            "batch_time": 0.030377871618767375,
            "samples_per_second": 1101928.3343271988,
            "samples_per_second_per_gpu": 137741.04179089985,
            "loss_sequences_lower_95": 5.200885658232247,
            "loss_sequences_upper_95": 5.27579199550342,
            "loss_tokens_lower_95": 5.06984876696211,
            "loss_tokens_upper_95": 5.1466339319843275,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.478210484991571,
            "data_time": 0.0029599776604031557,
            "batch_time": 0.032600178614558786,
            "samples_per_second": 1085909.9316046054,
            "samples_per_second_per_gpu": 135738.74145057568,
            "loss_sequences_lower_95": 5.003656437421086,
            "loss_sequences_upper_95": 5.303943949998027,
            "loss_tokens_lower_95": 3.7654146532271797,
            "loss_tokens_upper_95": 3.903305865849937,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.086818003613795,
            "data_time": 0.00510314670768944,
            "batch_time": 0.03424212900367943,
            "samples_per_second": 1080706.8966597763,
            "samples_per_second_per_gpu": 135088.36208247204,
            "loss_sequences_lower_95": 4.52395351787476,
            "loss_sequences_upper_95": 4.864802421075086,
            "loss_tokens_lower_95": 3.667744611732111,
            "loss_tokens_upper_95": 3.8241901684515165,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.491165167664828,
            "data_time": 0.023484564253262112,
            "batch_time": 0.05338679254055023,
            "samples_per_second": 1003851.2534060037,
            "samples_per_second_per_gpu": 125481.40667575046,
            "loss_sequences_lower_95": 5.381297183798872,
            "loss_sequences_upper_95": 5.598457538482806,
            "loss_tokens_lower_95": 5.384083619836258,
            "loss_tokens_upper_95": 5.597626540649971,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.458750169277191,
            "data_time": 0.04914341981594379,
            "batch_time": 0.08162393936744103,
            "samples_per_second": 872761.9571473843,
            "samples_per_second_per_gpu": 109095.24464342304,
            "loss_sequences_lower_95": 3.3271132049560546,
            "loss_sequences_upper_95": 3.684354354858398,
            "loss_tokens_lower_95": 3.1511811722156615,
            "loss_tokens_upper_95": 3.6082584879266126,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.84628372085009,
            "data_time": 0.0035403391578689917,
            "batch_time": 0.03242883628618985,
            "samples_per_second": 1098962.1233367606,
            "samples_per_second_per_gpu": 137370.26541709507,
            "loss_sequences_lower_95": 4.799039942605713,
            "loss_sequences_upper_95": 4.8933398962264905,
            "loss_tokens_lower_95": 4.798039515796923,
            "loss_tokens_upper_95": 4.8944115911071835,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.748558288124328,
            "data_time": 0.004945033526148337,
            "batch_time": 0.033990280857677364,
            "samples_per_second": 1088110.0310710894,
            "samples_per_second_per_gpu": 136013.75388388618,
            "loss_sequences_lower_95": 4.7003231814022834,
            "loss_sequences_upper_95": 4.796456911679079,
            "loss_tokens_lower_95": 4.69791983589591,
            "loss_tokens_upper_95": 4.798842031617911,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4778880449711176,
            "data_time": 0.0035269705832133283,
            "batch_time": 0.03233342139818756,
            "samples_per_second": 1093224.5759537793,
            "samples_per_second_per_gpu": 136653.0719942224,
            "loss_sequences_lower_95": 3.6052921266109563,
            "loss_sequences_upper_95": 3.734712705726333,
            "loss_tokens_lower_95": 3.323778295842825,
            "loss_tokens_upper_95": 3.382376397093795,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.4548301177024845,
            "data_time": 0.011014294810593128,
            "batch_time": 0.04002491291612387,
            "samples_per_second": 1052558.8633115664,
            "samples_per_second_per_gpu": 131569.8579139458,
            "loss_sequences_lower_95": 5.63200615234375,
            "loss_sequences_upper_95": 6.181511938476563,
            "loss_tokens_lower_95": 4.867910687833492,
            "loss_tokens_upper_95": 5.228539829014311,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6528637558221817,
            "data_time": 0.1613018661737442,
            "batch_time": 0.1969575434923172,
            "samples_per_second": 554125.8350860337,
            "samples_per_second_per_gpu": 69265.72938575421,
            "loss_sequences_lower_95": 3.439769262075424,
            "loss_sequences_upper_95": 3.8982110142707826,
            "loss_tokens_lower_95": 3.1916762779498926,
            "loss_tokens_upper_95": 4.037137875611754,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.294084757909007,
            "data_time": 0.03025293857493299,
            "batch_time": 0.06048621015345797,
            "samples_per_second": 922409.728058843,
            "samples_per_second_per_gpu": 115301.21600735537,
            "loss_sequences_lower_95": 4.570863491365279,
            "loss_sequences_upper_95": 5.137777981812926,
            "loss_tokens_lower_95": 3.3765701104715684,
            "loss_tokens_upper_95": 3.7405329587091316,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3054073668817807,
            "data_time": 0.0030592410928673213,
            "batch_time": 0.031978405805097684,
            "samples_per_second": 1093779.5502723278,
            "samples_per_second_per_gpu": 136722.44378404098,
            "loss_sequences_lower_95": 2.284001850946442,
            "loss_sequences_upper_95": 2.3266887193165684,
            "loss_tokens_lower_95": 2.283505575664182,
            "loss_tokens_upper_95": 2.3272170560544314,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9680873133082315,
            "data_time": 0.002629118184040441,
            "batch_time": 0.03142756855113126,
            "samples_per_second": 1101803.6114615821,
            "samples_per_second_per_gpu": 137725.45143269777,
            "loss_sequences_lower_95": 2.9419120138602515,
            "loss_sequences_upper_95": 3.095915247337716,
            "loss_tokens_lower_95": 2.8025985712210626,
            "loss_tokens_upper_95": 2.9537718435990477,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.233364490143982,
            "data_time": 0.017945786317189533,
            "batch_time": 0.04866375360223982,
            "samples_per_second": 952942.149408028,
            "samples_per_second_per_gpu": 119117.7686760035,
            "loss_sequences_lower_95": 3.0787110772325006,
            "loss_sequences_upper_95": 3.4902771722702752,
            "loss_tokens_lower_95": 2.983428994601353,
            "loss_tokens_upper_95": 3.282847065885899,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.637823511600118,
            "data_time": 0.004861201718449592,
            "batch_time": 0.034770553931593895,
            "samples_per_second": 1054720.659763903,
            "samples_per_second_per_gpu": 131840.08247048786,
            "loss_sequences_lower_95": 3.6714681696120266,
            "loss_sequences_upper_95": 3.8196537827730364,
            "loss_tokens_lower_95": 3.4947171153371364,
            "loss_tokens_upper_95": 3.6386906872478564,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9659821296610485,
            "data_time": 0.03170252697808402,
            "batch_time": 0.06230969372249785,
            "samples_per_second": 969245.9885169385,
            "samples_per_second_per_gpu": 121155.74856461732,
            "loss_sequences_lower_95": 2.8196262220057045,
            "loss_sequences_upper_95": 3.2890018463134765,
            "loss_tokens_lower_95": 2.714417414953036,
            "loss_tokens_upper_95": 3.0917926931459014,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.128972615783609,
            "data_time": 0.0018338923734328863,
            "batch_time": 0.030627547003867574,
            "samples_per_second": 1101609.31009382,
            "samples_per_second_per_gpu": 137701.1637617275,
            "loss_sequences_lower_95": 5.116859299594293,
            "loss_sequences_upper_95": 5.140801216884002,
            "loss_tokens_lower_95": 5.1169898335135775,
            "loss_tokens_upper_95": 5.141039897432612,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3275732791539534,
            "data_time": 0.04865588274869052,
            "batch_time": 0.07952085408297452,
            "samples_per_second": 895010.7593743473,
            "samples_per_second_per_gpu": 111876.34492179341,
            "loss_sequences_lower_95": 1.2711140308565305,
            "loss_sequences_upper_95": 1.460706303420576,
            "loss_tokens_lower_95": 1.1291125107201974,
            "loss_tokens_upper_95": 1.3924929850177121,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.7833400906819215,
            "data_time": 0.0013003255875292487,
            "batch_time": 0.030377227085434083,
            "samples_per_second": 1093150.5304730595,
            "samples_per_second_per_gpu": 136643.81630913244,
            "loss_sequences_lower_95": 5.164709651795073,
            "loss_sequences_upper_95": 5.211200168697589,
            "loss_tokens_lower_95": 4.192703820116054,
            "loss_tokens_upper_95": 4.240117093810445,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.251828119754792,
            "data_time": 0.005704543893299405,
            "batch_time": 0.03468266791767544,
            "samples_per_second": 1082799.0854666454,
            "samples_per_second_per_gpu": 135349.88568333068,
            "loss_sequences_lower_95": 5.252034399414062,
            "loss_sequences_upper_95": 5.462885900878906,
            "loss_tokens_lower_95": 5.032719290394795,
            "loss_tokens_upper_95": 5.230254697394021,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.102438126439633,
            "data_time": 0.02251574144525043,
            "batch_time": 0.051691736205149504,
            "samples_per_second": 1018097.2674299956,
            "samples_per_second_per_gpu": 127262.15842874945,
            "loss_sequences_lower_95": 4.9632093612007475,
            "loss_sequences_upper_95": 5.249154994798744,
            "loss_tokens_lower_95": 4.960974811056386,
            "loss_tokens_upper_95": 5.24242062775985,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.164223030119231,
            "data_time": 0.004636669374374022,
            "batch_time": 0.03342680112425103,
            "samples_per_second": 1093821.6146199498,
            "samples_per_second_per_gpu": 136727.70182749373,
            "loss_sequences_lower_95": 6.085661898526278,
            "loss_sequences_upper_95": 6.2421705766157665,
            "loss_tokens_lower_95": 6.088435779918324,
            "loss_tokens_upper_95": 6.239790982333097,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4720027463436127,
            "data_time": 0.004124981608796627,
            "batch_time": 0.033033462915014716,
            "samples_per_second": 1093382.4692680805,
            "samples_per_second_per_gpu": 136672.80865851007,
            "loss_sequences_lower_95": 1.5170597493489584,
            "loss_sequences_upper_95": 1.5772949951171875,
            "loss_tokens_lower_95": 1.384805152529762,
            "loss_tokens_upper_95": 1.4555757771858744,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.294753083728609,
            "data_time": 0.022359813962663923,
            "batch_time": 0.05171516750540052,
            "samples_per_second": 970859.2707369229,
            "samples_per_second_per_gpu": 121357.40884211536,
            "loss_sequences_lower_95": 5.96491229829334,
            "loss_sequences_upper_95": 6.6228495134626115,
            "loss_tokens_lower_95": 5.965342450823102,
            "loss_tokens_upper_95": 6.625670151483445,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.341750092804432,
            "data_time": 0.15701797604560852,
            "batch_time": 0.19471614062786102,
            "samples_per_second": 555296.3705611074,
            "samples_per_second_per_gpu": 69412.04632013843,
            "loss_sequences_lower_95": 2.1497712969779967,
            "loss_sequences_upper_95": 3.116809570789337,
            "loss_tokens_lower_95": 1.802840409426345,
            "loss_tokens_upper_95": 2.3253592516712307,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.532628540992737,
            "data_time": 0.006053728243661305,
            "batch_time": 0.035354701299515984,
            "samples_per_second": 1074119.974593332,
            "samples_per_second_per_gpu": 134264.9968241665,
            "loss_sequences_lower_95": 7.484506372070313,
            "loss_sequences_upper_95": 7.770928503417969,
            "loss_tokens_lower_95": 7.2657470703125,
            "loss_tokens_upper_95": 7.52454885621563,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.447899099349976,
            "data_time": 0.005748276672666035,
            "batch_time": 0.034803384353244114,
            "samples_per_second": 1082579.9949352953,
            "samples_per_second_per_gpu": 135322.4993669119,
            "loss_sequences_lower_95": 7.583522753906251,
            "loss_sequences_upper_95": 7.846810766601563,
            "loss_tokens_lower_95": 7.14642324815348,
            "loss_tokens_upper_95": 7.365840093147534,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.381312342006569,
            "data_time": 0.003979329042211424,
            "batch_time": 0.03313715705106091,
            "samples_per_second": 1082414.1971069316,
            "samples_per_second_per_gpu": 135301.77463836645,
            "loss_sequences_lower_95": 5.357458762086721,
            "loss_sequences_upper_95": 5.404502630793245,
            "loss_tokens_lower_95": 5.358347931573186,
            "loss_tokens_upper_95": 5.4047600383291785,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.152037474416917,
            "data_time": 0.008889555570942398,
            "batch_time": 0.03850987958764022,
            "samples_per_second": 1046904.5916391117,
            "samples_per_second_per_gpu": 130863.07395488897,
            "loss_sequences_lower_95": 4.060636111961166,
            "loss_sequences_upper_95": 4.241780233309932,
            "loss_tokens_lower_95": 4.057043644543251,
            "loss_tokens_upper_95": 4.242048103578629,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.055937035083771,
            "data_time": 0.005694081858983116,
            "batch_time": 0.03448736856854151,
            "samples_per_second": 1086822.838116401,
            "samples_per_second_per_gpu": 135852.85476455014,
            "loss_sequences_lower_95": 6.9725007080078125,
            "loss_sequences_upper_95": 7.139264196777344,
            "loss_tokens_lower_95": 6.97507216796875,
            "loss_tokens_upper_95": 7.138,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0056011747050757,
            "data_time": 0.0018887686594222129,
            "batch_time": 0.030688915180574067,
            "samples_per_second": 1102326.913611053,
            "samples_per_second_per_gpu": 137790.86420138163,
            "loss_sequences_lower_95": 3.472696929325331,
            "loss_sequences_upper_95": 3.5520852796830655,
            "loss_tokens_lower_95": 2.429306052013128,
            "loss_tokens_upper_95": 2.485176182822431,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.3868908908829765,
            "data_time": 0.017431959084102087,
            "batch_time": 0.04672354119164603,
            "samples_per_second": 1007800.6963680887,
            "samples_per_second_per_gpu": 125975.08704601109,
            "loss_sequences_lower_95": 5.205719962048886,
            "loss_sequences_upper_95": 5.565225401921059,
            "loss_tokens_lower_95": 5.205763870922487,
            "loss_tokens_upper_95": 5.567426983278189,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.471251181995167,
            "data_time": 0.01082591712474823,
            "batch_time": 0.040416051633656025,
            "samples_per_second": 1062957.544680179,
            "samples_per_second_per_gpu": 132869.6930850224,
            "loss_sequences_lower_95": 5.343165402879903,
            "loss_sequences_upper_95": 5.596804881376379,
            "loss_tokens_lower_95": 5.34439058191636,
            "loss_tokens_upper_95": 5.59542387120864,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.953883524991121,
            "data_time": 0.002258602635029943,
            "batch_time": 0.031117917955383774,
            "samples_per_second": 1099165.600229903,
            "samples_per_second_per_gpu": 137395.70002873786,
            "loss_sequences_lower_95": 4.507848611894025,
            "loss_sequences_upper_95": 4.60671336257516,
            "loss_tokens_lower_95": 3.205194745401345,
            "loss_tokens_upper_95": 3.2816853484620743,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.947459150243689,
            "data_time": 0.02683087686697642,
            "batch_time": 0.05752421170473099,
            "samples_per_second": 976448.1417776543,
            "samples_per_second_per_gpu": 122056.01772220679,
            "loss_sequences_lower_95": 5.885570853097098,
            "loss_sequences_upper_95": 6.007618204752604,
            "loss_tokens_lower_95": 5.886022965365617,
            "loss_tokens_upper_95": 6.006821760551008,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.116988799477207,
            "data_time": 0.0037192401722965077,
            "batch_time": 0.032519334401839815,
            "samples_per_second": 1096640.6971321232,
            "samples_per_second_per_gpu": 137080.0871415154,
            "loss_sequences_lower_95": 4.08379057064698,
            "loss_sequences_upper_95": 4.1501581986453555,
            "loss_tokens_lower_95": 4.083673763916762,
            "loss_tokens_upper_95": 4.15081081565367,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.379952334663243,
            "data_time": 0.02397280606356534,
            "batch_time": 0.0546812967820601,
            "samples_per_second": 936714.819134002,
            "samples_per_second_per_gpu": 117089.35239175025,
            "loss_sequences_lower_95": 5.1961428447834495,
            "loss_sequences_upper_95": 5.566718744074256,
            "loss_tokens_lower_95": 5.189032693511074,
            "loss_tokens_upper_95": 5.567599383604179,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.601491226752599,
            "data_time": 0.07902718335390091,
            "batch_time": 0.11120208352804184,
            "samples_per_second": 749934.9712833243,
            "samples_per_second_per_gpu": 93741.87141041554,
            "loss_sequences_lower_95": 2.4243102137247723,
            "loss_sequences_upper_95": 2.9182584126790365,
            "loss_tokens_lower_95": 2.170810476938884,
            "loss_tokens_upper_95": 2.8546874152289496,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.637724135319392,
            "data_time": 0.07958637177944183,
            "batch_time": 0.1156986653804779,
            "samples_per_second": 673828.2155878388,
            "samples_per_second_per_gpu": 84228.52694847985,
            "loss_sequences_lower_95": 2.4919723955790203,
            "loss_sequences_upper_95": 3.0450626373291017,
            "loss_tokens_lower_95": 2.040331495477912,
            "loss_tokens_upper_95": 2.9051304677898964,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.099436085073165,
            "data_time": 0.0036289451962853263,
            "batch_time": 0.03246854753477423,
            "samples_per_second": 1096309.6236918087,
            "samples_per_second_per_gpu": 137038.70296147608,
            "loss_sequences_lower_95": 4.0672320421805965,
            "loss_sequences_upper_95": 4.131215252209131,
            "loss_tokens_lower_95": 4.068127862090391,
            "loss_tokens_upper_95": 4.132040598536451,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.789724759686442,
            "data_time": 0.0012739910550529638,
            "batch_time": 0.03018533744129884,
            "samples_per_second": 1098610.898592378,
            "samples_per_second_per_gpu": 137326.36232404725,
            "loss_sequences_lower_95": 0.9164980761257949,
            "loss_sequences_upper_95": 0.9375140928059607,
            "loss_tokens_lower_95": 0.6566982759287583,
            "loss_tokens_upper_95": 0.6677322020215193,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.410384080541416,
            "data_time": 0.03803997114300728,
            "batch_time": 0.07073581963777542,
            "samples_per_second": 932633.2546891163,
            "samples_per_second_per_gpu": 116579.15683613953,
            "loss_sequences_lower_95": 4.443747837336983,
            "loss_sequences_upper_95": 4.798413698692021,
            "loss_tokens_lower_95": 4.113911517824245,
            "loss_tokens_upper_95": 4.362908004502119,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.305073931410506,
            "data_time": 0.11977179845174153,
            "batch_time": 0.15596732639131092,
            "samples_per_second": 502396.3121236504,
            "samples_per_second_per_gpu": 62799.5390154563,
            "loss_sequences_lower_95": 6.849583177308778,
            "loss_sequences_upper_95": 7.983943403089368,
            "loss_tokens_lower_95": 6.290881244047188,
            "loss_tokens_upper_95": 8.06241138599537,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.294756197347874,
            "data_time": 0.030715053989773707,
            "batch_time": 0.061339015052432104,
            "samples_per_second": 958449.370265376,
            "samples_per_second_per_gpu": 119806.171283172,
            "loss_sequences_lower_95": 4.281407881946099,
            "loss_sequences_upper_95": 4.613421900679426,
            "loss_tokens_lower_95": 3.9220399068282172,
            "loss_tokens_upper_95": 4.1282339331239495,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.445279541538983,
            "data_time": 0.030946342718033565,
            "batch_time": 0.060370172773088725,
            "samples_per_second": 991691.8748468622,
            "samples_per_second_per_gpu": 123961.48435585777,
            "loss_sequences_lower_95": 4.426889614942597,
            "loss_sequences_upper_95": 4.7237248490496375,
            "loss_tokens_lower_95": 4.106453718702357,
            "loss_tokens_upper_95": 4.281997911505365,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.386294063998432,
            "data_time": 0.029154899574461438,
            "batch_time": 0.060147699855622794,
            "samples_per_second": 961396.0959791022,
            "samples_per_second_per_gpu": 120174.51199738777,
            "loss_sequences_lower_95": 4.358631450373952,
            "loss_sequences_upper_95": 4.733399786600253,
            "loss_tokens_lower_95": 3.981079045405944,
            "loss_tokens_upper_95": 4.2467846300630745,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.573456274300087,
            "data_time": 0.030924002329508465,
            "batch_time": 0.06135004475003197,
            "samples_per_second": 962554.3835475402,
            "samples_per_second_per_gpu": 120319.29794344252,
            "loss_sequences_lower_95": 4.530993940772079,
            "loss_sequences_upper_95": 4.823935950674662,
            "loss_tokens_lower_95": 4.263997224706727,
            "loss_tokens_upper_95": 4.42745415518217,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.939517575021116,
            "data_time": 0.0323296034777606,
            "batch_time": 0.06275454273930302,
            "samples_per_second": 995372.6532554632,
            "samples_per_second_per_gpu": 124421.5816569329,
            "loss_sequences_lower_95": 3.8607460353685465,
            "loss_sequences_upper_95": 4.088955612656492,
            "loss_tokens_lower_95": 3.705791829665107,
            "loss_tokens_upper_95": 3.8384936773362095,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5174855749781537,
            "data_time": 0.03303902773630051,
            "batch_time": 0.06412330695561,
            "samples_per_second": 957955.9066795781,
            "samples_per_second_per_gpu": 119744.48833494727,
            "loss_sequences_lower_95": 3.534649574465868,
            "loss_sequences_upper_95": 3.7807973349966653,
            "loss_tokens_lower_95": 3.2439530531852774,
            "loss_tokens_upper_95": 3.3530098197803646,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/params.txt",
    "uuid": "fff6723e-b3cf-425b-a488-fdbacacc0773",
    "creation_date": "2023_12_14-05_03_42"
}