{
    "name": "rpj-d=576_l=24_h=8-8.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 24588380160,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "4917676032",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.7381879736979804,
            "data_time": 0.035477347671985626,
            "batch_time": 0.37100300565361977,
            "samples_per_second": 833430.4806327301,
            "samples_per_second_per_gpu": 104178.81007909126,
            "loss_sequences_lower_95": 2.670294545491536,
            "loss_sequences_upper_95": 2.8023062260945637,
            "loss_tokens_lower_95": 2.7264751625061034,
            "loss_tokens_upper_95": 2.749708652496338,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2388579791867804,
            "data_time": 0.0011065182280737358,
            "batch_time": 0.03122784050117394,
            "samples_per_second": 1061924.2762681022,
            "samples_per_second_per_gpu": 132740.53453351278,
            "loss_sequences_lower_95": 3.23622286181308,
            "loss_sequences_upper_95": 3.241464623848516,
            "loss_tokens_lower_95": 3.2282733854166668,
            "loss_tokens_upper_95": 3.249448192708333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.673392364443565,
            "data_time": 0.010243830680847167,
            "batch_time": 0.04007408237457275,
            "samples_per_second": 1043645.4954600094,
            "samples_per_second_per_gpu": 130455.68693250118,
            "loss_sequences_lower_95": 2.6482447846081794,
            "loss_sequences_upper_95": 2.6984626427475287,
            "loss_tokens_lower_95": 2.6620526614583335,
            "loss_tokens_upper_95": 2.684831421875,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.074265238624258,
            "data_time": 0.0016373242006490105,
            "batch_time": 0.030604854814316098,
            "samples_per_second": 1103142.7132069673,
            "samples_per_second_per_gpu": 137892.83915087092,
            "loss_sequences_lower_95": 3.0617892084809926,
            "loss_sequences_upper_95": 3.086403486428802,
            "loss_tokens_lower_95": 3.063589140625,
            "loss_tokens_upper_95": 3.0847067447916663,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.236005185820663,
            "data_time": 0.009796657410275888,
            "batch_time": 0.03915380101754846,
            "samples_per_second": 1054692.517636252,
            "samples_per_second_per_gpu": 131836.5647045315,
            "loss_sequences_lower_95": 3.2012530977517186,
            "loss_sequences_upper_95": 3.2697124224816227,
            "loss_tokens_lower_95": 3.225231140625,
            "loss_tokens_upper_95": 3.246515505208333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.013083901557689,
            "data_time": 0.003825505142626555,
            "batch_time": 0.03290401463923247,
            "samples_per_second": 1096631.4302949216,
            "samples_per_second_per_gpu": 137078.9287868652,
            "loss_sequences_lower_95": 2.971739320216902,
            "loss_sequences_upper_95": 3.0543414958069555,
            "loss_tokens_lower_95": 3.00219240625,
            "loss_tokens_upper_95": 3.0238758020833334,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6970271320975556,
            "data_time": 0.0016517785983995745,
            "batch_time": 0.030415970388477816,
            "samples_per_second": 1111865.1356858322,
            "samples_per_second_per_gpu": 138983.14196072903,
            "loss_sequences_lower_95": 1.675238904057717,
            "loss_sequences_upper_95": 1.7189445103236607,
            "loss_tokens_lower_95": 1.6873462265624999,
            "loss_tokens_upper_95": 1.7070258072916666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6169428990149375,
            "data_time": 0.0017940475570761673,
            "batch_time": 0.03138443108434609,
            "samples_per_second": 1105398.9694281337,
            "samples_per_second_per_gpu": 138174.8711785167,
            "loss_sequences_lower_95": 3.6080153693553663,
            "loss_sequences_upper_95": 3.6256626308900524,
            "loss_tokens_lower_95": 3.6061128958333333,
            "loss_tokens_upper_95": 3.627464395833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4025693841096833,
            "data_time": 0.013039445120190818,
            "batch_time": 0.05145884127843948,
            "samples_per_second": 1037813.5358426181,
            "samples_per_second_per_gpu": 129726.69198032726,
            "loss_sequences_lower_95": 3.3594088546628873,
            "loss_sequences_upper_95": 3.4493058429500922,
            "loss_tokens_lower_95": 3.391757609375,
            "loss_tokens_upper_95": 3.4134929375,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.010216738866723,
            "data_time": 0.009617241099476814,
            "batch_time": 0.038841940462589264,
            "samples_per_second": 1067693.6698607458,
            "samples_per_second_per_gpu": 133461.70873259322,
            "loss_sequences_lower_95": 3.9706468423835846,
            "loss_sequences_upper_95": 4.04429015510167,
            "loss_tokens_lower_95": 3.9983123333333332,
            "loss_tokens_upper_95": 4.022413531250001,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1658441078439057,
            "data_time": 0.001314471407634456,
            "batch_time": 0.02997430130761759,
            "samples_per_second": 1116647.9191210263,
            "samples_per_second_per_gpu": 139580.9898901283,
            "loss_sequences_lower_95": 3.1573286351625667,
            "loss_sequences_upper_95": 3.1743691623055708,
            "loss_tokens_lower_95": 3.15524890625,
            "loss_tokens_upper_95": 3.176492390625,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0735710393086615,
            "data_time": 0.002677891276261888,
            "batch_time": 0.03151524692252713,
            "samples_per_second": 1106367.5484643416,
            "samples_per_second_per_gpu": 138295.9435580427,
            "loss_sequences_lower_95": 3.063293268917899,
            "loss_sequences_upper_95": 3.0836122359498646,
            "loss_tokens_lower_95": 3.063211375,
            "loss_tokens_upper_95": 3.084082619791667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.579451630613625,
            "data_time": 0.00997790423306552,
            "batch_time": 0.03971734348493131,
            "samples_per_second": 1037610.5334272395,
            "samples_per_second_per_gpu": 129701.31667840494,
            "loss_sequences_lower_95": 3.5426140961250474,
            "loss_sequences_upper_95": 3.6145878975579997,
            "loss_tokens_lower_95": 3.568275385416667,
            "loss_tokens_upper_95": 3.5904502291666667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9611788985685514,
            "data_time": 0.010551935173125856,
            "batch_time": 0.040791390901542754,
            "samples_per_second": 1057354.532007332,
            "samples_per_second_per_gpu": 132169.3165009165,
            "loss_sequences_lower_95": 2.898146600975767,
            "loss_sequences_upper_95": 3.0223046887430782,
            "loss_tokens_lower_95": 2.9500968489583332,
            "loss_tokens_upper_95": 2.9721540104166664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.758507316762751,
            "data_time": 0.08655778850827898,
            "batch_time": 0.12161806651524135,
            "samples_per_second": 544551.5150094058,
            "samples_per_second_per_gpu": 68068.93937617572,
            "loss_sequences_lower_95": 3.6992100889032535,
            "loss_sequences_upper_95": 3.8179801507429643,
            "loss_tokens_lower_95": 3.7384667483243073,
            "loss_tokens_upper_95": 3.7789110617204145,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6431888925785922,
            "data_time": 0.014427334070205688,
            "batch_time": 0.04383184693076394,
            "samples_per_second": 1036972.8902646488,
            "samples_per_second_per_gpu": 129621.6112830811,
            "loss_sequences_lower_95": 2.54909225775271,
            "loss_sequences_upper_95": 2.736648172539803,
            "loss_tokens_lower_95": 2.632370828125,
            "loss_tokens_upper_95": 2.6536739010416666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.502545281261756,
            "data_time": 0.01363334059715271,
            "batch_time": 0.04306120301286379,
            "samples_per_second": 1048637.2942810406,
            "samples_per_second_per_gpu": 131079.66178513007,
            "loss_sequences_lower_95": 5.442367883848327,
            "loss_sequences_upper_95": 5.556157191133122,
            "loss_tokens_lower_95": 5.490988520833333,
            "loss_tokens_upper_95": 5.513892989583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.284054222654124,
            "data_time": 0.03674762323498726,
            "batch_time": 0.06895816326141357,
            "samples_per_second": 890685.7038533362,
            "samples_per_second_per_gpu": 111335.71298166702,
            "loss_sequences_lower_95": 3.216905056062292,
            "loss_sequences_upper_95": 3.3353198067086645,
            "loss_tokens_lower_95": 3.2720368619825018,
            "loss_tokens_upper_95": 3.296092161585073,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.182808982003297,
            "data_time": 0.0016990657004788687,
            "batch_time": 0.030820828907039008,
            "samples_per_second": 1092003.1833860453,
            "samples_per_second_per_gpu": 136500.39792325566,
            "loss_sequences_lower_95": 5.159420705517377,
            "loss_sequences_upper_95": 5.206372748384312,
            "loss_tokens_lower_95": 5.158887539390756,
            "loss_tokens_upper_95": 5.206790398892608,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.000814457738383,
            "data_time": 0.001955276936482472,
            "batch_time": 0.03112203755955787,
            "samples_per_second": 1088453.0575976952,
            "samples_per_second_per_gpu": 136056.6321997119,
            "loss_sequences_lower_95": 2.988686505932894,
            "loss_sequences_upper_95": 3.013959354111171,
            "loss_tokens_lower_95": 2.9847093234681927,
            "loss_tokens_upper_95": 3.0036106236883593,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6431262574361414,
            "data_time": 0.0030894960354904766,
            "batch_time": 0.03223320473824662,
            "samples_per_second": 1089258.6097203442,
            "samples_per_second_per_gpu": 136157.32621504302,
            "loss_sequences_lower_95": 3.8944958225082664,
            "loss_sequences_upper_95": 4.179301095853507,
            "loss_tokens_lower_95": 3.102538754197567,
            "loss_tokens_upper_95": 3.3099049776998455,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.870160163712998,
            "data_time": 0.004001883274697243,
            "batch_time": 0.033969519303200094,
            "samples_per_second": 1056524.5903640264,
            "samples_per_second_per_gpu": 132065.5737955033,
            "loss_sequences_lower_95": 3.946533935546875,
            "loss_sequences_upper_95": 4.145245353190104,
            "loss_tokens_lower_95": 3.6287715089426102,
            "loss_tokens_upper_95": 3.770868661556604,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.7253915876760186,
            "data_time": 0.004917325657057726,
            "batch_time": 0.03423242007984834,
            "samples_per_second": 1078046.2686584094,
            "samples_per_second_per_gpu": 134755.78358230117,
            "loss_sequences_lower_95": 2.76837358105987,
            "loss_sequences_upper_95": 2.82543563090824,
            "loss_tokens_lower_95": 2.6369267228688735,
            "loss_tokens_upper_95": 2.666702612283419,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.0502717370336705,
            "data_time": 0.024748523320470537,
            "batch_time": 0.05454940455300467,
            "samples_per_second": 1008573.9435478871,
            "samples_per_second_per_gpu": 126071.7429434859,
            "loss_sequences_lower_95": 2.031053934964267,
            "loss_sequences_upper_95": 2.132801354148171,
            "loss_tokens_lower_95": 1.9863184882973899,
            "loss_tokens_upper_95": 2.0295862358913634,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1749765921612174,
            "data_time": 0.020733172073960304,
            "batch_time": 0.05017115920782089,
            "samples_per_second": 995564.6255783406,
            "samples_per_second_per_gpu": 124445.57819729258,
            "loss_sequences_lower_95": 3.167086374711017,
            "loss_sequences_upper_95": 3.3582836540377867,
            "loss_tokens_lower_95": 3.0453809849991536,
            "loss_tokens_upper_95": 3.134847155075745,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3537626520792645,
            "data_time": 0.017675537329453688,
            "batch_time": 0.04745663129366361,
            "samples_per_second": 1000695.1904151699,
            "samples_per_second_per_gpu": 125086.89880189624,
            "loss_sequences_lower_95": 3.323728759765625,
            "loss_sequences_upper_95": 3.430532409667969,
            "loss_tokens_lower_95": 3.2208675426959403,
            "loss_tokens_upper_95": 3.416010102412314,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.954270754117416,
            "data_time": 0.001514819555073293,
            "batch_time": 0.030620381912135338,
            "samples_per_second": 1092389.2096851831,
            "samples_per_second_per_gpu": 136548.6512106479,
            "loss_sequences_lower_95": 4.958382732533463,
            "loss_sequences_upper_95": 5.037206949553418,
            "loss_tokens_lower_95": 4.826291914404057,
            "loss_tokens_upper_95": 4.90556482445112,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.132297134258931,
            "data_time": 0.003107725373850573,
            "batch_time": 0.03239399474739228,
            "samples_per_second": 1095942.3373831077,
            "samples_per_second_per_gpu": 136992.79217288847,
            "loss_sequences_lower_95": 4.603523084852431,
            "loss_sequences_upper_95": 4.889694481024437,
            "loss_tokens_lower_95": 3.4596838678261244,
            "loss_tokens_upper_95": 3.590374323457723,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.817314103500021,
            "data_time": 0.00526597854253408,
            "batch_time": 0.03426833893801715,
            "samples_per_second": 1080040.5878554978,
            "samples_per_second_per_gpu": 135005.07348193723,
            "loss_sequences_lower_95": 4.199196429464191,
            "loss_sequences_upper_95": 4.521880895204511,
            "loss_tokens_lower_95": 3.4276911639213252,
            "loss_tokens_upper_95": 3.575941733661564,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.095145593495129,
            "data_time": 0.02424334841115134,
            "batch_time": 0.054276802710124424,
            "samples_per_second": 1001169.6661817512,
            "samples_per_second_per_gpu": 125146.2082727189,
            "loss_sequences_lower_95": 6.004917761397689,
            "loss_sequences_upper_95": 6.187071931743186,
            "loss_tokens_lower_95": 6.004094525341574,
            "loss_tokens_upper_95": 6.184961188329409,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3244866585731505,
            "data_time": 0.05014886764379648,
            "batch_time": 0.08189813448832585,
            "samples_per_second": 879294.3795497587,
            "samples_per_second_per_gpu": 109911.79744371984,
            "loss_sequences_lower_95": 3.188239059448242,
            "loss_sequences_upper_95": 3.557991859436035,
            "loss_tokens_lower_95": 3.0158219229983088,
            "loss_tokens_upper_95": 3.467190404356272,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.465170569405219,
            "data_time": 0.003545245266155719,
            "batch_time": 0.0326028145651632,
            "samples_per_second": 1092355.2135316848,
            "samples_per_second_per_gpu": 136544.4016914606,
            "loss_sequences_lower_95": 4.406987819456883,
            "loss_sequences_upper_95": 4.522266699518616,
            "loss_tokens_lower_95": 4.406989068897134,
            "loss_tokens_upper_95": 4.522308905610287,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.448733764823395,
            "data_time": 0.005213767824904176,
            "batch_time": 0.03525685213907313,
            "samples_per_second": 1053501.8345145786,
            "samples_per_second_per_gpu": 131687.72931432232,
            "loss_sequences_lower_95": 5.377641207658297,
            "loss_sequences_upper_95": 5.5191290378180025,
            "loss_tokens_lower_95": 5.374746441665387,
            "loss_tokens_upper_95": 5.520714352269528,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.442831610180478,
            "data_time": 0.003748805167608076,
            "batch_time": 0.03311108126753607,
            "samples_per_second": 1074534.9791604797,
            "samples_per_second_per_gpu": 134316.87239505997,
            "loss_sequences_lower_95": 3.5888048506146286,
            "loss_sequences_upper_95": 3.712717202801959,
            "loss_tokens_lower_95": 3.263828720485807,
            "loss_tokens_upper_95": 3.319294141045458,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.339612569570542,
            "data_time": 0.010985888540744781,
            "batch_time": 0.040767136961221695,
            "samples_per_second": 1040862.9466247003,
            "samples_per_second_per_gpu": 130107.86832808754,
            "loss_sequences_lower_95": 5.518550073242187,
            "loss_sequences_upper_95": 6.071919970703124,
            "loss_tokens_lower_95": 4.727151749798354,
            "loss_tokens_upper_95": 5.09060914944056,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5527003407478333,
            "data_time": 0.1572660207748413,
            "batch_time": 0.19150230288505554,
            "samples_per_second": 564583.7685395052,
            "samples_per_second_per_gpu": 70572.97106743815,
            "loss_sequences_lower_95": 3.3352393388748167,
            "loss_sequences_upper_95": 3.7779140174388885,
            "loss_tokens_lower_95": 3.0864592015058143,
            "loss_tokens_upper_95": 3.945665223571076,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.702010155409232,
            "data_time": 0.027314338278263173,
            "batch_time": 0.05638341193503522,
            "samples_per_second": 946114.7978972088,
            "samples_per_second_per_gpu": 118264.3497371511,
            "loss_sequences_lower_95": 5.125480371234061,
            "loss_sequences_upper_95": 5.879405493023752,
            "loss_tokens_lower_95": 3.370837815208392,
            "loss_tokens_upper_95": 3.8267175581974,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.286677202578453,
            "data_time": 0.002983062838514646,
            "batch_time": 0.03229394637876087,
            "samples_per_second": 1079659.1210006857,
            "samples_per_second_per_gpu": 134957.39012508572,
            "loss_sequences_lower_95": 2.2611173523935806,
            "loss_sequences_upper_95": 2.3130080130174204,
            "loss_tokens_lower_95": 2.2606080114815423,
            "loss_tokens_upper_95": 2.313075421001256,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.569382694615341,
            "data_time": 0.0024751558477766624,
            "batch_time": 0.03144524816306562,
            "samples_per_second": 1096301.4404616454,
            "samples_per_second_per_gpu": 137037.68005770567,
            "loss_sequences_lower_95": 2.5449858831475596,
            "loss_sequences_upper_95": 2.6850740296156363,
            "loss_tokens_lower_95": 2.4218729795872447,
            "loss_tokens_upper_95": 2.5596810391294316,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1271779650733587,
            "data_time": 0.0181035233868493,
            "batch_time": 0.04822466274102529,
            "samples_per_second": 978260.703206933,
            "samples_per_second_per_gpu": 122282.58790086662,
            "loss_sequences_lower_95": 2.973443150782323,
            "loss_sequences_upper_95": 3.378780798160986,
            "loss_tokens_lower_95": 2.8727481302125493,
            "loss_tokens_upper_95": 3.1662295585865228,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.585582258292983,
            "data_time": 0.004713552072644234,
            "batch_time": 0.033854884281754496,
            "samples_per_second": 1079193.4922443777,
            "samples_per_second_per_gpu": 134899.18653054722,
            "loss_sequences_lower_95": 3.6193128048385703,
            "loss_sequences_upper_95": 3.772567958575868,
            "loss_tokens_lower_95": 3.4438941779868464,
            "loss_tokens_upper_95": 3.588224608061754,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.728255224664037,
            "data_time": 0.0315896471341451,
            "batch_time": 0.06255585522878737,
            "samples_per_second": 955195.1304077462,
            "samples_per_second_per_gpu": 119399.39130096827,
            "loss_sequences_lower_95": 2.5864391699069884,
            "loss_sequences_upper_95": 3.0525682030654533,
            "loss_tokens_lower_95": 2.452317638303872,
            "loss_tokens_upper_95": 2.810148010751746,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.145465820377935,
            "data_time": 0.002037114778900451,
            "batch_time": 0.031224523788921557,
            "samples_per_second": 1088177.9760160525,
            "samples_per_second_per_gpu": 136022.24700200657,
            "loss_sequences_lower_95": 4.126092816219494,
            "loss_sequences_upper_95": 4.164177298350294,
            "loss_tokens_lower_95": 4.12650196054836,
            "loss_tokens_upper_95": 4.164377289520404,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.7898816955321043,
            "data_time": 0.0494125713001598,
            "batch_time": 0.07973696101795544,
            "samples_per_second": 888806.4359814036,
            "samples_per_second_per_gpu": 111100.80449767545,
            "loss_sequences_lower_95": 0.741206537635581,
            "loss_sequences_upper_95": 0.8705247230900144,
            "loss_tokens_lower_95": 0.65979571933024,
            "loss_tokens_upper_95": 0.8404469172810581,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.496350961230086,
            "data_time": 0.0013430398098081475,
            "batch_time": 0.030919017381060555,
            "samples_per_second": 1076710.9615728627,
            "samples_per_second_per_gpu": 134588.87019660784,
            "loss_sequences_lower_95": 4.866171066316169,
            "loss_sequences_upper_95": 4.9139342775484804,
            "loss_tokens_lower_95": 3.923991664651837,
            "loss_tokens_upper_95": 3.9702233377659573,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.461336771249771,
            "data_time": 0.005752057310134646,
            "batch_time": 0.034600075275178936,
            "samples_per_second": 1085136.9307254446,
            "samples_per_second_per_gpu": 135642.11634068057,
            "loss_sequences_lower_95": 5.504179235839843,
            "loss_sequences_upper_95": 5.771197375488281,
            "loss_tokens_lower_95": 5.123845137428702,
            "loss_tokens_upper_95": 5.376946573043552,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.479909670871237,
            "data_time": 0.02304906158123986,
            "batch_time": 0.05432756068342823,
            "samples_per_second": 961478.1670154503,
            "samples_per_second_per_gpu": 120184.77087693129,
            "loss_sequences_lower_95": 5.296207341733186,
            "loss_sequences_upper_95": 5.6676671301800265,
            "loss_tokens_lower_95": 5.2973711096722145,
            "loss_tokens_upper_95": 5.657375647503397,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.913965865698728,
            "data_time": 0.004698675440018435,
            "batch_time": 0.033873125372162784,
            "samples_per_second": 1079462.2831864737,
            "samples_per_second_per_gpu": 134932.7853983092,
            "loss_sequences_lower_95": 5.843672809022845,
            "loss_sequences_upper_95": 5.984263916015625,
            "loss_tokens_lower_95": 5.842897403601444,
            "loss_tokens_upper_95": 5.985788222804214,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.9414086665312449,
            "data_time": 0.004216950624547106,
            "batch_time": 0.03332364939628763,
            "samples_per_second": 1089073.741088687,
            "samples_per_second_per_gpu": 136134.21763608587,
            "loss_sequences_lower_95": 0.9760912170410156,
            "loss_sequences_upper_95": 1.0258995442708332,
            "loss_tokens_lower_95": 0.8746765698466886,
            "loss_tokens_upper_95": 0.9290929379564326,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.69045094989595,
            "data_time": 0.022852908287729536,
            "batch_time": 0.0528536183493478,
            "samples_per_second": 947685.4596043214,
            "samples_per_second_per_gpu": 118460.68245054017,
            "loss_sequences_lower_95": 6.308160051618303,
            "loss_sequences_upper_95": 7.070803629557291,
            "loss_tokens_lower_95": 6.311295805431548,
            "loss_tokens_upper_95": 7.07458475748698,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1204074285924435,
            "data_time": 0.161229208111763,
            "batch_time": 0.19516855478286743,
            "samples_per_second": 563929.805516241,
            "samples_per_second_per_gpu": 70491.22568953013,
            "loss_sequences_lower_95": 1.8851796954870224,
            "loss_sequences_upper_95": 2.6978903651237487,
            "loss_tokens_lower_95": 1.6539515387151658,
            "loss_tokens_upper_95": 2.1223116530585537,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.563875238895417,
            "data_time": 0.006017611140296573,
            "batch_time": 0.0358153051800198,
            "samples_per_second": 1054422.302392287,
            "samples_per_second_per_gpu": 131802.78779903587,
            "loss_sequences_lower_95": 7.4893948974609375,
            "loss_sequences_upper_95": 7.81857392578125,
            "loss_tokens_lower_95": 7.296754829936019,
            "loss_tokens_upper_95": 7.585297157558693,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.708131039142609,
            "data_time": 0.006158626741833157,
            "batch_time": 0.03616374163400559,
            "samples_per_second": 1051804.741882574,
            "samples_per_second_per_gpu": 131475.59273532176,
            "loss_sequences_lower_95": 6.795573168945313,
            "loss_sequences_upper_95": 7.018234143066406,
            "loss_tokens_lower_95": 6.47123970868412,
            "loss_tokens_upper_95": 6.668224333570668,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5133138557348405,
            "data_time": 0.00421662553895676,
            "batch_time": 0.033344260824962606,
            "samples_per_second": 1083978.6504930754,
            "samples_per_second_per_gpu": 135497.33131163442,
            "loss_sequences_lower_95": 4.470666066040898,
            "loss_sequences_upper_95": 4.556318077831671,
            "loss_tokens_lower_95": 4.4706938848134214,
            "loss_tokens_upper_95": 4.555172744837936,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.664491891128493,
            "data_time": 0.008827594831991052,
            "batch_time": 0.03791571599839317,
            "samples_per_second": 1062318.9069131524,
            "samples_per_second_per_gpu": 132789.86336414405,
            "loss_sequences_lower_95": 5.553478103848646,
            "loss_sequences_upper_95": 5.773404854160667,
            "loss_tokens_lower_95": 5.550799560546875,
            "loss_tokens_upper_95": 5.773326202251944,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.455586034774781,
            "data_time": 0.005845601123476785,
            "batch_time": 0.03536771592639741,
            "samples_per_second": 1063614.3530745523,
            "samples_per_second_per_gpu": 132951.79413431903,
            "loss_sequences_lower_95": 5.3561815185546875,
            "loss_sequences_upper_95": 5.559570043945312,
            "loss_tokens_lower_95": 5.3578148925781255,
            "loss_tokens_upper_95": 5.5581802612304685,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.874931383104739,
            "data_time": 0.001999278690503991,
            "batch_time": 0.030991118751987394,
            "samples_per_second": 1094626.6440904236,
            "samples_per_second_per_gpu": 136828.33051130295,
            "loss_sequences_lower_95": 3.3379696277051796,
            "loss_sequences_upper_95": 3.432535902761353,
            "loss_tokens_lower_95": 2.2966009476442864,
            "loss_tokens_upper_95": 2.3592339414350767,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.346393649257831,
            "data_time": 0.019447359016963414,
            "batch_time": 0.0490749546459743,
            "samples_per_second": 992558.2551033918,
            "samples_per_second_per_gpu": 124069.78188792398,
            "loss_sequences_lower_95": 5.155935224846227,
            "loss_sequences_upper_95": 5.535187792422167,
            "loss_tokens_lower_95": 5.156571744093254,
            "loss_tokens_upper_95": 5.53377551178434,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.5268865080440746,
            "data_time": 0.010912172496318817,
            "batch_time": 0.04026301298290491,
            "samples_per_second": 1060143.3839378194,
            "samples_per_second_per_gpu": 132517.92299222742,
            "loss_sequences_lower_95": 5.389229963714001,
            "loss_sequences_upper_95": 5.661191525926776,
            "loss_tokens_lower_95": 5.392361498066023,
            "loss_tokens_upper_95": 5.658883331897212,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.924841489963449,
            "data_time": 0.0023187879145994995,
            "batch_time": 0.03136149457944396,
            "samples_per_second": 1091811.4517733532,
            "samples_per_second_per_gpu": 136476.43147166914,
            "loss_sequences_lower_95": 3.1788211754176845,
            "loss_sequences_upper_95": 3.2681663439594293,
            "loss_tokens_lower_95": 2.4857600740206953,
            "loss_tokens_upper_95": 2.5540892753624846,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.532457827260255,
            "data_time": 0.027251469592253368,
            "batch_time": 0.05863455682992935,
            "samples_per_second": 962780.9415301004,
            "samples_per_second_per_gpu": 120347.61769126255,
            "loss_sequences_lower_95": 4.350268804963934,
            "loss_sequences_upper_95": 4.706141993608424,
            "loss_tokens_lower_95": 4.351146120747561,
            "loss_tokens_upper_95": 4.708424869920842,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7549698114395142,
            "data_time": 0.003516809844272041,
            "batch_time": 0.03254883804600754,
            "samples_per_second": 1088377.8784162814,
            "samples_per_second_per_gpu": 136047.23480203518,
            "loss_sequences_lower_95": 3.7149181942254392,
            "loss_sequences_upper_95": 3.794434303027045,
            "loss_tokens_lower_95": 3.714066442087156,
            "loss_tokens_upper_95": 3.7942550201882645,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.297577098735328,
            "data_time": 0.024365910616787997,
            "batch_time": 0.05339300198988481,
            "samples_per_second": 971157.9936322751,
            "samples_per_second_per_gpu": 121394.7492040344,
            "loss_sequences_lower_95": 5.115808594342574,
            "loss_sequences_upper_95": 5.48017129249943,
            "loss_tokens_lower_95": 5.1139348298600575,
            "loss_tokens_upper_95": 5.4818355375123256,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.931920981903871,
            "data_time": 0.08010225743055344,
            "batch_time": 0.11141084134578705,
            "samples_per_second": 758725.984230297,
            "samples_per_second_per_gpu": 94840.74802878713,
            "loss_sequences_lower_95": 1.715465834935506,
            "loss_sequences_upper_95": 2.323274777730306,
            "loss_tokens_lower_95": 1.5378249486287436,
            "loss_tokens_upper_95": 2.1956986003451875,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.8867589682340622,
            "data_time": 0.08185751736164093,
            "batch_time": 0.11305882781744003,
            "samples_per_second": 757343.8283680354,
            "samples_per_second_per_gpu": 94667.97854600442,
            "loss_sequences_lower_95": 1.7644077587127684,
            "loss_sequences_upper_95": 2.383030732472738,
            "loss_tokens_lower_95": 1.4177866989307188,
            "loss_tokens_upper_95": 2.1178651316782062,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.596480428347356,
            "data_time": 0.003578110397949578,
            "batch_time": 0.032731672958320135,
            "samples_per_second": 1085793.1854844566,
            "samples_per_second_per_gpu": 135724.14818555707,
            "loss_sequences_lower_95": 3.5753787739322536,
            "loss_sequences_upper_95": 3.618206195634665,
            "loss_tokens_lower_95": 3.5746310995604746,
            "loss_tokens_upper_95": 3.6176433993579713,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.514181147713203,
            "data_time": 0.0012670585588109833,
            "batch_time": 0.030426381225137272,
            "samples_per_second": 1090876.5063985665,
            "samples_per_second_per_gpu": 136359.56329982082,
            "loss_sequences_lower_95": 0.5844207911845851,
            "loss_sequences_upper_95": 0.5989538499923066,
            "loss_tokens_lower_95": 0.4480909948898398,
            "loss_tokens_upper_95": 0.4563860605019097,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.5296003030041072,
            "data_time": 0.04052117466926575,
            "batch_time": 0.07090120762586594,
            "samples_per_second": 969797.8996540302,
            "samples_per_second_per_gpu": 121224.73745675378,
            "loss_sequences_lower_95": 1.4515943422092228,
            "loss_sequences_upper_95": 1.6730438322532835,
            "loss_tokens_lower_95": 1.3574738395913932,
            "loss_tokens_upper_95": 1.476576985916966,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9893177779945166,
            "data_time": 0.12210528055826823,
            "batch_time": 0.15573683239164807,
            "samples_per_second": 570561.4752244202,
            "samples_per_second_per_gpu": 71320.18440305252,
            "loss_sequences_lower_95": 3.5232412596006655,
            "loss_sequences_upper_95": 4.4873790019267314,
            "loss_tokens_lower_95": 3.3826781849802274,
            "loss_tokens_upper_95": 4.482620060296706,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3789580097285712,
            "data_time": 0.03069329261779785,
            "batch_time": 0.06015855073928833,
            "samples_per_second": 982187.2501243384,
            "samples_per_second_per_gpu": 122773.4062655423,
            "loss_sequences_lower_95": 1.3247358857131586,
            "loss_sequences_upper_95": 1.5073127025511208,
            "loss_tokens_lower_95": 1.2372684873383621,
            "loss_tokens_upper_95": 1.331978124260802,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4300255550117027,
            "data_time": 0.03103338252930414,
            "batch_time": 0.061781255971817745,
            "samples_per_second": 963270.9671758312,
            "samples_per_second_per_gpu": 120408.8708969789,
            "loss_sequences_lower_95": 1.3993050133309712,
            "loss_sequences_upper_95": 1.567593686173602,
            "loss_tokens_lower_95": 1.279647977105811,
            "loss_tokens_upper_95": 1.3601629526293317,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3712926763223439,
            "data_time": 0.030685592265356155,
            "batch_time": 0.06221446820667812,
            "samples_per_second": 941644.6781152801,
            "samples_per_second_per_gpu": 117705.58476441001,
            "loss_sequences_lower_95": 1.245541619091499,
            "loss_sequences_upper_95": 1.4506974406358673,
            "loss_tokens_lower_95": 1.28557664499398,
            "loss_tokens_upper_95": 1.4137745572617164,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4986860323243025,
            "data_time": 0.03116085131963094,
            "batch_time": 0.06341197660991124,
            "samples_per_second": 926408.9587913854,
            "samples_per_second_per_gpu": 115801.11984892318,
            "loss_sequences_lower_95": 1.4579437488462865,
            "loss_sequences_upper_95": 1.6111013645079078,
            "loss_tokens_lower_95": 1.3534582262841341,
            "loss_tokens_upper_95": 1.4307019218848873,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.240952289622763,
            "data_time": 0.03374314896854354,
            "batch_time": 0.06354965692684975,
            "samples_per_second": 996008.5577429486,
            "samples_per_second_per_gpu": 124501.06971786858,
            "loss_sequences_lower_95": 1.1943669810798598,
            "loss_sequences_upper_95": 1.2961601589037024,
            "loss_tokens_lower_95": 1.1817977375968851,
            "loss_tokens_upper_95": 1.2388447628372696,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.179234093282281,
            "data_time": 0.030597059499649776,
            "batch_time": 0.060501708870842344,
            "samples_per_second": 979647.0213363712,
            "samples_per_second_per_gpu": 122455.8776670464,
            "loss_sequences_lower_95": 1.1658630068709213,
            "loss_sequences_upper_95": 1.2814085285838057,
            "loss_tokens_lower_95": 1.0521321879473704,
            "loss_tokens_upper_95": 1.1019096388308094,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-8.0/params.txt",
    "uuid": "f12112c2-d9a0-468d-a160-411cdc6c0b3b",
    "creation_date": "2023_12_14-07_01_38"
}