{
    "name": "rpj-d=576_l=24_h=8-0.5",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 1536773760,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "307354752",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.324601093928019,
            "data_time": 0.03405388072133064,
            "batch_time": 0.3612387105822563,
            "samples_per_second": 829746.575641053,
            "samples_per_second_per_gpu": 103718.32195513163,
            "loss_sequences_lower_95": 3.252635040283203,
            "loss_sequences_upper_95": 3.392109260559082,
            "loss_tokens_lower_95": 3.3122993405659993,
            "loss_tokens_upper_95": 3.336916332244873,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7329098405267933,
            "data_time": 0.0012035170400899666,
            "batch_time": 0.03063020211871763,
            "samples_per_second": 1083070.7992864673,
            "samples_per_second_per_gpu": 135383.8499108084,
            "loss_sequences_lower_95": 3.730379907043024,
            "loss_sequences_upper_95": 3.7354242219026106,
            "loss_tokens_lower_95": 3.721897489583333,
            "loss_tokens_upper_95": 3.7441659062499997,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0332854212546834,
            "data_time": 0.009803147315979003,
            "batch_time": 0.03877431011199951,
            "samples_per_second": 1068491.8672851163,
            "samples_per_second_per_gpu": 133561.48341063954,
            "loss_sequences_lower_95": 3.005930319027025,
            "loss_sequences_upper_95": 3.0606257535973374,
            "loss_tokens_lower_95": 3.02155984375,
            "loss_tokens_upper_95": 3.045389973958333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.577171716935856,
            "data_time": 0.0015962193474957818,
            "batch_time": 0.030150882213523512,
            "samples_per_second": 1117193.0776985344,
            "samples_per_second_per_gpu": 139649.1347123168,
            "loss_sequences_lower_95": 3.564391047841495,
            "loss_sequences_upper_95": 3.5894129550579894,
            "loss_tokens_lower_95": 3.565911760416667,
            "loss_tokens_upper_95": 3.588144916666667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7201558533608305,
            "data_time": 0.009268982951859553,
            "batch_time": 0.03811895039926962,
            "samples_per_second": 1070505.7434294159,
            "samples_per_second_per_gpu": 133813.21792867698,
            "loss_sequences_lower_95": 3.686056369385263,
            "loss_sequences_upper_95": 3.7534496237449875,
            "loss_tokens_lower_95": 3.7089564166666666,
            "loss_tokens_upper_95": 3.731252958333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5132586623761703,
            "data_time": 0.003798753023147583,
            "batch_time": 0.032511184396951096,
            "samples_per_second": 1110184.5089364646,
            "samples_per_second_per_gpu": 138773.06361705807,
            "loss_sequences_lower_95": 3.4702179141793423,
            "loss_sequences_upper_95": 3.556222316345173,
            "loss_tokens_lower_95": 3.501830822916667,
            "loss_tokens_upper_95": 3.52471625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.145192228142096,
            "data_time": 0.0015511658607843845,
            "batch_time": 0.029997494053490593,
            "samples_per_second": 1123725.361221546,
            "samples_per_second_per_gpu": 140465.67015269326,
            "loss_sequences_lower_95": 2.1210241300621813,
            "loss_sequences_upper_95": 2.1689940957828444,
            "loss_tokens_lower_95": 2.1344138333333333,
            "loss_tokens_upper_95": 2.1563716093750003,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.019457249117147,
            "data_time": 0.0016928315411946207,
            "batch_time": 0.030448593090189796,
            "samples_per_second": 1117060.9008510322,
            "samples_per_second_per_gpu": 139632.61260637903,
            "loss_sequences_lower_95": 4.010447449689137,
            "loss_sequences_upper_95": 4.028769101767016,
            "loss_tokens_lower_95": 4.008613354166667,
            "loss_tokens_upper_95": 4.0303337812499995,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.889659545285915,
            "data_time": 0.010981654363965232,
            "batch_time": 0.040107166010235985,
            "samples_per_second": 1055926.2435692544,
            "samples_per_second_per_gpu": 131990.7804461568,
            "loss_sequences_lower_95": 3.8499865648223133,
            "loss_sequences_upper_95": 3.933777246242616,
            "loss_tokens_lower_95": 3.878510895833333,
            "loss_tokens_upper_95": 3.9009955416666666,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.4975603917841855,
            "data_time": 0.009465780109167099,
            "batch_time": 0.03949909936636686,
            "samples_per_second": 1045975.4547744667,
            "samples_per_second_per_gpu": 130746.93184680834,
            "loss_sequences_lower_95": 4.4703331265053725,
            "loss_sequences_upper_95": 4.523350349999228,
            "loss_tokens_lower_95": 4.485444802083333,
            "loss_tokens_upper_95": 4.5097630625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.681317841139202,
            "data_time": 0.0013018599207285627,
            "batch_time": 0.029823567253754676,
            "samples_per_second": 1122340.63793985,
            "samples_per_second_per_gpu": 140292.57974248126,
            "loss_sequences_lower_95": 3.6727582825133616,
            "loss_sequences_upper_95": 3.689679894402237,
            "loss_tokens_lower_95": 3.6702408229166665,
            "loss_tokens_upper_95": 3.6924216666666667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6073231907597885,
            "data_time": 0.002580183332508351,
            "batch_time": 0.031387020805098434,
            "samples_per_second": 1109798.4886627882,
            "samples_per_second_per_gpu": 138724.81108284852,
            "loss_sequences_lower_95": 3.596672122292795,
            "loss_sequences_upper_95": 3.617726876903504,
            "loss_tokens_lower_95": 3.5962444375,
            "loss_tokens_upper_95": 3.6184776145833335,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9969222129721187,
            "data_time": 0.009143407165768589,
            "batch_time": 0.038242045127356,
            "samples_per_second": 1058264.5376393879,
            "samples_per_second_per_gpu": 132283.06720492349,
            "loss_sequences_lower_95": 3.959629886774214,
            "loss_sequences_upper_95": 4.033789012978575,
            "loss_tokens_lower_95": 3.985455052083333,
            "loss_tokens_upper_95": 4.00827209375,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4690064621554373,
            "data_time": 0.009487456059550859,
            "batch_time": 0.038610445075776,
            "samples_per_second": 1059071.1970738117,
            "samples_per_second_per_gpu": 132383.89963422646,
            "loss_sequences_lower_95": 3.4064095491304416,
            "loss_sequences_upper_95": 3.5297800720587764,
            "loss_tokens_lower_95": 3.4572142604166665,
            "loss_tokens_upper_95": 3.48065421875,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.345641255378723,
            "data_time": 0.08104032278060913,
            "batch_time": 0.13123062678745814,
            "samples_per_second": 532250.1322322752,
            "samples_per_second_per_gpu": 66531.2665290344,
            "loss_sequences_lower_95": 4.280952176180753,
            "loss_sequences_upper_95": 4.409846167130904,
            "loss_tokens_lower_95": 4.324967011538419,
            "loss_tokens_upper_95": 4.366885271939364,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.148265469873612,
            "data_time": 0.014161982319571755,
            "batch_time": 0.04392132298512892,
            "samples_per_second": 1024598.3652236593,
            "samples_per_second_per_gpu": 128074.79565295741,
            "loss_sequences_lower_95": 3.044898937881514,
            "loss_sequences_upper_95": 3.250748205601995,
            "loss_tokens_lower_95": 3.1370371197916667,
            "loss_tokens_upper_95": 3.1594531302083335,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.93927904169289,
            "data_time": 0.012268087516228357,
            "batch_time": 0.042215739687283836,
            "samples_per_second": 1038963.4963776119,
            "samples_per_second_per_gpu": 129870.43704720149,
            "loss_sequences_lower_95": 5.891300912016614,
            "loss_sequences_upper_95": 5.984250610673648,
            "loss_tokens_lower_95": 5.9278095,
            "loss_tokens_upper_95": 5.950652197916666,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.002089676309804,
            "data_time": 0.03522522374987602,
            "batch_time": 0.06667539104819298,
            "samples_per_second": 917852.4049262056,
            "samples_per_second_per_gpu": 114731.5506157757,
            "loss_sequences_lower_95": 3.9450704543316952,
            "loss_sequences_upper_95": 4.073082526785428,
            "loss_tokens_lower_95": 3.989537085861456,
            "loss_tokens_upper_95": 4.0147306536064775,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.023420766808847,
            "data_time": 0.0017110678899804118,
            "batch_time": 0.03085682644659403,
            "samples_per_second": 1092776.8853799351,
            "samples_per_second_per_gpu": 136597.1106724919,
            "loss_sequences_lower_95": 5.007358504842615,
            "loss_sequences_upper_95": 5.04017309121742,
            "loss_tokens_lower_95": 5.007254937197336,
            "loss_tokens_upper_95": 5.039627226022825,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5069580129408022,
            "data_time": 0.0018150531657182487,
            "batch_time": 0.03051755757658345,
            "samples_per_second": 1107125.382484419,
            "samples_per_second_per_gpu": 138390.67281055238,
            "loss_sequences_lower_95": 3.495481244398526,
            "loss_sequences_upper_95": 3.5214715771883087,
            "loss_tokens_lower_95": 3.4918453732358326,
            "loss_tokens_upper_95": 3.51198724187942,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.026716092931363,
            "data_time": 0.0030791611181577344,
            "batch_time": 0.031808851351724496,
            "samples_per_second": 1105117.751862148,
            "samples_per_second_per_gpu": 138139.7189827685,
            "loss_sequences_lower_95": 5.247744115253749,
            "loss_sequences_upper_95": 5.533589736729452,
            "loss_tokens_lower_95": 4.52489215231862,
            "loss_tokens_upper_95": 4.734921835444216,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.413223149975141,
            "data_time": 0.0034685734104602894,
            "batch_time": 0.03253895139440577,
            "samples_per_second": 1088211.8329526884,
            "samples_per_second_per_gpu": 136026.47911908606,
            "loss_sequences_lower_95": 5.584019140625,
            "loss_sequences_upper_95": 5.792824479166666,
            "loss_tokens_lower_95": 5.040423189367138,
            "loss_tokens_upper_95": 5.184192155562107,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.463539291639234,
            "data_time": 0.004371519002439749,
            "batch_time": 0.03353249721038036,
            "samples_per_second": 1082665.8321726262,
            "samples_per_second_per_gpu": 135333.22902157827,
            "loss_sequences_lower_95": 3.5073301232882392,
            "loss_sequences_upper_95": 3.5714073198505023,
            "loss_tokens_lower_95": 3.368114643578792,
            "loss_tokens_upper_95": 3.400860823159665,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.4905626849694684,
            "data_time": 0.022805218185697283,
            "batch_time": 0.05303831611360822,
            "samples_per_second": 992070.1184850764,
            "samples_per_second_per_gpu": 124008.76481063454,
            "loss_sequences_lower_95": 2.468811909068714,
            "loss_sequences_upper_95": 2.5850810380415483,
            "loss_tokens_lower_95": 2.4215543985563843,
            "loss_tokens_upper_95": 2.4695079376826565,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.758920156712435,
            "data_time": 0.020537056028842926,
            "batch_time": 0.050202880054712296,
            "samples_per_second": 990862.528665554,
            "samples_per_second_per_gpu": 123857.81608319425,
            "loss_sequences_lower_95": 3.752478151905293,
            "loss_sequences_upper_95": 3.957860375229193,
            "loss_tokens_lower_95": 3.6216250343813474,
            "loss_tokens_upper_95": 3.7187676799466827,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.242059935728709,
            "data_time": 0.016207910501039945,
            "batch_time": 0.045812709209246516,
            "samples_per_second": 1006344.9908702951,
            "samples_per_second_per_gpu": 125793.12385878689,
            "loss_sequences_lower_95": 4.2102725830078125,
            "loss_sequences_upper_95": 4.302457234700521,
            "loss_tokens_lower_95": 4.1086980378819336,
            "loss_tokens_upper_95": 4.333907843402349,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.275441374398567,
            "data_time": 0.001418178915191431,
            "batch_time": 0.03016147298168073,
            "samples_per_second": 1106398.9525684188,
            "samples_per_second_per_gpu": 138299.86907105235,
            "loss_sequences_lower_95": 7.297220874495595,
            "loss_sequences_upper_95": 7.3722897110747505,
            "loss_tokens_lower_95": 7.115677297350566,
            "loss_tokens_upper_95": 7.195619732936356,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.076644423734459,
            "data_time": 0.0028316290586586764,
            "batch_time": 0.03190732422291032,
            "samples_per_second": 1101326.1941105116,
            "samples_per_second_per_gpu": 137665.77426381395,
            "loss_sequences_lower_95": 5.6457859848484855,
            "loss_sequences_upper_95": 5.949250212903777,
            "loss_tokens_lower_95": 4.276894994695873,
            "loss_tokens_upper_95": 4.417487254129768,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.567705666551004,
            "data_time": 0.005059816950076335,
            "batch_time": 0.03374704235308879,
            "samples_per_second": 1092214.8876190817,
            "samples_per_second_per_gpu": 136526.8609523852,
            "loss_sequences_lower_95": 5.0362843900817245,
            "loss_sequences_upper_95": 5.3781943259385665,
            "loss_tokens_lower_95": 4.10423834900119,
            "loss_tokens_upper_95": 4.26588887512857,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.772938841554128,
            "data_time": 0.022724419832229614,
            "batch_time": 0.05213915024484907,
            "samples_per_second": 1012006.7622382061,
            "samples_per_second_per_gpu": 126500.84527977576,
            "loss_sequences_lower_95": 5.705935459920805,
            "loss_sequences_upper_95": 5.8399938435314995,
            "loss_tokens_lower_95": 5.708272437526755,
            "loss_tokens_upper_95": 5.837144700141803,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.838224997520447,
            "data_time": 0.04833194384208092,
            "batch_time": 0.07898668142465445,
            "samples_per_second": 898264.9709029018,
            "samples_per_second_per_gpu": 112283.12136286273,
            "loss_sequences_lower_95": 3.6955844039916994,
            "loss_sequences_upper_95": 4.094266647338867,
            "loss_tokens_lower_95": 3.5255608956161253,
            "loss_tokens_upper_95": 4.008871159698541,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.582353211962213,
            "data_time": 0.0031884876984516293,
            "batch_time": 0.03232349427931148,
            "samples_per_second": 1091712.097356272,
            "samples_per_second_per_gpu": 136464.012169534,
            "loss_sequences_lower_95": 5.535164496305655,
            "loss_sequences_upper_95": 5.629975221100947,
            "loss_tokens_lower_95": 5.5350610676416965,
            "loss_tokens_upper_95": 5.630226371085754,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.310578581742045,
            "data_time": 0.00464079002766197,
            "batch_time": 0.03365457699699464,
            "samples_per_second": 1090942.495286843,
            "samples_per_second_per_gpu": 136367.81191085538,
            "loss_sequences_lower_95": 5.267415124641687,
            "loss_sequences_upper_95": 5.354037887985641,
            "loss_tokens_lower_95": 5.265728224895706,
            "loss_tokens_upper_95": 5.355121764386901,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.057226321772471,
            "data_time": 0.0033525887627323552,
            "batch_time": 0.03261671390698229,
            "samples_per_second": 1077782.4542674916,
            "samples_per_second_per_gpu": 134722.80678343645,
            "loss_sequences_lower_95": 4.20992156019408,
            "loss_sequences_upper_95": 4.334203384283188,
            "loss_tokens_lower_95": 3.8774672081904424,
            "loss_tokens_upper_95": 3.9367732824194595,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.98378252696991,
            "data_time": 0.010017318651080132,
            "batch_time": 0.03955007717013359,
            "samples_per_second": 1047116.1700547268,
            "samples_per_second_per_gpu": 130889.52125684085,
            "loss_sequences_lower_95": 6.179882836914063,
            "loss_sequences_upper_95": 6.744364123535156,
            "loss_tokens_lower_95": 5.3120181898499546,
            "loss_tokens_upper_95": 5.679053542252647,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.282965153455734,
            "data_time": 0.14800626039505005,
            "batch_time": 0.18473030626773834,
            "samples_per_second": 535242.7400840753,
            "samples_per_second_per_gpu": 66905.3425105094,
            "loss_sequences_lower_95": 3.986374408006668,
            "loss_sequences_upper_95": 4.634843730926514,
            "loss_tokens_lower_95": 3.7856174688229616,
            "loss_tokens_upper_95": 4.60515803194594,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.41126926260433,
            "data_time": 0.026994859918634942,
            "batch_time": 0.056593182239126655,
            "samples_per_second": 924940.8266166531,
            "samples_per_second_per_gpu": 115617.60332708164,
            "loss_sequences_lower_95": 5.87017119835163,
            "loss_sequences_upper_95": 6.673164753530218,
            "loss_tokens_lower_95": 3.9826834797993587,
            "loss_tokens_upper_95": 4.449823679273534,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3377502246838358,
            "data_time": 0.0029069936523834863,
            "batch_time": 0.03156601182288594,
            "samples_per_second": 1101076.7507519193,
            "samples_per_second_per_gpu": 137634.59384398992,
            "loss_sequences_lower_95": 3.317751599233085,
            "loss_sequences_upper_95": 3.3578720472743284,
            "loss_tokens_lower_95": 3.31738844938599,
            "loss_tokens_upper_95": 3.3577167955951017,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6731892931788352,
            "data_time": 0.0027304186023479754,
            "batch_time": 0.03148092065247908,
            "samples_per_second": 1103450.153557365,
            "samples_per_second_per_gpu": 137931.26919467063,
            "loss_sequences_lower_95": 3.64137979393072,
            "loss_sequences_upper_95": 3.821729842218732,
            "loss_tokens_lower_95": 3.4650757859494625,
            "loss_tokens_upper_95": 3.642039395734597,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5047420198882455,
            "data_time": 0.018043282959196303,
            "batch_time": 0.04715325435002645,
            "samples_per_second": 1002072.1254137564,
            "samples_per_second_per_gpu": 125259.01567671954,
            "loss_sequences_lower_95": 3.343991413046589,
            "loss_sequences_upper_95": 3.7394156822791467,
            "loss_tokens_lower_95": 3.243518783470528,
            "loss_tokens_upper_95": 3.5457186293976934,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.929422825927704,
            "data_time": 0.004779515787959099,
            "batch_time": 0.033800899237394336,
            "samples_per_second": 1082404.7399938267,
            "samples_per_second_per_gpu": 135300.59249922834,
            "loss_sequences_lower_95": 3.9658128360546567,
            "loss_sequences_upper_95": 4.11355793628527,
            "loss_tokens_lower_95": 3.7852950354707726,
            "loss_tokens_upper_95": 3.9335400792806565,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2797524173085284,
            "data_time": 0.030572939486730667,
            "batch_time": 0.06053954079037621,
            "samples_per_second": 991799.9419706983,
            "samples_per_second_per_gpu": 123974.99274633729,
            "loss_sequences_lower_95": 3.092940484023676,
            "loss_sequences_upper_95": 3.5955287933349607,
            "loss_tokens_lower_95": 2.9862395991314488,
            "loss_tokens_upper_95": 3.3771904651336917,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.273235940675684,
            "data_time": 0.00199107294124553,
            "batch_time": 0.030908959034772285,
            "samples_per_second": 1096617.3873157627,
            "samples_per_second_per_gpu": 137077.17341447034,
            "loss_sequences_lower_95": 4.261255141653331,
            "loss_sequences_upper_95": 4.285397870589743,
            "loss_tokens_lower_95": 4.261045949424259,
            "loss_tokens_upper_95": 4.2853583021291755,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3095707433316315,
            "data_time": 0.04179896007884632,
            "batch_time": 0.07221497188914906,
            "samples_per_second": 898919.407819512,
            "samples_per_second_per_gpu": 112364.925977439,
            "loss_sequences_lower_95": 1.2409539657889062,
            "loss_sequences_upper_95": 1.4355614930680654,
            "loss_tokens_lower_95": 1.1146270470829067,
            "loss_tokens_upper_95": 1.3782493567881589,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.011628748342176,
            "data_time": 0.0013664801107767942,
            "batch_time": 0.03036157553273894,
            "samples_per_second": 1095955.057318881,
            "samples_per_second_per_gpu": 136994.38216486012,
            "loss_sequences_lower_95": 6.444273904284591,
            "loss_sequences_upper_95": 6.500362300592899,
            "loss_tokens_lower_95": 5.3213030585106385,
            "loss_tokens_upper_95": 5.3737160058027085,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.384998586177826,
            "data_time": 0.005588048034244114,
            "batch_time": 0.03559977052703736,
            "samples_per_second": 1051125.9682781913,
            "samples_per_second_per_gpu": 131390.7460347739,
            "loss_sequences_lower_95": 6.381198669433593,
            "loss_sequences_upper_95": 6.717237353515625,
            "loss_tokens_lower_95": 6.0381439232586525,
            "loss_tokens_upper_95": 6.325678014202074,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.603980872942054,
            "data_time": 0.022372878203957766,
            "batch_time": 0.05255906258599233,
            "samples_per_second": 992897.2167576903,
            "samples_per_second_per_gpu": 124112.15209471129,
            "loss_sequences_lower_95": 5.448188463293987,
            "loss_sequences_upper_95": 5.762750297214674,
            "loss_tokens_lower_95": 5.451078013544497,
            "loss_tokens_upper_95": 5.757446262525475,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.127125332030383,
            "data_time": 0.0045093603162880405,
            "batch_time": 0.03330189995018833,
            "samples_per_second": 1093588.56922769,
            "samples_per_second_per_gpu": 136698.57115346126,
            "loss_sequences_lower_95": 5.077253306995739,
            "loss_sequences_upper_95": 5.174959864760891,
            "loss_tokens_lower_95": 5.077893288352272,
            "loss_tokens_upper_95": 5.175966658158736,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.20293137160937,
            "data_time": 0.004070819375362802,
            "batch_time": 0.03319936863919522,
            "samples_per_second": 1088902.8028610258,
            "samples_per_second_per_gpu": 136112.85035762822,
            "loss_sequences_lower_95": 1.2616823872884115,
            "loss_sequences_upper_95": 1.3403236165364583,
            "loss_tokens_lower_95": 1.1066051869966738,
            "loss_tokens_upper_95": 1.1691039462660064,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.562875953174773,
            "data_time": 0.02369300595351628,
            "batch_time": 0.054240761058671136,
            "samples_per_second": 948860.4355562198,
            "samples_per_second_per_gpu": 118607.55444452747,
            "loss_sequences_lower_95": 6.23491705031622,
            "loss_sequences_upper_95": 6.8925685192289805,
            "loss_tokens_lower_95": 6.230138622465588,
            "loss_tokens_upper_95": 6.8977938116164434,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5407062098383904,
            "data_time": 0.15131425857543945,
            "batch_time": 0.18508461117744446,
            "samples_per_second": 524756.6880008058,
            "samples_per_second_per_gpu": 65594.58600010072,
            "loss_sequences_lower_95": 2.320515424013138,
            "loss_sequences_upper_95": 3.4171019792556763,
            "loss_tokens_lower_95": 1.9693970890635066,
            "loss_tokens_upper_95": 2.524534484234053,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.468340895175934,
            "data_time": 0.005538357628716363,
            "batch_time": 0.03480505895993066,
            "samples_per_second": 1073974.8499889092,
            "samples_per_second_per_gpu": 134246.85624861365,
            "loss_sequences_lower_95": 7.394902575683593,
            "loss_sequences_upper_95": 7.769072143554688,
            "loss_tokens_lower_95": 7.15751370657519,
            "loss_tokens_upper_95": 7.4867104673950395,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.571258388519287,
            "data_time": 0.005453721871451726,
            "batch_time": 0.034804066022237144,
            "samples_per_second": 1070526.0826874233,
            "samples_per_second_per_gpu": 133815.76033592792,
            "loss_sequences_lower_95": 6.657736401367187,
            "loss_sequences_upper_95": 6.892998620605469,
            "loss_tokens_lower_95": 6.335972831317288,
            "loss_tokens_upper_95": 6.523951002186874,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.344876626894212,
            "data_time": 0.004098204386274153,
            "batch_time": 0.03308818683177731,
            "samples_per_second": 1087858.1807257566,
            "samples_per_second_per_gpu": 135982.27259071957,
            "loss_sequences_lower_95": 4.317688401593404,
            "loss_sequences_upper_95": 4.371539226842734,
            "loss_tokens_lower_95": 4.318350977806529,
            "loss_tokens_upper_95": 4.371230724103252,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.072268546085387,
            "data_time": 0.008190782408699889,
            "batch_time": 0.03773596855808961,
            "samples_per_second": 1048495.9520560938,
            "samples_per_second_per_gpu": 131061.99400701173,
            "loss_sequences_lower_95": 4.989579189318116,
            "loss_sequences_upper_95": 5.154511482484879,
            "loss_tokens_lower_95": 4.986405679963518,
            "loss_tokens_upper_95": 5.154066535258257,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.74884217119217,
            "data_time": 0.005677159343447004,
            "batch_time": 0.03462473929874481,
            "samples_per_second": 1083594.2265378588,
            "samples_per_second_per_gpu": 135449.27831723235,
            "loss_sequences_lower_95": 6.6797861328125,
            "loss_sequences_upper_95": 6.822013586425781,
            "loss_tokens_lower_95": 6.678432788085937,
            "loss_tokens_upper_95": 6.822774096679688,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9918194238149343,
            "data_time": 0.0021016321921393643,
            "batch_time": 0.030929079037758263,
            "samples_per_second": 1100489.6543398364,
            "samples_per_second_per_gpu": 137561.20679247956,
            "loss_sequences_lower_95": 4.610279867845317,
            "loss_sequences_upper_95": 4.716936882834673,
            "loss_tokens_lower_95": 3.2264797610827327,
            "loss_tokens_upper_95": 3.2972800216215927,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.8328881868675575,
            "data_time": 0.018191887651171003,
            "batch_time": 0.04840125015803746,
            "samples_per_second": 986319.8203290185,
            "samples_per_second_per_gpu": 123289.97754112731,
            "loss_sequences_lower_95": 5.6614426570152165,
            "loss_sequences_upper_95": 6.004978498772009,
            "loss_tokens_lower_95": 5.660784695753411,
            "loss_tokens_upper_95": 6.001286304530813,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.762811793533026,
            "data_time": 0.010986477136611938,
            "batch_time": 0.04036009218543768,
            "samples_per_second": 1061240.4277932853,
            "samples_per_second_per_gpu": 132655.05347416067,
            "loss_sequences_lower_95": 5.641768104702819,
            "loss_sequences_upper_95": 5.882985121783088,
            "loss_tokens_lower_95": 5.643570269416361,
            "loss_tokens_upper_95": 5.878509018841912,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.247399322523385,
            "data_time": 0.0021867995919153433,
            "batch_time": 0.030934183083047982,
            "samples_per_second": 1102455.1664877483,
            "samples_per_second_per_gpu": 137806.89581096853,
            "loss_sequences_lower_95": 4.640038304052361,
            "loss_sequences_upper_95": 4.742613197086778,
            "loss_tokens_lower_95": 3.578551226193313,
            "loss_tokens_upper_95": 3.6605768553291234,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.635121842540761,
            "data_time": 0.02774316817522049,
            "batch_time": 0.05787966897090276,
            "samples_per_second": 989514.1660582019,
            "samples_per_second_per_gpu": 123689.27075727524,
            "loss_sequences_lower_95": 4.5361240770450975,
            "loss_sequences_upper_95": 4.730204401571284,
            "loss_tokens_lower_95": 4.537052820599269,
            "loss_tokens_upper_95": 4.730458327197524,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.694598974729532,
            "data_time": 0.003442347559154543,
            "batch_time": 0.03252208625877297,
            "samples_per_second": 1089089.198807075,
            "samples_per_second_per_gpu": 136136.14985088437,
            "loss_sequences_lower_95": 4.656245505423357,
            "loss_sequences_upper_95": 4.733093194524082,
            "loss_tokens_lower_95": 4.656692611274369,
            "loss_tokens_upper_95": 4.7331382149512615,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.025091233762723,
            "data_time": 0.02230781858617609,
            "batch_time": 0.051795363426208496,
            "samples_per_second": 967567.2726338683,
            "samples_per_second_per_gpu": 120945.90907923353,
            "loss_sequences_lower_95": 5.858932880290504,
            "loss_sequences_upper_95": 6.192067459254589,
            "loss_tokens_lower_95": 5.85748572488433,
            "loss_tokens_upper_95": 6.19257056967726,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.860690633455913,
            "data_time": 0.0729222223162651,
            "batch_time": 0.10537878423929214,
            "samples_per_second": 772815.8699672717,
            "samples_per_second_per_gpu": 96601.98374590896,
            "loss_sequences_lower_95": 4.50730931599935,
            "loss_sequences_upper_95": 5.418348731994628,
            "loss_tokens_lower_95": 4.013974719577366,
            "loss_tokens_upper_95": 5.253165223863389,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.024783551692963,
            "data_time": 0.0756060928106308,
            "batch_time": 0.10906973481178284,
            "samples_per_second": 744794.0422934564,
            "samples_per_second_per_gpu": 93099.25528668205,
            "loss_sequences_lower_95": 3.793520762125651,
            "loss_sequences_upper_95": 4.649177729288737,
            "loss_tokens_lower_95": 3.1000560117571543,
            "loss_tokens_upper_95": 4.356000835975904,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6236739248520147,
            "data_time": 0.0036319806478948297,
            "batch_time": 0.03250659473367548,
            "samples_per_second": 1095020.9170000765,
            "samples_per_second_per_gpu": 136877.61462500956,
            "loss_sequences_lower_95": 3.6093402234444034,
            "loss_sequences_upper_95": 3.638130307092231,
            "loss_tokens_lower_95": 3.60929573160208,
            "loss_tokens_upper_95": 3.638467645434462,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.0440324317353553,
            "data_time": 0.0012893798053916748,
            "batch_time": 0.030343846277348062,
            "samples_per_second": 1093748.6248279093,
            "samples_per_second_per_gpu": 136718.57810348866,
            "loss_sequences_lower_95": 1.2299391436542604,
            "loss_sequences_upper_95": 1.2615011755079755,
            "loss_tokens_lower_95": 0.8329228411672914,
            "loss_tokens_upper_95": 0.8489013252267744,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.222332860541156,
            "data_time": 0.040268536657094955,
            "batch_time": 0.07062583416700363,
            "samples_per_second": 968942.7826673845,
            "samples_per_second_per_gpu": 121117.84783342306,
            "loss_sequences_lower_95": 2.124185300812008,
            "loss_sequences_upper_95": 2.4232480867641177,
            "loss_tokens_lower_95": 1.9753375599466885,
            "loss_tokens_upper_95": 2.121584346118122,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.024169106741209,
            "data_time": 0.11797999200366792,
            "batch_time": 0.14998886698768252,
            "samples_per_second": 578791.6939834259,
            "samples_per_second_per_gpu": 72348.96174792823,
            "loss_sequences_lower_95": 3.614154815673828,
            "loss_sequences_upper_95": 4.496554452019769,
            "loss_tokens_lower_95": 3.5283743964301215,
            "loss_tokens_upper_95": 4.431778396794825,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.0482608297976053,
            "data_time": 0.03011446339743478,
            "batch_time": 0.060813900970277335,
            "samples_per_second": 975469.7907067771,
            "samples_per_second_per_gpu": 121933.72383834714,
            "loss_sequences_lower_95": 1.9891150823453578,
            "loss_sequences_upper_95": 2.234898743978361,
            "loss_tokens_lower_95": 1.852493297939445,
            "loss_tokens_upper_95": 1.9711226608494532,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.107661996672793,
            "data_time": 0.03117040793100993,
            "batch_time": 0.06141731568745205,
            "samples_per_second": 971743.3071878997,
            "samples_per_second_per_gpu": 121467.91339848746,
            "loss_sequences_lower_95": 2.082695663266066,
            "loss_sequences_upper_95": 2.3054356365669064,
            "loss_tokens_lower_95": 1.8970693705243273,
            "loss_tokens_upper_95": 1.9958781721388676,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.062003592528948,
            "data_time": 0.031333676406315396,
            "batch_time": 0.06112662667319888,
            "samples_per_second": 985120.1538552353,
            "samples_per_second_per_gpu": 123140.01923190441,
            "loss_sequences_lower_95": 1.8939946197881932,
            "loss_sequences_upper_95": 2.171622741513136,
            "loss_tokens_lower_95": 1.9442435073194453,
            "loss_tokens_upper_95": 2.0991325321383965,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1980899674136465,
            "data_time": 0.029754982108161562,
            "batch_time": 0.060147135030655635,
            "samples_per_second": 966575.557911685,
            "samples_per_second_per_gpu": 120821.94473896062,
            "loss_sequences_lower_95": 2.166499463523307,
            "loss_sequences_upper_95": 2.375679230108494,
            "loss_tokens_lower_95": 1.9879644471165545,
            "loss_tokens_upper_95": 2.081368937744901,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7749450791696584,
            "data_time": 0.03080834871456947,
            "batch_time": 0.060977788619053216,
            "samples_per_second": 993967.5543092259,
            "samples_per_second_per_gpu": 124245.94428865323,
            "loss_sequences_lower_95": 1.7234668613220594,
            "loss_sequences_upper_95": 1.8502427663862335,
            "loss_tokens_lower_95": 1.7039488114443186,
            "loss_tokens_upper_95": 1.7746587943751828,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6357695329480055,
            "data_time": 0.030130210376921155,
            "batch_time": 0.06201712858109247,
            "samples_per_second": 954476.3536663158,
            "samples_per_second_per_gpu": 119309.54420828947,
            "loss_sequences_lower_95": 1.6115243353494784,
            "loss_sequences_upper_95": 1.7479183057459389,
            "loss_tokens_lower_95": 1.4796921385247137,
            "loss_tokens_upper_95": 1.5413783040658906,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.5/params.txt",
    "uuid": "52e4f27d-003f-42e4-a8af-2f0e88584dc7",
    "creation_date": "2023_12_14-06_46_29"
}