{
    "name": "rpj-d=576_l=24_h=8-16.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 49176760320,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "9835352064",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.6708955099185308,
            "data_time": 0.03777863085269928,
            "batch_time": 0.3804401233792305,
            "samples_per_second": 832417.938073023,
            "samples_per_second_per_gpu": 104052.24225912787,
            "loss_sequences_lower_95": 2.604340337117513,
            "loss_sequences_upper_95": 2.734243621826172,
            "loss_tokens_lower_95": 2.6593092473347983,
            "loss_tokens_upper_95": 2.6822673988342283,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.168113735949615,
            "data_time": 0.0012147823305737845,
            "batch_time": 0.030763279860758393,
            "samples_per_second": 1079145.9972461553,
            "samples_per_second_per_gpu": 134893.2496557694,
            "loss_sequences_lower_95": 3.1654563399275286,
            "loss_sequences_upper_95": 3.170747973873222,
            "loss_tokens_lower_95": 3.1573840104166666,
            "loss_tokens_upper_95": 3.178686557291667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.628360326436101,
            "data_time": 0.009339391708374023,
            "batch_time": 0.038834141731262206,
            "samples_per_second": 1054949.4043500961,
            "samples_per_second_per_gpu": 131868.67554376202,
            "loss_sequences_lower_95": 2.604929062201052,
            "loss_sequences_upper_95": 2.6520942656847897,
            "loss_tokens_lower_95": 2.6172042291666666,
            "loss_tokens_upper_95": 2.639630640625,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0247558788909124,
            "data_time": 0.0015960562189942912,
            "batch_time": 0.030033114415250327,
            "samples_per_second": 1121325.662013016,
            "samples_per_second_per_gpu": 140165.707751627,
            "loss_sequences_lower_95": 3.0142068802351805,
            "loss_sequences_upper_95": 3.035046004147874,
            "loss_tokens_lower_95": 3.0141424739583336,
            "loss_tokens_upper_95": 3.035005223958333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.170689233208868,
            "data_time": 0.009202897311206833,
            "batch_time": 0.03897289355912532,
            "samples_per_second": 1041207.8633800352,
            "samples_per_second_per_gpu": 130150.9829225044,
            "loss_sequences_lower_95": 3.1379392045092924,
            "loss_sequences_upper_95": 3.2031138806624715,
            "loss_tokens_lower_95": 3.1600570677083333,
            "loss_tokens_upper_95": 3.181041479166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9522719994278526,
            "data_time": 0.00353380033503408,
            "batch_time": 0.032255642116069794,
            "samples_per_second": 1112345.3908091655,
            "samples_per_second_per_gpu": 139043.1738511457,
            "loss_sequences_lower_95": 2.9113990851116376,
            "loss_sequences_upper_95": 2.9930091468919953,
            "loss_tokens_lower_95": 2.941544046875,
            "loss_tokens_upper_95": 2.96293228125,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6492706703896425,
            "data_time": 0.0015863552568009395,
            "batch_time": 0.030111174696226883,
            "samples_per_second": 1121817.6952400266,
            "samples_per_second_per_gpu": 140227.21190500332,
            "loss_sequences_lower_95": 1.6278145876514667,
            "loss_sequences_upper_95": 1.670928392059949,
            "loss_tokens_lower_95": 1.6398192083333334,
            "loss_tokens_upper_95": 1.6590869114583333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5681165564372277,
            "data_time": 0.0019227513106285104,
            "batch_time": 0.030262881868905675,
            "samples_per_second": 1126736.978201732,
            "samples_per_second_per_gpu": 140842.1222752165,
            "loss_sequences_lower_95": 3.560124969322644,
            "loss_sequences_upper_95": 3.5759462532722512,
            "loss_tokens_lower_95": 3.5574121979166664,
            "loss_tokens_upper_95": 3.5785633645833332,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.347779233765796,
            "data_time": 0.00993772820820884,
            "batch_time": 0.04295528029638623,
            "samples_per_second": 1070553.8785310565,
            "samples_per_second_per_gpu": 133819.23481638206,
            "loss_sequences_lower_95": 3.3072090272980974,
            "loss_sequences_upper_95": 3.39312942008662,
            "loss_tokens_lower_95": 3.3368261458333333,
            "loss_tokens_upper_95": 3.358799411458333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9828853489381992,
            "data_time": 0.009420079179108143,
            "batch_time": 0.0392185403034091,
            "samples_per_second": 1052244.6483099707,
            "samples_per_second_per_gpu": 131530.58103874634,
            "loss_sequences_lower_95": 3.9557047485834054,
            "loss_sequences_upper_95": 4.007509567803545,
            "loss_tokens_lower_95": 3.971072041666667,
            "loss_tokens_upper_95": 3.9952980729166665,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.101965350681288,
            "data_time": 0.0012396961801768851,
            "batch_time": 0.029558428936574385,
            "samples_per_second": 1129847.980635049,
            "samples_per_second_per_gpu": 141230.99757938113,
            "loss_sequences_lower_95": 3.0938482317219407,
            "loss_sequences_upper_95": 3.1100292045318283,
            "loss_tokens_lower_95": 3.09146125,
            "loss_tokens_upper_95": 3.1122441822916667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.009340160144264,
            "data_time": 0.002471660991989504,
            "batch_time": 0.031147308294024696,
            "samples_per_second": 1114499.2439141038,
            "samples_per_second_per_gpu": 139312.40548926298,
            "loss_sequences_lower_95": 2.9999183486372867,
            "loss_sequences_upper_95": 3.0184798285545082,
            "loss_tokens_lower_95": 2.999148427083333,
            "loss_tokens_upper_95": 3.019825411458333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5274525477001197,
            "data_time": 0.009547982762453585,
            "batch_time": 0.038658763108988525,
            "samples_per_second": 1059334.6832371044,
            "samples_per_second_per_gpu": 132416.83540463806,
            "loss_sequences_lower_95": 3.4958220259413033,
            "loss_sequences_upper_95": 3.5579398195835448,
            "loss_tokens_lower_95": 3.5163863541666665,
            "loss_tokens_upper_95": 3.5384585416666665,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9056796954751256,
            "data_time": 0.009889753691228737,
            "batch_time": 0.039359529654818225,
            "samples_per_second": 1048080.4451326943,
            "samples_per_second_per_gpu": 131010.05564158679,
            "loss_sequences_lower_95": 2.845093345253628,
            "loss_sequences_upper_95": 2.9645323277490934,
            "loss_tokens_lower_95": 2.894756859375,
            "loss_tokens_upper_95": 2.9166638177083337,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6963840181177314,
            "data_time": 0.07642858369009835,
            "batch_time": 0.11403827156339373,
            "samples_per_second": 507793.78316239343,
            "samples_per_second_per_gpu": 63474.22289529918,
            "loss_sequences_lower_95": 3.638959555192427,
            "loss_sequences_upper_95": 3.7543931007385254,
            "loss_tokens_lower_95": 3.6764893618496983,
            "loss_tokens_upper_95": 3.716831753470681,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5735055215157154,
            "data_time": 0.013256529515439814,
            "batch_time": 0.04264701090075753,
            "samples_per_second": 1046628.6974005814,
            "samples_per_second_per_gpu": 130828.58717507268,
            "loss_sequences_lower_95": 2.4819302973177275,
            "loss_sequences_upper_95": 2.6652722016715447,
            "loss_tokens_lower_95": 2.5629681041666665,
            "loss_tokens_upper_95": 2.5838176875,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.497764952264549,
            "data_time": 0.012124398102362951,
            "batch_time": 0.04192932198445002,
            "samples_per_second": 1054667.9828347538,
            "samples_per_second_per_gpu": 131833.49785434423,
            "loss_sequences_lower_95": 5.443982448930162,
            "loss_sequences_upper_95": 5.54833641354201,
            "loss_tokens_lower_95": 5.486065052083333,
            "loss_tokens_upper_95": 5.50956340625,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.225174554058763,
            "data_time": 0.035578809678554535,
            "batch_time": 0.06557630747556686,
            "samples_per_second": 939237.1826582527,
            "samples_per_second_per_gpu": 117404.64783228158,
            "loss_sequences_lower_95": 3.1806114134241326,
            "loss_sequences_upper_95": 3.265160851400407,
            "loss_tokens_lower_95": 3.2134565259589523,
            "loss_tokens_upper_95": 3.2370733667592533,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9792431796688823,
            "data_time": 0.0016659148597500048,
            "batch_time": 0.031132005443605584,
            "samples_per_second": 1082161.721678616,
            "samples_per_second_per_gpu": 135270.215209827,
            "loss_sequences_lower_95": 3.9621575841671413,
            "loss_sequences_upper_95": 3.9968866976080686,
            "loss_tokens_lower_95": 3.9615232705900154,
            "loss_tokens_upper_95": 3.9968877199317228,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9424527752947887,
            "data_time": 0.001877060266816692,
            "batch_time": 0.030991928118049718,
            "samples_per_second": 1092308.4582362066,
            "samples_per_second_per_gpu": 136538.55727952582,
            "loss_sequences_lower_95": 2.933927664940064,
            "loss_sequences_upper_95": 2.958909712994486,
            "loss_tokens_lower_95": 2.924687111445235,
            "loss_tokens_upper_95": 2.943571157472445,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7476774860819275,
            "data_time": 0.002995558441190872,
            "batch_time": 0.031741624053753734,
            "samples_per_second": 1105171.9175032545,
            "samples_per_second_per_gpu": 138146.4896879068,
            "loss_sequences_lower_95": 4.025129105068346,
            "loss_sequences_upper_95": 4.3193604092244335,
            "loss_tokens_lower_95": 3.150439947572299,
            "loss_tokens_upper_95": 3.3596800798003037,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.67990553445369,
            "data_time": 0.0039421811382821265,
            "batch_time": 0.03324076263828481,
            "samples_per_second": 1079158.384936281,
            "samples_per_second_per_gpu": 134894.79811703513,
            "loss_sequences_lower_95": 3.7694156168619792,
            "loss_sequences_upper_95": 3.967998714192708,
            "loss_tokens_lower_95": 3.4409443543632072,
            "loss_tokens_upper_95": 3.581209745970912,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6540682952091315,
            "data_time": 0.0044402144793112,
            "batch_time": 0.033533245549842064,
            "samples_per_second": 1087064.1887381363,
            "samples_per_second_per_gpu": 135883.02359226704,
            "loss_sequences_lower_95": 2.6961200196423074,
            "loss_sequences_upper_95": 2.753055880809029,
            "loss_tokens_lower_95": 2.565938162952118,
            "loss_tokens_upper_95": 2.595320673468606,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9841987019235439,
            "data_time": 0.022019430994987488,
            "batch_time": 0.05219861013548715,
            "samples_per_second": 1005264.8634050784,
            "samples_per_second_per_gpu": 125658.1079256348,
            "loss_sequences_lower_95": 1.9660523362593219,
            "loss_sequences_upper_95": 2.0659944152832033,
            "loss_tokens_lower_95": 1.9202693552857,
            "loss_tokens_upper_95": 1.9635649186234,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.062920066288539,
            "data_time": 0.019657472148537636,
            "batch_time": 0.04890574887394905,
            "samples_per_second": 1004956.2047699131,
            "samples_per_second_per_gpu": 125619.52559623914,
            "loss_sequences_lower_95": 3.0454015988719707,
            "loss_sequences_upper_95": 3.2277664558254946,
            "loss_tokens_lower_95": 2.946491932284614,
            "loss_tokens_upper_95": 3.0351835170531483,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1592000166575116,
            "data_time": 0.016304184228946,
            "batch_time": 0.045822313198676475,
            "samples_per_second": 1006484.1546489823,
            "samples_per_second_per_gpu": 125810.51933112279,
            "loss_sequences_lower_95": 3.1375874582926433,
            "loss_sequences_upper_95": 3.242363210042318,
            "loss_tokens_lower_95": 3.0192431323755216,
            "loss_tokens_upper_95": 3.2087893491917105,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.482130876656875,
            "data_time": 0.0014846189532877363,
            "batch_time": 0.03075177178421824,
            "samples_per_second": 1088909.1406887267,
            "samples_per_second_per_gpu": 136113.64258609083,
            "loss_sequences_lower_95": 5.492060476231485,
            "loss_sequences_upper_95": 5.569892896741671,
            "loss_tokens_lower_95": 5.341367717565937,
            "loss_tokens_upper_95": 5.421595371553825,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.014690760360021,
            "data_time": 0.0028250587466579157,
            "batch_time": 0.033022549128372394,
            "samples_per_second": 1057695.445101899,
            "samples_per_second_per_gpu": 132211.9306377374,
            "loss_sequences_lower_95": 4.485716643156828,
            "loss_sequences_upper_95": 4.765228620843856,
            "loss_tokens_lower_95": 3.3587923110560003,
            "loss_tokens_upper_95": 3.48784684710455,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.714311499441036,
            "data_time": 0.004878415449245556,
            "batch_time": 0.03424300897765804,
            "samples_per_second": 1070160.7152918696,
            "samples_per_second_per_gpu": 133770.0894114837,
            "loss_sequences_lower_95": 4.09152584141025,
            "loss_sequences_upper_95": 4.408081242167501,
            "loss_tokens_lower_95": 3.3415671118077404,
            "loss_tokens_upper_95": 3.489701125867215,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.56079663969066,
            "data_time": 0.02307042053767613,
            "batch_time": 0.05236129675592695,
            "samples_per_second": 1021096.8746999858,
            "samples_per_second_per_gpu": 127637.10933749823,
            "loss_sequences_lower_95": 5.463682368030287,
            "loss_sequences_upper_95": 5.65834491329106,
            "loss_tokens_lower_95": 5.462579652272403,
            "loss_tokens_upper_95": 5.6584455655590045,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2783968949317934,
            "data_time": 0.047455659279456504,
            "batch_time": 0.07829435513569759,
            "samples_per_second": 896978.245522477,
            "samples_per_second_per_gpu": 112122.28069030962,
            "loss_sequences_lower_95": 3.148338287353516,
            "loss_sequences_upper_95": 3.5088303298950194,
            "loss_tokens_lower_95": 2.9811500617557862,
            "loss_tokens_upper_95": 3.4268625385646105,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8333063099987066,
            "data_time": 0.0034025122539153617,
            "batch_time": 0.03251021966368631,
            "samples_per_second": 1093982.59313402,
            "samples_per_second_per_gpu": 136747.8241417525,
            "loss_sequences_lower_95": 3.7914078680251246,
            "loss_sequences_upper_95": 3.874228120801881,
            "loss_tokens_lower_95": 3.7913787748088854,
            "loss_tokens_upper_95": 3.8750085586657175,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.2310123111652995,
            "data_time": 0.004807632860118374,
            "batch_time": 0.033758822700911205,
            "samples_per_second": 1090879.1593182716,
            "samples_per_second_per_gpu": 136359.89491478394,
            "loss_sequences_lower_95": 4.169838075380068,
            "loss_sequences_upper_95": 4.290634587668279,
            "loss_tokens_lower_95": 4.168024056549447,
            "loss_tokens_upper_95": 4.292614896057279,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.384087296188591,
            "data_time": 0.0034978335670982014,
            "batch_time": 0.03215901666274596,
            "samples_per_second": 1099995.9631832365,
            "samples_per_second_per_gpu": 137499.49539790457,
            "loss_sequences_lower_95": 3.53426266609001,
            "loss_sequences_upper_95": 3.6611258057437603,
            "loss_tokens_lower_95": 3.196926779886543,
            "loss_tokens_upper_95": 3.2519507587017196,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.248185178756714,
            "data_time": 0.009957175701856613,
            "batch_time": 0.039623087272047997,
            "samples_per_second": 1040556.6535935424,
            "samples_per_second_per_gpu": 130069.5816991928,
            "loss_sequences_lower_95": 5.4417152465820315,
            "loss_sequences_upper_95": 6.01492802734375,
            "loss_tokens_lower_95": 4.640265171295913,
            "loss_tokens_upper_95": 5.004071541766629,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4279278069734573,
            "data_time": 0.14940635859966278,
            "batch_time": 0.18523573875427246,
            "samples_per_second": 562647.6505728824,
            "samples_per_second_per_gpu": 70330.9563216103,
            "loss_sequences_lower_95": 3.223518407344818,
            "loss_sequences_upper_95": 3.6470055103302,
            "loss_tokens_lower_95": 3.0185947111283222,
            "loss_tokens_upper_95": 3.809992033859779,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.60540962013705,
            "data_time": 0.026300194415640323,
            "batch_time": 0.05600628700662166,
            "samples_per_second": 938196.2042108212,
            "samples_per_second_per_gpu": 117274.52552635265,
            "loss_sequences_lower_95": 5.0442104208058325,
            "loss_sequences_upper_95": 5.861909397169091,
            "loss_tokens_lower_95": 3.246512380984886,
            "loss_tokens_upper_95": 3.7032260830265287,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.0993288878844574,
            "data_time": 0.002939046464032597,
            "batch_time": 0.031914003607299596,
            "samples_per_second": 1092390.1058601618,
            "samples_per_second_per_gpu": 136548.76323252023,
            "loss_sequences_lower_95": 2.078680843888229,
            "loss_sequences_upper_95": 2.120137281905308,
            "loss_tokens_lower_95": 2.0783078121586938,
            "loss_tokens_upper_95": 2.119840782768813,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.4006877783701173,
            "data_time": 0.0023769772603246283,
            "batch_time": 0.03130472765720117,
            "samples_per_second": 1100082.8704003415,
            "samples_per_second_per_gpu": 137510.35880004268,
            "loss_sequences_lower_95": 2.374620755279085,
            "loss_sequences_upper_95": 2.511450086342301,
            "loss_tokens_lower_95": 2.2634825258328015,
            "loss_tokens_upper_95": 2.3959604294596475,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0808300037523764,
            "data_time": 0.018673837184906006,
            "batch_time": 0.04778871602482266,
            "samples_per_second": 997488.1821699432,
            "samples_per_second_per_gpu": 124686.0227712429,
            "loss_sequences_lower_95": 2.9382290291698863,
            "loss_sequences_upper_95": 3.32438481928228,
            "loss_tokens_lower_95": 2.8359898739231615,
            "loss_tokens_upper_95": 3.1235027472031103,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5448140846807936,
            "data_time": 0.004598230868577957,
            "batch_time": 0.033648588508367536,
            "samples_per_second": 1079316.6302996776,
            "samples_per_second_per_gpu": 134914.5787874597,
            "loss_sequences_lower_95": 3.5838756486009027,
            "loss_sequences_upper_95": 3.7370608551820244,
            "loss_tokens_lower_95": 3.394952465422235,
            "loss_tokens_upper_95": 3.5378629975888805,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.671140974614678,
            "data_time": 0.029826692172459195,
            "batch_time": 0.06017429204214187,
            "samples_per_second": 964193.0931770538,
            "samples_per_second_per_gpu": 120524.13664713173,
            "loss_sequences_lower_95": 2.539879622110506,
            "loss_sequences_upper_95": 2.9862724908968294,
            "loss_tokens_lower_95": 2.418926782903625,
            "loss_tokens_upper_95": 2.7692004852543843,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.131357974518679,
            "data_time": 0.0019278195777798575,
            "batch_time": 0.03073173606581465,
            "samples_per_second": 1101634.6924024273,
            "samples_per_second_per_gpu": 137704.33655030341,
            "loss_sequences_lower_95": 4.1141093746092965,
            "loss_sequences_upper_95": 4.148029713364548,
            "loss_tokens_lower_95": 4.114376596022329,
            "loss_tokens_upper_95": 4.1479881816207,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.849696181931542,
            "data_time": 0.04736412221735174,
            "batch_time": 0.07862704450433905,
            "samples_per_second": 858765.1975558682,
            "samples_per_second_per_gpu": 107345.64969448352,
            "loss_sequences_lower_95": 0.8023781193112864,
            "loss_sequences_upper_95": 0.9364744093811628,
            "loss_tokens_lower_95": 0.7119949887450423,
            "loss_tokens_upper_95": 0.9069443420597572,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5320956725572517,
            "data_time": 0.0012674410854780864,
            "batch_time": 0.030225279532554064,
            "samples_per_second": 1097884.731052707,
            "samples_per_second_per_gpu": 137235.59138158837,
            "loss_sequences_lower_95": 3.784191838230477,
            "loss_sequences_upper_95": 3.8231461895636794,
            "loss_tokens_lower_95": 3.1328749335106383,
            "loss_tokens_upper_95": 3.1693684598646037,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.360035999536514,
            "data_time": 0.0056614554117596335,
            "batch_time": 0.034571983984538486,
            "samples_per_second": 1084140.3050576954,
            "samples_per_second_per_gpu": 135517.53813221192,
            "loss_sequences_lower_95": 4.369471166992187,
            "loss_sequences_upper_95": 4.600874279785156,
            "loss_tokens_lower_95": 4.096571155379931,
            "loss_tokens_upper_95": 4.312789664449197,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.020695055049399,
            "data_time": 0.021315936314857613,
            "batch_time": 0.05154388435816361,
            "samples_per_second": 993784.0318196481,
            "samples_per_second_per_gpu": 124223.00397745601,
            "loss_sequences_lower_95": 4.8516554724651835,
            "loss_sequences_upper_95": 5.188132111922554,
            "loss_tokens_lower_95": 4.8531799183721125,
            "loss_tokens_upper_95": 5.1852820620329485,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.569547632607547,
            "data_time": 0.004392066992909075,
            "batch_time": 0.03455814683293722,
            "samples_per_second": 1052095.4681862122,
            "samples_per_second_per_gpu": 131511.93352327653,
            "loss_sequences_lower_95": 7.466784113103693,
            "loss_sequences_upper_95": 7.668911373254025,
            "loss_tokens_lower_95": 7.469049682617188,
            "loss_tokens_upper_95": 7.66859199292732,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2634082938432694,
            "data_time": 0.004175577074923414,
            "batch_time": 0.033048264206723965,
            "samples_per_second": 1094942.9467213743,
            "samples_per_second_per_gpu": 136867.8683401718,
            "loss_sequences_lower_95": 1.301022560628255,
            "loss_sequences_upper_95": 1.3583427754720052,
            "loss_tokens_lower_95": 1.1831484117084332,
            "loss_tokens_upper_95": 1.2523816753263806,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.729722915376936,
            "data_time": 0.022377816694123403,
            "batch_time": 0.05258341559341976,
            "samples_per_second": 949756.324301097,
            "samples_per_second_per_gpu": 118719.54053763712,
            "loss_sequences_lower_95": 5.379998706635974,
            "loss_sequences_upper_95": 6.086836562383743,
            "loss_tokens_lower_95": 5.383066740490141,
            "loss_tokens_upper_95": 6.084645371210008,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.044671203941107,
            "data_time": 0.14974910020828247,
            "batch_time": 0.18680207431316376,
            "samples_per_second": 568487.4508182629,
            "samples_per_second_per_gpu": 71060.93135228286,
            "loss_sequences_lower_95": 1.894786012172699,
            "loss_sequences_upper_95": 2.6813055872917175,
            "loss_tokens_lower_95": 1.5940833652142397,
            "loss_tokens_upper_95": 2.054552514774283,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.520485695838929,
            "data_time": 0.00565432177649604,
            "batch_time": 0.03443019352261982,
            "samples_per_second": 1089358.2695076342,
            "samples_per_second_per_gpu": 136169.78368845428,
            "loss_sequences_lower_95": 7.444039843750001,
            "loss_sequences_upper_95": 7.775049487304687,
            "loss_tokens_lower_95": 7.251159957137003,
            "loss_tokens_upper_95": 7.545947554793253,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.467433343410492,
            "data_time": 0.0058010343521360365,
            "batch_time": 0.03482409933256724,
            "samples_per_second": 1082214.9595974402,
            "samples_per_second_per_gpu": 135276.86994968003,
            "loss_sequences_lower_95": 6.543305114746094,
            "loss_sequences_upper_95": 6.745720336914062,
            "loss_tokens_lower_95": 6.254256604633513,
            "loss_tokens_upper_95": 6.428225822620649,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.2839546730324125,
            "data_time": 0.0035810660359054105,
            "batch_time": 0.032641588325883236,
            "samples_per_second": 1086067.06369625,
            "samples_per_second_per_gpu": 135758.38296203126,
            "loss_sequences_lower_95": 5.23544632147272,
            "loss_sequences_upper_95": 5.332216364738727,
            "loss_tokens_lower_95": 5.236396423810446,
            "loss_tokens_upper_95": 5.332462584201098,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.183090996266144,
            "data_time": 0.008216227649561949,
            "batch_time": 0.03719254058653494,
            "samples_per_second": 1070373.7786155571,
            "samples_per_second_per_gpu": 133796.72232694464,
            "loss_sequences_lower_95": 5.0913016745571715,
            "loss_sequences_upper_95": 5.274012064897153,
            "loss_tokens_lower_95": 5.086315758583549,
            "loss_tokens_upper_95": 5.272622722854263,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.783786820411682,
            "data_time": 0.005764965027097672,
            "batch_time": 0.03467484504457504,
            "samples_per_second": 1087579.3405737856,
            "samples_per_second_per_gpu": 135947.4175717232,
            "loss_sequences_lower_95": 4.668381469726563,
            "loss_sequences_upper_95": 4.9074801879882815,
            "loss_tokens_lower_95": 4.666695874023437,
            "loss_tokens_upper_95": 4.905392114257812,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.863042715397666,
            "data_time": 0.00191980444866678,
            "batch_time": 0.030556924581978406,
            "samples_per_second": 1107401.881432534,
            "samples_per_second_per_gpu": 138425.23517906675,
            "loss_sequences_lower_95": 3.314955218557829,
            "loss_sequences_upper_95": 3.4088061454144984,
            "loss_tokens_lower_95": 2.289395938419784,
            "loss_tokens_upper_95": 2.3533761785567653,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.220134750230988,
            "data_time": 0.01758377892630441,
            "batch_time": 0.04678535120827811,
            "samples_per_second": 1006640.7918733644,
            "samples_per_second_per_gpu": 125830.09898417055,
            "loss_sequences_lower_95": 4.078973297574628,
            "loss_sequences_upper_95": 4.361880971424615,
            "loss_tokens_lower_95": 4.080921514710384,
            "loss_tokens_upper_95": 4.358976244570604,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.732367562312706,
            "data_time": 0.010606188327074051,
            "batch_time": 0.04007546603679657,
            "samples_per_second": 1059263.6699198615,
            "samples_per_second_per_gpu": 132407.95873998269,
            "loss_sequences_lower_95": 4.620841327742035,
            "loss_sequences_upper_95": 4.8408447983685665,
            "loss_tokens_lower_95": 4.622959092084099,
            "loss_tokens_upper_95": 4.841012704886642,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.7626883930525623,
            "data_time": 0.0020762659603728445,
            "batch_time": 0.030776481206311556,
            "samples_per_second": 1104234.1453037842,
            "samples_per_second_per_gpu": 138029.26816297302,
            "loss_sequences_lower_95": 3.007207650241137,
            "loss_sequences_upper_95": 3.0883675968170645,
            "loss_tokens_lower_95": 2.350115879306513,
            "loss_tokens_upper_95": 2.4144074563735014,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.47282139207951,
            "data_time": 0.025830323497454327,
            "batch_time": 0.05694345384836197,
            "samples_per_second": 978765.5568983002,
            "samples_per_second_per_gpu": 122345.69461228752,
            "loss_sequences_lower_95": 4.285118942664414,
            "loss_sequences_upper_95": 4.6549275796880165,
            "loss_tokens_lower_95": 4.286143840809978,
            "loss_tokens_upper_95": 4.65406316525091,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6095287913576177,
            "data_time": 0.003271918011526776,
            "batch_time": 0.032184602140070316,
            "samples_per_second": 1093683.54785994,
            "samples_per_second_per_gpu": 136710.4434824925,
            "loss_sequences_lower_95": 3.567827469478689,
            "loss_sequences_upper_95": 3.650836065916476,
            "loss_tokens_lower_95": 3.567689541224675,
            "loss_tokens_upper_95": 3.6505370048499617,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.738070036601094,
            "data_time": 0.02370148788798939,
            "batch_time": 0.05330924987792969,
            "samples_per_second": 963403.3237044495,
            "samples_per_second_per_gpu": 120425.41546305618,
            "loss_sequences_lower_95": 4.586658618519607,
            "loss_sequences_upper_95": 4.889078543950053,
            "loss_tokens_lower_95": 4.582367010023987,
            "loss_tokens_upper_95": 4.892415330016497,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9573695500691732,
            "data_time": 0.0745215192437172,
            "batch_time": 0.1077972799539566,
            "samples_per_second": 752556.4978275306,
            "samples_per_second_per_gpu": 94069.56222844133,
            "loss_sequences_lower_95": 1.7592088731129965,
            "loss_sequences_upper_95": 2.298725503285726,
            "loss_tokens_lower_95": 1.6012951824400161,
            "loss_tokens_upper_95": 2.2327171484629313,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.8451067407925923,
            "data_time": 0.07570350170135498,
            "batch_time": 0.1082567498087883,
            "samples_per_second": 753659.3920174736,
            "samples_per_second_per_gpu": 94207.4240021842,
            "loss_sequences_lower_95": 1.6979275131225586,
            "loss_sequences_upper_95": 2.201363360087077,
            "loss_tokens_lower_95": 1.4352739612707932,
            "loss_tokens_upper_95": 2.119968881499901,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.368138617022575,
            "data_time": 0.003139248449989036,
            "batch_time": 0.03223085529738275,
            "samples_per_second": 1090425.603793684,
            "samples_per_second_per_gpu": 136303.2004742105,
            "loss_sequences_lower_95": 2.3553190655490615,
            "loss_sequences_upper_95": 2.381031445600147,
            "loss_tokens_lower_95": 2.3549515134561396,
            "loss_tokens_upper_95": 2.3813305268547498,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.4526165710841882,
            "data_time": 0.0012569218890280901,
            "batch_time": 0.029967153603842795,
            "samples_per_second": 1106340.711165842,
            "samples_per_second_per_gpu": 138292.58889573024,
            "loss_sequences_lower_95": 0.4988102948707195,
            "loss_sequences_upper_95": 0.5097816920676652,
            "loss_tokens_lower_95": 0.40496105906522967,
            "loss_tokens_upper_95": 0.4121575763042844,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.418397526102742,
            "data_time": 0.03711071237921715,
            "batch_time": 0.06787030771374702,
            "samples_per_second": 977235.5819542425,
            "samples_per_second_per_gpu": 122154.44774428032,
            "loss_sequences_lower_95": 1.3368968483031265,
            "loss_sequences_upper_95": 1.5467738834891733,
            "loss_tokens_lower_95": 1.259848153646793,
            "loss_tokens_upper_95": 1.373184972475858,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.050575736003953,
            "data_time": 0.11374640464782715,
            "batch_time": 0.1489429473876953,
            "samples_per_second": 519512.8721402697,
            "samples_per_second_per_gpu": 64939.109017533716,
            "loss_sequences_lower_95": 3.5644189679944835,
            "loss_sequences_upper_95": 4.616093372654271,
            "loss_tokens_lower_95": 3.4005030690887827,
            "loss_tokens_upper_95": 4.602691603295597,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2955613525175467,
            "data_time": 0.029520307268415178,
            "batch_time": 0.05940088771638416,
            "samples_per_second": 986645.5425854499,
            "samples_per_second_per_gpu": 123330.69282318123,
            "loss_sequences_lower_95": 1.2351438615380264,
            "loss_sequences_upper_95": 1.4077293326215046,
            "loss_tokens_lower_95": 1.1623229600603184,
            "loss_tokens_upper_95": 1.253565414082212,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.340933810646941,
            "data_time": 0.030728487741379512,
            "batch_time": 0.06112228404907953,
            "samples_per_second": 974938.1617060703,
            "samples_per_second_per_gpu": 121867.27021325879,
            "loss_sequences_lower_95": 1.3061820681502179,
            "loss_sequences_upper_95": 1.4626129150390625,
            "loss_tokens_lower_95": 1.200514616484769,
            "loss_tokens_upper_95": 1.278288265285087,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2737884466967933,
            "data_time": 0.029296256247020903,
            "batch_time": 0.06251281499862671,
            "samples_per_second": 900012.8533654968,
            "samples_per_second_per_gpu": 112501.6066706871,
            "loss_sequences_lower_95": 1.160057772659674,
            "loss_sequences_upper_95": 1.3536963834995177,
            "loss_tokens_lower_95": 1.1898350422241142,
            "loss_tokens_upper_95": 1.3110904880060827,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4073286703447017,
            "data_time": 0.031253624530065624,
            "batch_time": 0.0619382517678397,
            "samples_per_second": 950237.6899438429,
            "samples_per_second_per_gpu": 118779.71124298037,
            "loss_sequences_lower_95": 1.3607280265994188,
            "loss_sequences_upper_95": 1.5087113706077018,
            "loss_tokens_lower_95": 1.2690373952515028,
            "loss_tokens_upper_95": 1.3460137673256183,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.187653936954759,
            "data_time": 0.031745698716905385,
            "batch_time": 0.06319169350612311,
            "samples_per_second": 962099.4585659371,
            "samples_per_second_per_gpu": 120262.43232074214,
            "loss_sequences_lower_95": 1.1471832393859485,
            "loss_sequences_upper_95": 1.250290841049289,
            "loss_tokens_lower_95": 1.128545513125183,
            "loss_tokens_upper_95": 1.183723359466401,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.1092784899037058,
            "data_time": 0.031176666418711346,
            "batch_time": 0.06132603543145316,
            "samples_per_second": 977895.0851161696,
            "samples_per_second_per_gpu": 122236.8856395212,
            "loss_sequences_lower_95": 1.0870389240544016,
            "loss_sequences_upper_95": 1.1944953871936332,
            "loss_tokens_lower_95": 0.9983862378268159,
            "loss_tokens_upper_95": 1.0476027303722122,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-16.0/params.txt",
    "uuid": "93705682-6eca-4e3d-86c9-e62c0fe7bd72",
    "creation_date": "2023_12_14-07_11_42"
}