{
    "name": "rpj-d=96_l=8_h=4-1.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 211386240,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "42277248",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.543149820963541,
            "data_time": 0.13073554635047913,
            "batch_time": 1.2670944929122925,
            "samples_per_second": 369260.25073499826,
            "samples_per_second_per_gpu": 46157.53134187478,
            "loss_sequences_lower_95": 5.447965927124024,
            "loss_sequences_upper_95": 5.641647872924805,
            "loss_tokens_lower_95": 5.52927012125651,
            "loss_tokens_upper_95": 5.55657667795817,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.38702091510821,
            "data_time": 0.01945252444515956,
            "batch_time": 0.06441812126186922,
            "samples_per_second": 4661868.32987862,
            "samples_per_second_per_gpu": 582733.5412348275,
            "loss_sequences_lower_95": 5.384729507847151,
            "loss_sequences_upper_95": 5.389294304197697,
            "loss_tokens_lower_95": 5.3754558125,
            "loss_tokens_upper_95": 5.398452229166666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.726752202364863,
            "data_time": 0.09833941608667374,
            "batch_time": 0.14313144236803055,
            "samples_per_second": 4152377.7737434763,
            "samples_per_second_per_gpu": 519047.22171793453,
            "loss_sequences_lower_95": 5.712150106624681,
            "loss_sequences_upper_95": 5.7417587591677295,
            "loss_tokens_lower_95": 5.714833458333334,
            "loss_tokens_upper_95": 5.738886,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.324275449280886,
            "data_time": 0.014035139428941827,
            "batch_time": 0.05791996105721122,
            "samples_per_second": 5392442.791056996,
            "samples_per_second_per_gpu": 674055.3488821245,
            "loss_sequences_lower_95": 5.31558103455219,
            "loss_sequences_upper_95": 5.333075114771263,
            "loss_tokens_lower_95": 5.3130183125,
            "loss_tokens_upper_95": 5.3356430625,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.363246600156889,
            "data_time": 0.09844282269477844,
            "batch_time": 0.14425038546323776,
            "samples_per_second": 3996047.1245729756,
            "samples_per_second_per_gpu": 499505.89057162195,
            "loss_sequences_lower_95": 5.3332513440165155,
            "loss_sequences_upper_95": 5.394634960042484,
            "loss_tokens_lower_95": 5.351827312499999,
            "loss_tokens_upper_95": 5.3748955,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.458552127464704,
            "data_time": 0.03635097543398539,
            "batch_time": 0.07954223205645879,
            "samples_per_second": 4927895.934366259,
            "samples_per_second_per_gpu": 615986.9917957823,
            "loss_sequences_lower_95": 5.434167646437904,
            "loss_sequences_upper_95": 5.482008055810779,
            "loss_tokens_lower_95": 5.446721156250001,
            "loss_tokens_upper_95": 5.47055565625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.486899437612417,
            "data_time": 0.01330367773771286,
            "batch_time": 0.05584478750824928,
            "samples_per_second": 5232130.506693011,
            "samples_per_second_per_gpu": 654016.3133366264,
            "loss_sequences_lower_95": 5.460483567841199,
            "loss_sequences_upper_95": 5.513811443718112,
            "loss_tokens_lower_95": 5.47401825,
            "loss_tokens_upper_95": 5.50007715625,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2319508338848335,
            "data_time": 0.01425871723576596,
            "batch_time": 0.05799690045808491,
            "samples_per_second": 5293072.613402122,
            "samples_per_second_per_gpu": 661634.0766752652,
            "loss_sequences_lower_95": 5.2241152139234295,
            "loss_sequences_upper_95": 5.239807918848167,
            "loss_tokens_lower_95": 5.220710125,
            "loss_tokens_upper_95": 5.243582333333333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.518061531268485,
            "data_time": 0.09700985252857208,
            "batch_time": 0.14224082231521606,
            "samples_per_second": 4088940.306023622,
            "samples_per_second_per_gpu": 511117.53825295274,
            "loss_sequences_lower_95": 5.477707144884559,
            "loss_sequences_upper_95": 5.55843848251715,
            "loss_tokens_lower_95": 5.50614534375,
            "loss_tokens_upper_95": 5.53004528125,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.793416990121834,
            "data_time": 0.09941363334655762,
            "batch_time": 0.1443888023495674,
            "samples_per_second": 4271173.6342172045,
            "samples_per_second_per_gpu": 533896.7042771506,
            "loss_sequences_lower_95": 5.779730803599,
            "loss_sequences_upper_95": 5.807149677502779,
            "loss_tokens_lower_95": 5.7813159999999995,
            "loss_tokens_upper_95": 5.805270895833333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.693551619575336,
            "data_time": 0.010733660438965106,
            "batch_time": 0.0540612550645039,
            "samples_per_second": 5391536.904732277,
            "samples_per_second_per_gpu": 673942.1130915346,
            "loss_sequences_lower_95": 5.686987134053378,
            "loss_sequences_upper_95": 5.700044806512951,
            "loss_tokens_lower_95": 5.682061552083334,
            "loss_tokens_upper_95": 5.7054204791666665,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.535658556935391,
            "data_time": 0.02250431627035141,
            "batch_time": 0.06454323083162308,
            "samples_per_second": 5122174.85747767,
            "samples_per_second_per_gpu": 640271.8571847087,
            "loss_sequences_lower_95": 5.526473659656784,
            "loss_sequences_upper_95": 5.545133466931096,
            "loss_tokens_lower_95": 5.523770510416667,
            "loss_tokens_upper_95": 5.547203197916667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3825308991252285,
            "data_time": 0.10039012879133224,
            "batch_time": 0.14446759223937988,
            "samples_per_second": 4107605.366019821,
            "samples_per_second_per_gpu": 513450.6707524776,
            "loss_sequences_lower_95": 5.3421594708734785,
            "loss_sequences_upper_95": 5.4249800923871705,
            "loss_tokens_lower_95": 5.37099878125,
            "loss_tokens_upper_95": 5.3939071145833335,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.832309031437953,
            "data_time": 0.09569559246301651,
            "batch_time": 0.14013807475566864,
            "samples_per_second": 4186326.704002828,
            "samples_per_second_per_gpu": 523290.8380003535,
            "loss_sequences_lower_95": 5.782297393115135,
            "loss_sequences_upper_95": 5.883618114339359,
            "loss_tokens_lower_95": 5.820750927083333,
            "loss_tokens_upper_95": 5.844460197916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.113015489144758,
            "data_time": 0.14895756542682648,
            "batch_time": 0.16900669038295746,
            "samples_per_second": 1176025.2336744154,
            "samples_per_second_per_gpu": 147003.15420930192,
            "loss_sequences_lower_95": 7.069022785533559,
            "loss_sequences_upper_95": 7.157060605829413,
            "loss_tokens_lower_95": 7.089838426763361,
            "loss_tokens_upper_95": 7.1363832473754885,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.313413061136407,
            "data_time": 0.09777127206325531,
            "batch_time": 0.13242673128843307,
            "samples_per_second": 3335036.563839935,
            "samples_per_second_per_gpu": 416879.5704799919,
            "loss_sequences_lower_95": 5.226072759600492,
            "loss_sequences_upper_95": 5.405946879484215,
            "loss_tokens_lower_95": 5.301596177083334,
            "loss_tokens_upper_95": 5.325419208333333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.740183790000573,
            "data_time": 0.09507465362548828,
            "batch_time": 0.13170147687196732,
            "samples_per_second": 3709303.0629839567,
            "samples_per_second_per_gpu": 463662.8828729946,
            "loss_sequences_lower_95": 6.694590304331918,
            "loss_sequences_upper_95": 6.784798912511337,
            "loss_tokens_lower_95": 6.729272885416667,
            "loss_tokens_upper_95": 6.750815958333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.584652486394663,
            "data_time": 0.1753092110157013,
            "batch_time": 0.205137699842453,
            "samples_per_second": 2216112.941079799,
            "samples_per_second_per_gpu": 277014.1176349749,
            "loss_sequences_lower_95": 6.545867106953605,
            "loss_sequences_upper_95": 6.6240387713322875,
            "loss_tokens_lower_95": 6.571590736264088,
            "loss_tokens_upper_95": 6.597574903144212,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.149063892776072,
            "data_time": 0.02702391418543729,
            "batch_time": 0.07165198976343329,
            "samples_per_second": 4503123.366197493,
            "samples_per_second_per_gpu": 562890.4207746866,
            "loss_sequences_lower_95": 5.131238830940215,
            "loss_sequences_upper_95": 5.166486269428322,
            "loss_tokens_lower_95": 5.130979550188719,
            "loss_tokens_upper_95": 5.166726661533257,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.163760313306807,
            "data_time": 0.030070813745260237,
            "batch_time": 0.07406249605119228,
            "samples_per_second": 4406119.41258837,
            "samples_per_second_per_gpu": 550764.9265735463,
            "loss_sequences_lower_95": 5.144660949403132,
            "loss_sequences_upper_95": 5.1714565621265685,
            "loss_tokens_lower_95": 5.151780697114477,
            "loss_tokens_upper_95": 5.174265341687762,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.392849873753161,
            "data_time": 0.05000321898195478,
            "batch_time": 0.09220872322718303,
            "samples_per_second": 4369545.198847481,
            "samples_per_second_per_gpu": 546193.1498559351,
            "loss_sequences_lower_95": 7.925981429167159,
            "loss_sequences_upper_95": 8.204003398824987,
            "loss_tokens_lower_95": 7.25269166638745,
            "loss_tokens_upper_95": 7.451791690865499,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.99482235399882,
            "data_time": 0.046062844494978585,
            "batch_time": 0.08988005667924881,
            "samples_per_second": 4426323.00477532,
            "samples_per_second_per_gpu": 553290.375596915,
            "loss_sequences_lower_95": 7.429661458333333,
            "loss_sequences_upper_95": 7.607066813151042,
            "loss_tokens_lower_95": 6.89310341735456,
            "loss_tokens_upper_95": 7.015028621265723,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.231501613266998,
            "data_time": 0.063910111784935,
            "batch_time": 0.10474229604005814,
            "samples_per_second": 4007335.4652398513,
            "samples_per_second_per_gpu": 500916.9331549814,
            "loss_sequences_lower_95": 6.292638280953848,
            "loss_sequences_upper_95": 6.360984477172574,
            "loss_tokens_lower_95": 6.209580036217856,
            "loss_tokens_upper_95": 6.2465166529538925,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.57174194726077,
            "data_time": 0.3682747185230255,
            "batch_time": 0.4110116511583328,
            "samples_per_second": 2670576.284945124,
            "samples_per_second_per_gpu": 333822.0356181405,
            "loss_sequences_lower_95": 4.556679104891691,
            "loss_sequences_upper_95": 4.674111300381748,
            "loss_tokens_lower_95": 4.545021973886426,
            "loss_tokens_upper_95": 4.596210366698579,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6111273473622845,
            "data_time": 0.32812850177288055,
            "batch_time": 0.3746677488088608,
            "samples_per_second": 2750555.897016667,
            "samples_per_second_per_gpu": 343819.48712708335,
            "loss_sequences_lower_95": 5.601126310387436,
            "loss_sequences_upper_95": 5.799910004284917,
            "loss_tokens_lower_95": 5.563881495429926,
            "loss_tokens_upper_95": 5.663387065208193,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.393360428810119,
            "data_time": 0.18186084181070328,
            "batch_time": 0.21267760545015335,
            "samples_per_second": 2709153.777533716,
            "samples_per_second_per_gpu": 338644.2221917145,
            "loss_sequences_lower_95": 5.3292933959960935,
            "loss_sequences_upper_95": 5.465613932291667,
            "loss_tokens_lower_95": 5.2958627217484935,
            "loss_tokens_upper_95": 5.495873148273331,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.619413715443457,
            "data_time": 0.028070846013724803,
            "batch_time": 0.07222293168306351,
            "samples_per_second": 4449646.5540955495,
            "samples_per_second_per_gpu": 556205.8192619437,
            "loss_sequences_lower_95": 9.706364375215294,
            "loss_sequences_upper_95": 9.780386169172285,
            "loss_tokens_lower_95": 9.562899546532323,
            "loss_tokens_upper_95": 9.639988286289359,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.404824783705702,
            "data_time": 0.048132552206516264,
            "batch_time": 0.09050335884094238,
            "samples_per_second": 4363845.262201612,
            "samples_per_second_per_gpu": 545480.6577752015,
            "loss_sequences_lower_95": 7.718683210687605,
            "loss_sequences_upper_95": 8.020734875611584,
            "loss_tokens_lower_95": 6.248398339932448,
            "loss_tokens_upper_95": 6.402743610655796,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.033071237212562,
            "data_time": 0.07859741747379304,
            "batch_time": 0.12084693014621735,
            "samples_per_second": 4369912.454358813,
            "samples_per_second_per_gpu": 546239.0567948517,
            "loss_sequences_lower_95": 6.846381198505493,
            "loss_sequences_upper_95": 7.184625035829511,
            "loss_tokens_lower_95": 5.921775468775209,
            "loss_tokens_upper_95": 6.093493805965633,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.280899725003874,
            "data_time": 0.3287169635295868,
            "batch_time": 0.3709720969200134,
            "samples_per_second": 2621873.4953073105,
            "samples_per_second_per_gpu": 327734.1869134138,
            "loss_sequences_lower_95": 6.249749393550228,
            "loss_sequences_upper_95": 6.312051544886201,
            "loss_tokens_lower_95": 6.249795504356628,
            "loss_tokens_upper_95": 6.312687076934396,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.222611727714539,
            "data_time": 0.3370330184698105,
            "batch_time": 0.36213114857673645,
            "samples_per_second": 1562455.5123032094,
            "samples_per_second_per_gpu": 195306.93903790117,
            "loss_sequences_lower_95": 5.149515197753907,
            "loss_sequences_upper_95": 5.599646911621094,
            "loss_tokens_lower_95": 4.959695221656977,
            "loss_tokens_upper_95": 5.470871097243963,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.838044721248084,
            "data_time": 0.05510447919368744,
            "batch_time": 0.09830311685800552,
            "samples_per_second": 4433737.757195982,
            "samples_per_second_per_gpu": 554217.2196494978,
            "loss_sequences_lower_95": 4.788994867799226,
            "loss_sequences_upper_95": 4.888194426396974,
            "loss_tokens_lower_95": 4.787620645950614,
            "loss_tokens_upper_95": 4.88821502966671,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.291765532075724,
            "data_time": 0.08034054040908814,
            "batch_time": 0.12404323518276214,
            "samples_per_second": 4390540.53071157,
            "samples_per_second_per_gpu": 548817.5663389462,
            "loss_sequences_lower_95": 5.238482910956056,
            "loss_sequences_upper_95": 5.3438570739596125,
            "loss_tokens_lower_95": 5.23709375879786,
            "loss_tokens_upper_95": 5.344603102525466,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.718854499719348,
            "data_time": 0.05132126063108444,
            "batch_time": 0.09266661666333675,
            "samples_per_second": 4247458.471986648,
            "samples_per_second_per_gpu": 530932.308998331,
            "loss_sequences_lower_95": 5.8796609075336645,
            "loss_sequences_upper_95": 5.985656578885847,
            "loss_tokens_lower_95": 5.688988558067786,
            "loss_tokens_upper_95": 5.748408214614257,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.052356024742126,
            "data_time": 0.20895662158727646,
            "batch_time": 0.25456026941537857,
            "samples_per_second": 3676109.996481612,
            "samples_per_second_per_gpu": 459513.7495602015,
            "loss_sequences_lower_95": 7.7654099243164065,
            "loss_sequences_upper_95": 8.27679375,
            "loss_tokens_lower_95": 6.810517807562252,
            "loss_tokens_upper_95": 7.151925463269358,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5861122608184814,
            "data_time": 0.15693996846675873,
            "batch_time": 0.1760593056678772,
            "samples_per_second": 756857.1124871632,
            "samples_per_second_per_gpu": 94607.1390608954,
            "loss_sequences_lower_95": 5.252745580673218,
            "loss_sequences_upper_95": 6.10882248878479,
            "loss_tokens_lower_95": 4.969848913433909,
            "loss_tokens_upper_95": 5.987720287805315,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.212660515445402,
            "data_time": 0.34329164028167725,
            "batch_time": 0.37894177436828613,
            "samples_per_second": 1970448.6430559643,
            "samples_per_second_per_gpu": 246306.08038199553,
            "loss_sequences_lower_95": 7.4188007924748565,
            "loss_sequences_upper_95": 8.093999069038478,
            "loss_tokens_lower_95": 5.894566178348717,
            "loss_tokens_upper_95": 6.349206143866263,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.867126683993025,
            "data_time": 0.05403149790234036,
            "batch_time": 0.09868152770731184,
            "samples_per_second": 4447191.552741299,
            "samples_per_second_per_gpu": 555898.9440926624,
            "loss_sequences_lower_95": 4.837417019918632,
            "loss_sequences_upper_95": 4.897086695199869,
            "loss_tokens_lower_95": 4.837752246648373,
            "loss_tokens_upper_95": 4.896558758430264,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.194166094544743,
            "data_time": 0.032075070909091404,
            "batch_time": 0.07522170387563251,
            "samples_per_second": 4399309.827054559,
            "samples_per_second_per_gpu": 549913.7283818199,
            "loss_sequences_lower_95": 8.229973816830002,
            "loss_sequences_upper_95": 8.389126214401804,
            "loss_tokens_lower_95": 8.104111370847384,
            "loss_tokens_upper_95": 8.258860942911502,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.625594929461077,
            "data_time": 0.1898769736289978,
            "batch_time": 0.22026846557855606,
            "samples_per_second": 1729099.0800569307,
            "samples_per_second_per_gpu": 216137.38500711633,
            "loss_sequences_lower_95": 4.537201754950779,
            "loss_sequences_upper_95": 4.916915681161287,
            "loss_tokens_lower_95": 4.4089516949366905,
            "loss_tokens_upper_95": 4.749335975893993,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.0494056899620965,
            "data_time": 0.0861221730709076,
            "batch_time": 0.13071830272674562,
            "samples_per_second": 4465388.803500531,
            "samples_per_second_per_gpu": 558173.6004375664,
            "loss_sequences_lower_95": 5.08961360844083,
            "loss_sequences_upper_95": 5.229735856029869,
            "loss_tokens_lower_95": 4.96505711963145,
            "loss_tokens_upper_95": 5.1265057185293745,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.712005748981383,
            "data_time": 0.3348459154367447,
            "batch_time": 0.3691922277212143,
            "samples_per_second": 2080017.796617493,
            "samples_per_second_per_gpu": 260002.22457718663,
            "loss_sequences_lower_95": 6.529362264493616,
            "loss_sequences_upper_95": 7.080410171136623,
            "loss_tokens_lower_95": 6.546105767852136,
            "loss_tokens_upper_95": 6.912974421390701,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.727798064748486,
            "data_time": 0.028854577145232676,
            "batch_time": 0.07293369329087787,
            "samples_per_second": 4440615.772664149,
            "samples_per_second_per_gpu": 555076.9715830187,
            "loss_sequences_lower_95": 4.718555771701215,
            "loss_sequences_upper_95": 4.737031439491023,
            "loss_tokens_lower_95": 4.718630327628025,
            "loss_tokens_upper_95": 4.73723098135252,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.14038978965537,
            "data_time": 0.3144485056400299,
            "batch_time": 0.3409152328968048,
            "samples_per_second": 1818602.8881401396,
            "samples_per_second_per_gpu": 227325.36101751745,
            "loss_sequences_lower_95": 6.004096718204831,
            "loss_sequences_upper_95": 6.345790485270973,
            "loss_tokens_lower_95": 5.892533332567011,
            "loss_tokens_upper_95": 6.3042424118994695,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.383085228949473,
            "data_time": 0.02266588111718496,
            "batch_time": 0.06690596203009287,
            "samples_per_second": 4507656.016366794,
            "samples_per_second_per_gpu": 563457.0020458492,
            "loss_sequences_lower_95": 6.86591645374738,
            "loss_sequences_upper_95": 6.90936853052935,
            "loss_tokens_lower_95": 6.317369426982592,
            "loss_tokens_upper_95": 6.356678433268859,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.599784519672394,
            "data_time": 0.10153466463088989,
            "batch_time": 0.1459346003830433,
            "samples_per_second": 4419028.744987989,
            "samples_per_second_per_gpu": 552378.5931234986,
            "loss_sequences_lower_95": 6.701219177246093,
            "loss_sequences_upper_95": 6.931799389648438,
            "loss_tokens_lower_95": 6.478587130069364,
            "loss_tokens_upper_95": 6.681602767153664,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.9918612563091775,
            "data_time": 0.3415544331073761,
            "batch_time": 0.38409194350242615,
            "samples_per_second": 2674341.350758572,
            "samples_per_second_per_gpu": 334292.6688448215,
            "loss_sequences_lower_95": 4.866518966011379,
            "loss_sequences_upper_95": 5.117156902810802,
            "loss_tokens_lower_95": 4.867082466457201,
            "loss_tokens_upper_95": 5.115034564474354,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.919808635205934,
            "data_time": 0.06689732770125072,
            "batch_time": 0.10662927230199178,
            "samples_per_second": 3959258.1605496085,
            "samples_per_second_per_gpu": 494907.27006870107,
            "loss_sequences_lower_95": 8.794316757664536,
            "loss_sequences_upper_95": 9.044648955374052,
            "loss_tokens_lower_95": 8.793618644945548,
            "loss_tokens_upper_95": 9.047444938890862,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.6085108372370405,
            "data_time": 0.0685797706246376,
            "batch_time": 0.1125302364428838,
            "samples_per_second": 4510532.713124464,
            "samples_per_second_per_gpu": 563816.589140558,
            "loss_sequences_lower_95": 4.69789013671875,
            "loss_sequences_upper_95": 4.779991959635416,
            "loss_tokens_lower_95": 4.554159804546819,
            "loss_tokens_upper_95": 4.64966165372399,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.212532079787481,
            "data_time": 0.3496052473783493,
            "batch_time": 0.3903281092643738,
            "samples_per_second": 2000084.6802881148,
            "samples_per_second_per_gpu": 250010.58503601435,
            "loss_sequences_lower_95": 5.847065109979538,
            "loss_sequences_upper_95": 6.579789457775298,
            "loss_tokens_lower_95": 5.842375313895089,
            "loss_tokens_upper_95": 6.576990719749814,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.648152083158493,
            "data_time": 0.162715345621109,
            "batch_time": 0.18068131804466248,
            "samples_per_second": 972056.8887944989,
            "samples_per_second_per_gpu": 121507.11109931236,
            "loss_sequences_lower_95": 6.4727172493934635,
            "loss_sequences_upper_95": 7.772854018211365,
            "loss_tokens_lower_95": 6.239333030464723,
            "loss_tokens_upper_95": 6.823121287552351,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.148992574691772,
            "data_time": 0.10700813680887222,
            "batch_time": 0.15185973420739174,
            "samples_per_second": 4068230.1650272366,
            "samples_per_second_per_gpu": 508528.77062840457,
            "loss_sequences_lower_95": 7.250379736328125,
            "loss_sequences_upper_95": 7.570439660644531,
            "loss_tokens_lower_95": 6.991253569162437,
            "loss_tokens_upper_95": 7.274368931597663,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.175158227920532,
            "data_time": 0.09930837899446487,
            "batch_time": 0.14373169839382172,
            "samples_per_second": 4301846.149928129,
            "samples_per_second_per_gpu": 537730.7687410162,
            "loss_sequences_lower_95": 7.5062188720703125,
            "loss_sequences_upper_95": 7.760717846679687,
            "loss_tokens_lower_95": 7.0534385562719075,
            "loss_tokens_upper_95": 7.257211415267432,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.452480438608393,
            "data_time": 0.04099853212634722,
            "batch_time": 0.08508099367221196,
            "samples_per_second": 4450115.327289506,
            "samples_per_second_per_gpu": 556264.4159111882,
            "loss_sequences_lower_95": 4.434189926183268,
            "loss_sequences_upper_95": 4.471063885127703,
            "loss_tokens_lower_95": 4.433679710743694,
            "loss_tokens_upper_95": 4.470983407931403,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.039156556312573,
            "data_time": 0.12561969955762228,
            "batch_time": 0.1657224694887797,
            "samples_per_second": 3924263.3938427926,
            "samples_per_second_per_gpu": 490532.9242303491,
            "loss_sequences_lower_95": 4.965105460499472,
            "loss_sequences_upper_95": 5.111631603797643,
            "loss_tokens_lower_95": 4.965355292113696,
            "loss_tokens_upper_95": 5.111832626038066,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.015592454910278,
            "data_time": 0.09440452232956886,
            "batch_time": 0.13940979912877083,
            "samples_per_second": 4262530.561109225,
            "samples_per_second_per_gpu": 532816.3201386532,
            "loss_sequences_lower_95": 9.93249365234375,
            "loss_sequences_upper_95": 10.1021736328125,
            "loss_tokens_lower_95": 9.930126635742187,
            "loss_tokens_upper_95": 10.099958935546875,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.029413302619162,
            "data_time": 0.027845104535420735,
            "batch_time": 0.0719905939130556,
            "samples_per_second": 4423261.106548535,
            "samples_per_second_per_gpu": 552907.6383185668,
            "loss_sequences_lower_95": 7.693980383751183,
            "loss_sequences_upper_95": 7.771866222948202,
            "loss_tokens_lower_95": 6.933717654257134,
            "loss_tokens_upper_95": 6.992850094511155,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.147097737931493,
            "data_time": 0.2060762814113072,
            "batch_time": 0.2384938257081168,
            "samples_per_second": 1968873.7444238286,
            "samples_per_second_per_gpu": 246109.21805297857,
            "loss_sequences_lower_95": 5.01260185811057,
            "loss_sequences_upper_95": 5.278729714920272,
            "loss_tokens_lower_95": 5.009767868269735,
            "loss_tokens_upper_95": 5.278939557431349,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.110869057973226,
            "data_time": 0.18513952940702438,
            "batch_time": 0.23033074289560318,
            "samples_per_second": 3801049.360755273,
            "samples_per_second_per_gpu": 475131.17009440914,
            "loss_sequences_lower_95": 5.019973312078738,
            "loss_sequences_upper_95": 5.200234351064645,
            "loss_tokens_lower_95": 5.019418729894301,
            "loss_tokens_upper_95": 5.201511398016238,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.964946852013474,
            "data_time": 0.029234659392386675,
            "batch_time": 0.07289204746484756,
            "samples_per_second": 4458661.751099716,
            "samples_per_second_per_gpu": 557332.7188874645,
            "loss_sequences_lower_95": 7.578810232975542,
            "loss_sequences_upper_95": 7.673741050317863,
            "loss_tokens_lower_95": 6.869725438327162,
            "loss_tokens_upper_95": 6.944948497131282,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.833997173914834,
            "data_time": 0.3728807270526886,
            "batch_time": 0.4106895476579666,
            "samples_per_second": 1959964.1024343641,
            "samples_per_second_per_gpu": 244995.51280429552,
            "loss_sequences_lower_95": 4.743078694015584,
            "loss_sequences_upper_95": 4.924929906451513,
            "loss_tokens_lower_95": 4.74079685917607,
            "loss_tokens_upper_95": 4.923987575308987,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.078359783242602,
            "data_time": 0.04437941083541283,
            "batch_time": 0.08901340686357938,
            "samples_per_second": 4414166.898247767,
            "samples_per_second_per_gpu": 551770.8622809709,
            "loss_sequences_lower_95": 9.05765641425363,
            "loss_sequences_upper_95": 9.099126200544724,
            "loss_tokens_lower_95": 9.057142987265864,
            "loss_tokens_upper_95": 9.09912109375,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.0823663947651685,
            "data_time": 0.34006059169769287,
            "batch_time": 0.38045696914196014,
            "samples_per_second": 2639682.2010575873,
            "samples_per_second_per_gpu": 329960.2751321984,
            "loss_sequences_lower_95": 4.932848143346101,
            "loss_sequences_upper_95": 5.229489743130878,
            "loss_tokens_lower_95": 4.932713947481322,
            "loss_tokens_upper_95": 5.229600228615178,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.599762066205343,
            "data_time": 0.2833528071641922,
            "batch_time": 0.3039790391921997,
            "samples_per_second": 1164631.954318553,
            "samples_per_second_per_gpu": 145578.99428981912,
            "loss_sequences_lower_95": 7.299551035563152,
            "loss_sequences_upper_95": 8.136831067403158,
            "loss_tokens_lower_95": 6.95366784201728,
            "loss_tokens_upper_95": 8.15304684109158,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.448017485936483,
            "data_time": 0.28679201006889343,
            "batch_time": 0.3070719987154007,
            "samples_per_second": 1298694.237052331,
            "samples_per_second_per_gpu": 162336.77963154137,
            "loss_sequences_lower_95": 7.237273457845052,
            "loss_sequences_upper_95": 8.294669303894043,
            "loss_tokens_lower_95": 6.777287086743987,
            "loss_tokens_upper_95": 8.060158153062456,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.19468466880922,
            "data_time": 0.04339348631245749,
            "batch_time": 0.08622547771249499,
            "samples_per_second": 4366595.048222655,
            "samples_per_second_per_gpu": 545824.3810278319,
            "loss_sequences_lower_95": 8.158290052006627,
            "loss_sequences_upper_95": 8.229813618947901,
            "loss_tokens_lower_95": 8.159389252922496,
            "loss_tokens_upper_95": 8.230473007179675,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.377104924689204,
            "data_time": 0.02288384514344311,
            "batch_time": 0.06739296767388889,
            "samples_per_second": 4467518.127894876,
            "samples_per_second_per_gpu": 558439.7659868594,
            "loss_sequences_lower_95": 6.906213149340935,
            "loss_sequences_upper_95": 6.937235447262019,
            "loss_tokens_lower_95": 6.313510803833504,
            "loss_tokens_upper_95": 6.343947620857941,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.560615592115507,
            "data_time": 0.3195270597934723,
            "batch_time": 0.410891130566597,
            "samples_per_second": 2357330.6737725907,
            "samples_per_second_per_gpu": 294666.33422157384,
            "loss_sequences_lower_95": 4.380627104992002,
            "loss_sequences_upper_95": 4.701781109186608,
            "loss_tokens_lower_95": 4.4357088198154635,
            "loss_tokens_upper_95": 4.624164656449583,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.584145494409509,
            "data_time": 0.2291857898235321,
            "batch_time": 0.27696533501148224,
            "samples_per_second": 1154855.7053836258,
            "samples_per_second_per_gpu": 144356.96317295323,
            "loss_sequences_lower_95": 5.269303440403293,
            "loss_sequences_upper_95": 5.914057695543444,
            "loss_tokens_lower_95": 5.116317240397135,
            "loss_tokens_upper_95": 6.063991782105999,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.540476490811604,
            "data_time": 0.33690422773361206,
            "batch_time": 0.3709537386894226,
            "samples_per_second": 1914276.6738094203,
            "samples_per_second_per_gpu": 239284.58422617754,
            "loss_sequences_lower_95": 4.418698678365568,
            "loss_sequences_upper_95": 4.663629410906536,
            "loss_tokens_lower_95": 4.435552377917367,
            "loss_tokens_upper_95": 4.598103382569386,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.514832179720809,
            "data_time": 0.3275417983531952,
            "batch_time": 0.3648810535669327,
            "samples_per_second": 2466674.8169400888,
            "samples_per_second_per_gpu": 308334.3521175111,
            "loss_sequences_lower_95": 4.492481641071598,
            "loss_sequences_upper_95": 4.708955922940882,
            "loss_tokens_lower_95": 4.427355248402135,
            "loss_tokens_upper_95": 4.557615286369766,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.624112556620342,
            "data_time": 0.3542516529560089,
            "batch_time": 0.38980770111083984,
            "samples_per_second": 2184807.875570381,
            "samples_per_second_per_gpu": 273100.98444629763,
            "loss_sequences_lower_95": 4.258179436660395,
            "loss_sequences_upper_95": 4.556086917039825,
            "loss_tokens_lower_95": 4.50060972682082,
            "loss_tokens_upper_95": 4.715645918316921,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.532326358120616,
            "data_time": 0.3127514272928238,
            "batch_time": 0.3470347970724106,
            "samples_per_second": 2431504.0361147914,
            "samples_per_second_per_gpu": 303938.0045143489,
            "loss_sequences_lower_95": 4.5303794302591465,
            "loss_sequences_upper_95": 4.742085666191287,
            "loss_tokens_lower_95": 4.452058481947284,
            "loss_tokens_upper_95": 4.572209723641939,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.314204390744985,
            "data_time": 0.30672362446784973,
            "batch_time": 0.34114621579647064,
            "samples_per_second": 2279234.934187075,
            "samples_per_second_per_gpu": 284904.3667733844,
            "loss_sequences_lower_95": 4.133470087466033,
            "loss_sequences_upper_95": 4.285607587920953,
            "loss_tokens_lower_95": 4.276586572736912,
            "loss_tokens_upper_95": 4.375353619022375,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.4613343157419343,
            "data_time": 0.32701556384563446,
            "batch_time": 0.3627510964870453,
            "samples_per_second": 2084179.343481958,
            "samples_per_second_per_gpu": 260522.41793524474,
            "loss_sequences_lower_95": 3.4239691106284535,
            "loss_sequences_upper_95": 3.6132604552478327,
            "loss_tokens_lower_95": 3.400339854345615,
            "loss_tokens_upper_95": 3.4868892864206598,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-1.0/params.txt",
    "uuid": "24419526-0d05-4b07-a0ec-466d61aa4d28",
    "creation_date": "2023_12_13-16_17_40"
}