{
    "name": "rpj-d=96_l=8_h=4-2.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 422772480,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "84554496",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.07867206732432,
            "data_time": 0.13472303748130798,
            "batch_time": 1.2518858909606934,
            "samples_per_second": 373023.4388918712,
            "samples_per_second_per_gpu": 46627.9298614839,
            "loss_sequences_lower_95": 4.99710075378418,
            "loss_sequences_upper_95": 5.160491154988606,
            "loss_tokens_lower_95": 5.06503849029541,
            "loss_tokens_upper_95": 5.092397715250651,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.107841900656407,
            "data_time": 0.01893997636139897,
            "batch_time": 0.06405745285035615,
            "samples_per_second": 4678719.8804568425,
            "samples_per_second_per_gpu": 584839.9850571053,
            "loss_sequences_lower_95": 5.105598264391346,
            "loss_sequences_upper_95": 5.110055357268909,
            "loss_tokens_lower_95": 5.096352447916667,
            "loss_tokens_upper_95": 5.119229541666667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.662088379567983,
            "data_time": 0.09795093536376953,
            "batch_time": 0.14270929247140884,
            "samples_per_second": 4129737.607457373,
            "samples_per_second_per_gpu": 516217.2009321716,
            "loss_sequences_lower_95": 4.64136569276148,
            "loss_sequences_upper_95": 4.6827734250438455,
            "loss_tokens_lower_95": 4.64999053125,
            "loss_tokens_upper_95": 4.67458253125,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.0012529056588395,
            "data_time": 0.013427185384850753,
            "batch_time": 0.05757577952585722,
            "samples_per_second": 5345473.361457412,
            "samples_per_second_per_gpu": 668184.1701821765,
            "loss_sequences_lower_95": 4.992455702319588,
            "loss_sequences_upper_95": 5.010111579816366,
            "loss_tokens_lower_95": 4.989947479166667,
            "loss_tokens_upper_95": 5.012540989583333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.08328277953045,
            "data_time": 0.09468542039394379,
            "batch_time": 0.1393745094537735,
            "samples_per_second": 4096213.0757643804,
            "samples_per_second_per_gpu": 512026.63447054755,
            "loss_sequences_lower_95": 5.053609763337736,
            "loss_sequences_upper_95": 5.113209201167897,
            "loss_tokens_lower_95": 5.071801614583333,
            "loss_tokens_upper_95": 5.0947554791666665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.071878214744708,
            "data_time": 0.03659649193286896,
            "batch_time": 0.07953422019879024,
            "samples_per_second": 4876577.92049164,
            "samples_per_second_per_gpu": 609572.240061455,
            "loss_sequences_lower_95": 5.042726601004844,
            "loss_sequences_upper_95": 5.099579276493987,
            "loss_tokens_lower_95": 5.059773302083333,
            "loss_tokens_upper_95": 5.083793854166666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4835668534648665,
            "data_time": 0.012563315778970718,
            "batch_time": 0.055142080038785936,
            "samples_per_second": 5228466.582253559,
            "samples_per_second_per_gpu": 653558.3227816948,
            "loss_sequences_lower_95": 4.457033143335459,
            "loss_sequences_upper_95": 4.5104650231186225,
            "loss_tokens_lower_95": 4.470558041666666,
            "loss_tokens_upper_95": 4.497033770833333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.027898137856528,
            "data_time": 0.013284409516736082,
            "batch_time": 0.05663956937037016,
            "samples_per_second": 5353259.943111961,
            "samples_per_second_per_gpu": 669157.4928889951,
            "loss_sequences_lower_95": 5.020451263907068,
            "loss_sequences_upper_95": 5.035377934800393,
            "loss_tokens_lower_95": 5.016901739583333,
            "loss_tokens_upper_95": 5.039330010416666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.220437298945295,
            "data_time": 0.09277985990047455,
            "batch_time": 0.13711097091436386,
            "samples_per_second": 4175899.6770508075,
            "samples_per_second_per_gpu": 521987.45963135094,
            "loss_sequences_lower_95": 5.180116855032075,
            "loss_sequences_upper_95": 5.2603634842043006,
            "loss_tokens_lower_95": 5.208647708333333,
            "loss_tokens_upper_95": 5.232506645833333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.596667935254546,
            "data_time": 0.09757713973522186,
            "batch_time": 0.14286987483501434,
            "samples_per_second": 4134845.5896172393,
            "samples_per_second_per_gpu": 516855.6987021549,
            "loss_sequences_lower_95": 5.576456170967917,
            "loss_sequences_upper_95": 5.615263047048696,
            "loss_tokens_lower_95": 5.58464825,
            "loss_tokens_upper_95": 5.6082946145833334,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.307105106802302,
            "data_time": 0.010152722227162328,
            "batch_time": 0.05352672728998908,
            "samples_per_second": 5405059.69995559,
            "samples_per_second_per_gpu": 675632.4624944488,
            "loss_sequences_lower_95": 5.301001040132075,
            "loss_sequences_upper_95": 5.313324410395539,
            "loss_tokens_lower_95": 5.295247489583334,
            "loss_tokens_upper_95": 5.31898153125,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.199069978594035,
            "data_time": 0.022897025942802428,
            "batch_time": 0.07414651513099671,
            "samples_per_second": 5081010.51239865,
            "samples_per_second_per_gpu": 635126.3140498312,
            "loss_sequences_lower_95": 5.189767864057294,
            "loss_sequences_upper_95": 5.208390882461083,
            "loss_tokens_lower_95": 5.187376125,
            "loss_tokens_upper_95": 5.2106758541666665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.166569686564664,
            "data_time": 0.0959634929895401,
            "batch_time": 0.14656062424182892,
            "samples_per_second": 4146568.692364414,
            "samples_per_second_per_gpu": 518321.0865455518,
            "loss_sequences_lower_95": 5.129928533155585,
            "loss_sequences_upper_95": 5.205481217018731,
            "loss_tokens_lower_95": 5.155029083333334,
            "loss_tokens_upper_95": 5.177859270833333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.326212673226102,
            "data_time": 0.09754149615764618,
            "batch_time": 0.14169534295797348,
            "samples_per_second": 4113384.803783149,
            "samples_per_second_per_gpu": 514173.1004728936,
            "loss_sequences_lower_95": 5.276638340221646,
            "loss_sequences_upper_95": 5.376084225532237,
            "loss_tokens_lower_95": 5.314632125,
            "loss_tokens_upper_95": 5.3384729375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.373661052096974,
            "data_time": 0.14047887921333313,
            "batch_time": 0.1601858288049698,
            "samples_per_second": 1127060.9074618025,
            "samples_per_second_per_gpu": 140882.6134327253,
            "loss_sequences_lower_95": 6.325490379333496,
            "loss_sequences_upper_95": 6.422142132845792,
            "loss_tokens_lower_95": 6.351201473582875,
            "loss_tokens_upper_95": 6.396297229420055,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.854997056566244,
            "data_time": 0.09759952872991562,
            "batch_time": 0.13269168138504028,
            "samples_per_second": 3360011.911104383,
            "samples_per_second_per_gpu": 420001.48888804787,
            "loss_sequences_lower_95": 4.758992622753622,
            "loss_sequences_upper_95": 4.9529676609762205,
            "loss_tokens_lower_95": 4.843175135416667,
            "loss_tokens_upper_95": 4.866833447916666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.589487396632776,
            "data_time": 0.0971624106168747,
            "batch_time": 0.1336444765329361,
            "samples_per_second": 3716507.104483837,
            "samples_per_second_per_gpu": 464563.3880604796,
            "loss_sequences_lower_95": 6.539909101098697,
            "loss_sequences_upper_95": 6.637987314994229,
            "loss_tokens_lower_95": 6.5783233125,
            "loss_tokens_upper_95": 6.600385875,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.994872183096214,
            "data_time": 0.17961528897285461,
            "batch_time": 0.21638651192188263,
            "samples_per_second": 2157051.053710444,
            "samples_per_second_per_gpu": 269631.3817138055,
            "loss_sequences_lower_95": 5.944927203068968,
            "loss_sequences_upper_95": 6.040755487660893,
            "loss_tokens_lower_95": 5.981528723044473,
            "loss_tokens_upper_95": 6.007647605020492,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.078048946952195,
            "data_time": 0.028097476200623944,
            "batch_time": 0.07254488603635267,
            "samples_per_second": 4508242.190824243,
            "samples_per_second_per_gpu": 563530.2738530304,
            "loss_sequences_lower_95": 5.06283821526492,
            "loss_sequences_upper_95": 5.093115060510433,
            "loss_tokens_lower_95": 5.062806808369534,
            "loss_tokens_upper_95": 5.093068701262284,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.908302531444654,
            "data_time": 0.029766809567809104,
            "batch_time": 0.07365599572658539,
            "samples_per_second": 4439886.38405208,
            "samples_per_second_per_gpu": 554985.79800651,
            "loss_sequences_lower_95": 4.888567187733394,
            "loss_sequences_upper_95": 4.914666914583749,
            "loss_tokens_lower_95": 4.896707170101977,
            "loss_tokens_upper_95": 4.918660288930183,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.971732132604902,
            "data_time": 0.05297946433226267,
            "batch_time": 0.0949533846643236,
            "samples_per_second": 4314417.3979104385,
            "samples_per_second_per_gpu": 539302.1747388048,
            "loss_sequences_lower_95": 7.415366513086029,
            "loss_sequences_upper_95": 7.676437488928909,
            "loss_tokens_lower_95": 6.841287850231564,
            "loss_tokens_upper_95": 7.038278190243775,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.798502702713012,
            "data_time": 0.03853068997462591,
            "batch_time": 0.08224350959062576,
            "samples_per_second": 4649170.491568663,
            "samples_per_second_per_gpu": 581146.3114460829,
            "loss_sequences_lower_95": 7.1388435546875,
            "loss_sequences_upper_95": 7.3120813639322915,
            "loss_tokens_lower_95": 6.701397491647012,
            "loss_tokens_upper_95": 6.826460495283019,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.392708022455269,
            "data_time": 0.06543729702631633,
            "batch_time": 0.10509398331244786,
            "samples_per_second": 4037802.1251397473,
            "samples_per_second_per_gpu": 504725.2656424684,
            "loss_sequences_lower_95": 5.484513423477185,
            "loss_sequences_upper_95": 5.551763985426342,
            "loss_tokens_lower_95": 5.370189639596037,
            "loss_tokens_upper_95": 5.40419124378892,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.021911699121649,
            "data_time": 0.33895885944366455,
            "batch_time": 0.3807879537343979,
            "samples_per_second": 2483619.2610572926,
            "samples_per_second_per_gpu": 310452.4076321616,
            "loss_sequences_lower_95": 4.012577313509854,
            "loss_sequences_upper_95": 4.147241051413796,
            "loss_tokens_lower_95": 3.992457343419329,
            "loss_tokens_upper_95": 4.044963689156505,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.22445332274145,
            "data_time": 0.35018837451934814,
            "batch_time": 0.3946259170770645,
            "samples_per_second": 2620960.759100827,
            "samples_per_second_per_gpu": 327620.09488760337,
            "loss_sequences_lower_95": 5.275315414740115,
            "loss_sequences_upper_95": 5.4930890889070465,
            "loss_tokens_lower_95": 5.169481978884564,
            "loss_tokens_upper_95": 5.275642084884902,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.12878581682841,
            "data_time": 0.18198208510875702,
            "batch_time": 0.2136412113904953,
            "samples_per_second": 2239448.504723411,
            "samples_per_second_per_gpu": 279931.0630904264,
            "loss_sequences_lower_95": 5.127036112467448,
            "loss_sequences_upper_95": 5.234296142578125,
            "loss_tokens_lower_95": 5.008782445957103,
            "loss_tokens_upper_95": 5.23730017587782,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.747130525528073,
            "data_time": 0.024271064065396787,
            "batch_time": 0.068505197763443,
            "samples_per_second": 4499974.797019771,
            "samples_per_second_per_gpu": 562496.8496274714,
            "loss_sequences_lower_95": 9.847844382812115,
            "loss_sequences_upper_95": 9.927553950500714,
            "loss_tokens_lower_95": 9.685408815220507,
            "loss_tokens_upper_95": 9.769264033309044,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.910687909383164,
            "data_time": 0.04425541907548904,
            "batch_time": 0.08641860038042068,
            "samples_per_second": 4425533.278483076,
            "samples_per_second_per_gpu": 553191.6598103845,
            "loss_sequences_lower_95": 7.024957090435606,
            "loss_sequences_upper_95": 7.298111074942129,
            "loss_tokens_lower_95": 5.76703169127143,
            "loss_tokens_upper_95": 5.91128465200315,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.621981795330504,
            "data_time": 0.08028227984905242,
            "batch_time": 0.12236404716968537,
            "samples_per_second": 4349995.259678958,
            "samples_per_second_per_gpu": 543749.4074598697,
            "loss_sequences_lower_95": 6.340762839463791,
            "loss_sequences_upper_95": 6.64637869877213,
            "loss_tokens_lower_95": 5.5164200838476125,
            "loss_tokens_upper_95": 5.6820721370553,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.632184936575694,
            "data_time": 0.404187336564064,
            "batch_time": 0.445582777261734,
            "samples_per_second": 1979687.381928833,
            "samples_per_second_per_gpu": 247460.92274110412,
            "loss_sequences_lower_95": 5.588325242691388,
            "loss_sequences_upper_95": 5.675318525479809,
            "loss_tokens_lower_95": 5.5888828643380775,
            "loss_tokens_upper_95": 5.67636501364512,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.119220275878906,
            "data_time": 0.2964302748441696,
            "batch_time": 0.3230549693107605,
            "samples_per_second": 1684726.9799247766,
            "samples_per_second_per_gpu": 210590.87249059707,
            "loss_sequences_lower_95": 5.056304718017578,
            "loss_sequences_upper_95": 5.520399108886719,
            "loss_tokens_lower_95": 4.8642987464536285,
            "loss_tokens_upper_95": 5.36433164429366,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.831065150830006,
            "data_time": 0.06299864314496517,
            "batch_time": 0.10614166595041752,
            "samples_per_second": 4311113.503068577,
            "samples_per_second_per_gpu": 538889.1878835721,
            "loss_sequences_lower_95": 4.791762890225179,
            "loss_sequences_upper_95": 4.8719380092718465,
            "loss_tokens_lower_95": 4.790747957415078,
            "loss_tokens_upper_95": 4.871375661203781,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.364190042653502,
            "data_time": 0.07757512629032134,
            "batch_time": 0.12093704640865326,
            "samples_per_second": 4386930.5327820415,
            "samples_per_second_per_gpu": 548366.3165977552,
            "loss_sequences_lower_95": 5.3172499148207155,
            "loss_sequences_upper_95": 5.411199386309249,
            "loss_tokens_lower_95": 5.315806745799422,
            "loss_tokens_upper_95": 5.411152747651771,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4472285295077585,
            "data_time": 0.05139266140758991,
            "batch_time": 0.0929221548140049,
            "samples_per_second": 4222294.299334417,
            "samples_per_second_per_gpu": 527786.7874168021,
            "loss_sequences_lower_95": 5.673304869742077,
            "loss_sequences_upper_95": 5.785140536269893,
            "loss_tokens_lower_95": 5.408602501392401,
            "loss_tokens_upper_95": 5.468067371836592,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.769413605690002,
            "data_time": 0.1762823536992073,
            "batch_time": 0.22260957211256027,
            "samples_per_second": 3579086.3897471908,
            "samples_per_second_per_gpu": 447385.79871839884,
            "loss_sequences_lower_95": 7.450899609375,
            "loss_sequences_upper_95": 7.945345666503907,
            "loss_tokens_lower_95": 6.533760415158628,
            "loss_tokens_upper_95": 6.875219807350265,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5753931403160095,
            "data_time": 0.1526220440864563,
            "batch_time": 0.1694963425397873,
            "samples_per_second": 944854.4061857035,
            "samples_per_second_per_gpu": 118106.80077321293,
            "loss_sequences_lower_95": 5.243146753311157,
            "loss_sequences_upper_95": 6.070462787151336,
            "loss_tokens_lower_95": 4.959048584685928,
            "loss_tokens_upper_95": 5.95068552302218,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.926891757153917,
            "data_time": 0.3576412945985794,
            "batch_time": 0.3933091312646866,
            "samples_per_second": 2424674.4320934885,
            "samples_per_second_per_gpu": 303084.30401168606,
            "loss_sequences_lower_95": 7.306771622581043,
            "loss_sequences_upper_95": 8.034437824117726,
            "loss_tokens_lower_95": 5.60454258451064,
            "loss_tokens_upper_95": 6.054143234304537,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.198810329912218,
            "data_time": 0.05353830258051554,
            "batch_time": 0.09881378379133013,
            "samples_per_second": 4356958.305116553,
            "samples_per_second_per_gpu": 544619.7881395691,
            "loss_sequences_lower_95": 5.163696427718163,
            "loss_sequences_upper_95": 5.234045063517093,
            "loss_tokens_lower_95": 5.162991406335326,
            "loss_tokens_upper_95": 5.234506317578637,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.407924950527677,
            "data_time": 0.03371626138687134,
            "batch_time": 0.07676144795758384,
            "samples_per_second": 4432751.156595935,
            "samples_per_second_per_gpu": 554093.8945744919,
            "loss_sequences_lower_95": 7.470718437681933,
            "loss_sequences_upper_95": 7.6519236746191535,
            "loss_tokens_lower_95": 7.301955652741069,
            "loss_tokens_upper_95": 7.481381762953427,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.515040761821872,
            "data_time": 0.180754654109478,
            "batch_time": 0.2105197310447693,
            "samples_per_second": 1813552.9599068323,
            "samples_per_second_per_gpu": 226694.11998835404,
            "loss_sequences_lower_95": 4.452459515582074,
            "loss_sequences_upper_95": 4.86513185605898,
            "loss_tokens_lower_95": 4.299034504003816,
            "loss_tokens_upper_95": 4.634583897198051,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.89147136902,
            "data_time": 0.08153316676616669,
            "batch_time": 0.12668922841548919,
            "samples_per_second": 4403512.645764376,
            "samples_per_second_per_gpu": 550439.080720547,
            "loss_sequences_lower_95": 4.953354977771063,
            "loss_sequences_upper_95": 5.098265589930076,
            "loss_tokens_lower_95": 4.809085839663179,
            "loss_tokens_upper_95": 4.967334583543453,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.962948915435047,
            "data_time": 0.32647043466567993,
            "batch_time": 0.36165933310985565,
            "samples_per_second": 2102950.074001042,
            "samples_per_second_per_gpu": 262868.7592501303,
            "loss_sequences_lower_95": 5.7026792200600225,
            "loss_sequences_upper_95": 6.213940224996427,
            "loss_tokens_lower_95": 5.800254774482566,
            "loss_tokens_upper_95": 6.174786217644398,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.987694567765062,
            "data_time": 0.0344818101781289,
            "batch_time": 0.07856355789686818,
            "samples_per_second": 4322466.788400169,
            "samples_per_second_per_gpu": 540308.3485500212,
            "loss_sequences_lower_95": 4.972469220406581,
            "loss_sequences_upper_95": 5.003340160219544,
            "loss_tokens_lower_95": 4.972174073486572,
            "loss_tokens_upper_95": 5.00300642159682,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.369123597746914,
            "data_time": 0.3129767179489136,
            "batch_time": 0.33994945883750916,
            "samples_per_second": 1695091.4015379674,
            "samples_per_second_per_gpu": 211886.42519224592,
            "loss_sequences_lower_95": 5.223020046197095,
            "loss_sequences_upper_95": 5.585272468640966,
            "loss_tokens_lower_95": 5.120281363948951,
            "loss_tokens_upper_95": 5.525047878550409,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.189927736780678,
            "data_time": 0.022754727800687154,
            "batch_time": 0.06688787947098414,
            "samples_per_second": 4512022.571557621,
            "samples_per_second_per_gpu": 564002.8214447026,
            "loss_sequences_lower_95": 6.913541789504717,
            "loss_sequences_upper_95": 6.960442298873166,
            "loss_tokens_lower_95": 6.100385795454546,
            "loss_tokens_upper_95": 6.147638733075436,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.706609930515289,
            "data_time": 0.09910557046532631,
            "batch_time": 0.14372752234339714,
            "samples_per_second": 4235628.441192307,
            "samples_per_second_per_gpu": 529453.5551490383,
            "loss_sequences_lower_95": 5.848503430175781,
            "loss_sequences_upper_95": 6.157763903808593,
            "loss_tokens_lower_95": 5.540223442484774,
            "loss_tokens_upper_95": 5.819852947285262,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.894972318151723,
            "data_time": 0.32701557874679565,
            "batch_time": 0.3696899861097336,
            "samples_per_second": 2781013.208058904,
            "samples_per_second_per_gpu": 347626.651007363,
            "loss_sequences_lower_95": 4.771219588569973,
            "loss_sequences_upper_95": 5.0198022062882135,
            "loss_tokens_lower_95": 4.771997561247452,
            "loss_tokens_upper_95": 5.01655345087466,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.205396535902313,
            "data_time": 0.06851734469334285,
            "batch_time": 0.10893479486306508,
            "samples_per_second": 3986355.3935621013,
            "samples_per_second_per_gpu": 498294.42419526266,
            "loss_sequences_lower_95": 9.102421468098958,
            "loss_sequences_upper_95": 9.306687474106297,
            "loss_tokens_lower_95": 9.102675503817473,
            "loss_tokens_upper_95": 9.310212106415719,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.882175208091736,
            "data_time": 0.07163818428913753,
            "batch_time": 0.11646205186843872,
            "samples_per_second": 4222372.803884816,
            "samples_per_second_per_gpu": 527796.600485602,
            "loss_sequences_lower_95": 4.017396891276041,
            "loss_sequences_upper_95": 4.123518782552083,
            "loss_tokens_lower_95": 3.822976198291817,
            "loss_tokens_upper_95": 3.9208719112645056,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.359214984802973,
            "data_time": 0.35888728499412537,
            "batch_time": 0.39959716796875,
            "samples_per_second": 2139588.248972927,
            "samples_per_second_per_gpu": 267448.5311216159,
            "loss_sequences_lower_95": 5.98959710984003,
            "loss_sequences_upper_95": 6.733089890252976,
            "loss_tokens_lower_95": 5.986694001697359,
            "loss_tokens_upper_95": 6.730106608072917,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.999627009034157,
            "data_time": 0.1490013152360916,
            "batch_time": 0.16624397039413452,
            "samples_per_second": 908506.7743560248,
            "samples_per_second_per_gpu": 113563.3467945031,
            "loss_sequences_lower_95": 5.754295885562897,
            "loss_sequences_upper_95": 7.113157868385315,
            "loss_tokens_lower_95": 5.497508670767559,
            "loss_tokens_upper_95": 6.086887735583119,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.158207009315491,
            "data_time": 0.09893298149108887,
            "batch_time": 0.14315508678555489,
            "samples_per_second": 4287198.381724663,
            "samples_per_second_per_gpu": 535899.7977155829,
            "loss_sequences_lower_95": 8.214719946289064,
            "loss_sequences_upper_95": 8.58574970703125,
            "loss_tokens_lower_95": 7.9779981997210765,
            "loss_tokens_upper_95": 8.31226517472372,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.613011263370514,
            "data_time": 0.09246483072638512,
            "batch_time": 0.13717442750930786,
            "samples_per_second": 4330913.693878345,
            "samples_per_second_per_gpu": 541364.2117347931,
            "loss_sequences_lower_95": 7.851386718750001,
            "loss_sequences_upper_95": 8.092274487304687,
            "loss_tokens_lower_95": 7.503018370328931,
            "loss_tokens_upper_95": 7.694204086892056,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.336123056392561,
            "data_time": 0.04122645904620489,
            "batch_time": 0.08472469200690587,
            "samples_per_second": 4547456.481618304,
            "samples_per_second_per_gpu": 568432.060202288,
            "loss_sequences_lower_95": 4.309704127425201,
            "loss_sequences_upper_95": 4.3621105208158735,
            "loss_tokens_lower_95": 4.309660701003604,
            "loss_tokens_upper_95": 4.362898465654332,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.181889761061895,
            "data_time": 0.12632198135058084,
            "batch_time": 0.1665504425764084,
            "samples_per_second": 3879213.9840046237,
            "samples_per_second_per_gpu": 484901.74800057797,
            "loss_sequences_lower_95": 5.09760009015577,
            "loss_sequences_upper_95": 5.265849161221318,
            "loss_tokens_lower_95": 5.097287995181691,
            "loss_tokens_upper_95": 5.264091283092118,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.568883788108826,
            "data_time": 0.10563449189066887,
            "batch_time": 0.1499323658645153,
            "samples_per_second": 4094885.1229997813,
            "samples_per_second_per_gpu": 511860.64037497266,
            "loss_sequences_lower_95": 10.502223388671876,
            "loss_sequences_upper_95": 10.638804907226563,
            "loss_tokens_lower_95": 10.501409375,
            "loss_tokens_upper_95": 10.634917944335937,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.3959402579683795,
            "data_time": 0.027226421449865614,
            "batch_time": 0.07121691107749939,
            "samples_per_second": 4494676.439307703,
            "samples_per_second_per_gpu": 561834.5549134628,
            "loss_sequences_lower_95": 7.130684074178099,
            "loss_sequences_upper_95": 7.214275163345553,
            "loss_tokens_lower_95": 6.298156265996246,
            "loss_tokens_upper_95": 6.359114744406646,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.238456718957246,
            "data_time": 0.20975840943200247,
            "batch_time": 0.24177557229995728,
            "samples_per_second": 2064247.846042551,
            "samples_per_second_per_gpu": 258030.9807553189,
            "loss_sequences_lower_95": 5.106441919127507,
            "loss_sequences_upper_95": 5.369729181545884,
            "loss_tokens_lower_95": 5.104762643842555,
            "loss_tokens_upper_95": 5.367431139590135,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.043831978592218,
            "data_time": 0.18693604320287704,
            "batch_time": 0.232488751411438,
            "samples_per_second": 3659889.259704099,
            "samples_per_second_per_gpu": 457486.1574630124,
            "loss_sequences_lower_95": 4.95625150792739,
            "loss_sequences_upper_95": 5.130587672813267,
            "loss_tokens_lower_95": 4.959179160922181,
            "loss_tokens_upper_95": 5.130646230660233,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.624104092567021,
            "data_time": 0.028708246536552906,
            "batch_time": 0.07232933212071657,
            "samples_per_second": 4492481.906741913,
            "samples_per_second_per_gpu": 561560.2383427392,
            "loss_sequences_lower_95": 7.34873653633111,
            "loss_sequences_upper_95": 7.446050261591037,
            "loss_tokens_lower_95": 6.5216619370257,
            "loss_tokens_upper_95": 6.597939484272464,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.865111959043634,
            "data_time": 0.31180785596370697,
            "batch_time": 0.35007649660110474,
            "samples_per_second": 2422382.003656379,
            "samples_per_second_per_gpu": 302797.75045704737,
            "loss_sequences_lower_95": 4.771302052654287,
            "loss_sequences_upper_95": 4.958578781854539,
            "loss_tokens_lower_95": 4.771161502252811,
            "loss_tokens_upper_95": 4.956347664323434,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.039874257808068,
            "data_time": 0.045414125690093406,
            "batch_time": 0.08952950055782612,
            "samples_per_second": 4453343.818901944,
            "samples_per_second_per_gpu": 556667.977362743,
            "loss_sequences_lower_95": 8.009880692134939,
            "loss_sequences_upper_95": 8.069624799909212,
            "loss_tokens_lower_95": 8.009821665352638,
            "loss_tokens_upper_95": 8.06934295560971,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.258905813531968,
            "data_time": 0.3330673724412918,
            "batch_time": 0.37403246760368347,
            "samples_per_second": 2538182.988954412,
            "samples_per_second_per_gpu": 317272.8736193015,
            "loss_sequences_lower_95": 5.112034088431052,
            "loss_sequences_upper_95": 5.400813619372914,
            "loss_tokens_lower_95": 5.111828257736651,
            "loss_tokens_upper_95": 5.400074012534132,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.859053985277812,
            "data_time": 0.3694997876882553,
            "batch_time": 0.39026764035224915,
            "samples_per_second": 1190162.6188269847,
            "samples_per_second_per_gpu": 148770.32735337308,
            "loss_sequences_lower_95": 6.669665590922038,
            "loss_sequences_upper_95": 7.383940722147624,
            "loss_tokens_lower_95": 6.161311340332031,
            "loss_tokens_upper_95": 7.410704591539171,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.649307878812154,
            "data_time": 0.30768507719039917,
            "batch_time": 0.3284245878458023,
            "samples_per_second": 1140988.4027136397,
            "samples_per_second_per_gpu": 142623.55033920496,
            "loss_sequences_lower_95": 6.487961260477702,
            "loss_sequences_upper_95": 7.4160902786254885,
            "loss_tokens_lower_95": 5.902900850103142,
            "loss_tokens_upper_95": 7.2680188982674245,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.433415397031669,
            "data_time": 0.04338059148618153,
            "batch_time": 0.08640097720282418,
            "samples_per_second": 4319074.640488607,
            "samples_per_second_per_gpu": 539884.3300610759,
            "loss_sequences_lower_95": 6.39457149185383,
            "loss_sequences_upper_95": 6.471493852977725,
            "loss_tokens_lower_95": 6.395345406848307,
            "loss_tokens_upper_95": 6.471714435290869,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.325767708437751,
            "data_time": 0.023113004440303453,
            "batch_time": 0.06746772146381146,
            "samples_per_second": 4501148.049590892,
            "samples_per_second_per_gpu": 562643.5061988615,
            "loss_sequences_lower_95": 5.893478395015557,
            "loss_sequences_upper_95": 5.926113667253641,
            "loss_tokens_lower_95": 5.256976960073558,
            "loss_tokens_upper_95": 5.288402356597235,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.115912754704633,
            "data_time": 0.335343673825264,
            "batch_time": 0.3645806610584259,
            "samples_per_second": 1946696.5108047735,
            "samples_per_second_per_gpu": 243337.0638505967,
            "loss_sequences_lower_95": 4.035960556390717,
            "loss_sequences_upper_95": 4.397337653693252,
            "loss_tokens_lower_95": 3.9561453151866863,
            "loss_tokens_upper_95": 4.152499086046887,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.764802262589738,
            "data_time": 0.2078694850206375,
            "batch_time": 0.22557741403579712,
            "samples_per_second": 1059708.2825538116,
            "samples_per_second_per_gpu": 132463.53531922644,
            "loss_sequences_lower_95": 5.402098062876108,
            "loss_sequences_upper_95": 6.157499045294684,
            "loss_tokens_lower_95": 5.247888108241705,
            "loss_tokens_upper_95": 6.20792775095245,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.005639727522687,
            "data_time": 0.3105171322822571,
            "batch_time": 0.34437669813632965,
            "samples_per_second": 2062804.5412610564,
            "samples_per_second_per_gpu": 257850.56765763205,
            "loss_sequences_lower_95": 3.9787289130978465,
            "loss_sequences_upper_95": 4.2739333455155535,
            "loss_tokens_lower_95": 3.8795363093986546,
            "loss_tokens_upper_95": 4.043429927328638,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.015273259907234,
            "data_time": 0.348502978682518,
            "batch_time": 0.38571134209632874,
            "samples_per_second": 2260643.360575964,
            "samples_per_second_per_gpu": 282580.4200719955,
            "loss_sequences_lower_95": 4.096100076814977,
            "loss_sequences_upper_95": 4.35539069757229,
            "loss_tokens_lower_95": 3.905821088985722,
            "loss_tokens_upper_95": 4.040044090187785,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.112422998358563,
            "data_time": 0.3321070969104767,
            "batch_time": 0.3672453314065933,
            "samples_per_second": 1819713.0297363275,
            "samples_per_second_per_gpu": 227464.12871704093,
            "loss_sequences_lower_95": 3.8000564203029725,
            "loss_sequences_upper_95": 4.148152616547375,
            "loss_tokens_lower_95": 3.9752879146600058,
            "loss_tokens_upper_95": 4.195844077954284,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.073821015474273,
            "data_time": 0.3565509021282196,
            "batch_time": 0.3912186324596405,
            "samples_per_second": 2434721.67514402,
            "samples_per_second_per_gpu": 304340.2093930025,
            "loss_sequences_lower_95": 4.174211511379335,
            "loss_sequences_upper_95": 4.420297110952982,
            "loss_tokens_lower_95": 3.972696902076032,
            "loss_tokens_upper_95": 4.094961671443,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.745041166033064,
            "data_time": 0.33860209584236145,
            "batch_time": 0.3727215826511383,
            "samples_per_second": 2092515.0985065093,
            "samples_per_second_per_gpu": 261564.38731331367,
            "loss_sequences_lower_95": 3.6784168693589865,
            "loss_sequences_upper_95": 3.8288211230165468,
            "loss_tokens_lower_95": 3.6801608714627814,
            "loss_tokens_upper_95": 3.778128747440772,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.0417433482844656,
            "data_time": 0.33231930434703827,
            "batch_time": 0.36738520860671997,
            "samples_per_second": 2413737.717650523,
            "samples_per_second_per_gpu": 301717.21470631537,
            "loss_sequences_lower_95": 3.117505222413598,
            "loss_sequences_upper_95": 3.3171857508217415,
            "loss_tokens_lower_95": 2.964373958097875,
            "loss_tokens_upper_95": 3.049949205799804,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-2.0/params.txt",
    "uuid": "cf386712-f2a5-4b28-8d13-97ce6d3357c3",
    "creation_date": "2023_12_14-05_56_42"
}