{
    "name": "rpj-d=1024_l=24_h=8-4.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 32929300480,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "6585860096",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=1024_l=24_h=8-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.4663205524285634,
            "data_time": 0.04317959398031235,
            "batch_time": 0.41518186777830124,
            "samples_per_second": 690541.2050494377,
            "samples_per_second_per_gpu": 86317.65063117971,
            "loss_sequences_lower_95": 2.4021301396687824,
            "loss_sequences_upper_95": 2.5276373354593913,
            "loss_tokens_lower_95": 2.455122820536295,
            "loss_tokens_upper_95": 2.4773368390401207,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9739398394679726,
            "data_time": 0.001098449119743963,
            "batch_time": 0.03678090535951828,
            "samples_per_second": 897429.9506038261,
            "samples_per_second_per_gpu": 112178.74382547826,
            "loss_sequences_lower_95": 2.971251127770621,
            "loss_sequences_upper_95": 2.976564126857375,
            "loss_tokens_lower_95": 2.9635940989583336,
            "loss_tokens_upper_95": 2.9841723541666667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4838982533435434,
            "data_time": 0.009401931762695312,
            "batch_time": 0.044778246879577634,
            "samples_per_second": 869179.9412820362,
            "samples_per_second_per_gpu": 108647.49266025453,
            "loss_sequences_lower_95": 2.4600991572165976,
            "loss_sequences_upper_95": 2.507660485092474,
            "loss_tokens_lower_95": 2.47299153125,
            "loss_tokens_upper_95": 2.4948424322916667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8189652222210597,
            "data_time": 0.001605107988181867,
            "batch_time": 0.036927184678222,
            "samples_per_second": 905303.1814282428,
            "samples_per_second_per_gpu": 113162.89767853035,
            "loss_sequences_lower_95": 2.807046890101482,
            "loss_sequences_upper_95": 2.8304193480186854,
            "loss_tokens_lower_95": 2.8087387500000003,
            "loss_tokens_upper_95": 2.8289354635416664,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9753107233105753,
            "data_time": 0.009672442280438792,
            "batch_time": 0.045054450927977545,
            "samples_per_second": 865416.4276115606,
            "samples_per_second_per_gpu": 108177.05345144507,
            "loss_sequences_lower_95": 2.9416322416783354,
            "loss_sequences_upper_95": 3.0086757310296273,
            "loss_tokens_lower_95": 2.9649436875000004,
            "loss_tokens_upper_95": 2.9853747447916668,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7738395691890445,
            "data_time": 0.003986976392891096,
            "batch_time": 0.039519418192946396,
            "samples_per_second": 898545.0188407212,
            "samples_per_second_per_gpu": 112318.12735509015,
            "loss_sequences_lower_95": 2.733749715362902,
            "loss_sequences_upper_95": 2.8136333784548135,
            "loss_tokens_lower_95": 2.7633644114583333,
            "loss_tokens_upper_95": 2.78429534375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5127020352592273,
            "data_time": 0.0017379985159022967,
            "batch_time": 0.03725217168521726,
            "samples_per_second": 905961.6121168656,
            "samples_per_second_per_gpu": 113245.2015146082,
            "loss_sequences_lower_95": 1.4921786611128827,
            "loss_sequences_upper_95": 1.533647142059949,
            "loss_tokens_lower_95": 1.5035891666666668,
            "loss_tokens_upper_95": 1.5222090442708334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.406850768234093,
            "data_time": 0.001650995412465171,
            "batch_time": 0.03750977382556209,
            "samples_per_second": 906530.0646445272,
            "samples_per_second_per_gpu": 113316.2580805659,
            "loss_sequences_lower_95": 3.398575261780105,
            "loss_sequences_upper_95": 3.414990229262107,
            "loss_tokens_lower_95": 3.3965195208333334,
            "loss_tokens_upper_95": 3.417243802083333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1611764343773445,
            "data_time": 0.011271131417107961,
            "batch_time": 0.0476113860569303,
            "samples_per_second": 864766.0613840659,
            "samples_per_second_per_gpu": 108095.75767300824,
            "loss_sequences_lower_95": 3.1200075227070627,
            "loss_sequences_upper_95": 3.2068437777883636,
            "loss_tokens_lower_95": 3.1504901979166666,
            "loss_tokens_upper_95": 3.1720277083333333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.831607435531767,
            "data_time": 0.009607333689928055,
            "batch_time": 0.04527392145246267,
            "samples_per_second": 870067.54565818,
            "samples_per_second_per_gpu": 108758.4432072725,
            "loss_sequences_lower_95": 3.804834690018605,
            "loss_sequences_upper_95": 3.8557999923766366,
            "loss_tokens_lower_95": 3.81981103125,
            "loss_tokens_upper_95": 3.8439574375,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8995223453767958,
            "data_time": 0.00128562236753481,
            "batch_time": 0.036633012130770755,
            "samples_per_second": 908542.0149263461,
            "samples_per_second_per_gpu": 113567.75186579327,
            "loss_sequences_lower_95": 2.891475038971495,
            "loss_sequences_upper_95": 2.907490904866726,
            "loss_tokens_lower_95": 2.889384088541667,
            "loss_tokens_upper_95": 2.9096609635416666,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.805327474847529,
            "data_time": 0.0025901955232135857,
            "batch_time": 0.037938132472677494,
            "samples_per_second": 904818.8542678826,
            "samples_per_second_per_gpu": 113102.35678348533,
            "loss_sequences_lower_95": 2.795307304004191,
            "loss_sequences_upper_95": 2.815076883451882,
            "loss_tokens_lower_95": 2.795486703125,
            "loss_tokens_upper_95": 2.8154817083333334,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3518311629188955,
            "data_time": 0.009678224329891884,
            "batch_time": 0.045026087478215515,
            "samples_per_second": 863374.1007874332,
            "samples_per_second_per_gpu": 107921.76259842915,
            "loss_sequences_lower_95": 3.3194286416075047,
            "loss_sequences_upper_95": 3.3831533312072133,
            "loss_tokens_lower_95": 3.3411687447916667,
            "loss_tokens_upper_95": 3.3626101614583335,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.717031147717944,
            "data_time": 0.010062453281356994,
            "batch_time": 0.045737495460358275,
            "samples_per_second": 863278.8854948257,
            "samples_per_second_per_gpu": 107909.86068685321,
            "loss_sequences_lower_95": 2.657274930685941,
            "loss_sequences_upper_95": 2.77530501418104,
            "loss_tokens_lower_95": 2.706344421875,
            "loss_tokens_upper_95": 2.72753665625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.37217777967453,
            "data_time": 0.08585516044071742,
            "batch_time": 0.12034510714667183,
            "samples_per_second": 521630.5436160136,
            "samples_per_second_per_gpu": 65203.8179520017,
            "loss_sequences_lower_95": 3.3123929717323994,
            "loss_sequences_upper_95": 3.4312407406893644,
            "loss_tokens_lower_95": 3.35248198075728,
            "loss_tokens_upper_95": 3.3920716285705566,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.396187470188641,
            "data_time": 0.013493216850540855,
            "batch_time": 0.04874043031172319,
            "samples_per_second": 855490.2214227166,
            "samples_per_second_per_gpu": 106936.27767783958,
            "loss_sequences_lower_95": 2.3081474281956087,
            "loss_sequences_upper_95": 2.4830609557927525,
            "loss_tokens_lower_95": 2.3859083489583335,
            "loss_tokens_upper_95": 2.406092473958333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.443522225583764,
            "data_time": 0.013002720971902212,
            "batch_time": 0.0484124260644118,
            "samples_per_second": 867196.7926746687,
            "samples_per_second_per_gpu": 108399.59908433359,
            "loss_sequences_lower_95": 5.393463448798751,
            "loss_sequences_upper_95": 5.491516870181605,
            "loss_tokens_lower_95": 5.431934125,
            "loss_tokens_upper_95": 5.455036677083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.984968210830063,
            "data_time": 0.03557091951370239,
            "batch_time": 0.07102682814002037,
            "samples_per_second": 773520.7586333457,
            "samples_per_second_per_gpu": 96690.09482916821,
            "loss_sequences_lower_95": 2.944316701420018,
            "loss_sequences_upper_95": 3.027976076720191,
            "loss_tokens_lower_95": 2.9735735220987287,
            "loss_tokens_upper_95": 2.996452287767754,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.838714822364862,
            "data_time": 0.001642917140741283,
            "batch_time": 0.037075082234749764,
            "samples_per_second": 900908.7661149923,
            "samples_per_second_per_gpu": 112613.59576437404,
            "loss_sequences_lower_95": 2.8196428223699437,
            "loss_sequences_upper_95": 2.8580314024444524,
            "loss_tokens_lower_95": 2.8194693820324743,
            "loss_tokens_upper_95": 2.858023265582716,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.750626546717858,
            "data_time": 0.0017724567253118867,
            "batch_time": 0.037209923337599274,
            "samples_per_second": 899537.7851000738,
            "samples_per_second_per_gpu": 112442.22313750922,
            "loss_sequences_lower_95": 2.7457383627436642,
            "loss_sequences_upper_95": 2.770186919041588,
            "loss_tokens_lower_95": 2.7307637591321106,
            "loss_tokens_upper_95": 2.748843475328275,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0305007941656634,
            "data_time": 0.0031895259997397523,
            "batch_time": 0.038924549122557786,
            "samples_per_second": 897362.8273435906,
            "samples_per_second_per_gpu": 112170.35341794882,
            "loss_sequences_lower_95": 3.259658639049215,
            "loss_sequences_upper_95": 3.5282369218122636,
            "loss_tokens_lower_95": 2.528710571027296,
            "loss_tokens_upper_95": 2.717085940524854,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2657480156049132,
            "data_time": 0.0038901461248702193,
            "batch_time": 0.039382791424051246,
            "samples_per_second": 891447.0969134336,
            "samples_per_second_per_gpu": 111430.8871141792,
            "loss_sequences_lower_95": 3.334391259765625,
            "loss_sequences_upper_95": 3.531656380208333,
            "loss_tokens_lower_95": 3.0407439809355346,
            "loss_tokens_upper_95": 3.1765236954599056,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.412432441845186,
            "data_time": 0.004570193599971351,
            "batch_time": 0.04008572673366081,
            "samples_per_second": 890117.1740047866,
            "samples_per_second_per_gpu": 111264.64675059833,
            "loss_sequences_lower_95": 2.45642428568044,
            "loss_sequences_upper_95": 2.5103070933679636,
            "loss_tokens_lower_95": 2.328032457681976,
            "loss_tokens_upper_95": 2.357920406170345,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8114234989339655,
            "data_time": 0.022648334503173828,
            "batch_time": 0.05812858045101166,
            "samples_per_second": 836010.1012509707,
            "samples_per_second_per_gpu": 104501.26265637134,
            "loss_sequences_lower_95": 1.7954335715553977,
            "loss_sequences_upper_95": 1.8904515006325462,
            "loss_tokens_lower_95": 1.752480502210773,
            "loss_tokens_upper_95": 1.7934808730348684,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.806892142003896,
            "data_time": 0.019665982574224472,
            "batch_time": 0.05492866225540638,
            "samples_per_second": 822856.4041176528,
            "samples_per_second_per_gpu": 102857.0505147066,
            "loss_sequences_lower_95": 2.787164649184869,
            "loss_sequences_upper_95": 2.959522742446588,
            "loss_tokens_lower_95": 2.70677259647935,
            "loss_tokens_upper_95": 2.792094366219533,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6088113005956015,
            "data_time": 0.017150169763809595,
            "batch_time": 0.052831001770802036,
            "samples_per_second": 829516.2864223515,
            "samples_per_second_per_gpu": 103689.53580279394,
            "loss_sequences_lower_95": 2.5921999664306643,
            "loss_sequences_upper_95": 2.6767119038899736,
            "loss_tokens_lower_95": 2.4972089203974814,
            "loss_tokens_upper_95": 2.656989098774046,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.925903255823315,
            "data_time": 0.001421390092697872,
            "batch_time": 0.03689309255706784,
            "samples_per_second": 900453.698000129,
            "samples_per_second_per_gpu": 112556.71225001612,
            "loss_sequences_lower_95": 4.929075784825427,
            "loss_sequences_upper_95": 5.014255275100266,
            "loss_tokens_lower_95": 4.788234495944622,
            "loss_tokens_upper_95": 4.875606384233599,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6798460349841955,
            "data_time": 0.002825963817186804,
            "batch_time": 0.038189869399038734,
            "samples_per_second": 898793.9410336638,
            "samples_per_second_per_gpu": 112349.24262920798,
            "loss_sequences_lower_95": 4.12591885653409,
            "loss_sequences_upper_95": 4.40511316370081,
            "loss_tokens_lower_95": 3.0482198079427083,
            "loss_tokens_upper_95": 3.1715278479968494,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.431637547986499,
            "data_time": 0.004874711503853669,
            "batch_time": 0.04029844298556044,
            "samples_per_second": 885634.5812834351,
            "samples_per_second_per_gpu": 110704.32266042939,
            "loss_sequences_lower_95": 3.7707423291515574,
            "loss_sequences_upper_95": 4.08150109821619,
            "loss_tokens_lower_95": 3.07130962419833,
            "loss_tokens_upper_95": 3.2116153112647225,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.585005760192871,
            "data_time": 0.022140117628233775,
            "batch_time": 0.057856932282447815,
            "samples_per_second": 830805.542100147,
            "samples_per_second_per_gpu": 103850.69276251837,
            "loss_sequences_lower_95": 5.476172125829409,
            "loss_sequences_upper_95": 5.693326056493471,
            "loss_tokens_lower_95": 5.475647585794806,
            "loss_tokens_upper_95": 5.692517326738192,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.018895721435547,
            "data_time": 0.04785420344426082,
            "batch_time": 0.08458863313381489,
            "samples_per_second": 743497.0222829586,
            "samples_per_second_per_gpu": 92937.12778536983,
            "loss_sequences_lower_95": 2.8810314025878907,
            "loss_sequences_upper_95": 3.2296753311157227,
            "loss_tokens_lower_95": 2.7317543370992423,
            "loss_tokens_upper_95": 3.1591676824634534,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7305495425402935,
            "data_time": 0.003397695125977686,
            "batch_time": 0.03903199192936435,
            "samples_per_second": 893871.3624432264,
            "samples_per_second_per_gpu": 111733.9203054033,
            "loss_sequences_lower_95": 2.6910554071775845,
            "loss_sequences_upper_95": 2.770407676111102,
            "loss_tokens_lower_95": 2.6903396840715357,
            "loss_tokens_upper_95": 2.77093520852658,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.190073449722965,
            "data_time": 0.004827663910136137,
            "batch_time": 0.0402805455162708,
            "samples_per_second": 890774.5814098871,
            "samples_per_second_per_gpu": 111346.82267623588,
            "loss_sequences_lower_95": 3.1267607767790873,
            "loss_sequences_upper_95": 3.254553882495777,
            "loss_tokens_lower_95": 3.1244114231418916,
            "loss_tokens_upper_95": 3.2528934813053465,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1365756699258016,
            "data_time": 0.003602988004169505,
            "batch_time": 0.03896850231656761,
            "samples_per_second": 892398.0720854829,
            "samples_per_second_per_gpu": 111549.75901068536,
            "loss_sequences_lower_95": 3.2902169610522645,
            "loss_sequences_upper_95": 3.420167197807569,
            "loss_tokens_lower_95": 2.954938420509587,
            "loss_tokens_upper_95": 3.0077700826366116,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.01738542675972,
            "data_time": 0.010193979367613792,
            "batch_time": 0.045460909605026245,
            "samples_per_second": 862757.503891833,
            "samples_per_second_per_gpu": 107844.68798647913,
            "loss_sequences_lower_95": 5.196708862304687,
            "loss_sequences_upper_95": 5.769493542480469,
            "loss_tokens_lower_95": 4.4338826955193165,
            "loss_tokens_upper_95": 4.796612552867513,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.450478121638298,
            "data_time": 0.15545807778835297,
            "batch_time": 0.19387005269527435,
            "samples_per_second": 491751.04185806896,
            "samples_per_second_per_gpu": 61468.88023225862,
            "loss_sequences_lower_95": 3.2355604350566862,
            "loss_sequences_upper_95": 3.678495264053345,
            "loss_tokens_lower_95": 2.9815708555024245,
            "loss_tokens_upper_95": 3.8026128834691537,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.523621518721526,
            "data_time": 0.026690077274403673,
            "batch_time": 0.06149974782416161,
            "samples_per_second": 790189.5838438255,
            "samples_per_second_per_gpu": 98773.69798047819,
            "loss_sequences_lower_95": 5.0191261993057426,
            "loss_sequences_upper_95": 5.872045600277254,
            "loss_tokens_lower_95": 3.0905184258737317,
            "loss_tokens_upper_95": 3.551749789163789,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.12968444024803,
            "data_time": 0.002943161668048965,
            "batch_time": 0.038297222306331,
            "samples_per_second": 895132.9969378022,
            "samples_per_second_per_gpu": 111891.62461722527,
            "loss_sequences_lower_95": 2.1031253999682584,
            "loss_sequences_upper_95": 2.1572065534212537,
            "loss_tokens_lower_95": 2.101496579898086,
            "loss_tokens_upper_95": 2.1569339030587864,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0472148336488902,
            "data_time": 0.0023168343465181113,
            "batch_time": 0.03780184413834418,
            "samples_per_second": 898774.6071896744,
            "samples_per_second_per_gpu": 112346.8258987093,
            "loss_sequences_lower_95": 2.0219853346958083,
            "loss_sequences_upper_95": 2.143584758537199,
            "loss_tokens_lower_95": 1.9259165019481408,
            "loss_tokens_upper_95": 2.0436838248068945,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.962706144912776,
            "data_time": 0.01886329385969374,
            "batch_time": 0.054089835948414274,
            "samples_per_second": 820662.3270405359,
            "samples_per_second_per_gpu": 102582.79088006698,
            "loss_sequences_lower_95": 2.8403039394280847,
            "loss_sequences_upper_95": 3.270845724811484,
            "loss_tokens_lower_95": 2.7041094106838286,
            "loss_tokens_upper_95": 2.9877499392471525,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.413271094554037,
            "data_time": 0.004864783212542534,
            "batch_time": 0.040162093192338946,
            "samples_per_second": 888350.9525111532,
            "samples_per_second_per_gpu": 111043.86906389415,
            "loss_sequences_lower_95": 3.4637780675007397,
            "loss_sequences_upper_95": 3.622608578026342,
            "loss_tokens_lower_95": 3.2683103745114725,
            "loss_tokens_upper_95": 3.409239883410027,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4683738292717354,
            "data_time": 0.030807006926763626,
            "batch_time": 0.06786295913514637,
            "samples_per_second": 791968.048300613,
            "samples_per_second_per_gpu": 98996.00603757662,
            "loss_sequences_lower_95": 2.348748667647199,
            "loss_sequences_upper_95": 2.7742100738897553,
            "loss_tokens_lower_95": 2.1851180159053833,
            "loss_tokens_upper_95": 2.5141193632589465,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0780253265828796,
            "data_time": 0.0018121222710542758,
            "batch_time": 0.03722242103116397,
            "samples_per_second": 899102.299631757,
            "samples_per_second_per_gpu": 112387.78745396962,
            "loss_sequences_lower_95": 3.06013723252463,
            "loss_sequences_upper_95": 3.095120825922997,
            "loss_tokens_lower_95": 3.0604669517887952,
            "loss_tokens_upper_95": 3.095324133186012,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0079086848254342,
            "data_time": 0.04540332013910467,
            "batch_time": 0.08065385818481445,
            "samples_per_second": 746646.3058137131,
            "samples_per_second_per_gpu": 93330.78822671414,
            "loss_sequences_lower_95": 0.9511067566362399,
            "loss_sequences_upper_95": 1.123797344467015,
            "loss_tokens_lower_95": 0.832338104716587,
            "loss_tokens_upper_95": 1.0903424078512824,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4352549328228585,
            "data_time": 0.0011820726077894353,
            "batch_time": 0.03661316939241628,
            "samples_per_second": 901231.6075533162,
            "samples_per_second_per_gpu": 112653.95094416452,
            "loss_sequences_lower_95": 3.7076006760187368,
            "loss_sequences_upper_95": 3.7479158477463312,
            "loss_tokens_lower_95": 3.0094990570599616,
            "loss_tokens_upper_95": 3.0465386907640233,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.454963767051697,
            "data_time": 0.005664980600750636,
            "batch_time": 0.041081829676552425,
            "samples_per_second": 885881.335073772,
            "samples_per_second_per_gpu": 110735.1668842215,
            "loss_sequences_lower_95": 4.4773050292968755,
            "loss_sequences_upper_95": 4.777156774902344,
            "loss_tokens_lower_95": 4.093435402241034,
            "loss_tokens_upper_95": 4.366015332333357,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6142446761545926,
            "data_time": 0.022682159633959753,
            "batch_time": 0.05786198276584431,
            "samples_per_second": 832992.7011424054,
            "samples_per_second_per_gpu": 104124.08764280067,
            "loss_sequences_lower_95": 2.4920064445163894,
            "loss_sequences_upper_95": 2.7342286483101224,
            "loss_tokens_lower_95": 2.4931755530315898,
            "loss_tokens_upper_95": 2.7366379381262735,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.235192008451982,
            "data_time": 0.004533194274787444,
            "batch_time": 0.03993156839566058,
            "samples_per_second": 891033.5573402367,
            "samples_per_second_per_gpu": 111379.19466752959,
            "loss_sequences_lower_95": 6.154524850556345,
            "loss_sequences_upper_95": 6.314354580965909,
            "loss_tokens_lower_95": 6.156773265491832,
            "loss_tokens_upper_95": 6.314484770803741,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2666501092910767,
            "data_time": 0.004225441433013754,
            "batch_time": 0.039723821776978516,
            "samples_per_second": 893983.3966534417,
            "samples_per_second_per_gpu": 111747.92458168021,
            "loss_sequences_lower_95": 1.3197183532714845,
            "loss_sequences_upper_95": 1.3865922281901042,
            "loss_tokens_lower_95": 1.1714049994998,
            "loss_tokens_upper_95": 1.2473156567314425,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.768684659685407,
            "data_time": 0.024387065853391374,
            "batch_time": 0.05928573863846915,
            "samples_per_second": 806989.0472440963,
            "samples_per_second_per_gpu": 100873.63090551204,
            "loss_sequences_lower_95": 5.409151073637463,
            "loss_sequences_upper_95": 6.134951433454241,
            "loss_tokens_lower_95": 5.404711594354539,
            "loss_tokens_upper_95": 6.140144609723772,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8187308795750141,
            "data_time": 0.14937882125377655,
            "batch_time": 0.19098715484142303,
            "samples_per_second": 472580.42867226043,
            "samples_per_second_per_gpu": 59072.553584032554,
            "loss_sequences_lower_95": 1.6388055682182312,
            "loss_sequences_upper_95": 2.3558848977088926,
            "loss_tokens_lower_95": 1.426271082298043,
            "loss_tokens_upper_95": 1.8365586428298162,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.486985253334045,
            "data_time": 0.005718207075482323,
            "batch_time": 0.041133141706860256,
            "samples_per_second": 885762.6999947366,
            "samples_per_second_per_gpu": 110720.33749934207,
            "loss_sequences_lower_95": 7.409422412109375,
            "loss_sequences_upper_95": 7.771720947265625,
            "loss_tokens_lower_95": 7.197970637855595,
            "loss_tokens_upper_95": 7.512725923025063,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.280650176525116,
            "data_time": 0.005767280147189186,
            "batch_time": 0.04110550643905761,
            "samples_per_second": 887562.0887719062,
            "samples_per_second_per_gpu": 110945.26109648828,
            "loss_sequences_lower_95": 6.370747351074218,
            "loss_sequences_upper_95": 6.585000109863281,
            "loss_tokens_lower_95": 6.048067486972647,
            "loss_tokens_upper_95": 6.228477824158111,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8307146848227167,
            "data_time": 0.0035142329225572055,
            "batch_time": 0.03900125800167836,
            "samples_per_second": 891532.3521208466,
            "samples_per_second_per_gpu": 111441.54401510583,
            "loss_sequences_lower_95": 2.8009856635517933,
            "loss_sequences_upper_95": 2.8597979042700303,
            "loss_tokens_lower_95": 2.801834148390358,
            "loss_tokens_upper_95": 2.8603196106059334,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.848159289579787,
            "data_time": 0.00834986306389293,
            "batch_time": 0.043647050857543945,
            "samples_per_second": 873613.7264310161,
            "samples_per_second_per_gpu": 109201.71580387701,
            "loss_sequences_lower_95": 2.750045841996388,
            "loss_sequences_upper_95": 2.946493141186036,
            "loss_tokens_lower_95": 2.7478492748535905,
            "loss_tokens_upper_95": 2.948373577158938,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.15079612326622,
            "data_time": 0.0054034169704195055,
            "batch_time": 0.040792416485529094,
            "samples_per_second": 886502.3583000151,
            "samples_per_second_per_gpu": 110812.79478750189,
            "loss_sequences_lower_95": 5.033514501953125,
            "loss_sequences_upper_95": 5.270792578125,
            "loss_tokens_lower_95": 5.032000268554688,
            "loss_tokens_upper_95": 5.271524279785156,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7560478380635423,
            "data_time": 0.001618910647069574,
            "batch_time": 0.037077897798361535,
            "samples_per_second": 899103.3120939477,
            "samples_per_second_per_gpu": 112387.91401174346,
            "loss_sequences_lower_95": 3.2345363776460503,
            "loss_sequences_upper_95": 3.331697417144631,
            "loss_tokens_lower_95": 2.1697966210595916,
            "loss_tokens_upper_95": 2.233582069407712,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6276285986402144,
            "data_time": 0.017305311134883334,
            "batch_time": 0.052957592691693985,
            "samples_per_second": 820014.5834794695,
            "samples_per_second_per_gpu": 102501.82293493369,
            "loss_sequences_lower_95": 2.490856238977233,
            "loss_sequences_upper_95": 2.765192686621823,
            "loss_tokens_lower_95": 2.4928151998946912,
            "loss_tokens_upper_95": 2.7651183512673447,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4710937841265808,
            "data_time": 0.010709188878536224,
            "batch_time": 0.046862813644111156,
            "samples_per_second": 866023.9950699864,
            "samples_per_second_per_gpu": 108252.9993837483,
            "loss_sequences_lower_95": 2.3749294146369486,
            "loss_sequences_upper_95": 2.5678734992532166,
            "loss_tokens_lower_95": 2.376235231885723,
            "loss_tokens_upper_95": 2.5663564225739126,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.311429167624839,
            "data_time": 0.0020293914010805557,
            "batch_time": 0.03744376525487802,
            "samples_per_second": 898842.8081433435,
            "samples_per_second_per_gpu": 112355.35101791794,
            "loss_sequences_lower_95": 2.5069526772712325,
            "loss_sequences_upper_95": 2.5837041652965675,
            "loss_tokens_lower_95": 1.9634688615580433,
            "loss_tokens_upper_95": 2.021868247802636,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4726569942696384,
            "data_time": 0.02684384087721507,
            "batch_time": 0.06295563280582428,
            "samples_per_second": 823703.1141167755,
            "samples_per_second_per_gpu": 102962.88926459693,
            "loss_sequences_lower_95": 2.362619543327856,
            "loss_sequences_upper_95": 2.5818525606993012,
            "loss_tokens_lower_95": 2.3628251817491317,
            "loss_tokens_upper_95": 2.5847060511351896,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.249442292134696,
            "data_time": 0.0034907982026264345,
            "batch_time": 0.03908354444235963,
            "samples_per_second": 889709.7455356244,
            "samples_per_second_per_gpu": 111213.71819195306,
            "loss_sequences_lower_95": 3.218986211654243,
            "loss_sequences_upper_95": 3.281599128559824,
            "loss_tokens_lower_95": 3.2181345565749235,
            "loss_tokens_upper_95": 3.280315008600917,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.986209913364892,
            "data_time": 0.02202594063498757,
            "batch_time": 0.05793693065643311,
            "samples_per_second": 787598.7396297322,
            "samples_per_second_per_gpu": 98449.84245371653,
            "loss_sequences_lower_95": 2.828929308548714,
            "loss_sequences_upper_95": 3.1445997886287356,
            "loss_tokens_lower_95": 2.8300557034686933,
            "loss_tokens_upper_95": 3.148163538071716,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5729056666294734,
            "data_time": 0.07801085710525513,
            "batch_time": 0.11354980617761612,
            "samples_per_second": 650622.9930282115,
            "samples_per_second_per_gpu": 81327.87412852644,
            "loss_sequences_lower_95": 1.4286706447601318,
            "loss_sequences_upper_95": 1.8257529576619467,
            "loss_tokens_lower_95": 1.2967520660824245,
            "loss_tokens_upper_95": 1.789585256576538,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5707228104273478,
            "data_time": 0.0752488374710083,
            "batch_time": 0.11135882884263992,
            "samples_per_second": 647005.6392276709,
            "samples_per_second_per_gpu": 80875.70490345886,
            "loss_sequences_lower_95": 1.4588766384124756,
            "loss_sequences_upper_95": 1.8812184842427573,
            "loss_tokens_lower_95": 1.2274801018532744,
            "loss_tokens_upper_95": 1.7931766638595066,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5984967591660713,
            "data_time": 0.003118907935202578,
            "batch_time": 0.038676467202845004,
            "samples_per_second": 892968.8026434595,
            "samples_per_second_per_gpu": 111621.10033043244,
            "loss_sequences_lower_95": 2.5837369480048786,
            "loss_sequences_upper_95": 2.6131458616186487,
            "loss_tokens_lower_95": 2.5843268190813697,
            "loss_tokens_upper_95": 2.6129601853023745,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.4659592517723777,
            "data_time": 0.0011345892003380756,
            "batch_time": 0.03659078825257017,
            "samples_per_second": 900811.1822162827,
            "samples_per_second_per_gpu": 112601.39777703534,
            "loss_sequences_lower_95": 0.5310275244972069,
            "loss_sequences_upper_95": 0.5425209448714675,
            "loss_tokens_lower_95": 0.39628225622856034,
            "loss_tokens_upper_95": 0.4029393830177795,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2422106190929263,
            "data_time": 0.04012953117489815,
            "batch_time": 0.09550290554761887,
            "samples_per_second": 792118.6286196373,
            "samples_per_second_per_gpu": 99014.82857745467,
            "loss_sequences_lower_95": 1.1679069128562145,
            "loss_sequences_upper_95": 1.3621095762478084,
            "loss_tokens_lower_95": 1.1023330422356683,
            "loss_tokens_upper_95": 1.2080295098908438,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4855265194499814,
            "data_time": 0.11493649936857678,
            "batch_time": 0.15193579310462588,
            "samples_per_second": 506231.04380955745,
            "samples_per_second_per_gpu": 63278.88047619468,
            "loss_sequences_lower_95": 3.0853057552028345,
            "loss_sequences_upper_95": 3.9406237834208717,
            "loss_tokens_lower_95": 2.9062491569989994,
            "loss_tokens_upper_95": 3.9548267129026815,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.103090219017936,
            "data_time": 0.028434466748010544,
            "batch_time": 0.06589033773967198,
            "samples_per_second": 781310.2214101695,
            "samples_per_second_per_gpu": 97663.77767627119,
            "loss_sequences_lower_95": 1.0538165301811404,
            "loss_sequences_upper_95": 1.2124577592058878,
            "loss_tokens_lower_95": 0.9850226933741589,
            "loss_tokens_upper_95": 1.0680042356037112,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1483327909940626,
            "data_time": 0.028949686459132602,
            "batch_time": 0.06481631596883138,
            "samples_per_second": 809783.9127330766,
            "samples_per_second_per_gpu": 101222.98909163458,
            "loss_sequences_lower_95": 1.1225052112486305,
            "loss_sequences_upper_95": 1.2672665293623762,
            "loss_tokens_lower_95": 1.0244747841415505,
            "loss_tokens_upper_95": 1.0957231269515217,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0892512496651672,
            "data_time": 0.028957664966583252,
            "batch_time": 0.06519445067360288,
            "samples_per_second": 800775.2435864822,
            "samples_per_second_per_gpu": 100096.90544831028,
            "loss_sequences_lower_95": 0.982541984465064,
            "loss_sequences_upper_95": 1.1600174485183343,
            "loss_tokens_lower_95": 1.012036397801246,
            "loss_tokens_upper_95": 1.123723866844945,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2085989019492778,
            "data_time": 0.029382177761622837,
            "batch_time": 0.06581148363295056,
            "samples_per_second": 797628.723688887,
            "samples_per_second_per_gpu": 99703.59046111087,
            "loss_sequences_lower_95": 1.1684450870606957,
            "loss_sequences_upper_95": 1.3021253329951588,
            "loss_tokens_lower_95": 1.0946015188627154,
            "loss_tokens_upper_95": 1.1643237580391477,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0380093599698559,
            "data_time": 0.031236295346860534,
            "batch_time": 0.0683438601317229,
            "samples_per_second": 803803.2800253934,
            "samples_per_second_per_gpu": 100475.41000317417,
            "loss_sequences_lower_95": 1.0028410538383152,
            "loss_sequences_upper_95": 1.0955169440796657,
            "loss_tokens_lower_95": 0.9830095976756728,
            "loss_tokens_upper_95": 1.034820856907173,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0249243981954528,
            "data_time": 0.030315251577468144,
            "batch_time": 0.06629899569920131,
            "samples_per_second": 810472.674013598,
            "samples_per_second_per_gpu": 101309.08425169975,
            "loss_sequences_lower_95": 1.017968519722543,
            "loss_sequences_upper_95": 1.1238512132225968,
            "loss_tokens_lower_95": 0.9103249527906495,
            "loss_tokens_upper_95": 0.9565394442364564,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-4.0/params.txt",
    "uuid": "691f1edd-24cd-49be-ab2d-a25668b8350f",
    "creation_date": "2023_12_14-07_47_55"
}