{
    "name": "rpj-d=1024_l=24_h=8-0.25",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 2058081280,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "411616256",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=1024_l=24_h=8-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.1948235809803007,
            "data_time": 0.039599742740392685,
            "batch_time": 0.4531300999224186,
            "samples_per_second": 691862.5019945037,
            "samples_per_second_per_gpu": 86482.81274931296,
            "loss_sequences_lower_95": 3.1236750729878744,
            "loss_sequences_upper_95": 3.2618838691711427,
            "loss_tokens_lower_95": 3.182459939320882,
            "loss_tokens_upper_95": 3.207026735941569,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.645167697049456,
            "data_time": 0.0009856164992518939,
            "batch_time": 0.03654048878863354,
            "samples_per_second": 902310.3089438442,
            "samples_per_second_per_gpu": 112788.78861798052,
            "loss_sequences_lower_95": 3.6426102899004693,
            "loss_sequences_upper_95": 3.6477061770579655,
            "loss_tokens_lower_95": 3.6341332604166667,
            "loss_tokens_upper_95": 3.6563570104166665,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9851145544830633,
            "data_time": 0.008558290481567383,
            "batch_time": 0.04385668849945069,
            "samples_per_second": 871319.9748697315,
            "samples_per_second_per_gpu": 108914.99685871643,
            "loss_sequences_lower_95": 2.9585575337312657,
            "loss_sequences_upper_95": 3.0117362633529976,
            "loss_tokens_lower_95": 2.973356125,
            "loss_tokens_upper_95": 2.9971871145833333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4821034988422985,
            "data_time": 0.0014229482529979002,
            "batch_time": 0.03673257835601505,
            "samples_per_second": 906826.4977997065,
            "samples_per_second_per_gpu": 113353.31222496332,
            "loss_sequences_lower_95": 3.469285015302835,
            "loss_sequences_upper_95": 3.494567322406572,
            "loss_tokens_lower_95": 3.4708394791666666,
            "loss_tokens_upper_95": 3.4930048385416668,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.633181870590645,
            "data_time": 0.008653858268403437,
            "batch_time": 0.04400852096983161,
            "samples_per_second": 865998.1286316912,
            "samples_per_second_per_gpu": 108249.7660789614,
            "loss_sequences_lower_95": 3.598468241332262,
            "loss_sequences_upper_95": 3.6668317744299737,
            "loss_tokens_lower_95": 3.622046083333333,
            "loss_tokens_upper_95": 3.6439524791666664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4371572939576462,
            "data_time": 0.003315249862878219,
            "batch_time": 0.0388442232556965,
            "samples_per_second": 899188.8401800969,
            "samples_per_second_per_gpu": 112398.60502251211,
            "loss_sequences_lower_95": 3.394991814401343,
            "loss_sequences_upper_95": 3.479177642760027,
            "loss_tokens_lower_95": 3.4258382552083333,
            "loss_tokens_upper_95": 3.448411109375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1092373494712673,
            "data_time": 0.001453594904739355,
            "batch_time": 0.03675942806007423,
            "samples_per_second": 909536.5989062755,
            "samples_per_second_per_gpu": 113692.07486328443,
            "loss_sequences_lower_95": 2.083844866071429,
            "loss_sequences_upper_95": 2.134557941047513,
            "loss_tokens_lower_95": 2.0984823854166663,
            "loss_tokens_upper_95": 2.1202621614583332,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9491167574278347,
            "data_time": 0.0016501966952679299,
            "batch_time": 0.037078268903910194,
            "samples_per_second": 907072.2553838755,
            "samples_per_second_per_gpu": 113384.03192298443,
            "loss_sequences_lower_95": 3.9402490285503924,
            "loss_sequences_upper_95": 3.957781434064136,
            "loss_tokens_lower_95": 3.9381755,
            "loss_tokens_upper_95": 3.9598271562500003,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7919556983118134,
            "data_time": 0.00851217338017055,
            "batch_time": 0.04867576701300485,
            "samples_per_second": 869096.0374173323,
            "samples_per_second_per_gpu": 108637.00467716654,
            "loss_sequences_lower_95": 3.749371865125206,
            "loss_sequences_upper_95": 3.8382609545699946,
            "loss_tokens_lower_95": 3.780819989583333,
            "loss_tokens_upper_95": 3.8031498333333333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.378658088299597,
            "data_time": 0.008955875411629677,
            "batch_time": 0.044444942846894264,
            "samples_per_second": 878330.9570893135,
            "samples_per_second_per_gpu": 109791.36963616419,
            "loss_sequences_lower_95": 4.342949522719553,
            "loss_sequences_upper_95": 4.410126046041255,
            "loss_tokens_lower_95": 4.366738135416666,
            "loss_tokens_upper_95": 4.390976833333333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6177014676580628,
            "data_time": 0.0012255514703668545,
            "batch_time": 0.03663795992066417,
            "samples_per_second": 907766.9034532944,
            "samples_per_second_per_gpu": 113470.8629316618,
            "loss_sequences_lower_95": 3.60913127425012,
            "loss_sequences_upper_95": 3.626078441108075,
            "loss_tokens_lower_95": 3.60654221875,
            "loss_tokens_upper_95": 3.628816,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5095048730247274,
            "data_time": 0.002418869838031702,
            "batch_time": 0.037765678418466786,
            "samples_per_second": 906796.7090084638,
            "samples_per_second_per_gpu": 113349.58862605797,
            "loss_sequences_lower_95": 3.499026620174537,
            "loss_sequences_upper_95": 3.51967214181591,
            "loss_tokens_lower_95": 3.4985683125,
            "loss_tokens_upper_95": 3.5203351666666665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9225485494122303,
            "data_time": 0.008590676567771217,
            "batch_time": 0.04390996623887375,
            "samples_per_second": 865598.1574211656,
            "samples_per_second_per_gpu": 108199.7696776457,
            "loss_sequences_lower_95": 3.8855919642574164,
            "loss_sequences_upper_95": 3.958540799216373,
            "loss_tokens_lower_95": 3.9113589166666665,
            "loss_tokens_upper_95": 3.9338483854166664,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.37544313273459,
            "data_time": 0.008613634869396924,
            "batch_time": 0.043915367696389734,
            "samples_per_second": 868253.7679097815,
            "samples_per_second_per_gpu": 108531.72098872269,
            "loss_sequences_lower_95": 3.3124377341959965,
            "loss_sequences_upper_95": 3.4377052446975243,
            "loss_tokens_lower_95": 3.3638421770833333,
            "loss_tokens_upper_95": 3.386876270833333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.1639720743352715,
            "data_time": 0.07258302824837821,
            "batch_time": 0.10663991315024239,
            "samples_per_second": 536236.3397882042,
            "samples_per_second_per_gpu": 67029.54247352552,
            "loss_sequences_lower_95": 4.0993650869889695,
            "loss_sequences_upper_95": 4.228750116174871,
            "loss_tokens_lower_95": 4.143551722439852,
            "loss_tokens_upper_95": 4.184847172823819,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0972902430737332,
            "data_time": 0.01208345727487044,
            "batch_time": 0.04756605489687486,
            "samples_per_second": 855125.0691312451,
            "samples_per_second_per_gpu": 106890.63364140564,
            "loss_sequences_lower_95": 2.994059161750638,
            "loss_sequences_upper_95": 3.199822001554528,
            "loss_tokens_lower_95": 3.0859984375,
            "loss_tokens_upper_95": 3.10843234375,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.8233351619388305,
            "data_time": 0.01149053250749906,
            "batch_time": 0.047196198254823685,
            "samples_per_second": 863143.7231454278,
            "samples_per_second_per_gpu": 107892.96539317847,
            "loss_sequences_lower_95": 5.765738535053183,
            "loss_sequences_upper_95": 5.8766532958340205,
            "loss_tokens_lower_95": 5.8116853229166665,
            "loss_tokens_upper_95": 5.8350005,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8776514569266896,
            "data_time": 0.032597094774246216,
            "batch_time": 0.06862180680036545,
            "samples_per_second": 770538.8235839175,
            "samples_per_second_per_gpu": 96317.35294798968,
            "loss_sequences_lower_95": 3.837018847856365,
            "loss_sequences_upper_95": 3.915506156546171,
            "loss_tokens_lower_95": 3.8653770259169278,
            "loss_tokens_upper_95": 3.8901045502209275,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.072086657618308,
            "data_time": 0.0015632982338208,
            "batch_time": 0.03695549430635361,
            "samples_per_second": 902506.1791722896,
            "samples_per_second_per_gpu": 112813.2723965362,
            "loss_sequences_lower_95": 5.048190487132352,
            "loss_sequences_upper_95": 5.0963691375649836,
            "loss_tokens_lower_95": 5.047985285215781,
            "loss_tokens_upper_95": 5.096461118875516,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3978214301191176,
            "data_time": 0.0016622332156084145,
            "batch_time": 0.03703844262536164,
            "samples_per_second": 902287.8581125528,
            "samples_per_second_per_gpu": 112785.9822640691,
            "loss_sequences_lower_95": 3.382901656791476,
            "loss_sequences_upper_95": 3.408672345679397,
            "loss_tokens_lower_95": 3.3841369404382036,
            "loss_tokens_upper_95": 3.4041435394290778,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.868051341205596,
            "data_time": 0.0029325462758372574,
            "batch_time": 0.03834975428675617,
            "samples_per_second": 899382.083297593,
            "samples_per_second_per_gpu": 112422.76041219913,
            "loss_sequences_lower_95": 5.116349210815717,
            "loss_sequences_upper_95": 5.412510275356489,
            "loss_tokens_lower_95": 4.324480004318561,
            "loss_tokens_upper_95": 4.53853034673204,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.045837835381429,
            "data_time": 0.003418230947027815,
            "batch_time": 0.038835819889890384,
            "samples_per_second": 895618.8592143251,
            "samples_per_second_per_gpu": 111952.35740179064,
            "loss_sequences_lower_95": 5.175435693359375,
            "loss_sequences_upper_95": 5.380008276367188,
            "loss_tokens_lower_95": 4.716928262578616,
            "loss_tokens_upper_95": 4.858585900648585,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.353203160732058,
            "data_time": 0.00417020957394423,
            "batch_time": 0.039466783112170646,
            "samples_per_second": 896425.7316027905,
            "samples_per_second_per_gpu": 112053.21645034882,
            "loss_sequences_lower_95": 3.3994494249461003,
            "loss_sequences_upper_95": 3.463255059023171,
            "loss_tokens_lower_95": 3.2539011077098055,
            "loss_tokens_upper_95": 3.2863618271666186,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.431122649799694,
            "data_time": 0.0209387583392007,
            "batch_time": 0.056947680456297736,
            "samples_per_second": 834923.8294126996,
            "samples_per_second_per_gpu": 104365.47867658745,
            "loss_sequences_lower_95": 2.4098096951571377,
            "loss_sequences_upper_95": 2.5226859214089132,
            "loss_tokens_lower_95": 2.3616850176043034,
            "loss_tokens_upper_95": 2.4094068152902346,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.567503769543706,
            "data_time": 0.018183765932917595,
            "batch_time": 0.05346992425620556,
            "samples_per_second": 824611.5877644081,
            "samples_per_second_per_gpu": 103076.44847055101,
            "loss_sequences_lower_95": 3.5572356368084344,
            "loss_sequences_upper_95": 3.751374187858737,
            "loss_tokens_lower_95": 3.4458278367256265,
            "loss_tokens_upper_95": 3.5398914342840215,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.103223966757456,
            "data_time": 0.015163128192608174,
            "batch_time": 0.05040740202634762,
            "samples_per_second": 833095.5407628285,
            "samples_per_second_per_gpu": 104136.94259535357,
            "loss_sequences_lower_95": 4.072885050455729,
            "loss_sequences_upper_95": 4.17281879679362,
            "loss_tokens_lower_95": 3.9660390658195883,
            "loss_tokens_upper_95": 4.1856050734172205,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.793963839058679,
            "data_time": 0.0012914477435520182,
            "batch_time": 0.03666541381868224,
            "samples_per_second": 904224.5623919702,
            "samples_per_second_per_gpu": 113028.07029899627,
            "loss_sequences_lower_95": 6.809507944367896,
            "loss_sequences_upper_95": 6.888796495927858,
            "loss_tokens_lower_95": 6.640269351886139,
            "loss_tokens_upper_95": 6.722698020688996,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.010212954284266,
            "data_time": 0.0028093367214970942,
            "batch_time": 0.038953080473330196,
            "samples_per_second": 899553.6535472074,
            "samples_per_second_per_gpu": 112444.20669340092,
            "loss_sequences_lower_95": 5.587737369216251,
            "loss_sequences_upper_95": 5.903006505484533,
            "loss_tokens_lower_95": 4.194352013971674,
            "loss_tokens_upper_95": 4.335991638904263,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.467923814424475,
            "data_time": 0.004538132129488765,
            "batch_time": 0.03977517421181138,
            "samples_per_second": 890976.4857511132,
            "samples_per_second_per_gpu": 111372.06071888914,
            "loss_sequences_lower_95": 4.946402842030183,
            "loss_sequences_upper_95": 5.293884600225976,
            "loss_tokens_lower_95": 4.017467887511597,
            "loss_tokens_upper_95": 4.176167360880021,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.831814606984456,
            "data_time": 0.020255063261304582,
            "batch_time": 0.05576522648334503,
            "samples_per_second": 834231.1867568835,
            "samples_per_second_per_gpu": 104278.89834461044,
            "loss_sequences_lower_95": 5.727270633227205,
            "loss_sequences_upper_95": 5.936529680365297,
            "loss_tokens_lower_95": 5.728215995335688,
            "loss_tokens_upper_95": 5.936905144121004,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7525454235076903,
            "data_time": 0.042170474162468545,
            "batch_time": 0.07782932886710534,
            "samples_per_second": 753037.3551320272,
            "samples_per_second_per_gpu": 94129.6693915034,
            "loss_sequences_lower_95": 3.621654037475586,
            "loss_sequences_upper_95": 4.002754783630371,
            "loss_tokens_lower_95": 3.4367284293678027,
            "loss_tokens_upper_95": 3.9093509353337432,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.183576559703206,
            "data_time": 0.002999372521304889,
            "batch_time": 0.03875377256202308,
            "samples_per_second": 890615.610670785,
            "samples_per_second_per_gpu": 111326.95133384812,
            "loss_sequences_lower_95": 5.128011188487558,
            "loss_sequences_upper_95": 5.238784499544204,
            "loss_tokens_lower_95": 5.127358930699047,
            "loss_tokens_upper_95": 5.238314060300985,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.217565598202767,
            "data_time": 0.0045883375513223025,
            "batch_time": 0.040389284613081916,
            "samples_per_second": 884294.2015144262,
            "samples_per_second_per_gpu": 110536.77518930327,
            "loss_sequences_lower_95": 5.155796960179285,
            "loss_sequences_upper_95": 5.278650672156531,
            "loss_tokens_lower_95": 5.153797866358901,
            "loss_tokens_upper_95": 5.2799414582373565,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.92479761393986,
            "data_time": 0.003317529130445443,
            "batch_time": 0.038868149722369134,
            "samples_per_second": 890412.300393257,
            "samples_per_second_per_gpu": 111301.53754915713,
            "loss_sequences_lower_95": 4.06852936355541,
            "loss_sequences_upper_95": 4.192949018177452,
            "loss_tokens_lower_95": 3.752184660442064,
            "loss_tokens_upper_95": 3.810070457020201,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.839029848575592,
            "data_time": 0.009193592704832554,
            "batch_time": 0.04439548961818218,
            "samples_per_second": 869255.3470463046,
            "samples_per_second_per_gpu": 108656.91838078808,
            "loss_sequences_lower_95": 6.036838989257813,
            "loss_sequences_upper_95": 6.5897935791015625,
            "loss_tokens_lower_95": 5.20546055004188,
            "loss_tokens_upper_95": 5.5727588504612005,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.173490807414055,
            "data_time": 0.13064712285995483,
            "batch_time": 0.17016328871250153,
            "samples_per_second": 480710.8603448114,
            "samples_per_second_per_gpu": 60088.857543101425,
            "loss_sequences_lower_95": 3.91245214343071,
            "loss_sequences_upper_95": 4.473959064483642,
            "loss_tokens_lower_95": 3.6726876094423493,
            "loss_tokens_upper_95": 4.544627231290971,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.049888890365074,
            "data_time": 0.02362237331715036,
            "batch_time": 0.058160718451154995,
            "samples_per_second": 793387.8241951743,
            "samples_per_second_per_gpu": 99173.47802439678,
            "loss_sequences_lower_95": 5.406852178463991,
            "loss_sequences_upper_95": 6.122002752896012,
            "loss_tokens_lower_95": 3.826212149945832,
            "loss_tokens_upper_95": 4.294873638647654,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.589876041604005,
            "data_time": 0.0027664868781963983,
            "batch_time": 0.038054039494858846,
            "samples_per_second": 898403.0609714634,
            "samples_per_second_per_gpu": 112300.38262143293,
            "loss_sequences_lower_95": 2.558193035134065,
            "loss_sequences_upper_95": 2.621205437136509,
            "loss_tokens_lower_95": 2.557636050003072,
            "loss_tokens_upper_95": 2.621681292705944,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.422278541595008,
            "data_time": 0.002451645203880159,
            "batch_time": 0.03782923535223751,
            "samples_per_second": 901691.8231540025,
            "samples_per_second_per_gpu": 112711.47789425032,
            "loss_sequences_lower_95": 3.3911655206160245,
            "loss_sequences_upper_95": 3.5631801825089755,
            "loss_tokens_lower_95": 3.233184834479242,
            "loss_tokens_upper_95": 3.401424880519732,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.482055744408688,
            "data_time": 0.01624340812365214,
            "batch_time": 0.05119204024473826,
            "samples_per_second": 829506.7034748055,
            "samples_per_second_per_gpu": 103688.33793435068,
            "loss_sequences_lower_95": 3.3405622880537433,
            "loss_sequences_upper_95": 3.7711337470309636,
            "loss_tokens_lower_95": 3.211624199146479,
            "loss_tokens_upper_95": 3.5185173945113752,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8901378641753284,
            "data_time": 0.004280350729823112,
            "batch_time": 0.03943008817732334,
            "samples_per_second": 893453.7903333759,
            "samples_per_second_per_gpu": 111681.72379167199,
            "loss_sequences_lower_95": 3.944670248295062,
            "loss_sequences_upper_95": 4.103078233678226,
            "loss_tokens_lower_95": 3.7356864320027525,
            "loss_tokens_upper_95": 3.885228626265969,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2312543901001534,
            "data_time": 0.028046031792958576,
            "batch_time": 0.0639439310346331,
            "samples_per_second": 811741.2476790947,
            "samples_per_second_per_gpu": 101467.65595988683,
            "loss_sequences_lower_95": 3.072295263336926,
            "loss_sequences_upper_95": 3.566431650301305,
            "loss_tokens_lower_95": 2.952697928150171,
            "loss_tokens_upper_95": 3.333356748397354,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.9151881648960485,
            "data_time": 0.001533049942729857,
            "batch_time": 0.0369319951967673,
            "samples_per_second": 902091.512159366,
            "samples_per_second_per_gpu": 112761.43901992076,
            "loss_sequences_lower_95": 4.898970350710767,
            "loss_sequences_upper_95": 4.930843180354821,
            "loss_tokens_lower_95": 4.899009313581467,
            "loss_tokens_upper_95": 4.930875149639303,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1612122261408464,
            "data_time": 0.04131329709833319,
            "batch_time": 0.0774167624386874,
            "samples_per_second": 739110.71561611,
            "samples_per_second_per_gpu": 92388.83945201375,
            "loss_sequences_lower_95": 1.0962141481418055,
            "loss_sequences_upper_95": 1.287532124936002,
            "loss_tokens_lower_95": 0.9888198337144959,
            "loss_tokens_upper_95": 1.2174471899089208,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.6125552566108965,
            "data_time": 0.001074728602036047,
            "batch_time": 0.03642051858704128,
            "samples_per_second": 905156.5107056685,
            "samples_per_second_per_gpu": 113144.56383820856,
            "loss_sequences_lower_95": 4.946520786245414,
            "loss_sequences_upper_95": 4.990023216391509,
            "loss_tokens_lower_95": 4.09002582205029,
            "loss_tokens_upper_95": 4.133093568665377,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.4411561586856845,
            "data_time": 0.004966670558566139,
            "batch_time": 0.04036534069076417,
            "samples_per_second": 888868.0781676788,
            "samples_per_second_per_gpu": 111108.50977095985,
            "loss_sequences_lower_95": 6.444155480957031,
            "loss_sequences_upper_95": 6.675536389160156,
            "loss_tokens_lower_95": 6.188644439862481,
            "loss_tokens_upper_95": 6.403319708284996,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.565656483691671,
            "data_time": 0.019991820141420525,
            "batch_time": 0.05608748379400221,
            "samples_per_second": 825939.2358167814,
            "samples_per_second_per_gpu": 103242.40447709768,
            "loss_sequences_lower_95": 5.366401818316916,
            "loss_sequences_upper_95": 5.768783038595449,
            "loss_tokens_lower_95": 5.3649850994607675,
            "loss_tokens_upper_95": 5.763376876167628,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.600240523887403,
            "data_time": 0.004087335014917764,
            "batch_time": 0.03948260861707021,
            "samples_per_second": 892433.2370575422,
            "samples_per_second_per_gpu": 111554.15463219277,
            "loss_sequences_lower_95": 6.509750569661458,
            "loss_sequences_upper_95": 6.689568832859849,
            "loss_tokens_lower_95": 6.509041914506392,
            "loss_tokens_upper_95": 6.691021136659565,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2493013211886088,
            "data_time": 0.003787235376682687,
            "batch_time": 0.03926599913455071,
            "samples_per_second": 894458.7480147132,
            "samples_per_second_per_gpu": 111807.34350183915,
            "loss_sequences_lower_95": 1.2961591634114584,
            "loss_sequences_upper_95": 1.3668001505533856,
            "loss_tokens_lower_95": 1.1575017487463737,
            "loss_tokens_upper_95": 1.2274875575230093,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.3823746386028475,
            "data_time": 0.021117966089929854,
            "batch_time": 0.05594033854348319,
            "samples_per_second": 810049.8662458257,
            "samples_per_second_per_gpu": 101256.23328072821,
            "loss_sequences_lower_95": 6.032000528971354,
            "loss_sequences_upper_95": 6.727766970679874,
            "loss_tokens_lower_95": 6.037513892764137,
            "loss_tokens_upper_95": 6.733839968726748,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4490790963172913,
            "data_time": 0.13536547124385834,
            "batch_time": 0.17525337636470795,
            "samples_per_second": 490595.9083917419,
            "samples_per_second_per_gpu": 61324.48854896774,
            "loss_sequences_lower_95": 2.214672064781189,
            "loss_sequences_upper_95": 3.3524558663368222,
            "loss_tokens_lower_95": 1.8845819626641027,
            "loss_tokens_upper_95": 2.4489030346427993,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.780856457233429,
            "data_time": 0.005043832082597036,
            "batch_time": 0.04029834175866748,
            "samples_per_second": 891159.695439018,
            "samples_per_second_per_gpu": 111394.96192987724,
            "loss_sequences_lower_95": 7.727723510742187,
            "loss_sequences_upper_95": 8.081105725097656,
            "loss_tokens_lower_95": 7.454859224632509,
            "loss_tokens_upper_95": 7.7685175913441205,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.083531452655793,
            "data_time": 0.005217748501944164,
            "batch_time": 0.04089766362356761,
            "samples_per_second": 884295.8076122461,
            "samples_per_second_per_gpu": 110536.97595153077,
            "loss_sequences_lower_95": 7.169805322265625,
            "loss_sequences_upper_95": 7.4023947998046875,
            "loss_tokens_lower_95": 6.828556812269185,
            "loss_tokens_upper_95": 7.038292411762175,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.13770476236385,
            "data_time": 0.0033477550366251764,
            "batch_time": 0.03883278712779782,
            "samples_per_second": 894349.0645306502,
            "samples_per_second_per_gpu": 111793.63306633127,
            "loss_sequences_lower_95": 5.104012327996669,
            "loss_sequences_upper_95": 5.171549129965639,
            "loss_tokens_lower_95": 5.103903761942676,
            "loss_tokens_upper_95": 5.171720486729488,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.474293517077573,
            "data_time": 0.007179992076493462,
            "batch_time": 0.042534740309700864,
            "samples_per_second": 877436.9681157322,
            "samples_per_second_per_gpu": 109679.62101446652,
            "loss_sequences_lower_95": 4.3628899687079965,
            "loss_sequences_upper_95": 4.585602256669427,
            "loss_tokens_lower_95": 4.359278543826806,
            "loss_tokens_upper_95": 4.584129153015794,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.231983938694,
            "data_time": 0.00502268426002018,
            "batch_time": 0.04027418815900409,
            "samples_per_second": 890740.7182256164,
            "samples_per_second_per_gpu": 111342.58977820205,
            "loss_sequences_lower_95": 5.160230944824219,
            "loss_sequences_upper_95": 5.304886474609375,
            "loss_tokens_lower_95": 5.160762963867188,
            "loss_tokens_upper_95": 5.3055631591796875,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6111340376771386,
            "data_time": 0.0016123000525346551,
            "batch_time": 0.03704477176774427,
            "samples_per_second": 900804.7791160167,
            "samples_per_second_per_gpu": 112600.59738950209,
            "loss_sequences_lower_95": 4.15978002860395,
            "loss_sequences_upper_95": 4.260825624926088,
            "loss_tokens_lower_95": 2.9059180945538117,
            "loss_tokens_upper_95": 2.974906363639999,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.123893204020031,
            "data_time": 0.016718924045562744,
            "batch_time": 0.052138711724962504,
            "samples_per_second": 825305.3406198291,
            "samples_per_second_per_gpu": 103163.16757747864,
            "loss_sequences_lower_95": 5.914987535619024,
            "loss_sequences_upper_95": 6.330224575213532,
            "loss_tokens_lower_95": 5.914543299888497,
            "loss_tokens_upper_95": 6.330218744989651,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.039738950542375,
            "data_time": 0.00995323620736599,
            "batch_time": 0.045711043290793896,
            "samples_per_second": 873441.8308311161,
            "samples_per_second_per_gpu": 109180.22885388951,
            "loss_sequences_lower_95": 5.8809755332797184,
            "loss_sequences_upper_95": 6.1932247146905635,
            "loss_tokens_lower_95": 5.886480413698683,
            "loss_tokens_upper_95": 6.190560087316176,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6790421448464117,
            "data_time": 0.0017146761937867584,
            "batch_time": 0.0370489853446142,
            "samples_per_second": 902531.9714816982,
            "samples_per_second_per_gpu": 112816.49643521228,
            "loss_sequences_lower_95": 3.908132344709069,
            "loss_sequences_upper_95": 4.0033598190588595,
            "loss_tokens_lower_95": 3.1940076790313268,
            "loss_tokens_upper_95": 3.2701586172072625,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.3024712315312135,
            "data_time": 0.023134840031464893,
            "batch_time": 0.05930150051911672,
            "samples_per_second": 820422.4656587587,
            "samples_per_second_per_gpu": 102552.80820734483,
            "loss_sequences_lower_95": 5.151506672208271,
            "loss_sequences_upper_95": 5.44754871186756,
            "loss_tokens_lower_95": 5.154683099978815,
            "loss_tokens_upper_95": 5.44836114146722,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.058990696519158,
            "data_time": 0.002754674144134708,
            "batch_time": 0.03807577443501306,
            "samples_per_second": 899600.7376374301,
            "samples_per_second_per_gpu": 112450.09220467876,
            "loss_sequences_lower_95": 4.026275324923547,
            "loss_sequences_upper_95": 4.090423908758601,
            "loss_tokens_lower_95": 4.0277650978951645,
            "loss_tokens_upper_95": 4.090557282707378,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.227289265799291,
            "data_time": 0.020049116828224876,
            "batch_time": 0.055195552652532405,
            "samples_per_second": 801844.1682983353,
            "samples_per_second_per_gpu": 100230.52103729191,
            "loss_sequences_lower_95": 5.993259111422937,
            "loss_sequences_upper_95": 6.456205231009178,
            "loss_tokens_lower_95": 5.993059962004134,
            "loss_tokens_upper_95": 6.46313800996947,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.571681487560272,
            "data_time": 0.06769263744354248,
            "batch_time": 0.10426735132932663,
            "samples_per_second": 653999.3386591772,
            "samples_per_second_per_gpu": 81749.91733239715,
            "loss_sequences_lower_95": 3.2671556663513184,
            "loss_sequences_upper_95": 4.081516615549723,
            "loss_tokens_lower_95": 2.889035193125407,
            "loss_tokens_upper_95": 3.8345072004530163,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.318337776263555,
            "data_time": 0.0689840018749237,
            "batch_time": 0.10581284761428833,
            "samples_per_second": 645771.8097387103,
            "samples_per_second_per_gpu": 80721.47621733879,
            "loss_sequences_lower_95": 3.11327693939209,
            "loss_sequences_upper_95": 3.9659072176615395,
            "loss_tokens_lower_95": 2.5389832614512926,
            "loss_tokens_upper_95": 3.6403026023607574,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9355521843605437,
            "data_time": 0.0032784000012789005,
            "batch_time": 0.038601906458162576,
            "samples_per_second": 899456.1245416442,
            "samples_per_second_per_gpu": 112432.01556770553,
            "loss_sequences_lower_95": 2.905721613528627,
            "loss_sequences_upper_95": 2.964136065790685,
            "loss_tokens_lower_95": 2.905955032101436,
            "loss_tokens_upper_95": 2.964710166605302,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8073880330275628,
            "data_time": 0.001034479270357011,
            "batch_time": 0.03639670190782464,
            "samples_per_second": 904817.9942793904,
            "samples_per_second_per_gpu": 113102.2492849238,
            "loss_sequences_lower_95": 0.9462649606463277,
            "loss_sequences_upper_95": 0.9682889810919655,
            "loss_tokens_lower_95": 0.6543223857397356,
            "loss_tokens_upper_95": 0.6664366606473459,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1596330770357386,
            "data_time": 0.033846136182546616,
            "batch_time": 0.07001425325870514,
            "samples_per_second": 798643.4045119756,
            "samples_per_second_per_gpu": 99830.42556399695,
            "loss_sequences_lower_95": 2.0698581905815545,
            "loss_sequences_upper_95": 2.3469830130028915,
            "loss_tokens_lower_95": 1.9176577210807004,
            "loss_tokens_upper_95": 2.061123589009918,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4849442082482414,
            "data_time": 0.09550761041187104,
            "batch_time": 0.13436596734183176,
            "samples_per_second": 480541.2061416676,
            "samples_per_second_per_gpu": 60067.65076770845,
            "loss_sequences_lower_95": 3.1707545615531303,
            "loss_sequences_upper_95": 3.8599889188199428,
            "loss_tokens_lower_95": 3.011500831886574,
            "loss_tokens_upper_95": 3.8739249900535295,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9964597174307195,
            "data_time": 0.027019384361448743,
            "batch_time": 0.06298527831122988,
            "samples_per_second": 812818.2494427687,
            "samples_per_second_per_gpu": 101602.2811803461,
            "loss_sequences_lower_95": 1.9470437352250263,
            "loss_sequences_upper_95": 2.182607678669255,
            "loss_tokens_lower_95": 1.8036879155934609,
            "loss_tokens_upper_95": 1.9197872506439233,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0454244926208403,
            "data_time": 0.027087180387406123,
            "batch_time": 0.06295003493626912,
            "samples_per_second": 808181.6611210647,
            "samples_per_second_per_gpu": 101022.70764013309,
            "loss_sequences_lower_95": 2.0338432684177308,
            "loss_sequences_upper_95": 2.2488549581388146,
            "loss_tokens_lower_95": 1.846345566724632,
            "loss_tokens_upper_95": 1.9427229201177143,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.032759132181726,
            "data_time": 0.02756231455575852,
            "batch_time": 0.06346458764303298,
            "samples_per_second": 806765.7493758003,
            "samples_per_second_per_gpu": 100845.71867197503,
            "loss_sequences_lower_95": 1.8955218105781368,
            "loss_sequences_upper_95": 2.1503548877995184,
            "loss_tokens_lower_95": 1.9106067308685848,
            "loss_tokens_upper_95": 2.0660419308364633,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1344165889228264,
            "data_time": 0.02774129311243693,
            "batch_time": 0.06388504164559501,
            "samples_per_second": 806188.463384163,
            "samples_per_second_per_gpu": 100773.55792302037,
            "loss_sequences_lower_95": 2.125329962009337,
            "loss_sequences_upper_95": 2.3247525796657653,
            "loss_tokens_lower_95": 1.9360740542783172,
            "loss_tokens_upper_95": 2.0265927894093165,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.785482574693905,
            "data_time": 0.026869732656596618,
            "batch_time": 0.06289652541831688,
            "samples_per_second": 817751.3750693215,
            "samples_per_second_per_gpu": 102218.92188366519,
            "loss_sequences_lower_95": 1.7355253800101902,
            "loss_sequences_upper_95": 1.8576570025141934,
            "loss_tokens_lower_95": 1.7122849505977624,
            "loss_tokens_upper_95": 1.781502443834089,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.606676690462159,
            "data_time": 0.02695577201389131,
            "batch_time": 0.0625138453074864,
            "samples_per_second": 814648.7657325899,
            "samples_per_second_per_gpu": 101831.09571657373,
            "loss_sequences_lower_95": 1.5962229054148604,
            "loss_sequences_upper_95": 1.7324481871069932,
            "loss_tokens_lower_95": 1.4578581582702306,
            "loss_tokens_upper_95": 1.5178150252882008,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.25/params.txt",
    "uuid": "9a662cc4-ff6b-447b-a8e1-64238c40b3bb",
    "creation_date": "2023_12_14-07_44_07"
}