{
    "name": "rpj-d=576_l=24_h=8-4.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 12294190080,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "2458838016",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.8130369464556377,
            "data_time": 0.03856232389807701,
            "batch_time": 0.37322206795215607,
            "samples_per_second": 834372.263917215,
            "samples_per_second_per_gpu": 104296.53298965188,
            "loss_sequences_lower_95": 2.7453269068400066,
            "loss_sequences_upper_95": 2.8774213536580406,
            "loss_tokens_lower_95": 2.8013723119099936,
            "loss_tokens_upper_95": 2.82468656539917,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3138508520543835,
            "data_time": 0.0010996856395069034,
            "batch_time": 0.030436704291334334,
            "samples_per_second": 1088651.079182615,
            "samples_per_second_per_gpu": 136081.38489782688,
            "loss_sequences_lower_95": 3.3112377976505605,
            "loss_sequences_upper_95": 3.3164199358227746,
            "loss_tokens_lower_95": 3.3032355208333333,
            "loss_tokens_upper_95": 3.324557213541667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.726987174092507,
            "data_time": 0.010309277534484863,
            "batch_time": 0.039348427772521975,
            "samples_per_second": 1073100.897264236,
            "samples_per_second_per_gpu": 134137.6121580295,
            "loss_sequences_lower_95": 2.701376915756537,
            "loss_sequences_upper_95": 2.7525828334263394,
            "loss_tokens_lower_95": 2.7156363385416666,
            "loss_tokens_upper_95": 2.738469578125,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1477329122897277,
            "data_time": 0.001709666299192529,
            "batch_time": 0.030700984557992535,
            "samples_per_second": 1104709.9390600987,
            "samples_per_second_per_gpu": 138088.74238251234,
            "loss_sequences_lower_95": 3.1350481435244846,
            "loss_sequences_upper_95": 3.1600748429445877,
            "loss_tokens_lower_95": 3.136945125,
            "loss_tokens_upper_95": 3.1583577239583334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3072772239716137,
            "data_time": 0.010155379534717575,
            "batch_time": 0.03881605972806771,
            "samples_per_second": 1072798.5191260537,
            "samples_per_second_per_gpu": 134099.8148907567,
            "loss_sequences_lower_95": 3.2724391339024312,
            "loss_sequences_upper_95": 3.341254407956498,
            "loss_tokens_lower_95": 3.2964735677083334,
            "loss_tokens_upper_95": 3.3177461197916664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0834113786530124,
            "data_time": 0.003947167616823445,
            "batch_time": 0.03299173745124236,
            "samples_per_second": 1100820.204820127,
            "samples_per_second_per_gpu": 137602.52560251587,
            "loss_sequences_lower_95": 3.041741789837908,
            "loss_sequences_upper_95": 3.1249453131638765,
            "loss_tokens_lower_95": 3.0723968645833333,
            "loss_tokens_upper_95": 3.094362442708333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7600695417486891,
            "data_time": 0.0016210339781121757,
            "batch_time": 0.030625067468957448,
            "samples_per_second": 1106995.8016617533,
            "samples_per_second_per_gpu": 138374.47520771917,
            "loss_sequences_lower_95": 1.7378710538903062,
            "loss_sequences_upper_95": 1.7822054019850129,
            "loss_tokens_lower_95": 1.7502704010416668,
            "loss_tokens_upper_95": 1.770285859375,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6704812131751896,
            "data_time": 0.0017098658556521362,
            "batch_time": 0.030638010817958142,
            "samples_per_second": 1109076.9306347936,
            "samples_per_second_per_gpu": 138634.6163293492,
            "loss_sequences_lower_95": 3.6610756810373033,
            "loss_sequences_upper_95": 3.6797046997709426,
            "loss_tokens_lower_95": 3.659666875,
            "loss_tokens_upper_95": 3.6810653541666665,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4718113483452218,
            "data_time": 0.011705493170117575,
            "batch_time": 0.049578559777093315,
            "samples_per_second": 1075345.1474101602,
            "samples_per_second_per_gpu": 134418.14342627002,
            "loss_sequences_lower_95": 3.428439144971894,
            "loss_sequences_upper_95": 3.5185332631677144,
            "loss_tokens_lower_95": 3.4608603020833333,
            "loss_tokens_upper_95": 3.482902166666667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.079023551092789,
            "data_time": 0.011008854024112225,
            "batch_time": 0.0408863490447402,
            "samples_per_second": 1056245.6778995506,
            "samples_per_second_per_gpu": 132030.70973744383,
            "loss_sequences_lower_95": 4.038635766553313,
            "loss_sequences_upper_95": 4.113797238315989,
            "loss_tokens_lower_95": 4.0671269375,
            "loss_tokens_upper_95": 4.091201427083333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.24229689818695,
            "data_time": 0.0013292461723522083,
            "batch_time": 0.03030619721897467,
            "samples_per_second": 1108460.0027087533,
            "samples_per_second_per_gpu": 138557.50033859417,
            "loss_sequences_lower_95": 3.234101322943076,
            "loss_sequences_upper_95": 3.2504780565857545,
            "loss_tokens_lower_95": 3.2315670625,
            "loss_tokens_upper_95": 3.25300628125,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1513188619109207,
            "data_time": 0.0026623402705895317,
            "batch_time": 0.031486304376047915,
            "samples_per_second": 1110142.140502664,
            "samples_per_second_per_gpu": 138767.767562833,
            "loss_sequences_lower_95": 3.1409979423449865,
            "loss_sequences_upper_95": 3.1614680152394183,
            "loss_tokens_lower_95": 3.1408845364583335,
            "loss_tokens_upper_95": 3.1618536458333337,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6367271193142594,
            "data_time": 0.010303500141550901,
            "batch_time": 0.03940507074589786,
            "samples_per_second": 1055980.5021298234,
            "samples_per_second_per_gpu": 131997.56276622793,
            "loss_sequences_lower_95": 3.5991539450252756,
            "loss_sequences_upper_95": 3.6724464896242712,
            "loss_tokens_lower_95": 3.625614104166667,
            "loss_tokens_upper_95": 3.647855020833333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0320956541904365,
            "data_time": 0.010580347828656078,
            "batch_time": 0.040413866005095826,
            "samples_per_second": 1038731.5606751869,
            "samples_per_second_per_gpu": 129841.44508439836,
            "loss_sequences_lower_95": 2.968274597043661,
            "loss_sequences_upper_95": 3.0939870550773296,
            "loss_tokens_lower_95": 3.020901010416667,
            "loss_tokens_upper_95": 3.0431996197916664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8919234546748074,
            "data_time": 0.08774650948388237,
            "batch_time": 0.12056887149810791,
            "samples_per_second": 557715.9389247695,
            "samples_per_second_per_gpu": 69714.49236559619,
            "loss_sequences_lower_95": 3.830742428519509,
            "loss_sequences_upper_95": 3.952768837321888,
            "loss_tokens_lower_95": 3.8720048124139956,
            "loss_tokens_upper_95": 3.9123568188060416,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.721441353722842,
            "data_time": 0.014705804261294279,
            "batch_time": 0.04421104084361683,
            "samples_per_second": 1041644.4665759011,
            "samples_per_second_per_gpu": 130205.55832198764,
            "loss_sequences_lower_95": 2.626140067320176,
            "loss_sequences_upper_95": 2.816776704927227,
            "loss_tokens_lower_95": 2.7106906041666665,
            "loss_tokens_upper_95": 2.7320825260416663,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.609401740625225,
            "data_time": 0.013297446072101593,
            "batch_time": 0.04290237153569857,
            "samples_per_second": 1055102.536569754,
            "samples_per_second_per_gpu": 131887.81707121924,
            "loss_sequences_lower_95": 5.547767401622279,
            "loss_sequences_upper_95": 5.66485949996908,
            "loss_tokens_lower_95": 5.5979527395833335,
            "loss_tokens_upper_95": 5.620811833333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.38946644790837,
            "data_time": 0.038641903549432755,
            "batch_time": 0.06897606328129768,
            "samples_per_second": 932848.2870923494,
            "samples_per_second_per_gpu": 116606.03588654367,
            "loss_sequences_lower_95": 3.319574618730389,
            "loss_sequences_upper_95": 3.4422136775782852,
            "loss_tokens_lower_95": 3.377503854720319,
            "loss_tokens_upper_95": 3.401472810839043,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.097962851725384,
            "data_time": 0.0017107643339791439,
            "batch_time": 0.030911869604929705,
            "samples_per_second": 1093734.6444288148,
            "samples_per_second_per_gpu": 136716.83055360185,
            "loss_sequences_lower_95": 5.075055942663082,
            "loss_sequences_upper_95": 5.121356925317796,
            "loss_tokens_lower_95": 5.074709813083072,
            "loss_tokens_upper_95": 5.121134392581185,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.078866384960555,
            "data_time": 0.0018459495370555076,
            "batch_time": 0.030752260926043152,
            "samples_per_second": 1100379.5115459866,
            "samples_per_second_per_gpu": 137547.43894324833,
            "loss_sequences_lower_95": 3.0680163819769466,
            "loss_sequences_upper_95": 3.0933602988541873,
            "loss_tokens_lower_95": 3.0626267042741886,
            "loss_tokens_upper_95": 3.0817905902340086,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9438305200131705,
            "data_time": 0.0031574008377175866,
            "batch_time": 0.0318911309291685,
            "samples_per_second": 1107598.7206779544,
            "samples_per_second_per_gpu": 138449.8400847443,
            "loss_sequences_lower_95": 4.2087647406965925,
            "loss_sequences_upper_95": 4.492524303351589,
            "loss_tokens_lower_95": 3.381380196699279,
            "loss_tokens_upper_95": 3.5908677968917533,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.034505999825895,
            "data_time": 0.0034625612674875463,
            "batch_time": 0.03232176918932732,
            "samples_per_second": 1095997.108405451,
            "samples_per_second_per_gpu": 136999.63855068138,
            "loss_sequences_lower_95": 4.1119194498697915,
            "loss_sequences_upper_95": 4.311638199869792,
            "loss_tokens_lower_95": 3.7918619975923744,
            "loss_tokens_upper_95": 3.9324598933765724,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.908420735547902,
            "data_time": 0.0048387442598875115,
            "batch_time": 0.0343177045272666,
            "samples_per_second": 1075818.7305414681,
            "samples_per_second_per_gpu": 134477.34131768352,
            "loss_sequences_lower_95": 2.954134930544861,
            "loss_sequences_upper_95": 3.014417295188412,
            "loss_tokens_lower_95": 2.8135199883844737,
            "loss_tokens_upper_95": 2.8440767307839674,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1177100159905176,
            "data_time": 0.025090143084526062,
            "batch_time": 0.05511644695486341,
            "samples_per_second": 999929.1024955784,
            "samples_per_second_per_gpu": 124991.1378119473,
            "loss_sequences_lower_95": 2.0989522379094905,
            "loss_sequences_upper_95": 2.2040301270918414,
            "loss_tokens_lower_95": 2.053236790474151,
            "loss_tokens_upper_95": 2.0972210276944976,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.251204912029967,
            "data_time": 0.02070266753435135,
            "batch_time": 0.05096142552793026,
            "samples_per_second": 985550.4354507289,
            "samples_per_second_per_gpu": 123193.80443134111,
            "loss_sequences_lower_95": 3.242457362583705,
            "loss_sequences_upper_95": 3.43598032425861,
            "loss_tokens_lower_95": 3.125202737548663,
            "loss_tokens_upper_95": 3.2150305795637273,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.361599574883779,
            "data_time": 0.01751092152717786,
            "batch_time": 0.04670894145965576,
            "samples_per_second": 1016101.5156408822,
            "samples_per_second_per_gpu": 127012.68945511027,
            "loss_sequences_lower_95": 3.3330503997802734,
            "loss_sequences_upper_95": 3.4347776896158853,
            "loss_tokens_lower_95": 3.2303511611316056,
            "loss_tokens_upper_95": 3.4261731500236596,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.425622245466134,
            "data_time": 0.0014305892580548747,
            "batch_time": 0.030759754148997048,
            "samples_per_second": 1090950.258900145,
            "samples_per_second_per_gpu": 136368.78236251813,
            "loss_sequences_lower_95": 5.433947659594755,
            "loss_sequences_upper_95": 5.511372577552163,
            "loss_tokens_lower_95": 5.28924485014812,
            "loss_tokens_upper_95": 5.368888526386832,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.349753544587479,
            "data_time": 0.0030081550146909366,
            "batch_time": 0.03160141918483197,
            "samples_per_second": 1110522.35714211,
            "samples_per_second_per_gpu": 138815.29464276374,
            "loss_sequences_lower_95": 4.853748606672191,
            "loss_sequences_upper_95": 5.15071678290062,
            "loss_tokens_lower_95": 3.6348774451759054,
            "loss_tokens_upper_95": 3.7689934088683583,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.974816068600062,
            "data_time": 0.005052019615431089,
            "batch_time": 0.0345630943775177,
            "samples_per_second": 1067575.9106735673,
            "samples_per_second_per_gpu": 133446.98883419592,
            "loss_sequences_lower_95": 4.380220255672729,
            "loss_sequences_upper_95": 4.718069260112254,
            "loss_tokens_lower_95": 3.5720093521851806,
            "loss_tokens_upper_95": 3.725438428979106,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.699124105444782,
            "data_time": 0.023130753210612705,
            "batch_time": 0.05305775787149157,
            "samples_per_second": 995328.8149557816,
            "samples_per_second_per_gpu": 124416.1018694727,
            "loss_sequences_lower_95": 5.622017721933861,
            "loss_sequences_upper_95": 5.773139584336651,
            "loss_tokens_lower_95": 5.62313795394549,
            "loss_tokens_upper_95": 5.77187791240814,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.430414733886719,
            "data_time": 0.04970572544978215,
            "batch_time": 0.07998492167546199,
            "samples_per_second": 903151.8741266902,
            "samples_per_second_per_gpu": 112893.98426583628,
            "loss_sequences_lower_95": 3.2942986068725584,
            "loss_sequences_upper_95": 3.6531332015991214,
            "loss_tokens_lower_95": 3.1180784440424447,
            "loss_tokens_upper_95": 3.586725595982642,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.890040588598534,
            "data_time": 0.0034645203432422476,
            "batch_time": 0.03272258284633145,
            "samples_per_second": 1088476.960533226,
            "samples_per_second_per_gpu": 136059.62006665324,
            "loss_sequences_lower_95": 4.841468921922979,
            "loss_sequences_upper_95": 4.9398367156513885,
            "loss_tokens_lower_95": 4.840551395474828,
            "loss_tokens_upper_95": 4.9392412199334705,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.0857639191679835,
            "data_time": 0.004881590847867931,
            "batch_time": 0.03436541323949618,
            "samples_per_second": 1078134.6981977907,
            "samples_per_second_per_gpu": 134766.83727472383,
            "loss_sequences_lower_95": 5.015448003045659,
            "loss_sequences_upper_95": 5.1542604638551905,
            "loss_tokens_lower_95": 5.012773573466933,
            "loss_tokens_upper_95": 5.155444815820792,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.544227139864186,
            "data_time": 0.0034966388994880165,
            "batch_time": 0.03288787822229043,
            "samples_per_second": 1081394.991848259,
            "samples_per_second_per_gpu": 135174.37398103238,
            "loss_sequences_lower_95": 3.696184093985905,
            "loss_sequences_upper_95": 3.8215270132701984,
            "loss_tokens_lower_95": 3.3612442387481543,
            "loss_tokens_upper_95": 3.4173244128451277,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.425385359764099,
            "data_time": 0.010804340243339539,
            "batch_time": 0.04045536648482084,
            "samples_per_second": 1042140.8664653454,
            "samples_per_second_per_gpu": 130267.60830816817,
            "loss_sequences_lower_95": 5.612168115234375,
            "loss_sequences_upper_95": 6.164488146972657,
            "loss_tokens_lower_95": 4.80724238882311,
            "loss_tokens_upper_95": 5.167734295827991,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6914127618074417,
            "data_time": 0.15824812650680542,
            "batch_time": 0.192834734916687,
            "samples_per_second": 537145.8088541892,
            "samples_per_second_per_gpu": 67143.22610677365,
            "loss_sequences_lower_95": 3.4513047873973846,
            "loss_sequences_upper_95": 3.938292849063873,
            "loss_tokens_lower_95": 3.2190102062006107,
            "loss_tokens_upper_95": 4.065029652913411,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.52071044773891,
            "data_time": 0.02767733817404889,
            "batch_time": 0.05720865472834161,
            "samples_per_second": 940952.7447225319,
            "samples_per_second_per_gpu": 117619.09309031648,
            "loss_sequences_lower_95": 4.912721401521529,
            "loss_sequences_upper_95": 5.6742206880416,
            "loss_tokens_lower_95": 3.2936191150273926,
            "loss_tokens_upper_95": 3.727085805933801,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.250900107135206,
            "data_time": 0.0029790523565477794,
            "batch_time": 0.032177373559938535,
            "samples_per_second": 1085863.4580358046,
            "samples_per_second_per_gpu": 135732.93225447557,
            "loss_sequences_lower_95": 2.216482590608276,
            "loss_sequences_upper_95": 2.2857819496778067,
            "loss_tokens_lower_95": 2.215287928083361,
            "loss_tokens_upper_95": 2.2860259996433347,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.763998031658819,
            "data_time": 0.0024616356547321975,
            "batch_time": 0.03145742027806736,
            "samples_per_second": 1098482.2561715404,
            "samples_per_second_per_gpu": 137310.28202144254,
            "loss_sequences_lower_95": 2.7362592889988235,
            "loss_sequences_upper_95": 2.88532880194183,
            "loss_tokens_lower_95": 2.606911293555471,
            "loss_tokens_upper_95": 2.7505363127805436,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1829105539199634,
            "data_time": 0.019042059779167175,
            "batch_time": 0.048372303446133934,
            "samples_per_second": 995776.6992488733,
            "samples_per_second_per_gpu": 124472.08740610916,
            "loss_sequences_lower_95": 3.052130417596726,
            "loss_sequences_upper_95": 3.450381648584163,
            "loss_tokens_lower_95": 2.9314782522873783,
            "loss_tokens_upper_95": 3.225916650932658,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6466887937160393,
            "data_time": 0.0049183454364538194,
            "batch_time": 0.034944978356361386,
            "samples_per_second": 1053903.1030411823,
            "samples_per_second_per_gpu": 131737.88788014778,
            "loss_sequences_lower_95": 3.6844281444233427,
            "loss_sequences_upper_95": 3.8398723840148974,
            "loss_tokens_lower_95": 3.5015405031307782,
            "loss_tokens_upper_95": 3.6464845719868886,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.838131219875522,
            "data_time": 0.02919251294363113,
            "batch_time": 0.05893737929207938,
            "samples_per_second": 983437.6204309847,
            "samples_per_second_per_gpu": 122929.70255387308,
            "loss_sequences_lower_95": 2.691728061582984,
            "loss_sequences_upper_95": 3.136555313482517,
            "loss_tokens_lower_95": 2.5594401373559847,
            "loss_tokens_upper_95": 2.9171881671052966,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3076413338078,
            "data_time": 0.001847855672329557,
            "batch_time": 0.03106813072634372,
            "samples_per_second": 1093687.5159650457,
            "samples_per_second_per_gpu": 136710.9394956307,
            "loss_sequences_lower_95": 4.29127970125275,
            "loss_sequences_upper_95": 4.323598245039633,
            "loss_tokens_lower_95": 4.291344010989698,
            "loss_tokens_upper_95": 4.323449855986823,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8157711671394052,
            "data_time": 0.04679299267855558,
            "batch_time": 0.07749451290477406,
            "samples_per_second": 870788.4567914974,
            "samples_per_second_per_gpu": 108848.55709893718,
            "loss_sequences_lower_95": 0.7618961315710567,
            "loss_sequences_upper_95": 0.905019961977468,
            "loss_tokens_lower_95": 0.6760703142492164,
            "loss_tokens_upper_95": 0.8707285636035056,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.578171954133725,
            "data_time": 0.0012681140359069557,
            "batch_time": 0.03077680132315125,
            "samples_per_second": 1083988.387884119,
            "samples_per_second_per_gpu": 135498.54848551488,
            "loss_sequences_lower_95": 4.952424004192872,
            "loss_sequences_upper_95": 4.999655388086347,
            "loss_tokens_lower_95": 3.9994554763056094,
            "loss_tokens_upper_95": 4.045851964458413,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.282045358896256,
            "data_time": 0.005797851180273389,
            "batch_time": 0.03533331978888739,
            "samples_per_second": 1067850.895044438,
            "samples_per_second_per_gpu": 133481.36188055476,
            "loss_sequences_lower_95": 5.300578588867187,
            "loss_sequences_upper_95": 5.591278564453125,
            "loss_tokens_lower_95": 4.957357110704273,
            "loss_tokens_upper_95": 5.2222193270253285,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.363614244046419,
            "data_time": 0.02408970614611092,
            "batch_time": 0.05482569993552515,
            "samples_per_second": 993846.657529356,
            "samples_per_second_per_gpu": 124230.8321911695,
            "loss_sequences_lower_95": 5.200785217285157,
            "loss_sequences_upper_95": 5.52646781589674,
            "loss_tokens_lower_95": 5.2011474874745245,
            "loss_tokens_upper_95": 5.5239312346085265,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.409635495417046,
            "data_time": 0.004711244120655289,
            "batch_time": 0.034097383180296564,
            "samples_per_second": 1081090.933201293,
            "samples_per_second_per_gpu": 135136.36665016162,
            "loss_sequences_lower_95": 7.298880374792851,
            "loss_sequences_upper_95": 7.518133156516336,
            "loss_tokens_lower_95": 7.3011011851917615,
            "loss_tokens_upper_95": 7.5161531760475855,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2917565831343334,
            "data_time": 0.004140371654896026,
            "batch_time": 0.03287963822801063,
            "samples_per_second": 1101619.4777905443,
            "samples_per_second_per_gpu": 137702.43472381803,
            "loss_sequences_lower_95": 1.3387063212076824,
            "loss_sequences_upper_95": 1.4029956298828126,
            "loss_tokens_lower_95": 1.2013164777629803,
            "loss_tokens_upper_95": 1.2763707182091588,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.912227276393345,
            "data_time": 0.022366002202033997,
            "batch_time": 0.05200698971748352,
            "samples_per_second": 957321.005950797,
            "samples_per_second_per_gpu": 119665.12574384962,
            "loss_sequences_lower_95": 5.549786362420945,
            "loss_sequences_upper_95": 6.277994631812686,
            "loss_tokens_lower_95": 5.5541244361514135,
            "loss_tokens_upper_95": 6.278552289690291,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.135953899472952,
            "data_time": 0.16243816912174225,
            "batch_time": 0.19707627594470978,
            "samples_per_second": 525397.4178228495,
            "samples_per_second_per_gpu": 65674.67722785618,
            "loss_sequences_lower_95": 1.9287083268165588,
            "loss_sequences_upper_95": 2.7922259509563445,
            "loss_tokens_lower_95": 1.6631042354623067,
            "loss_tokens_upper_95": 2.142975237344958,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.693615876674652,
            "data_time": 0.006197901472212776,
            "batch_time": 0.03531933209252736,
            "samples_per_second": 1080798.59349887,
            "samples_per_second_per_gpu": 135099.82418735875,
            "loss_sequences_lower_95": 7.624725561523437,
            "loss_sequences_upper_95": 7.975919982910156,
            "loss_tokens_lower_95": 7.401637270648266,
            "loss_tokens_upper_95": 7.708181086248149,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.605311569690705,
            "data_time": 0.005897269835547795,
            "batch_time": 0.03462081012271699,
            "samples_per_second": 1093749.160257837,
            "samples_per_second_per_gpu": 136718.64503222963,
            "loss_sequences_lower_95": 6.6825044799804685,
            "loss_sequences_upper_95": 6.889283740234375,
            "loss_tokens_lower_95": 6.387496503544379,
            "loss_tokens_upper_95": 6.576295192300169,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.86375955259724,
            "data_time": 0.004066165235130284,
            "batch_time": 0.03336945473151064,
            "samples_per_second": 1079080.211405229,
            "samples_per_second_per_gpu": 134885.02642565363,
            "loss_sequences_lower_95": 4.822735073947683,
            "loss_sequences_upper_95": 4.904982883147733,
            "loss_tokens_lower_95": 4.823208721492102,
            "loss_tokens_upper_95": 4.905506455526631,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.582273772784641,
            "data_time": 0.00862119133018655,
            "batch_time": 0.03801040851097453,
            "samples_per_second": 1055280.3314060639,
            "samples_per_second_per_gpu": 131910.04142575798,
            "loss_sequences_lower_95": 4.470299964822749,
            "loss_sequences_upper_95": 4.690578318887409,
            "loss_tokens_lower_95": 4.4697326097620245,
            "loss_tokens_upper_95": 4.690853042719734,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.264798843383789,
            "data_time": 0.006028969609548175,
            "batch_time": 0.03529263867272271,
            "samples_per_second": 1074435.1145488354,
            "samples_per_second_per_gpu": 134304.38931860443,
            "loss_sequences_lower_95": 5.142256506347656,
            "loss_sequences_upper_95": 5.393620544433594,
            "loss_tokens_lower_95": 5.139639819335938,
            "loss_tokens_upper_95": 5.394431335449219,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8907615384236753,
            "data_time": 0.0019527489837040748,
            "batch_time": 0.030982600853889337,
            "samples_per_second": 1096321.323071453,
            "samples_per_second_per_gpu": 137040.16538393163,
            "loss_sequences_lower_95": 3.3440016519335383,
            "loss_sequences_upper_95": 3.435719505750355,
            "loss_tokens_lower_95": 2.3208920723270277,
            "loss_tokens_upper_95": 2.3809716028305354,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.312996596542757,
            "data_time": 0.019334910597120013,
            "batch_time": 0.04861359766551426,
            "samples_per_second": 1007425.6517478332,
            "samples_per_second_per_gpu": 125928.20646847915,
            "loss_sequences_lower_95": 5.133623049152431,
            "loss_sequences_upper_95": 5.489406642629139,
            "loss_tokens_lower_95": 5.13672961334684,
            "loss_tokens_upper_95": 5.48803267977131,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.432720999624215,
            "data_time": 0.011194076389074326,
            "batch_time": 0.04086644295603037,
            "samples_per_second": 1057402.7856743606,
            "samples_per_second_per_gpu": 132175.34820929507,
            "loss_sequences_lower_95": 5.309345499674479,
            "loss_sequences_upper_95": 5.554093400543811,
            "loss_tokens_lower_95": 5.310266077378217,
            "loss_tokens_upper_95": 5.551933785232843,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0870508179544554,
            "data_time": 0.0019730590945275074,
            "batch_time": 0.031226434538322082,
            "samples_per_second": 1087755.665139452,
            "samples_per_second_per_gpu": 135969.4581424315,
            "loss_sequences_lower_95": 3.3692829195634477,
            "loss_sequences_upper_95": 3.4542465850283417,
            "loss_tokens_lower_95": 2.615115105274132,
            "loss_tokens_upper_95": 2.683263765890219,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.522395987990041,
            "data_time": 0.027001870175202686,
            "batch_time": 0.05728264649709066,
            "samples_per_second": 985123.2300628125,
            "samples_per_second_per_gpu": 123140.40375785156,
            "loss_sequences_lower_95": 4.394431276674624,
            "loss_sequences_upper_95": 4.642207748170883,
            "loss_tokens_lower_95": 4.39718581103774,
            "loss_tokens_upper_95": 4.645986825448495,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.917677326085736,
            "data_time": 0.003445167826791095,
            "batch_time": 0.032813843062218004,
            "samples_per_second": 1080607.7393768355,
            "samples_per_second_per_gpu": 135075.96742210444,
            "loss_sequences_lower_95": 4.888097883218655,
            "loss_sequences_upper_95": 4.94856973163943,
            "loss_tokens_lower_95": 4.887544064769687,
            "loss_tokens_upper_95": 4.948327218618597,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.456111958883341,
            "data_time": 0.024571553143587978,
            "batch_time": 0.054681472344831984,
            "samples_per_second": 956536.470928006,
            "samples_per_second_per_gpu": 119567.05886600075,
            "loss_sequences_lower_95": 5.2918033785033,
            "loss_sequences_upper_95": 5.619522953959344,
            "loss_tokens_lower_95": 5.291373702854786,
            "loss_tokens_upper_95": 5.622742217721291,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.8435631314913432,
            "data_time": 0.08382768929004669,
            "batch_time": 0.11705964803695679,
            "samples_per_second": 754199.322359848,
            "samples_per_second_per_gpu": 94274.915294981,
            "loss_sequences_lower_95": 1.6218564319610596,
            "loss_sequences_upper_95": 2.1537514368693036,
            "loss_tokens_lower_95": 1.488138747215271,
            "loss_tokens_upper_95": 2.1650306118859186,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.047356128692627,
            "data_time": 0.08156054466962814,
            "batch_time": 0.1122753769159317,
            "samples_per_second": 768968.5716536873,
            "samples_per_second_per_gpu": 96121.07145671091,
            "loss_sequences_lower_95": 1.8779988193511963,
            "loss_sequences_upper_95": 2.47764347076416,
            "loss_tokens_lower_95": 1.5777222879816977,
            "loss_tokens_upper_95": 2.3557309568598033,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6833757577483186,
            "data_time": 0.003223156353668159,
            "batch_time": 0.03237526847587886,
            "samples_per_second": 1088463.9274928793,
            "samples_per_second_per_gpu": 136057.99093660992,
            "loss_sequences_lower_95": 3.6567420422381263,
            "loss_sequences_upper_95": 3.7103329948683728,
            "loss_tokens_lower_95": 3.656347785691274,
            "loss_tokens_upper_95": 3.7110972952526695,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.5707527112404372,
            "data_time": 0.0011819222334698983,
            "batch_time": 0.030416700825975553,
            "samples_per_second": 1091462.087152962,
            "samples_per_second_per_gpu": 136432.76089412026,
            "loss_sequences_lower_95": 0.6483656034826344,
            "loss_sequences_upper_95": 0.6626053846706601,
            "loss_tokens_lower_95": 0.4753378408246985,
            "loss_tokens_upper_95": 0.48389946168706016,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6442201231408307,
            "data_time": 0.041641976684331894,
            "batch_time": 0.07182028144598007,
            "samples_per_second": 974731.4556249578,
            "samples_per_second_per_gpu": 121841.43195311973,
            "loss_sequences_lower_95": 1.5547183810256597,
            "loss_sequences_upper_95": 1.7897558918149452,
            "loss_tokens_lower_95": 1.459381800674742,
            "loss_tokens_upper_95": 1.581011151787222,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8366015980372556,
            "data_time": 0.12420469238644555,
            "batch_time": 0.15746554874238514,
            "samples_per_second": 522146.05886315304,
            "samples_per_second_per_gpu": 65268.25735789413,
            "loss_sequences_lower_95": 3.387498257611249,
            "loss_sequences_upper_95": 4.349885796211861,
            "loss_tokens_lower_95": 3.2568985362111786,
            "loss_tokens_upper_95": 4.330838389455536,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.5017744884258364,
            "data_time": 0.03212795371101016,
            "batch_time": 0.0617709755897522,
            "samples_per_second": 981084.2235067191,
            "samples_per_second_per_gpu": 122635.52793833989,
            "loss_sequences_lower_95": 1.4492016792297362,
            "loss_sequences_upper_95": 1.6424424101666706,
            "loss_tokens_lower_95": 1.3494880130854445,
            "loss_tokens_upper_95": 1.4470127536269974,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.5475645523245742,
            "data_time": 0.03294344459261213,
            "batch_time": 0.06285398630868821,
            "samples_per_second": 974400.2834189566,
            "samples_per_second_per_gpu": 121800.03542736957,
            "loss_sequences_lower_95": 1.5246815518635075,
            "loss_sequences_upper_95": 1.7011888038821337,
            "loss_tokens_lower_95": 1.390140566820834,
            "loss_tokens_upper_95": 1.473821923421558,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4972423729373188,
            "data_time": 0.03305540198371524,
            "batch_time": 0.06355998345783778,
            "samples_per_second": 970071.2261966547,
            "samples_per_second_per_gpu": 121258.90327458184,
            "loss_sequences_lower_95": 1.380172915574981,
            "loss_sequences_upper_95": 1.5949902371662419,
            "loss_tokens_lower_95": 1.404057181272238,
            "loss_tokens_upper_95": 1.5352597816296183,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6134094486876231,
            "data_time": 0.03157961936224075,
            "batch_time": 0.06270178159077962,
            "samples_per_second": 950155.3518399328,
            "samples_per_second_per_gpu": 118769.4189799916,
            "loss_sequences_lower_95": 1.5777177252420567,
            "loss_sequences_upper_95": 1.7428924839671065,
            "loss_tokens_lower_95": 1.4615578933668285,
            "loss_tokens_upper_95": 1.5419752697335596,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3070753027933726,
            "data_time": 0.03284600928977684,
            "batch_time": 0.062440062746589566,
            "samples_per_second": 1002543.243666199,
            "samples_per_second_per_gpu": 125317.90545827488,
            "loss_sequences_lower_95": 1.2626216343470984,
            "loss_sequences_upper_95": 1.3677270332478588,
            "loss_tokens_lower_95": 1.2454797009816465,
            "loss_tokens_upper_95": 1.305109401277786,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2538366822934732,
            "data_time": 0.03207211267380487,
            "batch_time": 0.06182796330679031,
            "samples_per_second": 982282.4378903846,
            "samples_per_second_per_gpu": 122785.30473629807,
            "loss_sequences_lower_95": 1.2311465309887397,
            "loss_sequences_upper_95": 1.3437974883288872,
            "loss_tokens_lower_95": 1.1303949044511,
            "loss_tokens_upper_95": 1.1830214366947331,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-4.0/params.txt",
    "uuid": "d225305a-caeb-4c5e-8c5f-46ad4fcbb901",
    "creation_date": "2023_12_14-06_52_39"
}