{
    "name": "rpj-d=512_l=8_h=4-1.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 1578280960,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "315656192",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.5059158484141033,
            "data_time": 0.0349949412047863,
            "batch_time": 0.3471750169992447,
            "samples_per_second": 1691183.05397593,
            "samples_per_second_per_gpu": 211397.88174699125,
            "loss_sequences_lower_95": 3.431764500935872,
            "loss_sequences_upper_95": 3.5757889302571613,
            "loss_tokens_lower_95": 3.4931543604532878,
            "loss_tokens_upper_95": 3.5187810770670573,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.899942990231468,
            "data_time": 0.0014808057569137554,
            "batch_time": 0.015418765831141741,
            "samples_per_second": 2232620.21008936,
            "samples_per_second_per_gpu": 279077.52626117,
            "loss_sequences_lower_95": 3.8974413577659734,
            "loss_sequences_upper_95": 3.90243944965529,
            "loss_tokens_lower_95": 3.88870196875,
            "loss_tokens_upper_95": 3.91132171875,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.159310565676008,
            "data_time": 0.010016833305358887,
            "batch_time": 0.02437615203857422,
            "samples_per_second": 2108324.6216514143,
            "samples_per_second_per_gpu": 263540.5777064268,
            "loss_sequences_lower_95": 3.131204616001674,
            "loss_sequences_upper_95": 3.1873459858797037,
            "loss_tokens_lower_95": 3.147306265625,
            "loss_tokens_upper_95": 3.171650973958333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7411044474729556,
            "data_time": 0.001619065376488786,
            "batch_time": 0.015048680238817868,
            "samples_per_second": 2323352.408671532,
            "samples_per_second_per_gpu": 290419.0510839415,
            "loss_sequences_lower_95": 3.727568832554768,
            "loss_sequences_upper_95": 3.754251399403995,
            "loss_tokens_lower_95": 3.7298549270833337,
            "loss_tokens_upper_95": 3.752285947916667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.886379865424939,
            "data_time": 0.010171245293788225,
            "batch_time": 0.024097770333765037,
            "samples_per_second": 2166695.4968120987,
            "samples_per_second_per_gpu": 270836.93710151233,
            "loss_sequences_lower_95": 3.8510892910772814,
            "loss_sequences_upper_95": 3.9201290589000446,
            "loss_tokens_lower_95": 3.8750620520833334,
            "loss_tokens_upper_95": 3.8975318229166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.665307092796289,
            "data_time": 0.003962041891139486,
            "batch_time": 0.017817211215910705,
            "samples_per_second": 2261517.5668670232,
            "samples_per_second_per_gpu": 282689.6958583779,
            "loss_sequences_lower_95": 3.621623191600102,
            "loss_sequences_upper_95": 3.708524308506118,
            "loss_tokens_lower_95": 3.6537363125,
            "loss_tokens_upper_95": 3.6766706666666664,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.291327642251034,
            "data_time": 0.0016019583526287733,
            "batch_time": 0.015198361231880125,
            "samples_per_second": 2303566.2165432787,
            "samples_per_second_per_gpu": 287945.77706790983,
            "loss_sequences_lower_95": 2.2659295081313777,
            "loss_sequences_upper_95": 2.3159844547193877,
            "loss_tokens_lower_95": 2.28036825,
            "loss_tokens_upper_95": 2.302878828125,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.148119856699599,
            "data_time": 0.0016338210981393748,
            "batch_time": 0.014955563008760319,
            "samples_per_second": 2343402.7977500604,
            "samples_per_second_per_gpu": 292925.34971875756,
            "loss_sequences_lower_95": 4.138822347431282,
            "loss_sequences_upper_95": 4.157307949525523,
            "loss_tokens_lower_95": 4.137271489583334,
            "loss_tokens_upper_95": 4.158960770833334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.034246090466414,
            "data_time": 0.011322578740498376,
            "batch_time": 0.025678309183272106,
            "samples_per_second": 2152476.4164493224,
            "samples_per_second_per_gpu": 269059.5520561653,
            "loss_sequences_lower_95": 3.9908263105687087,
            "loss_sequences_upper_95": 4.0805557499086955,
            "loss_tokens_lower_95": 4.022757520833333,
            "loss_tokens_upper_95": 4.0458267604166664,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.60789717704411,
            "data_time": 0.009800535626709461,
            "batch_time": 0.024400906637310982,
            "samples_per_second": 2160676.5802434064,
            "samples_per_second_per_gpu": 270084.5725304258,
            "loss_sequences_lower_95": 4.563577885684288,
            "loss_sequences_upper_95": 4.645434015447443,
            "loss_tokens_lower_95": 4.595935645833333,
            "loss_tokens_upper_95": 4.619975197916667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.842261008175984,
            "data_time": 0.0012905235060537373,
            "batch_time": 0.01468966575020403,
            "samples_per_second": 2334793.3874991466,
            "samples_per_second_per_gpu": 291849.1734373933,
            "loss_sequences_lower_95": 3.8338886793695184,
            "loss_sequences_upper_95": 3.850580684645659,
            "loss_tokens_lower_95": 3.8309753645833333,
            "loss_tokens_upper_95": 3.8535698333333332,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.78789413397335,
            "data_time": 0.0029209875048050575,
            "batch_time": 0.016618422127881713,
            "samples_per_second": 2340333.3016969846,
            "samples_per_second_per_gpu": 292541.66271212307,
            "loss_sequences_lower_95": 3.777019076524105,
            "loss_sequences_upper_95": 3.7984954592487505,
            "loss_tokens_lower_95": 3.7768064062500004,
            "loss_tokens_upper_95": 3.7990672500000002,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.13148101880149,
            "data_time": 0.01025898371760553,
            "batch_time": 0.023976080031263027,
            "samples_per_second": 2176342.7806478515,
            "samples_per_second_per_gpu": 272042.84758098144,
            "loss_sequences_lower_95": 4.091518248892702,
            "loss_sequences_upper_95": 4.16961534975992,
            "loss_tokens_lower_95": 4.119935135416666,
            "loss_tokens_upper_95": 4.142991979166667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6281481614666413,
            "data_time": 0.01037006929101222,
            "batch_time": 0.023975942239343406,
            "samples_per_second": 2220401.436390144,
            "samples_per_second_per_gpu": 277550.179548768,
            "loss_sequences_lower_95": 3.5642524361853205,
            "loss_sequences_upper_95": 3.690836994613989,
            "loss_tokens_lower_95": 3.616175645833333,
            "loss_tokens_upper_95": 3.63995890625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.568619012832642,
            "data_time": 0.08305353777749198,
            "batch_time": 0.09847096034458705,
            "samples_per_second": 1079727.6578765092,
            "samples_per_second_per_gpu": 134965.95723456366,
            "loss_sequences_lower_95": 4.498834193836559,
            "loss_sequences_upper_95": 4.63832222331654,
            "loss_tokens_lower_95": 4.547408650138161,
            "loss_tokens_upper_95": 4.59066783731634,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.303777219950284,
            "data_time": 0.014501956376162443,
            "batch_time": 0.028238654136657715,
            "samples_per_second": 2161867.457748626,
            "samples_per_second_per_gpu": 270233.43221857824,
            "loss_sequences_lower_95": 3.199050921114819,
            "loss_sequences_upper_95": 3.407754009458136,
            "loss_tokens_lower_95": 3.2923650989583333,
            "loss_tokens_upper_95": 3.3150870885416666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.984591993618766,
            "data_time": 0.012980939199527105,
            "batch_time": 0.026650339365005493,
            "samples_per_second": 2213537.7852163524,
            "samples_per_second_per_gpu": 276692.22315204405,
            "loss_sequences_lower_95": 5.923289904430863,
            "loss_sequences_upper_95": 6.039226795689726,
            "loss_tokens_lower_95": 5.973052614583334,
            "loss_tokens_upper_95": 5.9958658541666665,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.171365280620387,
            "data_time": 0.03952654451131821,
            "batch_time": 0.05449138954281807,
            "samples_per_second": 1846887.8668394233,
            "samples_per_second_per_gpu": 230860.9833549279,
            "loss_sequences_lower_95": 4.091498152936091,
            "loss_sequences_upper_95": 4.2306160723576784,
            "loss_tokens_lower_95": 4.158545497206391,
            "loss_tokens_upper_95": 4.1840795298091695,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.153856111122933,
            "data_time": 0.0020596432862357832,
            "batch_time": 0.01578152637954182,
            "samples_per_second": 2255702.088848869,
            "samples_per_second_per_gpu": 281962.7611061086,
            "loss_sequences_lower_95": 5.134950130078515,
            "loss_sequences_upper_95": 5.172988233958837,
            "loss_tokens_lower_95": 5.134600022477211,
            "loss_tokens_upper_95": 5.172758343273751,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6860844505614336,
            "data_time": 0.002227955705420986,
            "batch_time": 0.015853299361884975,
            "samples_per_second": 2266542.947303633,
            "samples_per_second_per_gpu": 283317.86841295415,
            "loss_sequences_lower_95": 3.6719219123431586,
            "loss_sequences_upper_95": 3.6980700394281514,
            "loss_tokens_lower_95": 3.6717686304496415,
            "loss_tokens_upper_95": 3.69239935358226,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.361273238441546,
            "data_time": 0.0030753893407305718,
            "batch_time": 0.016707886175430686,
            "samples_per_second": 2265640.4353676927,
            "samples_per_second_per_gpu": 283205.0544209616,
            "loss_sequences_lower_95": 5.5951786551207485,
            "loss_sequences_upper_95": 5.89513386333107,
            "loss_tokens_lower_95": 4.837405345335954,
            "loss_tokens_upper_95": 5.05163080353154,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.566173116207123,
            "data_time": 0.003941923379898071,
            "batch_time": 0.017828286803783254,
            "samples_per_second": 2212874.9285261426,
            "samples_per_second_per_gpu": 276609.3660657678,
            "loss_sequences_lower_95": 5.71557041015625,
            "loss_sequences_upper_95": 5.91386337890625,
            "loss_tokens_lower_95": 5.221131387578616,
            "loss_tokens_upper_95": 5.359018573113207,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.673742119519434,
            "data_time": 0.004553980417381045,
            "batch_time": 0.018417972604973104,
            "samples_per_second": 2218591.759600863,
            "samples_per_second_per_gpu": 277323.96995010786,
            "loss_sequences_lower_95": 3.7163002274450343,
            "loss_sequences_upper_95": 3.7835130392194607,
            "loss_tokens_lower_95": 3.574725284489643,
            "loss_tokens_upper_95": 3.6082538168901044,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.6233971303159542,
            "data_time": 0.02275364100933075,
            "batch_time": 0.03711700226579394,
            "samples_per_second": 2021204.101248381,
            "samples_per_second_per_gpu": 252650.51265604762,
            "loss_sequences_lower_95": 2.600516024502841,
            "loss_sequences_upper_95": 2.7187786795876243,
            "loss_tokens_lower_95": 2.5553789924606973,
            "loss_tokens_upper_95": 2.6040685032563236,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.910496506399038,
            "data_time": 0.019981006160378456,
            "batch_time": 0.0339462012052536,
            "samples_per_second": 2038036.2901454025,
            "samples_per_second_per_gpu": 254754.53626817532,
            "loss_sequences_lower_95": 3.9049670846121654,
            "loss_sequences_upper_95": 4.1091099517199465,
            "loss_tokens_lower_95": 3.768480840333023,
            "loss_tokens_upper_95": 3.8668325390360527,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.298536455631256,
            "data_time": 0.016847089315072086,
            "batch_time": 0.03065932102692433,
            "samples_per_second": 2064455.1856224202,
            "samples_per_second_per_gpu": 258056.89820280252,
            "loss_sequences_lower_95": 4.265638203938803,
            "loss_sequences_upper_95": 4.3627338867187495,
            "loss_tokens_lower_95": 4.1571553724215855,
            "loss_tokens_upper_95": 4.396540128993163,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.232702838968887,
            "data_time": 0.00168061634180478,
            "batch_time": 0.015190219752444325,
            "samples_per_second": 2292078.7259738296,
            "samples_per_second_per_gpu": 286509.8407467287,
            "loss_sequences_lower_95": 7.2515396826251175,
            "loss_sequences_upper_95": 7.326136908862752,
            "loss_tokens_lower_95": 7.081352093685795,
            "loss_tokens_upper_95": 7.159642757955469,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.252019844873987,
            "data_time": 0.002832741745366346,
            "batch_time": 0.016385533465635056,
            "samples_per_second": 2278505.5082601495,
            "samples_per_second_per_gpu": 284813.1885325187,
            "loss_sequences_lower_95": 5.8108228895399305,
            "loss_sequences_upper_95": 6.103884271178583,
            "loss_tokens_lower_95": 4.5023755592394705,
            "loss_tokens_upper_95": 4.643710618188011,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.751059453947145,
            "data_time": 0.0050786246319074885,
            "batch_time": 0.019012641262363742,
            "samples_per_second": 2190525.58865186,
            "samples_per_second_per_gpu": 273815.6985814825,
            "loss_sequences_lower_95": 5.1915760131419315,
            "loss_sequences_upper_95": 5.51712824590377,
            "loss_tokens_lower_95": 4.3042797638855275,
            "loss_tokens_upper_95": 4.463716228395248,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.24261506180785,
            "data_time": 0.02342533426625388,
            "batch_time": 0.03767619814191546,
            "samples_per_second": 1992645.9169668572,
            "samples_per_second_per_gpu": 249080.73962085714,
            "loss_sequences_lower_95": 5.177442994836258,
            "loss_sequences_upper_95": 5.306296236221105,
            "loss_tokens_lower_95": 5.178344740497467,
            "loss_tokens_upper_95": 5.304007724431008,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.075680437088013,
            "data_time": 0.048892511771275446,
            "batch_time": 0.06412937090947078,
            "samples_per_second": 1656858.3433295304,
            "samples_per_second_per_gpu": 207107.2929161913,
            "loss_sequences_lower_95": 3.9283868560791015,
            "loss_sequences_upper_95": 4.31269482421875,
            "loss_tokens_lower_95": 3.7439750439365773,
            "loss_tokens_upper_95": 4.223473269440406,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.291239695514847,
            "data_time": 0.003424218827230067,
            "batch_time": 0.017025215981684334,
            "samples_per_second": 2270889.4664164823,
            "samples_per_second_per_gpu": 283861.1833020603,
            "loss_sequences_lower_95": 5.247356471800633,
            "loss_sequences_upper_95": 5.336381663514746,
            "loss_tokens_lower_95": 5.2469562136163,
            "loss_tokens_upper_95": 5.335437074190763,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.523865224493028,
            "data_time": 0.005043818938984957,
            "batch_time": 0.018650037429461083,
            "samples_per_second": 2250175.1238102806,
            "samples_per_second_per_gpu": 281271.8904762851,
            "loss_sequences_lower_95": 5.478764104569769,
            "loss_sequences_upper_95": 5.567716071451422,
            "loss_tokens_lower_95": 5.477603676785806,
            "loss_tokens_upper_95": 5.569778429885852,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.283747504403464,
            "data_time": 0.003431344907690588,
            "batch_time": 0.017264666361633697,
            "samples_per_second": 2223946.6351303826,
            "samples_per_second_per_gpu": 277993.3293912978,
            "loss_sequences_lower_95": 4.444761091529091,
            "loss_sequences_upper_95": 4.568822910113744,
            "loss_tokens_lower_95": 4.101415134374453,
            "loss_tokens_upper_95": 4.1609891478823,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.080633313655853,
            "data_time": 0.011138021014630795,
            "batch_time": 0.025329058058559895,
            "samples_per_second": 2088618.8879097956,
            "samples_per_second_per_gpu": 261077.36098872445,
            "loss_sequences_lower_95": 6.267165832519531,
            "loss_sequences_upper_95": 6.827541516113281,
            "loss_tokens_lower_95": 5.427788777480766,
            "loss_tokens_upper_95": 5.797202755961491,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.69299778342247,
            "data_time": 0.15759071707725525,
            "batch_time": 0.1746966689825058,
            "samples_per_second": 962318.4568260858,
            "samples_per_second_per_gpu": 120289.80710326073,
            "loss_sequences_lower_95": 4.412581026554108,
            "loss_sequences_upper_95": 5.079561161994934,
            "loss_tokens_lower_95": 4.213963852805653,
            "loss_tokens_upper_95": 5.013483086947737,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.557004355836189,
            "data_time": 0.02844672000154536,
            "batch_time": 0.04314540548527494,
            "samples_per_second": 1806528.9975570447,
            "samples_per_second_per_gpu": 225816.1246946306,
            "loss_sequences_lower_95": 6.032942410173088,
            "loss_sequences_upper_95": 6.869841617277299,
            "loss_tokens_lower_95": 4.153241689309998,
            "loss_tokens_upper_95": 4.630034966882751,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.104164887120391,
            "data_time": 0.0030311751696798536,
            "batch_time": 0.01659445009297795,
            "samples_per_second": 2262826.6901022755,
            "samples_per_second_per_gpu": 282853.33626278443,
            "loss_sequences_lower_95": 4.083640466292595,
            "loss_sequences_upper_95": 4.1243272747208115,
            "loss_tokens_lower_95": 4.084017375047783,
            "loss_tokens_upper_95": 4.1242671301605505,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.046919269630494,
            "data_time": 0.0026243472219577442,
            "batch_time": 0.01704565989309658,
            "samples_per_second": 2225418.1164144166,
            "samples_per_second_per_gpu": 278177.2645518021,
            "loss_sequences_lower_95": 4.0199649513936055,
            "loss_sequences_upper_95": 4.211720569328547,
            "loss_tokens_lower_95": 3.825620293417335,
            "loss_tokens_upper_95": 4.012103269259251,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.659442806418562,
            "data_time": 0.018158384495311312,
            "batch_time": 0.03188151948981815,
            "samples_per_second": 2044306.2425960216,
            "samples_per_second_per_gpu": 255538.2803245027,
            "loss_sequences_lower_95": 3.5405315692608172,
            "loss_sequences_upper_95": 3.9105781750801283,
            "loss_tokens_lower_95": 3.4031294943556314,
            "loss_tokens_upper_95": 3.70920206893935,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.081287079000435,
            "data_time": 0.005089130625128746,
            "batch_time": 0.019262804090976714,
            "samples_per_second": 2155364.0121961767,
            "samples_per_second_per_gpu": 269420.5015245221,
            "loss_sequences_lower_95": 4.115687990978937,
            "loss_sequences_upper_95": 4.26155211968787,
            "loss_tokens_lower_95": 3.934923206303055,
            "loss_tokens_upper_95": 4.083535917932635,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.420542366620971,
            "data_time": 0.03189511242366973,
            "batch_time": 0.046362996101379395,
            "samples_per_second": 1957717.8490505624,
            "samples_per_second_per_gpu": 244714.7311313203,
            "loss_sequences_lower_95": 3.247850762343988,
            "loss_sequences_upper_95": 3.7887645628394147,
            "loss_tokens_lower_95": 3.136598098530855,
            "loss_tokens_upper_95": 3.538312295288285,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.793739609681122,
            "data_time": 0.0022369581749093473,
            "batch_time": 0.015828341376624297,
            "samples_per_second": 2266128.682096683,
            "samples_per_second_per_gpu": 283266.08526208537,
            "loss_sequences_lower_95": 4.785004022288833,
            "loss_sequences_upper_95": 4.802584774767453,
            "loss_tokens_lower_95": 4.784812812171809,
            "loss_tokens_upper_95": 4.802745451434037,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.3799848226667608,
            "data_time": 0.04759784611788663,
            "batch_time": 0.0639322584325617,
            "samples_per_second": 1545143.6186434329,
            "samples_per_second_per_gpu": 193142.9523304291,
            "loss_sequences_lower_95": 1.3078467211677034,
            "loss_sequences_upper_95": 1.5082096507248368,
            "loss_tokens_lower_95": 1.179238260634236,
            "loss_tokens_upper_95": 1.4461128008402233,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.6788509888156655,
            "data_time": 0.0016787340684988274,
            "batch_time": 0.01541363798262987,
            "samples_per_second": 2244918.335392695,
            "samples_per_second_per_gpu": 280614.79192408687,
            "loss_sequences_lower_95": 6.093875008189204,
            "loss_sequences_upper_95": 6.148601458087657,
            "loss_tokens_lower_95": 5.021276547388781,
            "loss_tokens_upper_95": 5.071989627659574,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.5806450610160825,
            "data_time": 0.005779133902655708,
            "batch_time": 0.019618399559505402,
            "samples_per_second": 2205278.8057394954,
            "samples_per_second_per_gpu": 275659.85071743693,
            "loss_sequences_lower_95": 6.573397827148438,
            "loss_sequences_upper_95": 6.877948852539062,
            "loss_tokens_lower_95": 6.281113985538114,
            "loss_tokens_upper_95": 6.5534955348511215,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.600056739475416,
            "data_time": 0.02504372798790366,
            "batch_time": 0.03985902818582826,
            "samples_per_second": 1955426.9395042653,
            "samples_per_second_per_gpu": 244428.36743803316,
            "loss_sequences_lower_95": 5.434795492421026,
            "loss_sequences_upper_95": 5.762628691300102,
            "loss_tokens_lower_95": 5.4365899260147765,
            "loss_tokens_upper_95": 5.760630387015965,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.665752640998725,
            "data_time": 0.004840501819748476,
            "batch_time": 0.018637146576341378,
            "samples_per_second": 2220341.5149431345,
            "samples_per_second_per_gpu": 277542.6893678918,
            "loss_sequences_lower_95": 5.614161395448627,
            "loss_sequences_upper_95": 5.71739863540187,
            "loss_tokens_lower_95": 5.613270911014442,
            "loss_tokens_upper_95": 5.718123437130091,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.0817898882627488,
            "data_time": 0.004567803537591975,
            "batch_time": 0.018328419074099114,
            "samples_per_second": 2240821.300248397,
            "samples_per_second_per_gpu": 280102.66253104963,
            "loss_sequences_lower_95": 1.1285492594401043,
            "loss_sequences_upper_95": 1.1938323262532553,
            "loss_tokens_lower_95": 0.9973198654461785,
            "loss_tokens_upper_95": 1.0567992763511653,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.300661232357934,
            "data_time": 0.024916389158793857,
            "batch_time": 0.03932596317359379,
            "samples_per_second": 1916282.7580850602,
            "samples_per_second_per_gpu": 239535.34476063252,
            "loss_sequences_lower_95": 5.97643301827567,
            "loss_sequences_upper_95": 6.627207932245163,
            "loss_tokens_lower_95": 5.975631481352306,
            "loss_tokens_upper_95": 6.632476835704985,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.062364686280489,
            "data_time": 0.15725670754909515,
            "batch_time": 0.1746424287557602,
            "samples_per_second": 752465.1223391083,
            "samples_per_second_per_gpu": 94058.14029238853,
            "loss_sequences_lower_95": 2.809593749046326,
            "loss_sequences_upper_95": 4.166753625869751,
            "loss_tokens_lower_95": 2.398183996456186,
            "loss_tokens_upper_95": 3.017547594837307,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.307038342952728,
            "data_time": 0.006026152580503433,
            "batch_time": 0.019818720836488026,
            "samples_per_second": 2214262.594353713,
            "samples_per_second_per_gpu": 276782.82429421414,
            "loss_sequences_lower_95": 7.240037463378906,
            "loss_sequences_upper_95": 7.6151459106445305,
            "loss_tokens_lower_95": 6.980944493740746,
            "loss_tokens_upper_95": 7.311824048553828,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.609787364006042,
            "data_time": 0.006048637250113109,
            "batch_time": 0.019992707740692867,
            "samples_per_second": 2189596.5118117603,
            "samples_per_second_per_gpu": 273699.56397647003,
            "loss_sequences_lower_95": 6.673952770996094,
            "loss_sequences_upper_95": 6.900548864746094,
            "loss_tokens_lower_95": 6.397853995348491,
            "loss_tokens_upper_95": 6.582993692198453,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.067954272821051,
            "data_time": 0.00404903147132899,
            "batch_time": 0.017835739225049482,
            "samples_per_second": 2226313.118567886,
            "samples_per_second_per_gpu": 278289.1398209857,
            "loss_sequences_lower_95": 5.048461840402908,
            "loss_sequences_upper_95": 5.087046117779396,
            "loss_tokens_lower_95": 5.048683694909173,
            "loss_tokens_upper_95": 5.0874014352817,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.432495204351282,
            "data_time": 0.008942076806935659,
            "batch_time": 0.022859133262288535,
            "samples_per_second": 2149913.007453691,
            "samples_per_second_per_gpu": 268739.1259317114,
            "loss_sequences_lower_95": 5.333408544396842,
            "loss_sequences_upper_95": 5.529408650903657,
            "loss_tokens_lower_95": 5.330270831083189,
            "loss_tokens_upper_95": 5.530482963409658,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.496337914943695,
            "data_time": 0.0059175680554102335,
            "batch_time": 0.01998589341602628,
            "samples_per_second": 2160463.77580394,
            "samples_per_second_per_gpu": 270057.9719754925,
            "loss_sequences_lower_95": 8.445401538085939,
            "loss_sequences_upper_95": 8.548902734375,
            "loss_tokens_lower_95": 8.445397705078125,
            "loss_tokens_upper_95": 8.549088745117187,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9995961670280065,
            "data_time": 0.002324845029185535,
            "batch_time": 0.015944384491961935,
            "samples_per_second": 2264986.4666572437,
            "samples_per_second_per_gpu": 283123.30833215546,
            "loss_sequences_lower_95": 4.6327592741101,
            "loss_sequences_upper_95": 4.743799253119087,
            "loss_tokens_lower_95": 3.2289868310903045,
            "loss_tokens_upper_95": 3.300178591423879,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.8275346791566305,
            "data_time": 0.02005237170628139,
            "batch_time": 0.03394185304641724,
            "samples_per_second": 2046160.0449878331,
            "samples_per_second_per_gpu": 255770.00562347914,
            "loss_sequences_lower_95": 5.645510522643132,
            "loss_sequences_upper_95": 6.010766123301948,
            "loss_tokens_lower_95": 5.646942047574627,
            "loss_tokens_upper_95": 6.006172043529909,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.814496442383411,
            "data_time": 0.010916910134255886,
            "batch_time": 0.02516783494502306,
            "samples_per_second": 2119324.018051642,
            "samples_per_second_per_gpu": 264915.50225645525,
            "loss_sequences_lower_95": 5.6924191942401965,
            "loss_sequences_upper_95": 5.935493929993872,
            "loss_tokens_lower_95": 5.694575626148897,
            "loss_tokens_upper_95": 5.93307066674326,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.680988785304566,
            "data_time": 0.002488340369222402,
            "batch_time": 0.016195718334090205,
            "samples_per_second": 2246727.6441783262,
            "samples_per_second_per_gpu": 280840.9555222908,
            "loss_sequences_lower_95": 5.192198008169704,
            "loss_sequences_upper_95": 5.306731577551515,
            "loss_tokens_lower_95": 3.8849227299441838,
            "loss_tokens_upper_95": 3.972744028565733,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.662438241262285,
            "data_time": 0.028269124527772267,
            "batch_time": 0.04321111490329107,
            "samples_per_second": 1946567.4619521878,
            "samples_per_second_per_gpu": 243320.93274402348,
            "loss_sequences_lower_95": 4.585577497532759,
            "loss_sequences_upper_95": 4.738512691114315,
            "loss_tokens_lower_95": 4.586077170397239,
            "loss_tokens_upper_95": 4.7369681302832545,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.753223553199652,
            "data_time": 0.004007927809588347,
            "batch_time": 0.017775940079974312,
            "samples_per_second": 2228126.7888223995,
            "samples_per_second_per_gpu": 278515.84860279993,
            "loss_sequences_lower_95": 5.7299009998566515,
            "loss_sequences_upper_95": 5.776153389000382,
            "loss_tokens_lower_95": 5.729239505686162,
            "loss_tokens_upper_95": 5.776833697677753,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.937265007241258,
            "data_time": 0.026360947435552423,
            "batch_time": 0.04126681631261652,
            "samples_per_second": 1841635.5542660141,
            "samples_per_second_per_gpu": 230204.44428325177,
            "loss_sequences_lower_95": 5.758332469162432,
            "loss_sequences_upper_95": 6.114245220295434,
            "loss_tokens_lower_95": 5.757768382840943,
            "loss_tokens_upper_95": 6.117446514240746,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.743806161483129,
            "data_time": 0.08424361050128937,
            "batch_time": 0.10006047040224075,
            "samples_per_second": 1444182.677348331,
            "samples_per_second_per_gpu": 180522.83466854139,
            "loss_sequences_lower_95": 3.4526187642415365,
            "loss_sequences_upper_95": 4.185862528483073,
            "loss_tokens_lower_95": 3.123574225107829,
            "loss_tokens_upper_95": 4.221038627624512,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4458754221598307,
            "data_time": 0.08044519275426865,
            "batch_time": 0.09563519060611725,
            "samples_per_second": 1413750.4988800373,
            "samples_per_second_per_gpu": 176718.81236000467,
            "loss_sequences_lower_95": 3.1992749150594073,
            "loss_sequences_upper_95": 3.912105127970378,
            "loss_tokens_lower_95": 2.693263214625669,
            "loss_tokens_upper_95": 3.8259053605326105,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.936409146164233,
            "data_time": 0.00355720997137347,
            "batch_time": 0.01732185646111015,
            "samples_per_second": 2240636.8869781173,
            "samples_per_second_per_gpu": 280079.61087226466,
            "loss_sequences_lower_95": 4.913964584867452,
            "loss_sequences_upper_95": 4.959585370834867,
            "loss_tokens_lower_95": 4.9128918906480115,
            "loss_tokens_upper_95": 4.959692576974411,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.9974790131821611,
            "data_time": 0.0017429509605698826,
            "batch_time": 0.015436828389144308,
            "samples_per_second": 2251176.851989829,
            "samples_per_second_per_gpu": 281397.1064987286,
            "loss_sequences_lower_95": 1.1744774692746445,
            "loss_sequences_upper_95": 1.2045338465089244,
            "loss_tokens_lower_95": 0.8069578688068041,
            "loss_tokens_upper_95": 0.8221673270789864,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.441835448497862,
            "data_time": 0.04130130261182785,
            "batch_time": 0.056938719004392624,
            "samples_per_second": 1809687.8493950723,
            "samples_per_second_per_gpu": 226210.98117438404,
            "loss_sequences_lower_95": 2.3454750301331044,
            "loss_sequences_upper_95": 2.649971771240234,
            "loss_tokens_lower_95": 2.196030043150101,
            "loss_tokens_upper_95": 2.3441953662337265,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.200891568854049,
            "data_time": 0.13022728193373906,
            "batch_time": 0.1456169400896345,
            "samples_per_second": 964618.2588840018,
            "samples_per_second_per_gpu": 120577.28236050022,
            "loss_sequences_lower_95": 3.8114840842582085,
            "loss_sequences_upper_95": 4.6790104634053,
            "loss_tokens_lower_95": 3.643369264955874,
            "loss_tokens_upper_95": 4.633618823392891,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.276403631378965,
            "data_time": 0.0313469199907212,
            "batch_time": 0.045491422925676615,
            "samples_per_second": 1940522.1660186215,
            "samples_per_second_per_gpu": 242565.2707523277,
            "loss_sequences_lower_95": 2.2139070092177975,
            "loss_sequences_upper_95": 2.475529787017078,
            "loss_tokens_lower_95": 2.0606658571081264,
            "loss_tokens_upper_95": 2.1858278974913268,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.3301082362489,
            "data_time": 0.031145362626938594,
            "batch_time": 0.04580376261756534,
            "samples_per_second": 1905578.9242986515,
            "samples_per_second_per_gpu": 238197.36553733144,
            "loss_sequences_lower_95": 2.3092005194687264,
            "loss_sequences_upper_95": 2.551116761928651,
            "loss_tokens_lower_95": 2.1039265268137237,
            "loss_tokens_upper_95": 2.206631199516975,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.318607607265798,
            "data_time": 0.030043721199035645,
            "batch_time": 0.04534111420313517,
            "samples_per_second": 1892954.5567270205,
            "samples_per_second_per_gpu": 236619.31959087757,
            "loss_sequences_lower_95": 2.146365626265363,
            "loss_sequences_upper_95": 2.4401731537609566,
            "loss_tokens_lower_95": 2.175730631884794,
            "loss_tokens_upper_95": 2.338960130109123,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.4065233425396246,
            "data_time": 0.03158202057793027,
            "batch_time": 0.04713712987445649,
            "samples_per_second": 1787452.0569731747,
            "samples_per_second_per_gpu": 223431.50712164684,
            "loss_sequences_lower_95": 2.380325210385206,
            "loss_sequences_upper_95": 2.6044269701329674,
            "loss_tokens_lower_95": 2.1868800748545802,
            "loss_tokens_upper_95": 2.2848316275813496,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.9855748570483664,
            "data_time": 0.03355932824405623,
            "batch_time": 0.048978832032945424,
            "samples_per_second": 1870332.2374517645,
            "samples_per_second_per_gpu": 233791.52968147057,
            "loss_sequences_lower_95": 1.9306553431919642,
            "loss_sequences_upper_95": 2.061349349880811,
            "loss_tokens_lower_95": 1.9109710812966145,
            "loss_tokens_upper_95": 1.9871980739525446,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.7910015990094441,
            "data_time": 0.03385137943994431,
            "batch_time": 0.048592944939931236,
            "samples_per_second": 1914566.346607096,
            "samples_per_second_per_gpu": 239320.793325887,
            "loss_sequences_lower_95": 1.770292710094917,
            "loss_sequences_upper_95": 1.9161338713110947,
            "loss_tokens_lower_95": 1.624810320835217,
            "loss_tokens_upper_95": 1.689293572700045,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/checkpoints/epoch_9.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-1.0/params.txt",
    "uuid": "fdecd6f8-1425-4783-b08a-54362a0b52a6",
    "creation_date": "2023_12_13-16_17_54"
}