{
    "name": "rpj-d=1024_l=24_h=8-32.0",
    "dataset_name": "rpj",
    "dataset_uuid": "67db6b77-c7c4-48ae-b431-57254587ed43",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 263434403840,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 32.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/27127",
        "--train-num-samples",
        "52686880768",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/rpj_original/manifest.jsonl",
        "--data-key",
        "json.gz",
        "--name",
        "rpj-d=1024_l=24_h=8-32.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/411m_32x_rpj_original"
    ],
    "results": [
        {
            "loss": 2.2214248557885488,
            "data_time": 0.04005636274814606,
            "batch_time": 0.4678775668144226,
            "samples_per_second": 223946.81495279731,
            "samples_per_second_per_gpu": 111973.40747639866,
            "loss_sequences_lower_95": 2.1597918510437015,
            "loss_sequences_upper_95": 2.2832867749532064,
            "loss_tokens_lower_95": 2.2103179041544596,
            "loss_tokens_upper_95": 2.2324453035990395,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7681623882618447,
            "data_time": 0.002127407433441989,
            "batch_time": 0.11700638770877664,
            "samples_per_second": 280561.34102163557,
            "samples_per_second_per_gpu": 140280.67051081778,
            "loss_sequences_lower_95": 2.7654619282285675,
            "loss_sequences_upper_95": 2.7709198738735163,
            "loss_tokens_lower_95": 2.7586032708333335,
            "loss_tokens_upper_95": 2.778448770833333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3604738809624495,
            "data_time": 0.013138838112354279,
            "batch_time": 0.12299040332436562,
            "samples_per_second": 270688.88605339406,
            "samples_per_second_per_gpu": 135344.44302669703,
            "loss_sequences_lower_95": 2.337475648218272,
            "loss_sequences_upper_95": 2.383274548588967,
            "loss_tokens_lower_95": 2.349791265625,
            "loss_tokens_upper_95": 2.3711376822916668,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.622109214743388,
            "data_time": 0.003023358552079452,
            "batch_time": 0.11683799365633413,
            "samples_per_second": 281391.10019817465,
            "samples_per_second_per_gpu": 140695.55009908733,
            "loss_sequences_lower_95": 2.6105459336742913,
            "loss_sequences_upper_95": 2.6338851179929126,
            "loss_tokens_lower_95": 2.6121157760416667,
            "loss_tokens_upper_95": 2.6319886197916667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7718551761021195,
            "data_time": 0.011492621153593063,
            "batch_time": 0.12134876474738121,
            "samples_per_second": 273239.44558407157,
            "samples_per_second_per_gpu": 136619.72279203578,
            "loss_sequences_lower_95": 2.7371266537916883,
            "loss_sequences_upper_95": 2.8057416492469924,
            "loss_tokens_lower_95": 2.7616183072916667,
            "loss_tokens_upper_95": 2.7819265208333337,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5725374560547394,
            "data_time": 0.005430274683496226,
            "batch_time": 0.11930520249449689,
            "samples_per_second": 280591.0383805814,
            "samples_per_second_per_gpu": 140295.5191902907,
            "loss_sequences_lower_95": 2.533092216113082,
            "loss_sequences_upper_95": 2.609972231700905,
            "loss_tokens_lower_95": 2.562349901041667,
            "loss_tokens_upper_95": 2.582679109375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.3782357623504133,
            "data_time": 0.00318155853779285,
            "batch_time": 0.11681851783356109,
            "samples_per_second": 280149.201077867,
            "samples_per_second_per_gpu": 140074.6005389335,
            "loss_sequences_lower_95": 1.3587838657924107,
            "loss_sequences_upper_95": 1.3977120087292731,
            "loss_tokens_lower_95": 1.369657640625,
            "loss_tokens_upper_95": 1.3871370390625,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2236993097135533,
            "data_time": 0.0030487342675526935,
            "batch_time": 0.11658447504043579,
            "samples_per_second": 281086.5768564159,
            "samples_per_second_per_gpu": 140543.28842820795,
            "loss_sequences_lower_95": 3.2149396167375657,
            "loss_sequences_upper_95": 3.2320499171711385,
            "loss_tokens_lower_95": 3.2134227083333333,
            "loss_tokens_upper_95": 3.2340803333333334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9709512252148573,
            "data_time": 0.012520704418420792,
            "batch_time": 0.12685725465416908,
            "samples_per_second": 268544.28157908877,
            "samples_per_second_per_gpu": 134272.14078954438,
            "loss_sequences_lower_95": 2.925017950786808,
            "loss_sequences_upper_95": 3.0168596593345085,
            "loss_tokens_lower_95": 2.9606251197916666,
            "loss_tokens_upper_95": 2.9814793020833332,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.694445431468044,
            "data_time": 0.012209061533212662,
            "batch_time": 0.1257706694304943,
            "samples_per_second": 274203.90342703264,
            "samples_per_second_per_gpu": 137101.95171351632,
            "loss_sequences_lower_95": 3.6577819341727396,
            "loss_sequences_upper_95": 3.7262093042667677,
            "loss_tokens_lower_95": 3.682600354166667,
            "loss_tokens_upper_95": 3.7064032916666667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.708516143586058,
            "data_time": 0.003141483367468676,
            "batch_time": 0.11765962519233093,
            "samples_per_second": 279509.67998403404,
            "samples_per_second_per_gpu": 139754.83999201702,
            "loss_sequences_lower_95": 2.700331873329793,
            "loss_sequences_upper_95": 2.7165748231293683,
            "loss_tokens_lower_95": 2.69849634375,
            "loss_tokens_upper_95": 2.718356489583333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.573840635610292,
            "data_time": 0.004233224843427588,
            "batch_time": 0.117968672533764,
            "samples_per_second": 280712.5303839093,
            "samples_per_second_per_gpu": 140356.26519195465,
            "loss_sequences_lower_95": 2.5641590479064713,
            "loss_sequences_upper_95": 2.5833236497325336,
            "loss_tokens_lower_95": 2.5642541145833335,
            "loss_tokens_upper_95": 2.5835695833333334,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.163225329188256,
            "data_time": 0.011731680482625961,
            "batch_time": 0.12199938669800758,
            "samples_per_second": 273687.6702249964,
            "samples_per_second_per_gpu": 136843.8351124982,
            "loss_sequences_lower_95": 3.1296326983531153,
            "loss_sequences_upper_95": 3.1944987453746987,
            "loss_tokens_lower_95": 3.1528944635416667,
            "loss_tokens_upper_95": 3.1737220885416666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.525334143590053,
            "data_time": 0.012580174952745438,
            "batch_time": 0.12251777574419975,
            "samples_per_second": 272169.1190594075,
            "samples_per_second_per_gpu": 136084.55952970375,
            "loss_sequences_lower_95": 2.467243736873091,
            "loss_sequences_upper_95": 2.5831998494880506,
            "loss_tokens_lower_95": 2.5148311927083333,
            "loss_tokens_upper_95": 2.5357138229166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.132786371491172,
            "data_time": 0.08611363172531128,
            "batch_time": 0.16892653703689575,
            "samples_per_second": 207300.40438462934,
            "samples_per_second_per_gpu": 103650.20219231467,
            "loss_sequences_lower_95": 3.072685128992254,
            "loss_sequences_upper_95": 3.190711212158203,
            "loss_tokens_lower_95": 3.1130474437366833,
            "loss_tokens_upper_95": 3.152402010830966,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0699710595712024,
            "data_time": 0.01688374714417891,
            "batch_time": 0.12887561321258545,
            "samples_per_second": 269217.47282389866,
            "samples_per_second_per_gpu": 134608.73641194933,
            "loss_sequences_lower_95": 1.9932828230343476,
            "loss_sequences_upper_95": 2.144465183447123,
            "loss_tokens_lower_95": 2.0607331614583333,
            "loss_tokens_upper_95": 2.079232328125,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.281104620025151,
            "data_time": 0.016100207964579265,
            "batch_time": 0.12918300926685333,
            "samples_per_second": 272441.7789182623,
            "samples_per_second_per_gpu": 136220.88945913114,
            "loss_sequences_lower_95": 5.222346637494332,
            "loss_sequences_upper_95": 5.335074859055491,
            "loss_tokens_lower_95": 5.26972309375,
            "loss_tokens_upper_95": 5.293067260416667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.667227258447741,
            "data_time": 0.04180239140987396,
            "batch_time": 0.15246327221393585,
            "samples_per_second": 252072.5527278024,
            "samples_per_second_per_gpu": 126036.2763639012,
            "loss_sequences_lower_95": 2.608116162409548,
            "loss_sequences_upper_95": 2.715020264172163,
            "loss_tokens_lower_95": 2.6562429271760535,
            "loss_tokens_upper_95": 2.6785074765564967,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.180820783018947,
            "data_time": 0.0026638722365429296,
            "batch_time": 0.1167083416549926,
            "samples_per_second": 281457.7182777288,
            "samples_per_second_per_gpu": 140728.8591388644,
            "loss_sequences_lower_95": 2.168305452643186,
            "loss_sequences_upper_95": 2.1936766778347994,
            "loss_tokens_lower_95": 2.167874758815073,
            "loss_tokens_upper_95": 2.1935092080065606,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.539100177543046,
            "data_time": 0.0027943728076424566,
            "batch_time": 0.11720532064984558,
            "samples_per_second": 280480.6930021172,
            "samples_per_second_per_gpu": 140240.3465010586,
            "loss_sequences_lower_95": 2.543244301667372,
            "loss_sequences_upper_95": 2.5676575570105555,
            "loss_tokens_lower_95": 2.528273445093992,
            "loss_tokens_upper_95": 2.546213566013952,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2488085768613604,
            "data_time": 0.00474582412349644,
            "batch_time": 0.11878973067696415,
            "samples_per_second": 279575.97760855843,
            "samples_per_second_per_gpu": 139787.98880427922,
            "loss_sequences_lower_95": 2.765828598519795,
            "loss_sequences_upper_95": 3.033696699615464,
            "loss_tokens_lower_95": 2.0652585621984456,
            "loss_tokens_upper_95": 2.2479954175787764,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5384400056997936,
            "data_time": 0.004176085933725884,
            "batch_time": 0.11753646046557326,
            "samples_per_second": 281705.3427222604,
            "samples_per_second_per_gpu": 140852.6713611302,
            "loss_sequences_lower_95": 2.7412553548177083,
            "loss_sequences_upper_95": 2.940423933919271,
            "loss_tokens_lower_95": 2.4749005994496858,
            "loss_tokens_upper_95": 2.6095050302181604,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0396103340174956,
            "data_time": 0.006479254790714809,
            "batch_time": 0.11813482500257946,
            "samples_per_second": 279053.5701892601,
            "samples_per_second_per_gpu": 139526.78509463006,
            "loss_sequences_lower_95": 2.12460533529633,
            "loss_sequences_upper_95": 2.1754268018651675,
            "loss_tokens_lower_95": 2.0088940446229793,
            "loss_tokens_upper_95": 2.0380513751341027,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6237260227853603,
            "data_time": 0.02702533347266061,
            "batch_time": 0.1398101704461234,
            "samples_per_second": 265179.4768562495,
            "samples_per_second_per_gpu": 132589.73842812475,
            "loss_sequences_lower_95": 1.6402961141412908,
            "loss_sequences_upper_95": 1.7295804942737925,
            "loss_tokens_lower_95": 1.5938001360475664,
            "loss_tokens_upper_95": 1.633566780617001,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.491849235612519,
            "data_time": 0.02412048727273941,
            "batch_time": 0.13918300718069077,
            "samples_per_second": 267382.7012408938,
            "samples_per_second_per_gpu": 133691.3506204469,
            "loss_sequences_lower_95": 2.517228007413903,
            "loss_sequences_upper_95": 2.675164309131856,
            "loss_tokens_lower_95": 2.4467271602064997,
            "loss_tokens_upper_95": 2.528304589645396,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.57440869251887,
            "data_time": 0.021352142095565796,
            "batch_time": 0.12833555340766906,
            "samples_per_second": 269062.2659171759,
            "samples_per_second_per_gpu": 134531.13295858796,
            "loss_sequences_lower_95": 2.576130615234375,
            "loss_sequences_upper_95": 2.671845621744792,
            "loss_tokens_lower_95": 2.473434689666786,
            "loss_tokens_upper_95": 2.641935543253631,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.720024531847684,
            "data_time": 0.002651335357071517,
            "batch_time": 0.11625615192745108,
            "samples_per_second": 282270.06461329473,
            "samples_per_second_per_gpu": 141135.03230664736,
            "loss_sequences_lower_95": 3.764876592026106,
            "loss_sequences_upper_95": 3.8425929361098126,
            "loss_tokens_lower_95": 3.6584974235809162,
            "loss_tokens_upper_95": 3.735095945667495,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.859332997590203,
            "data_time": 0.005178730487823486,
            "batch_time": 0.11802709897359212,
            "samples_per_second": 280367.0914454404,
            "samples_per_second_per_gpu": 140183.5457227202,
            "loss_sequences_lower_95": 3.8016372115523724,
            "loss_sequences_upper_95": 4.094100510311448,
            "loss_tokens_lower_95": 2.7103971476126105,
            "loss_tokens_upper_95": 2.8335360631847464,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.925937589830099,
            "data_time": 0.007739125071345149,
            "batch_time": 0.12087567432506664,
            "samples_per_second": 276973.74625232647,
            "samples_per_second_per_gpu": 138486.87312616323,
            "loss_sequences_lower_95": 3.5041344027470402,
            "loss_sequences_upper_95": 3.834899256579298,
            "loss_tokens_lower_95": 2.784664092751694,
            "loss_tokens_upper_95": 2.9284857829642625,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.4425998147764165,
            "data_time": 0.026798239776066372,
            "batch_time": 0.1394076177052089,
            "samples_per_second": 262848.5971386395,
            "samples_per_second_per_gpu": 131424.29856931974,
            "loss_sequences_lower_95": 5.352086719530359,
            "loss_sequences_upper_95": 5.532019572497503,
            "loss_tokens_lower_95": 5.34961960326591,
            "loss_tokens_upper_95": 5.5330562225759845,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7646666765213013,
            "data_time": 0.046108782291412354,
            "batch_time": 0.13873842358589172,
            "samples_per_second": 220491.3719328895,
            "samples_per_second_per_gpu": 110245.68596644475,
            "loss_sequences_lower_95": 2.6836229400634766,
            "loss_sequences_upper_95": 3.0289529113769533,
            "loss_tokens_lower_95": 2.547050986520293,
            "loss_tokens_upper_95": 2.9504783452943317,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4076089051875267,
            "data_time": 0.004843051840619343,
            "batch_time": 0.11801704837054741,
            "samples_per_second": 279433.7502448744,
            "samples_per_second_per_gpu": 139716.8751224372,
            "loss_sequences_lower_95": 1.386230015827909,
            "loss_sequences_upper_95": 1.4295513422486725,
            "loss_tokens_lower_95": 1.386185342091743,
            "loss_tokens_upper_95": 1.4297131603791502,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0667290235522535,
            "data_time": 0.006541071793971918,
            "batch_time": 0.11825643594448383,
            "samples_per_second": 276319.62420987355,
            "samples_per_second_per_gpu": 138159.81210493678,
            "loss_sequences_lower_95": 2.038567580364455,
            "loss_sequences_upper_95": 2.09427916130797,
            "loss_tokens_lower_95": 2.039211588741618,
            "loss_tokens_upper_95": 2.0954809340368423,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7847860757103424,
            "data_time": 0.005689519232717054,
            "batch_time": 0.11881871470089617,
            "samples_per_second": 278603.2476372122,
            "samples_per_second_per_gpu": 139301.6238186061,
            "loss_sequences_lower_95": 3.0545500347481975,
            "loss_sequences_upper_95": 3.1843877383492587,
            "loss_tokens_lower_95": 2.7262277910194874,
            "loss_tokens_upper_95": 2.778304662663646,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.422322862148285,
            "data_time": 0.013006027787923813,
            "batch_time": 0.12478868290781975,
            "samples_per_second": 272642.4076060802,
            "samples_per_second_per_gpu": 136321.2038030401,
            "loss_sequences_lower_95": 4.847670544433594,
            "loss_sequences_upper_95": 5.407986938476562,
            "loss_tokens_lower_95": 4.150299302510755,
            "loss_tokens_upper_95": 4.504375505731407,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8489474654197693,
            "data_time": 0.18289291858673096,
            "batch_time": 0.2975921630859375,
            "samples_per_second": 175018.22998421505,
            "samples_per_second_per_gpu": 87509.11499210753,
            "loss_sequences_lower_95": 2.642562544345856,
            "loss_sequences_upper_95": 3.145746505260467,
            "loss_tokens_lower_95": 2.45696151996481,
            "loss_tokens_upper_95": 3.19116878290286,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.112591121388578,
            "data_time": 0.03241589665412903,
            "batch_time": 0.13689966996510824,
            "samples_per_second": 258236.2592675325,
            "samples_per_second_per_gpu": 129118.12963376625,
            "loss_sequences_lower_95": 4.278609203470165,
            "loss_sequences_upper_95": 5.007686553604301,
            "loss_tokens_lower_95": 2.739159447499031,
            "loss_tokens_upper_95": 3.159775513108177,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.119222505897765,
            "data_time": 0.005136015514532725,
            "batch_time": 0.11849923597441779,
            "samples_per_second": 278835.77695737634,
            "samples_per_second_per_gpu": 139417.88847868817,
            "loss_sequences_lower_95": 2.098762018246232,
            "loss_sequences_upper_95": 2.1396501440312363,
            "loss_tokens_lower_95": 2.0982554024512616,
            "loss_tokens_upper_95": 2.1401119580483425,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6208626383177651,
            "data_time": 0.0032199493674344793,
            "batch_time": 0.11699349196382271,
            "samples_per_second": 281580.9541704062,
            "samples_per_second_per_gpu": 140790.4770852031,
            "loss_sequences_lower_95": 1.6266431189962158,
            "loss_sequences_upper_95": 1.7349540818531075,
            "loss_tokens_lower_95": 1.5499687681570133,
            "loss_tokens_upper_95": 1.6552251478550515,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6707305314339997,
            "data_time": 0.0208689636654324,
            "batch_time": 0.1297596428129408,
            "samples_per_second": 267110.4642525948,
            "samples_per_second_per_gpu": 133555.2321262974,
            "loss_sequences_lower_95": 2.655452585744334,
            "loss_sequences_upper_95": 3.0738372690948372,
            "loss_tokens_lower_95": 2.5193352204357224,
            "loss_tokens_upper_95": 2.7937225821721783,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2290059362891723,
            "data_time": 0.00711124986410141,
            "batch_time": 0.12531305253505706,
            "samples_per_second": 274198.39653045114,
            "samples_per_second_per_gpu": 137099.19826522557,
            "loss_sequences_lower_95": 3.3511900265760652,
            "loss_sequences_upper_95": 3.5115383903906867,
            "loss_tokens_lower_95": 3.148985271290343,
            "loss_tokens_upper_95": 3.291259355235649,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.033435979994332,
            "data_time": 0.03158402442932129,
            "batch_time": 0.13132908940315247,
            "samples_per_second": 241220.35536757795,
            "samples_per_second_per_gpu": 120610.17768378898,
            "loss_sequences_lower_95": 2.0371332122058403,
            "loss_sequences_upper_95": 2.438577707802377,
            "loss_tokens_lower_95": 1.888678844756831,
            "loss_tokens_upper_95": 2.189419351549755,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0838354821681118,
            "data_time": 0.0026518898650099293,
            "batch_time": 0.11630918575932804,
            "samples_per_second": 282183.88999903545,
            "samples_per_second_per_gpu": 141091.94499951773,
            "loss_sequences_lower_95": 2.0710978963370796,
            "loss_sequences_upper_95": 2.0968876001762853,
            "loss_tokens_lower_95": 2.071038709695064,
            "loss_tokens_upper_95": 2.0963148342559137,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.021904166462352,
            "data_time": 0.04605396091938019,
            "batch_time": 0.13997335731983185,
            "samples_per_second": 235381.63512617745,
            "samples_per_second_per_gpu": 117690.81756308873,
            "loss_sequences_lower_95": 1.000321447502062,
            "loss_sequences_upper_95": 1.168091899908862,
            "loss_tokens_lower_95": 0.8734120848234991,
            "loss_tokens_upper_95": 1.1459143397507916,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.385991359056917,
            "data_time": 0.0024051349001178598,
            "batch_time": 0.1165124995810388,
            "samples_per_second": 281441.0221029781,
            "samples_per_second_per_gpu": 140720.51105148904,
            "loss_sequences_lower_95": 3.998763420056997,
            "loss_sequences_upper_95": 4.038165546793108,
            "loss_tokens_lower_95": 3.2516661146034815,
            "loss_tokens_upper_95": 3.2900892891682783,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.595575357437133,
            "data_time": 0.00768456794321537,
            "batch_time": 0.11910475417971611,
            "samples_per_second": 276901.66455944267,
            "samples_per_second_per_gpu": 138450.83227972133,
            "loss_sequences_lower_95": 4.7420971313476565,
            "loss_sequences_upper_95": 5.0002192016601565,
            "loss_tokens_lower_95": 4.424319342357284,
            "loss_tokens_upper_95": 4.65646955662703,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.113642548478168,
            "data_time": 0.02494097501039505,
            "batch_time": 0.12861976772546768,
            "samples_per_second": 256353.91084714173,
            "samples_per_second_per_gpu": 128176.95542357086,
            "loss_sequences_lower_95": 2.0590194436778195,
            "loss_sequences_upper_95": 2.1689400349492614,
            "loss_tokens_lower_95": 2.0593893830672556,
            "loss_tokens_upper_95": 2.167836088097614,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.151991001042453,
            "data_time": 0.006360945247468494,
            "batch_time": 0.11813105997585115,
            "samples_per_second": 279270.90021436376,
            "samples_per_second_per_gpu": 139635.45010718188,
            "loss_sequences_lower_95": 7.035377770626184,
            "loss_sequences_upper_95": 7.270540364583334,
            "loss_tokens_lower_95": 7.030290841767282,
            "loss_tokens_upper_95": 7.268575772372159,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1680725177526474,
            "data_time": 0.0063944978916898685,
            "batch_time": 0.12019549405321162,
            "samples_per_second": 278125.7052511544,
            "samples_per_second_per_gpu": 139062.8526255772,
            "loss_sequences_lower_95": 1.2584395914713542,
            "loss_sequences_upper_95": 1.324612664794922,
            "loss_tokens_lower_95": 1.119257558492147,
            "loss_tokens_upper_95": 1.190015363567302,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.439197639056615,
            "data_time": 0.030632376670837402,
            "batch_time": 0.13802472182682582,
            "samples_per_second": 263976.55780000205,
            "samples_per_second_per_gpu": 131988.27890000102,
            "loss_sequences_lower_95": 5.060650736490885,
            "loss_sequences_upper_95": 5.830468357631138,
            "loss_tokens_lower_95": 5.055254734584264,
            "loss_tokens_upper_95": 5.829098903111049,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4054182469844818,
            "data_time": 0.1644863486289978,
            "batch_time": 0.2788604497909546,
            "samples_per_second": 178403.12520072248,
            "samples_per_second_per_gpu": 89201.56260036124,
            "loss_sequences_lower_95": 1.439814481139183,
            "loss_sequences_upper_95": 2.039811986684799,
            "loss_tokens_lower_95": 1.181695091011598,
            "loss_tokens_upper_95": 1.5762189947698533,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.290295672416687,
            "data_time": 0.007730446755886078,
            "batch_time": 0.11864662542939186,
            "samples_per_second": 278716.73017655406,
            "samples_per_second_per_gpu": 139358.36508827703,
            "loss_sequences_lower_95": 7.248262731933593,
            "loss_sequences_upper_95": 7.568702807617187,
            "loss_tokens_lower_95": 7.123730241546372,
            "loss_tokens_upper_95": 7.4078123554158735,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.870694519996643,
            "data_time": 0.00785224698483944,
            "batch_time": 0.11884315498173237,
            "samples_per_second": 278523.78120110853,
            "samples_per_second_per_gpu": 139261.89060055427,
            "loss_sequences_lower_95": 6.0427926269531245,
            "loss_sequences_upper_95": 6.241173962402344,
            "loss_tokens_lower_95": 5.75883963885281,
            "loss_tokens_upper_95": 5.931310173267715,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.855061635849582,
            "data_time": 0.004176714318863889,
            "batch_time": 0.11686664692899014,
            "samples_per_second": 281141.2597562617,
            "samples_per_second_per_gpu": 140570.62987813086,
            "loss_sequences_lower_95": 2.825241456142097,
            "loss_sequences_upper_95": 2.8850000752964715,
            "loss_tokens_lower_95": 2.8255004841890505,
            "loss_tokens_upper_95": 2.8843102777719576,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2529766583406063,
            "data_time": 0.012753526369730631,
            "batch_time": 0.12344467639923096,
            "samples_per_second": 272182.1456909337,
            "samples_per_second_per_gpu": 136091.07284546684,
            "loss_sequences_lower_95": 2.207028624831989,
            "loss_sequences_upper_95": 2.3005890220724123,
            "loss_tokens_lower_95": 2.206134966075329,
            "loss_tokens_upper_95": 2.299504626108571,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.997205829143524,
            "data_time": 0.007722409442067146,
            "batch_time": 0.1189134269952774,
            "samples_per_second": 277629.2175584029,
            "samples_per_second_per_gpu": 138814.60877920146,
            "loss_sequences_lower_95": 2.9162175842285154,
            "loss_sequences_upper_95": 3.0783055603027343,
            "loss_tokens_lower_95": 2.915770916748047,
            "loss_tokens_upper_95": 3.0768255249023437,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.966793875493525,
            "data_time": 0.002542222374515591,
            "batch_time": 0.11591846723210776,
            "samples_per_second": 282828.59052384086,
            "samples_per_second_per_gpu": 141414.29526192043,
            "loss_sequences_lower_95": 2.7158890552935784,
            "loss_sequences_upper_95": 2.8011429569240773,
            "loss_tokens_lower_95": 1.8047668688747707,
            "loss_tokens_upper_95": 1.8605681616713945,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9069427910135752,
            "data_time": 0.02051111062367757,
            "batch_time": 0.12708773877885607,
            "samples_per_second": 267011.6559451809,
            "samples_per_second_per_gpu": 133505.82797259046,
            "loss_sequences_lower_95": 1.8612586064125174,
            "loss_sequences_upper_95": 1.9538593747722568,
            "loss_tokens_lower_95": 1.8603270487998849,
            "loss_tokens_upper_95": 1.9515083996217641,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0065687880796546,
            "data_time": 0.012969225645065308,
            "batch_time": 0.12618359923362732,
            "samples_per_second": 276527.8215084787,
            "samples_per_second_per_gpu": 138263.91075423936,
            "loss_sequences_lower_95": 1.9767076110839845,
            "loss_sequences_upper_95": 2.037130270565257,
            "loss_tokens_lower_95": 1.9762586616067326,
            "loss_tokens_upper_95": 2.0366701552447153,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.616149722014127,
            "data_time": 0.00287873911857605,
            "batch_time": 0.11639795899391174,
            "samples_per_second": 282308.91599788,
            "samples_per_second_per_gpu": 141154.45799894,
            "loss_sequences_lower_95": 1.9302058836697906,
            "loss_sequences_upper_95": 2.0036612958787425,
            "loss_tokens_lower_95": 1.5317220774083935,
            "loss_tokens_upper_95": 1.58362083397299,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.029687011683429,
            "data_time": 0.03319379687309265,
            "batch_time": 0.14606699347496033,
            "samples_per_second": 262719.4497716988,
            "samples_per_second_per_gpu": 131359.7248858494,
            "loss_sequences_lower_95": 1.9618107023693265,
            "loss_sequences_upper_95": 2.0981369906632357,
            "loss_tokens_lower_95": 1.9617933969649057,
            "loss_tokens_upper_95": 2.0992368627477576,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9290404967211803,
            "data_time": 0.00410599442361628,
            "batch_time": 0.11701869270176564,
            "samples_per_second": 280662.72049856867,
            "samples_per_second_per_gpu": 140331.36024928433,
            "loss_sequences_lower_95": 3.901009950783639,
            "loss_sequences_upper_95": 3.9573616536458336,
            "loss_tokens_lower_95": 3.900956493692661,
            "loss_tokens_upper_95": 3.9569970553803517,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9602674384718959,
            "data_time": 0.027525646345955983,
            "batch_time": 0.1336858102253505,
            "samples_per_second": 261144.47562533006,
            "samples_per_second_per_gpu": 130572.23781266503,
            "loss_sequences_lower_95": 1.9038214933525013,
            "loss_sequences_upper_95": 2.016381491503669,
            "loss_tokens_lower_95": 1.9025436179151813,
            "loss_tokens_upper_95": 2.0171302276907612,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.258246753613154,
            "data_time": 0.09328576922416687,
            "batch_time": 0.2045610547065735,
            "samples_per_second": 217891.576358304,
            "samples_per_second_per_gpu": 108945.788179152,
            "loss_sequences_lower_95": 1.1821585909525554,
            "loss_sequences_upper_95": 1.5338848368326823,
            "loss_tokens_lower_95": 1.0542823791503906,
            "loss_tokens_upper_95": 1.4374110354317557,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.230962660908699,
            "data_time": 0.08853182196617126,
            "batch_time": 0.19972965121269226,
            "samples_per_second": 218570.2126002629,
            "samples_per_second_per_gpu": 109285.10630013145,
            "loss_sequences_lower_95": 1.21028910001119,
            "loss_sequences_upper_95": 1.5734670003255207,
            "loss_tokens_lower_95": 0.9933430875285287,
            "loss_tokens_upper_95": 1.4439416306742119,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.478827533946648,
            "data_time": 0.0038274266959911767,
            "batch_time": 0.11704327853557935,
            "samples_per_second": 280725.4396473912,
            "samples_per_second_per_gpu": 140362.7198236956,
            "loss_sequences_lower_95": 3.4596307040454715,
            "loss_sequences_upper_95": 3.4982495368878865,
            "loss_tokens_lower_95": 3.459292006569864,
            "loss_tokens_upper_95": 3.4979935955334134,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.2698804384885543,
            "data_time": 0.002421846046750342,
            "batch_time": 0.11712908898062466,
            "samples_per_second": 280035.14701774344,
            "samples_per_second_per_gpu": 140017.57350887172,
            "loss_sequences_lower_95": 0.35299856544217584,
            "loss_sequences_upper_95": 0.3631907069356173,
            "loss_tokens_lower_95": 0.2584271496876879,
            "loss_tokens_upper_95": 0.26417586510238,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9769008262889591,
            "data_time": 0.049779683351516724,
            "batch_time": 0.1633654534816742,
            "samples_per_second": 254448.65283154327,
            "samples_per_second_per_gpu": 127224.32641577163,
            "loss_sequences_lower_95": 0.9509072401392178,
            "loss_sequences_upper_95": 1.1225009197325218,
            "loss_tokens_lower_95": 0.9091421923336366,
            "loss_tokens_upper_95": 1.0036519923218343,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.663774941418622,
            "data_time": 0.08838319778442383,
            "batch_time": 0.1611696481704712,
            "samples_per_second": 166821.5741386807,
            "samples_per_second_per_gpu": 83410.78706934035,
            "loss_sequences_lower_95": 3.2682482487446554,
            "loss_sequences_upper_95": 4.181097133739574,
            "loss_tokens_lower_95": 3.07735659752363,
            "loss_tokens_upper_95": 4.217337149160879,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8776121532044759,
            "data_time": 0.02925671140352885,
            "batch_time": 0.1287584900856018,
            "samples_per_second": 242530.79192918784,
            "samples_per_second_per_gpu": 121265.39596459392,
            "loss_sequences_lower_95": 0.8623833191104052,
            "loss_sequences_upper_95": 1.003085762116967,
            "loss_tokens_lower_95": 0.81772345129836,
            "loss_tokens_upper_95": 0.8934208208217252,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9146042208119136,
            "data_time": 0.031515667835871376,
            "batch_time": 0.13096700112024942,
            "samples_per_second": 242263.3662330356,
            "samples_per_second_per_gpu": 121131.6831165178,
            "loss_sequences_lower_95": 0.9289788734622119,
            "loss_sequences_upper_95": 1.0614163980251405,
            "loss_tokens_lower_95": 0.8567136639039962,
            "loss_tokens_upper_95": 0.9215604846301637,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8984681541418157,
            "data_time": 0.03890913724899292,
            "batch_time": 0.16287070512771606,
            "samples_per_second": 236860.50973712723,
            "samples_per_second_per_gpu": 118430.25486856361,
            "loss_sequences_lower_95": 0.8015817083963533,
            "loss_sequences_upper_95": 0.9562337084514339,
            "loss_tokens_lower_95": 0.8360776772918613,
            "loss_tokens_upper_95": 0.934342116826975,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9777760451159826,
            "data_time": 0.033909072478612266,
            "batch_time": 0.13431871930758157,
            "samples_per_second": 237266.4115441305,
            "samples_per_second_per_gpu": 118633.20577206525,
            "loss_sequences_lower_95": 0.97507196519433,
            "loss_sequences_upper_95": 1.0923969036195336,
            "loss_tokens_lower_95": 0.9213995080870632,
            "loss_tokens_upper_95": 0.9843560168304918,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.872034634122197,
            "data_time": 0.03639990942818778,
            "batch_time": 0.14622183073134648,
            "samples_per_second": 255616.7137916498,
            "samples_per_second_per_gpu": 127808.3568958249,
            "loss_sequences_lower_95": 0.855228724983168,
            "loss_sequences_upper_95": 0.9399332555924883,
            "loss_tokens_lower_95": 0.8449234002540948,
            "loss_tokens_upper_95": 0.8930306513234864,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8234014038632556,
            "data_time": 0.032611568768819175,
            "batch_time": 0.13204342126846313,
            "samples_per_second": 242234.60902385725,
            "samples_per_second_per_gpu": 121117.30451192863,
            "loss_sequences_lower_95": 0.8704882342640946,
            "loss_sequences_upper_95": 0.9642515787264195,
            "loss_tokens_lower_95": 0.777485921076797,
            "loss_tokens_upper_95": 0.8206145721490733,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/params.txt",
    "uuid": "f17f8e15-c808-4630-862c-ffd777d49feb",
    "creation_date": "2024_01_26-08_20_56"
}