{
    "name": "rpj-d=1024_l=24_h=8-16.0",
    "dataset_name": "rpj",
    "dataset_uuid": "67db6b77-c7c4-48ae-b431-57254587ed43",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 131717201920,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 16.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/17288",
        "--train-num-samples",
        "26343440384",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/rpj_original/manifest.jsonl",
        "--data-key",
        "json.gz",
        "--name",
        "rpj-d=1024_l=24_h=8-16.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/411m_16x_rpj_original/"
    ],
    "results": [
        {
            "loss": 2.2583877523740132,
            "data_time": 0.041414543986320496,
            "batch_time": 0.4727131575345993,
            "samples_per_second": 226182.3078944849,
            "samples_per_second_per_gpu": 113091.15394724245,
            "loss_sequences_lower_95": 2.1965691884358725,
            "loss_sequences_upper_95": 2.3205279223124187,
            "loss_tokens_lower_95": 2.24737979888916,
            "loss_tokens_upper_95": 2.2693579101562498,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8035228460174086,
            "data_time": 0.0021558568178822236,
            "batch_time": 0.11742542722775248,
            "samples_per_second": 279658.08509697753,
            "samples_per_second_per_gpu": 139829.04254848877,
            "loss_sequences_lower_95": 2.8008411082409794,
            "loss_sequences_upper_95": 2.8062503400775305,
            "loss_tokens_lower_95": 2.793720807291667,
            "loss_tokens_upper_95": 2.8138525364583336,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.380841959739218,
            "data_time": 0.012052610516548157,
            "batch_time": 0.12160580232739449,
            "samples_per_second": 272807.7420437945,
            "samples_per_second_per_gpu": 136403.87102189724,
            "loss_sequences_lower_95": 2.357509877730389,
            "loss_sequences_upper_95": 2.403952867157605,
            "loss_tokens_lower_95": 2.370069140625,
            "loss_tokens_upper_95": 2.3913379322916666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6553784390577335,
            "data_time": 0.0033459294783441643,
            "batch_time": 0.1172061637043953,
            "samples_per_second": 280767.83819690486,
            "samples_per_second_per_gpu": 140383.91909845243,
            "loss_sequences_lower_95": 2.6438155656008377,
            "loss_sequences_upper_95": 2.6672115113563146,
            "loss_tokens_lower_95": 2.645499098958333,
            "loss_tokens_upper_95": 2.665178135416667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8075133753891146,
            "data_time": 0.012903187423944473,
            "batch_time": 0.12290431931614876,
            "samples_per_second": 272261.101066086,
            "samples_per_second_per_gpu": 136130.550533043,
            "loss_sequences_lower_95": 2.772775622832071,
            "loss_sequences_upper_95": 2.8412959267796905,
            "loss_tokens_lower_95": 2.7974552708333333,
            "loss_tokens_upper_95": 2.817549630208333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6038896102308824,
            "data_time": 0.00567175901454428,
            "batch_time": 0.11939767521360646,
            "samples_per_second": 280942.6606266254,
            "samples_per_second_per_gpu": 140471.3303133127,
            "loss_sequences_lower_95": 2.564186528748513,
            "loss_sequences_upper_95": 2.6417233465157097,
            "loss_tokens_lower_95": 2.5936417656249997,
            "loss_tokens_upper_95": 2.614163,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4025721284078092,
            "data_time": 0.0029739292411061075,
            "batch_time": 0.11645621332255277,
            "samples_per_second": 281068.7259495215,
            "samples_per_second_per_gpu": 140534.36297476076,
            "loss_sequences_lower_95": 1.3830269949776786,
            "loss_sequences_upper_95": 1.4222334208585776,
            "loss_tokens_lower_95": 1.393932546875,
            "loss_tokens_upper_95": 1.4115510807291665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2530301186426773,
            "data_time": 0.002917998234430949,
            "batch_time": 0.11659529685974121,
            "samples_per_second": 280820.68755695753,
            "samples_per_second_per_gpu": 140410.34377847877,
            "loss_sequences_lower_95": 3.2442622760553013,
            "loss_sequences_upper_95": 3.261341689504254,
            "loss_tokens_lower_95": 3.2428530677083334,
            "loss_tokens_upper_95": 3.2634624010416666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.001967067156381,
            "data_time": 0.012949943542480469,
            "batch_time": 0.13110559806227684,
            "samples_per_second": 272111.7653939937,
            "samples_per_second_per_gpu": 136055.88269699685,
            "loss_sequences_lower_95": 2.955792323166762,
            "loss_sequences_upper_95": 3.047915463331269,
            "loss_tokens_lower_95": 2.991431927083333,
            "loss_tokens_upper_95": 3.0124827604166664,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.612528101728839,
            "data_time": 0.012554757297039032,
            "batch_time": 0.12605786696076393,
            "samples_per_second": 274148.25788989384,
            "samples_per_second_per_gpu": 137074.12894494692,
            "loss_sequences_lower_95": 3.5767600802093624,
            "loss_sequences_upper_95": 3.6433777337960103,
            "loss_tokens_lower_95": 3.60099290625,
            "loss_tokens_upper_95": 3.6241200104166666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.739563275448701,
            "data_time": 0.002826813930749632,
            "batch_time": 0.11705864089472792,
            "samples_per_second": 280886.1053710406,
            "samples_per_second_per_gpu": 140443.0526855203,
            "loss_sequences_lower_95": 2.731532342593703,
            "loss_sequences_upper_95": 2.7474297148014593,
            "loss_tokens_lower_95": 2.7295272083333337,
            "loss_tokens_upper_95": 2.74928765625,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.610188081829511,
            "data_time": 0.0041963111522585845,
            "batch_time": 0.1184157889546746,
            "samples_per_second": 279071.3839325791,
            "samples_per_second_per_gpu": 139535.69196628954,
            "loss_sequences_lower_95": 2.60051869460316,
            "loss_sequences_upper_95": 2.6197678050812163,
            "loss_tokens_lower_95": 2.6003936406249997,
            "loss_tokens_upper_95": 2.6200488020833337,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1916918991545393,
            "data_time": 0.013448342680931091,
            "batch_time": 0.12409758567810059,
            "samples_per_second": 272026.6848295779,
            "samples_per_second_per_gpu": 136013.34241478896,
            "loss_sequences_lower_95": 3.1579607795266544,
            "loss_sequences_upper_95": 3.223022733305337,
            "loss_tokens_lower_95": 3.1813419791666666,
            "loss_tokens_upper_95": 3.202150630208333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5574321523460495,
            "data_time": 0.012249618768692017,
            "batch_time": 0.12223401665687561,
            "samples_per_second": 272242.6104309163,
            "samples_per_second_per_gpu": 136121.30521545815,
            "loss_sequences_lower_95": 2.4990345381913506,
            "loss_sequences_upper_95": 2.6157676432623154,
            "loss_tokens_lower_95": 2.547000541666667,
            "loss_tokens_upper_95": 2.567835489583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1453917080705818,
            "data_time": 0.08301648497581482,
            "batch_time": 0.1663360595703125,
            "samples_per_second": 205738.84812224546,
            "samples_per_second_per_gpu": 102869.42406112273,
            "loss_sequences_lower_95": 3.0858462767167523,
            "loss_sequences_upper_95": 3.203367337313565,
            "loss_tokens_lower_95": 3.125640366294167,
            "loss_tokens_upper_95": 3.165026248585094,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.101514458308762,
            "data_time": 0.016889696771448307,
            "batch_time": 0.12879526073282416,
            "samples_per_second": 270015.03501131875,
            "samples_per_second_per_gpu": 135007.51750565937,
            "loss_sequences_lower_95": 2.023957513303173,
            "loss_sequences_upper_95": 2.176667950243713,
            "loss_tokens_lower_95": 2.0920929375,
            "loss_tokens_upper_95": 2.11079525,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.2506708859768265,
            "data_time": 0.016215076049168903,
            "batch_time": 0.12972077230612436,
            "samples_per_second": 270979.9089467518,
            "samples_per_second_per_gpu": 135489.9544733759,
            "loss_sequences_lower_95": 5.192197130349192,
            "loss_sequences_upper_95": 5.304573916634029,
            "loss_tokens_lower_95": 5.239202572916667,
            "loss_tokens_upper_95": 5.262443927083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.706203890628502,
            "data_time": 0.04308424890041351,
            "batch_time": 0.1536850929260254,
            "samples_per_second": 251145.12193178493,
            "samples_per_second_per_gpu": 125572.56096589247,
            "loss_sequences_lower_95": 2.646666311045162,
            "loss_sequences_upper_95": 2.7545836339231395,
            "loss_tokens_lower_95": 2.6950911349937567,
            "loss_tokens_upper_95": 2.7175008429855594,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.226397165680285,
            "data_time": 0.0027254769633734146,
            "batch_time": 0.11674793882630681,
            "samples_per_second": 281432.9906504381,
            "samples_per_second_per_gpu": 140716.49532521906,
            "loss_sequences_lower_95": 2.2125634883852907,
            "loss_sequences_upper_95": 2.239785206322995,
            "loss_tokens_lower_95": 2.213025776882522,
            "loss_tokens_upper_95": 2.2398732374305657,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5718685247299327,
            "data_time": 0.002774838998818853,
            "batch_time": 0.1170982310346737,
            "samples_per_second": 280699.8414758051,
            "samples_per_second_per_gpu": 140349.92073790255,
            "loss_sequences_lower_95": 2.5748516192927204,
            "loss_sequences_upper_95": 2.5990412095371314,
            "loss_tokens_lower_95": 2.56120722892187,
            "loss_tokens_upper_95": 2.5790301340320734,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.28966217993789,
            "data_time": 0.00599926888053097,
            "batch_time": 0.12252585300758703,
            "samples_per_second": 277445.1924047818,
            "samples_per_second_per_gpu": 138722.5962023909,
            "loss_sequences_lower_95": 2.7838053923594708,
            "loss_sequences_upper_95": 3.0452108016484853,
            "loss_tokens_lower_95": 2.111089621074205,
            "loss_tokens_upper_95": 2.28834465559477,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.566040334781011,
            "data_time": 0.004199182099484383,
            "batch_time": 0.11778188639498771,
            "samples_per_second": 281011.6643613206,
            "samples_per_second_per_gpu": 140505.8321806603,
            "loss_sequences_lower_95": 2.7388171549479163,
            "loss_sequences_upper_95": 2.934379565429688,
            "loss_tokens_lower_95": 2.5074969351906446,
            "loss_tokens_upper_95": 2.6419433102397796,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0594502012326554,
            "data_time": 0.006390597139086042,
            "batch_time": 0.11843754422096979,
            "samples_per_second": 277579.3804110267,
            "samples_per_second_per_gpu": 138789.69020551335,
            "loss_sequences_lower_95": 2.142045823473927,
            "loss_sequences_upper_95": 2.192930222425251,
            "loss_tokens_lower_95": 2.02983295560187,
            "loss_tokens_upper_95": 2.057986036539501,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6468338733369654,
            "data_time": 0.027469822338649204,
            "batch_time": 0.1404328261102949,
            "samples_per_second": 264607.44874290406,
            "samples_per_second_per_gpu": 132303.72437145203,
            "loss_sequences_lower_95": 1.6616585818204013,
            "loss_sequences_upper_95": 1.7481603240966797,
            "loss_tokens_lower_95": 1.6174577567582888,
            "loss_tokens_upper_95": 1.657206478163106,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.54045824226068,
            "data_time": 0.02530939131975174,
            "batch_time": 0.13508538901805878,
            "samples_per_second": 265972.6285805489,
            "samples_per_second_per_gpu": 132986.31429027446,
            "loss_sequences_lower_95": 2.565992045499841,
            "loss_sequences_upper_95": 2.7275256908183194,
            "loss_tokens_lower_95": 2.4922388604434667,
            "loss_tokens_upper_95": 2.5748666135959715,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6658393136660257,
            "data_time": 0.021719694137573242,
            "batch_time": 0.12941343188285828,
            "samples_per_second": 266083.9726287615,
            "samples_per_second_per_gpu": 133041.98631438075,
            "loss_sequences_lower_95": 2.6771057790120443,
            "loss_sequences_upper_95": 2.762413299560547,
            "loss_tokens_lower_95": 2.566082514702758,
            "loss_tokens_upper_95": 2.731421630255814,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.854287769210062,
            "data_time": 0.003091408201485626,
            "batch_time": 0.11764192684388264,
            "samples_per_second": 279201.11418897234,
            "samples_per_second_per_gpu": 139600.55709448617,
            "loss_sequences_lower_95": 3.900099564105482,
            "loss_sequences_upper_95": 3.982235212726367,
            "loss_tokens_lower_95": 3.790175265182283,
            "loss_tokens_upper_95": 3.870638425226361,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9070722365419472,
            "data_time": 0.005509702364603678,
            "batch_time": 0.11919297615687052,
            "samples_per_second": 276586.42861615703,
            "samples_per_second_per_gpu": 138293.21430807852,
            "loss_sequences_lower_95": 3.854524513527199,
            "loss_sequences_upper_95": 4.146885090323811,
            "loss_tokens_lower_95": 2.7515343329129056,
            "loss_tokens_upper_95": 2.8759135981671493,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.947339816915297,
            "data_time": 0.008420696129670014,
            "batch_time": 0.12172550929559244,
            "samples_per_second": 275983.5647719755,
            "samples_per_second_per_gpu": 137991.78238598775,
            "loss_sequences_lower_95": 3.513371688269918,
            "loss_sequences_upper_95": 3.841232164402464,
            "loss_tokens_lower_95": 2.8034915181812683,
            "loss_tokens_upper_95": 2.9468013245200066,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.426131801518131,
            "data_time": 0.03260352781840733,
            "batch_time": 0.14591509103775024,
            "samples_per_second": 261509.25112005576,
            "samples_per_second_per_gpu": 130754.62556002788,
            "loss_sequences_lower_95": 5.319321445134133,
            "loss_sequences_upper_95": 5.529468721015268,
            "loss_tokens_lower_95": 5.3218576370308925,
            "loss_tokens_upper_95": 5.530248967697632,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.773655159473419,
            "data_time": 0.04961113631725311,
            "batch_time": 0.14295344054698944,
            "samples_per_second": 217441.9171749981,
            "samples_per_second_per_gpu": 108720.95858749904,
            "loss_sequences_lower_95": 2.7102430725097655,
            "loss_sequences_upper_95": 3.039496253967285,
            "loss_tokens_lower_95": 2.561391245615077,
            "loss_tokens_upper_95": 2.962047635591947,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.13763248462882,
            "data_time": 0.006622301853769194,
            "batch_time": 0.12015717785532881,
            "samples_per_second": 276747.5791747271,
            "samples_per_second_per_gpu": 138373.78958736354,
            "loss_sequences_lower_95": 1.1262368146570336,
            "loss_sequences_upper_95": 1.1497074904192923,
            "loss_tokens_lower_95": 1.1259690471168915,
            "loss_tokens_upper_95": 1.14970110265601,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9983722079883923,
            "data_time": 0.008152507818662204,
            "batch_time": 0.1203838831339127,
            "samples_per_second": 273604.99101087195,
            "samples_per_second_per_gpu": 136802.49550543597,
            "loss_sequences_lower_95": 1.9683392598919942,
            "loss_sequences_upper_95": 2.0284729103881936,
            "loss_tokens_lower_95": 1.9688507404999105,
            "loss_tokens_upper_95": 2.028864015271486,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8325737731630576,
            "data_time": 0.006069313863228108,
            "batch_time": 0.11932020351804536,
            "samples_per_second": 278226.73223898094,
            "samples_per_second_per_gpu": 139113.36611949047,
            "loss_sequences_lower_95": 3.1085313865487283,
            "loss_sequences_upper_95": 3.2439011865393774,
            "loss_tokens_lower_95": 2.7719977981692283,
            "loss_tokens_upper_95": 2.8245981497503068,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.442256257534027,
            "data_time": 0.014747075736522675,
            "batch_time": 0.12661965191364288,
            "samples_per_second": 272802.3471895156,
            "samples_per_second_per_gpu": 136401.1735947578,
            "loss_sequences_lower_95": 4.866221655273438,
            "loss_sequences_upper_95": 5.418268286132813,
            "loss_tokens_lower_95": 4.1829490636374915,
            "loss_tokens_upper_95": 4.535149391118774,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0287646651268005,
            "data_time": 0.19104492664337158,
            "batch_time": 0.30638647079467773,
            "samples_per_second": 172679.99971247005,
            "samples_per_second_per_gpu": 86339.99985623502,
            "loss_sequences_lower_95": 2.8127088606357575,
            "loss_sequences_upper_95": 3.3287383735179903,
            "loss_tokens_lower_95": 2.6010472352477327,
            "loss_tokens_upper_95": 3.3990079331672054,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0254527042652,
            "data_time": 0.034467349449793495,
            "batch_time": 0.1395344932874044,
            "samples_per_second": 255816.1811015423,
            "samples_per_second_per_gpu": 127908.09055077114,
            "loss_sequences_lower_95": 4.142666757517848,
            "loss_sequences_upper_95": 4.838355123585668,
            "loss_tokens_lower_95": 2.6806678746168613,
            "loss_tokens_upper_95": 3.086004838223119,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1552099145724295,
            "data_time": 0.006427153117126889,
            "batch_time": 0.12007240785492791,
            "samples_per_second": 277735.2700252647,
            "samples_per_second_per_gpu": 138867.63501263235,
            "loss_sequences_lower_95": 2.1344254386638544,
            "loss_sequences_upper_95": 2.175551305581722,
            "loss_tokens_lower_95": 2.134938949911602,
            "loss_tokens_upper_95": 2.1755929449438893,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6721349680343889,
            "data_time": 0.0038524124973504117,
            "batch_time": 0.11809899788494257,
            "samples_per_second": 279277.6761290359,
            "samples_per_second_per_gpu": 139638.83806451794,
            "loss_sequences_lower_95": 1.6793252025140089,
            "loss_sequences_upper_95": 1.7901380074410536,
            "loss_tokens_lower_95": 1.6003421973973522,
            "loss_tokens_upper_95": 1.706472512417973,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6719915010990243,
            "data_time": 0.022401094436645508,
            "batch_time": 0.1315287020471361,
            "samples_per_second": 265755.7846200925,
            "samples_per_second_per_gpu": 132877.89231004624,
            "loss_sequences_lower_95": 2.6491097642388537,
            "loss_sequences_upper_95": 3.0408512660435267,
            "loss_tokens_lower_95": 2.523817515836393,
            "loss_tokens_upper_95": 2.796011869164113,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.228160241502613,
            "data_time": 0.008450010418891906,
            "batch_time": 0.12271432876586914,
            "samples_per_second": 274601.97529198986,
            "samples_per_second_per_gpu": 137300.98764599493,
            "loss_sequences_lower_95": 3.350249979189276,
            "loss_sequences_upper_95": 3.5105256981612567,
            "loss_tokens_lower_95": 3.151229854807531,
            "loss_tokens_upper_95": 3.2924303191712894,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0933785213202967,
            "data_time": 0.03701956073443095,
            "batch_time": 0.13774287700653076,
            "samples_per_second": 236987.15085358522,
            "samples_per_second_per_gpu": 118493.57542679261,
            "loss_sequences_lower_95": 2.114953687714367,
            "loss_sequences_upper_95": 2.530829964614496,
            "loss_tokens_lower_95": 1.9522323434154518,
            "loss_tokens_upper_95": 2.249985751825678,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.297276691439915,
            "data_time": 0.0033077073935121773,
            "batch_time": 0.11773607696588047,
            "samples_per_second": 279035.1337479857,
            "samples_per_second_per_gpu": 139517.56687399285,
            "loss_sequences_lower_95": 2.2782712304179586,
            "loss_sequences_upper_95": 2.316499647585767,
            "loss_tokens_lower_95": 2.278047723802573,
            "loss_tokens_upper_95": 2.3162345652724294,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9334662932794071,
            "data_time": 0.048115238547325134,
            "batch_time": 0.14346301555633545,
            "samples_per_second": 229765.17900954047,
            "samples_per_second_per_gpu": 114882.58950477024,
            "loss_sequences_lower_95": 0.9145080399744718,
            "loss_sequences_upper_95": 1.0663114677355128,
            "loss_tokens_lower_95": 0.8012192053584993,
            "loss_tokens_upper_95": 1.0420529227905897,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9800556084894785,
            "data_time": 0.0028767072644837245,
            "batch_time": 0.11791596743407014,
            "samples_per_second": 278313.3837840393,
            "samples_per_second_per_gpu": 139156.69189201965,
            "loss_sequences_lower_95": 3.537051467095781,
            "loss_sequences_upper_95": 3.574895127014544,
            "loss_tokens_lower_95": 2.8601178372823983,
            "loss_tokens_upper_95": 2.895445968326886,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.038919390678406,
            "data_time": 0.00930938683450222,
            "batch_time": 0.12140363082289696,
            "samples_per_second": 273857.55585893954,
            "samples_per_second_per_gpu": 136928.77792946977,
            "loss_sequences_lower_95": 5.202657104492188,
            "loss_sequences_upper_95": 5.472759765625,
            "loss_tokens_lower_95": 4.8506696118371755,
            "loss_tokens_upper_95": 5.100563864770639,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6510354446328204,
            "data_time": 0.026360929012298584,
            "batch_time": 0.1313377395272255,
            "samples_per_second": 251179.92283506203,
            "samples_per_second_per_gpu": 125589.96141753101,
            "loss_sequences_lower_95": 1.6027726579749066,
            "loss_sequences_upper_95": 1.7000609057882556,
            "loss_tokens_lower_95": 1.6010271188487177,
            "loss_tokens_upper_95": 1.7008514271611752,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.4894217744018095,
            "data_time": 0.008866740124566215,
            "batch_time": 0.12149924181756519,
            "samples_per_second": 274611.15227876574,
            "samples_per_second_per_gpu": 137305.57613938287,
            "loss_sequences_lower_95": 6.390232174035274,
            "loss_sequences_upper_95": 6.589687444513494,
            "loss_tokens_lower_95": 6.3868764241536455,
            "loss_tokens_upper_95": 6.587768573183002,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1477668804327648,
            "data_time": 0.007189044292936934,
            "batch_time": 0.12148095699066812,
            "samples_per_second": 277123.0974292838,
            "samples_per_second_per_gpu": 138561.5487146419,
            "loss_sequences_lower_95": 1.2064820841471355,
            "loss_sequences_upper_95": 1.2526147644042969,
            "loss_tokens_lower_95": 1.1002734218687475,
            "loss_tokens_upper_95": 1.1769240332851891,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.717935453142439,
            "data_time": 0.03227985756737845,
            "batch_time": 0.1402847170829773,
            "samples_per_second": 261274.96626072057,
            "samples_per_second_per_gpu": 130637.48313036028,
            "loss_sequences_lower_95": 5.37361092703683,
            "loss_sequences_upper_95": 6.06921139671689,
            "loss_tokens_lower_95": 5.374022856212798,
            "loss_tokens_upper_95": 6.063965962727864,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4957780539989471,
            "data_time": 0.1912643313407898,
            "batch_time": 0.3109029531478882,
            "samples_per_second": 171997.8702625126,
            "samples_per_second_per_gpu": 85998.9351312563,
            "loss_sequences_lower_95": 1.5317017167806626,
            "loss_sequences_upper_95": 2.138684940338135,
            "loss_tokens_lower_95": 1.274344060838837,
            "loss_tokens_upper_95": 1.6800919774635552,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.146900667190552,
            "data_time": 0.010182248428463936,
            "batch_time": 0.1216642688959837,
            "samples_per_second": 274355.05949654843,
            "samples_per_second_per_gpu": 137177.52974827422,
            "loss_sequences_lower_95": 7.154089758300781,
            "loss_sequences_upper_95": 7.519247766113281,
            "loss_tokens_lower_95": 6.945815714724778,
            "loss_tokens_upper_95": 7.265835803656409,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.118062695503235,
            "data_time": 0.009317513555288315,
            "batch_time": 0.1207001842558384,
            "samples_per_second": 276177.47208459384,
            "samples_per_second_per_gpu": 138088.73604229692,
            "loss_sequences_lower_95": 6.259909155273437,
            "loss_sequences_upper_95": 6.465250537109375,
            "loss_tokens_lower_95": 6.013786842372934,
            "loss_tokens_upper_95": 6.181088674026665,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7246402345005793,
            "data_time": 0.004647088811752644,
            "batch_time": 0.11768561601638794,
            "samples_per_second": 279014.0424759195,
            "samples_per_second_per_gpu": 139507.02123795974,
            "loss_sequences_lower_95": 3.6773197042354595,
            "loss_sequences_upper_95": 3.771628993332006,
            "loss_tokens_lower_95": 3.6760383547131665,
            "loss_tokens_upper_95": 3.771868256055146,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0236197634959185,
            "data_time": 0.012779485611688523,
            "batch_time": 0.12338872466768537,
            "samples_per_second": 273466.03142485576,
            "samples_per_second_per_gpu": 136733.01571242788,
            "loss_sequences_lower_95": 1.982975616689468,
            "loss_sequences_upper_95": 2.0663452945363505,
            "loss_tokens_lower_95": 1.9824718750750048,
            "loss_tokens_upper_95": 2.0654010309784825,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1105040974617006,
            "data_time": 0.009606990963220596,
            "batch_time": 0.12108461186289787,
            "samples_per_second": 274333.2038141907,
            "samples_per_second_per_gpu": 137166.60190709535,
            "loss_sequences_lower_95": 3.0082770080566408,
            "loss_sequences_upper_95": 3.219123748779297,
            "loss_tokens_lower_95": 3.004231311035156,
            "loss_tokens_upper_95": 3.213582611083984,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8852587371134149,
            "data_time": 0.003235245579319058,
            "batch_time": 0.11691993601012446,
            "samples_per_second": 280697.36202427285,
            "samples_per_second_per_gpu": 140348.68101213643,
            "loss_sequences_lower_95": 2.5895918578524126,
            "loss_sequences_upper_95": 2.671383480479837,
            "loss_tokens_lower_95": 1.7320236226232244,
            "loss_tokens_upper_95": 1.7862863514695217,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.621398285698535,
            "data_time": 0.02531049648920695,
            "batch_time": 0.13207881980472141,
            "samples_per_second": 265460.86221804086,
            "samples_per_second_per_gpu": 132730.43110902043,
            "loss_sequences_lower_95": 1.5726098274117084,
            "loss_sequences_upper_95": 1.670400756152708,
            "loss_tokens_lower_95": 1.5720685361036613,
            "loss_tokens_upper_95": 1.67027456368973,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6367755726271984,
            "data_time": 0.014344040304422379,
            "batch_time": 0.12834826484322548,
            "samples_per_second": 274241.7872672131,
            "samples_per_second_per_gpu": 137120.89363360655,
            "loss_sequences_lower_95": 1.6003787111768537,
            "loss_sequences_upper_95": 1.672261517094631,
            "loss_tokens_lower_95": 1.6009938797296264,
            "loss_tokens_upper_95": 1.6729142821068859,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6523859119797566,
            "data_time": 0.003493205547332764,
            "batch_time": 0.11755242490768433,
            "samples_per_second": 279705.40942390414,
            "samples_per_second_per_gpu": 139852.70471195207,
            "loss_sequences_lower_95": 1.9743183757672553,
            "loss_sequences_upper_95": 2.0495255420698673,
            "loss_tokens_lower_95": 1.5651524643300674,
            "loss_tokens_upper_95": 1.6176601215939712,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7138815594728662,
            "data_time": 0.03850275278091431,
            "batch_time": 0.1525078316529592,
            "samples_per_second": 259891.55992047707,
            "samples_per_second_per_gpu": 129945.77996023853,
            "loss_sequences_lower_95": 1.6262483384874131,
            "loss_sequences_upper_95": 1.7996675077569548,
            "loss_tokens_lower_95": 1.626072789752294,
            "loss_tokens_upper_95": 1.8000164334736173,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2842758674504924,
            "data_time": 0.004696032954651175,
            "batch_time": 0.11782178195934852,
            "samples_per_second": 278935.6752794335,
            "samples_per_second_per_gpu": 139467.83763971674,
            "loss_sequences_lower_95": 3.253338671576357,
            "loss_sequences_upper_95": 3.315046834683199,
            "loss_tokens_lower_95": 3.253385065761181,
            "loss_tokens_upper_95": 3.315494873793482,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.622256112908854,
            "data_time": 0.032375105789729526,
            "batch_time": 0.13844152007784163,
            "samples_per_second": 259883.30173130814,
            "samples_per_second_per_gpu": 129941.65086565407,
            "loss_sequences_lower_95": 1.5817217965727872,
            "loss_sequences_upper_95": 1.6642423055704358,
            "loss_tokens_lower_95": 1.5797320652933955,
            "loss_tokens_upper_95": 1.6645892541385392,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.317288613319397,
            "data_time": 0.10497850179672241,
            "batch_time": 0.2165655791759491,
            "samples_per_second": 216057.6816081457,
            "samples_per_second_per_gpu": 108028.84080407285,
            "loss_sequences_lower_95": 1.2085845692952473,
            "loss_sequences_upper_95": 1.5826758448282878,
            "loss_tokens_lower_95": 1.0912597550286187,
            "loss_tokens_upper_95": 1.5138543075985376,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2207245806852975,
            "data_time": 0.14083698391914368,
            "batch_time": 0.25245076417922974,
            "samples_per_second": 212731.1050475682,
            "samples_per_second_per_gpu": 106365.5525237841,
            "loss_sequences_lower_95": 1.1590706094106038,
            "loss_sequences_upper_95": 1.4761724789937334,
            "loss_tokens_lower_95": 0.983081048258235,
            "loss_tokens_upper_95": 1.4691352715652979,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6341554534277263,
            "data_time": 0.004719126140764223,
            "batch_time": 0.11783780984632863,
            "samples_per_second": 279468.547066602,
            "samples_per_second_per_gpu": 139734.273533301,
            "loss_sequences_lower_95": 3.615235000632824,
            "loss_sequences_upper_95": 3.654064628589838,
            "loss_tokens_lower_95": 3.614621341126657,
            "loss_tokens_upper_95": 3.65299766574236,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.2819903550999077,
            "data_time": 0.002865056598995134,
            "batch_time": 0.11728137796765083,
            "samples_per_second": 279675.7426747127,
            "samples_per_second_per_gpu": 139837.87133735634,
            "loss_sequences_lower_95": 0.3394657235444591,
            "loss_sequences_upper_95": 0.3475106429818928,
            "loss_tokens_lower_95": 0.27484667723547407,
            "loss_tokens_upper_95": 0.28009538933475087,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0374281810024593,
            "data_time": 0.0545089989900589,
            "batch_time": 0.1689894050359726,
            "samples_per_second": 252023.14878332545,
            "samples_per_second_per_gpu": 126011.57439166273,
            "loss_sequences_lower_95": 1.028208313589021,
            "loss_sequences_upper_95": 1.218058512529989,
            "loss_tokens_lower_95": 0.9633556616156964,
            "loss_tokens_upper_95": 1.0591897745678274,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.472313597395613,
            "data_time": 0.0914759635925293,
            "batch_time": 0.16705158352851868,
            "samples_per_second": 158419.34630451116,
            "samples_per_second_per_gpu": 79209.67315225558,
            "loss_sequences_lower_95": 3.1068442989040066,
            "loss_sequences_upper_95": 3.95539108482567,
            "loss_tokens_lower_95": 2.9051106582453223,
            "loss_tokens_upper_95": 4.001463186005016,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9215564516986289,
            "data_time": 0.032190303007761635,
            "batch_time": 0.13226536909739176,
            "samples_per_second": 239809.18529073018,
            "samples_per_second_per_gpu": 119904.59264536509,
            "loss_sequences_lower_95": 0.9170010869095966,
            "loss_sequences_upper_95": 1.0724738632760398,
            "loss_tokens_lower_95": 0.8546333065305666,
            "loss_tokens_upper_95": 0.931081220596352,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9609454743018965,
            "data_time": 0.03539040684700012,
            "batch_time": 0.13534754514694214,
            "samples_per_second": 239772.7313830028,
            "samples_per_second_per_gpu": 119886.3656915014,
            "loss_sequences_lower_95": 0.9904920717565024,
            "loss_sequences_upper_95": 1.133062092850848,
            "loss_tokens_lower_95": 0.8952682844063343,
            "loss_tokens_upper_95": 0.9609792446969918,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9472617710508952,
            "data_time": 0.03713493545850118,
            "batch_time": 0.13808407386144003,
            "samples_per_second": 235720.16279537618,
            "samples_per_second_per_gpu": 117860.08139768809,
            "loss_sequences_lower_95": 0.8563027405157322,
            "loss_sequences_upper_95": 1.0213129113360147,
            "loss_tokens_lower_95": 0.8773400419124452,
            "loss_tokens_upper_95": 0.9790388935389637,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0179584008164522,
            "data_time": 0.03472978870073954,
            "batch_time": 0.13535274068514505,
            "samples_per_second": 237405.59558170606,
            "samples_per_second_per_gpu": 118702.79779085303,
            "loss_sequences_lower_95": 1.0265470667583187,
            "loss_sequences_upper_95": 1.14922555597817,
            "loss_tokens_lower_95": 0.9558925616778318,
            "loss_tokens_upper_95": 1.0192215868988512,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8923687001933223,
            "data_time": 0.03867036955697196,
            "batch_time": 0.14965367317199707,
            "samples_per_second": 251601.9330490948,
            "samples_per_second_per_gpu": 125800.9665245474,
            "loss_sequences_lower_95": 0.8756523369261938,
            "loss_sequences_upper_95": 0.9581676222522807,
            "loss_tokens_lower_95": 0.8636830830469435,
            "loss_tokens_upper_95": 0.9122160499574985,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8339598204304532,
            "data_time": 0.033937801917394005,
            "batch_time": 0.1339630981286367,
            "samples_per_second": 239792.6786800148,
            "samples_per_second_per_gpu": 119896.3393400074,
            "loss_sequences_lower_95": 0.8836964118771438,
            "loss_sequences_upper_95": 0.977296996698147,
            "loss_tokens_lower_95": 0.789953570604468,
            "loss_tokens_upper_95": 0.8329408555723515,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-16.0/params.txt",
    "uuid": "1d84fe6b-746b-463f-b510-dc191aa39813",
    "creation_date": "2024_01_26-08_19_57"
}