{
    "name": "rpj-d=512_l=8_h=4-8.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 12626247680,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "2525249536",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.073348774512609,
            "data_time": 0.034581951797008514,
            "batch_time": 0.3465514965355396,
            "samples_per_second": 1736078.3922356206,
            "samples_per_second_per_gpu": 217009.79902945258,
            "loss_sequences_lower_95": 3.0026929219563803,
            "loss_sequences_upper_95": 3.139836730957031,
            "loss_tokens_lower_95": 3.061223659515381,
            "loss_tokens_upper_95": 3.0855529085795084,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5525051864251345,
            "data_time": 0.0014595485385960922,
            "batch_time": 0.015253593390838843,
            "samples_per_second": 2256171.3759097406,
            "samples_per_second_per_gpu": 282021.4219887176,
            "loss_sequences_lower_95": 3.5498965429005045,
            "loss_sequences_upper_95": 3.555037494926411,
            "loss_tokens_lower_95": 3.54162809375,
            "loss_tokens_upper_95": 3.5634867916666666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.892356330034684,
            "data_time": 0.010224129676818848,
            "batch_time": 0.023778528213500977,
            "samples_per_second": 2224976.2066804813,
            "samples_per_second_per_gpu": 278122.02583506017,
            "loss_sequences_lower_95": 2.8668155467753507,
            "loss_sequences_upper_95": 2.9184062973333864,
            "loss_tokens_lower_95": 2.8807096666666667,
            "loss_tokens_upper_95": 2.904082328125,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4011874367527124,
            "data_time": 0.0015922846566689642,
            "batch_time": 0.014865292824412646,
            "samples_per_second": 2352967.025547421,
            "samples_per_second_per_gpu": 294120.87819342763,
            "loss_sequences_lower_95": 3.3893131745328606,
            "loss_sequences_upper_95": 3.4126363261114694,
            "loss_tokens_lower_95": 3.3900974635416667,
            "loss_tokens_upper_95": 3.411995921875,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5461635647868914,
            "data_time": 0.009934416805130552,
            "batch_time": 0.023744332362931088,
            "samples_per_second": 2193954.936488006,
            "samples_per_second_per_gpu": 274244.3670610007,
            "loss_sequences_lower_95": 3.5127115843980716,
            "loss_sequences_upper_95": 3.5785922234995544,
            "loss_tokens_lower_95": 3.5350024791666668,
            "loss_tokens_upper_95": 3.5569968125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.326910519907865,
            "data_time": 0.00407425573338633,
            "batch_time": 0.017745404463747273,
            "samples_per_second": 2290510.3001465183,
            "samples_per_second_per_gpu": 286313.7875183148,
            "loss_sequences_lower_95": 3.2834149514307867,
            "loss_sequences_upper_95": 3.3705796190543214,
            "loss_tokens_lower_95": 3.31566390625,
            "loss_tokens_upper_95": 3.3382463072916666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.9327240556843426,
            "data_time": 0.0015844676467372777,
            "batch_time": 0.01483983063970071,
            "samples_per_second": 2360155.1920486535,
            "samples_per_second_per_gpu": 295019.3990060817,
            "loss_sequences_lower_95": 1.909616270727041,
            "loss_sequences_upper_95": 1.9555674027423469,
            "loss_tokens_lower_95": 1.92232678125,
            "loss_tokens_upper_95": 1.9433093333333333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.892980764299163,
            "data_time": 0.001574577621115346,
            "batch_time": 0.014819730330291327,
            "samples_per_second": 2361372.3060971405,
            "samples_per_second_per_gpu": 295171.53826214257,
            "loss_sequences_lower_95": 3.8839886800556283,
            "loss_sequences_upper_95": 3.9023664716950264,
            "loss_tokens_lower_95": 3.88202090625,
            "loss_tokens_upper_95": 3.90394615625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7201258670992967,
            "data_time": 0.01236745384004381,
            "batch_time": 0.026324403664422413,
            "samples_per_second": 2174643.740009441,
            "samples_per_second_per_gpu": 271830.46750118013,
            "loss_sequences_lower_95": 3.679509815743299,
            "loss_sequences_upper_95": 3.765849335213018,
            "loss_tokens_lower_95": 3.708941625,
            "loss_tokens_upper_95": 3.7313771979166668,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.370168323102205,
            "data_time": 0.01055743359029293,
            "batch_time": 0.02451184019446373,
            "samples_per_second": 2209971.490857928,
            "samples_per_second_per_gpu": 276246.436357241,
            "loss_sequences_lower_95": 4.345038613783041,
            "loss_sequences_upper_95": 4.396567821879632,
            "loss_tokens_lower_95": 4.357883322916667,
            "loss_tokens_upper_95": 4.382629177083333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.476893836053182,
            "data_time": 0.0012897424548905202,
            "batch_time": 0.014841090754046117,
            "samples_per_second": 2316641.7195825055,
            "samples_per_second_per_gpu": 289580.2149478132,
            "loss_sequences_lower_95": 3.4684384154018773,
            "loss_sequences_upper_95": 3.485172931994741,
            "loss_tokens_lower_95": 3.4660416875,
            "loss_tokens_upper_95": 3.487714484375,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.404168967851149,
            "data_time": 0.0026502414706545407,
            "batch_time": 0.016089072136160336,
            "samples_per_second": 2327108.3358626934,
            "samples_per_second_per_gpu": 290888.5419828367,
            "loss_sequences_lower_95": 3.3936975062067236,
            "loss_sequences_upper_95": 3.414408140488338,
            "loss_tokens_lower_95": 3.3934168281250003,
            "loss_tokens_upper_95": 3.4147844427083336,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.868612886684178,
            "data_time": 0.010500786332744854,
            "batch_time": 0.024322548402627937,
            "samples_per_second": 2163872.7288013743,
            "samples_per_second_per_gpu": 270484.0911001718,
            "loss_sequences_lower_95": 3.8310177754678785,
            "loss_sequences_upper_95": 3.905924854704139,
            "loss_tokens_lower_95": 3.8572362499999997,
            "loss_tokens_upper_95": 3.8798476458333333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2670138826681008,
            "data_time": 0.010957694148637384,
            "batch_time": 0.025134156899623186,
            "samples_per_second": 2136546.3798667905,
            "samples_per_second_per_gpu": 267068.2974833488,
            "loss_sequences_lower_95": 3.2031257396317305,
            "loss_sequences_upper_95": 3.3286386649137603,
            "loss_tokens_lower_95": 3.2556728645833335,
            "loss_tokens_upper_95": 3.278343390625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.191853398626501,
            "data_time": 0.08368804625102452,
            "batch_time": 0.10038525717599052,
            "samples_per_second": 954624.3743267272,
            "samples_per_second_per_gpu": 119328.0467908409,
            "loss_sequences_lower_95": 4.12917177026922,
            "loss_sequences_upper_95": 4.2545879190618345,
            "loss_tokens_lower_95": 4.171643222462047,
            "loss_tokens_upper_95": 4.2126880038868295,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9413131531056442,
            "data_time": 0.014352830973538485,
            "batch_time": 0.028121001341126182,
            "samples_per_second": 2167317.097929345,
            "samples_per_second_per_gpu": 270914.6372411681,
            "loss_sequences_lower_95": 2.8417315246759975,
            "loss_sequences_upper_95": 3.0405489907667866,
            "loss_tokens_lower_95": 2.9303573177083333,
            "loss_tokens_upper_95": 2.952195192708333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.815058684286153,
            "data_time": 0.012968766192595163,
            "batch_time": 0.026707264284292858,
            "samples_per_second": 2204963.9064634535,
            "samples_per_second_per_gpu": 275620.4883079317,
            "loss_sequences_lower_95": 5.769965195467101,
            "loss_sequences_upper_95": 5.858393928306398,
            "loss_tokens_lower_95": 5.803734666666666,
            "loss_tokens_upper_95": 5.826288666666667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7721477571080944,
            "data_time": 0.0379178561270237,
            "batch_time": 0.05299755558371544,
            "samples_per_second": 1775924.0054174825,
            "samples_per_second_per_gpu": 221990.5006771853,
            "loss_sequences_lower_95": 3.7084311125708407,
            "loss_sequences_upper_95": 3.860975703255075,
            "loss_tokens_lower_95": 3.7597052589791717,
            "loss_tokens_upper_95": 3.7844463723604793,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.43208375324841,
            "data_time": 0.001880279129078285,
            "batch_time": 0.015399234000262475,
            "samples_per_second": 2291840.399369588,
            "samples_per_second_per_gpu": 286480.0499211985,
            "loss_sequences_lower_95": 5.411031414684518,
            "loss_sequences_upper_95": 5.453518782380537,
            "loss_tokens_lower_95": 5.410691405137267,
            "loss_tokens_upper_95": 5.453716779349452,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.320139528650422,
            "data_time": 0.002077999767983795,
            "batch_time": 0.015722047561293195,
            "samples_per_second": 2270708.403914146,
            "samples_per_second_per_gpu": 283838.55048926827,
            "loss_sequences_lower_95": 3.307210730556662,
            "loss_sequences_upper_95": 3.332908454413339,
            "loss_tokens_lower_95": 3.30627018982081,
            "loss_tokens_upper_95": 3.3259964390351135,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.459266863316368,
            "data_time": 0.0032105715515251773,
            "batch_time": 0.01673553980936092,
            "samples_per_second": 2288594.7610576977,
            "samples_per_second_per_gpu": 286074.3451322122,
            "loss_sequences_lower_95": 4.71937466325431,
            "loss_sequences_upper_95": 5.020824457239756,
            "loss_tokens_lower_95": 3.9073519659689957,
            "loss_tokens_upper_95": 4.123463816230344,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.849388543774684,
            "data_time": 0.004224483003007605,
            "batch_time": 0.017736130730902896,
            "samples_per_second": 2275600.9636547016,
            "samples_per_second_per_gpu": 284450.1204568377,
            "loss_sequences_lower_95": 5.0027014322916665,
            "loss_sequences_upper_95": 5.217465120442708,
            "loss_tokens_lower_95": 4.510430412244497,
            "loss_tokens_upper_95": 4.658988318101415,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.242007106429614,
            "data_time": 0.0046777480687850564,
            "batch_time": 0.01843424835895521,
            "samples_per_second": 2230189.7835198618,
            "samples_per_second_per_gpu": 278773.7229399827,
            "loss_sequences_lower_95": 3.286909916366566,
            "loss_sequences_upper_95": 3.3492232663239196,
            "loss_tokens_lower_95": 3.144278508957668,
            "loss_tokens_upper_95": 3.175638475825186,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.317223549972881,
            "data_time": 0.023742477808679854,
            "batch_time": 0.0390971302986145,
            "samples_per_second": 1921526.6169458993,
            "samples_per_second_per_gpu": 240190.82711823742,
            "loss_sequences_lower_95": 2.297226826060902,
            "loss_sequences_upper_95": 2.4092487473921342,
            "loss_tokens_lower_95": 2.2481577679381233,
            "loss_tokens_upper_95": 2.2943353706351406,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5265891493583212,
            "data_time": 0.021611470729112625,
            "batch_time": 0.03603355959057808,
            "samples_per_second": 1959321.665231361,
            "samples_per_second_per_gpu": 244915.20815392013,
            "loss_sequences_lower_95": 3.5213557340660873,
            "loss_sequences_upper_95": 3.7220346754424423,
            "loss_tokens_lower_95": 3.388517363902759,
            "loss_tokens_upper_95": 3.4834273310553487,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.034023535251618,
            "data_time": 0.018433858186770707,
            "batch_time": 0.03218655708508614,
            "samples_per_second": 2043331.4902264446,
            "samples_per_second_per_gpu": 255416.43627830557,
            "loss_sequences_lower_95": 4.0040037740071615,
            "loss_sequences_upper_95": 4.118205790201823,
            "loss_tokens_lower_95": 3.8827520306826524,
            "loss_tokens_upper_95": 4.109365825865265,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.557468085013784,
            "data_time": 0.001666479042202642,
            "batch_time": 0.0151048064900599,
            "samples_per_second": 2303653.8998682643,
            "samples_per_second_per_gpu": 287956.73748353304,
            "loss_sequences_lower_95": 6.572862163832981,
            "loss_sequences_upper_95": 6.651528783339894,
            "loss_tokens_lower_95": 6.406745854287114,
            "loss_tokens_upper_95": 6.488769302612404,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.84795402176051,
            "data_time": 0.0029821605890389255,
            "batch_time": 0.0167331749560849,
            "samples_per_second": 2252561.1050778655,
            "samples_per_second_per_gpu": 281570.1381347332,
            "loss_sequences_lower_95": 5.422616423019256,
            "loss_sequences_upper_95": 5.736291719687105,
            "loss_tokens_lower_95": 4.06301317206304,
            "loss_tokens_upper_95": 4.205965901027475,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.385423696590365,
            "data_time": 0.005178496241569519,
            "batch_time": 0.018706068799302384,
            "samples_per_second": 2248171.731391676,
            "samples_per_second_per_gpu": 281021.4664239595,
            "loss_sequences_lower_95": 4.8570904103562285,
            "loss_sequences_upper_95": 5.210918210390892,
            "loss_tokens_lower_95": 3.9310410269693854,
            "loss_tokens_upper_95": 4.092933653308022,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.830263414339388,
            "data_time": 0.023949940289769853,
            "batch_time": 0.03807672432490757,
            "samples_per_second": 2004910.1871200537,
            "samples_per_second_per_gpu": 250613.7733900067,
            "loss_sequences_lower_95": 5.731066810921447,
            "loss_sequences_upper_95": 5.9273062370683505,
            "loss_tokens_lower_95": 5.732178863107341,
            "loss_tokens_upper_95": 5.926247514001855,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6254799723625184,
            "data_time": 0.052737148908468395,
            "batch_time": 0.06720101833343506,
            "samples_per_second": 1784779.7087647172,
            "samples_per_second_per_gpu": 223097.46359558965,
            "loss_sequences_lower_95": 3.495806350708008,
            "loss_sequences_upper_95": 3.8582655258178713,
            "loss_tokens_lower_95": 3.3248266667074295,
            "loss_tokens_upper_95": 3.784067149153762,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.5509323401485275,
            "data_time": 0.0034615035437367443,
            "batch_time": 0.01697050111181653,
            "samples_per_second": 2283615.400100624,
            "samples_per_second_per_gpu": 285451.925012578,
            "loss_sequences_lower_95": 5.503185897717822,
            "loss_sequences_upper_95": 5.598900430107312,
            "loss_tokens_lower_95": 5.502213895674737,
            "loss_tokens_upper_95": 5.599692962552776,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.895018779089176,
            "data_time": 0.005053364627707451,
            "batch_time": 0.01900598353312805,
            "samples_per_second": 2213873.4492339552,
            "samples_per_second_per_gpu": 276734.1811542444,
            "loss_sequences_lower_95": 5.833947833886799,
            "loss_sequences_upper_95": 5.955910652529305,
            "loss_tokens_lower_95": 5.831794237641406,
            "loss_tokens_upper_95": 5.956798186680999,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.866731008326268,
            "data_time": 0.0036424197594496395,
            "batch_time": 0.017345543041600006,
            "samples_per_second": 2243489.3329600445,
            "samples_per_second_per_gpu": 280436.16662000556,
            "loss_sequences_lower_95": 4.040295370307399,
            "loss_sequences_upper_95": 4.1700678878302675,
            "loss_tokens_lower_95": 3.6779809784819397,
            "loss_tokens_upper_95": 3.7361900192276405,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.7512201695442196,
            "data_time": 0.010789843276143074,
            "batch_time": 0.024954120628535748,
            "samples_per_second": 2113719.255022923,
            "samples_per_second_per_gpu": 264214.9068778654,
            "loss_sequences_lower_95": 5.949518408203125,
            "loss_sequences_upper_95": 6.521870812988281,
            "loss_tokens_lower_95": 5.0956211738914625,
            "loss_tokens_upper_95": 5.466123234302614,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.072874307632446,
            "data_time": 0.15917140245437622,
            "batch_time": 0.1771625131368637,
            "samples_per_second": 804625.0145714759,
            "samples_per_second_per_gpu": 100578.12682143449,
            "loss_sequences_lower_95": 3.8621192276477814,
            "loss_sequences_upper_95": 4.3639488339424135,
            "loss_tokens_lower_95": 3.6478736789747215,
            "loss_tokens_upper_95": 4.407346756156834,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.548088684164244,
            "data_time": 0.029802464424295627,
            "batch_time": 0.04481781036295789,
            "samples_per_second": 1735014.3865158586,
            "samples_per_second_per_gpu": 216876.79831448232,
            "loss_sequences_lower_95": 6.077900415179373,
            "loss_sequences_upper_95": 6.987070035386359,
            "loss_tokens_lower_95": 3.952298534440618,
            "loss_tokens_upper_95": 4.461705132347889,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.946870160446567,
            "data_time": 0.0030451083762778174,
            "batch_time": 0.016596596067150433,
            "samples_per_second": 2268784.602553125,
            "samples_per_second_per_gpu": 283598.0753191406,
            "loss_sequences_lower_95": 2.922747005464313,
            "loss_sequences_upper_95": 2.9711874758952463,
            "loss_tokens_lower_95": 2.9222530286663115,
            "loss_tokens_upper_95": 2.9711524120112496,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.218798612849864,
            "data_time": 0.0026200088734861767,
            "batch_time": 0.016057853695029753,
            "samples_per_second": 2302033.806517847,
            "samples_per_second_per_gpu": 287754.22581473086,
            "loss_sequences_lower_95": 3.18996455531062,
            "loss_sequences_upper_95": 3.354425810435062,
            "loss_tokens_lower_95": 3.0374218180368207,
            "loss_tokens_upper_95": 3.1984909385823004,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.424188105177967,
            "data_time": 0.01974195573065016,
            "batch_time": 0.03383765286869473,
            "samples_per_second": 1995534.202185637,
            "samples_per_second_per_gpu": 249441.77527320464,
            "loss_sequences_lower_95": 3.2727653587257466,
            "loss_sequences_upper_95": 3.681256349444826,
            "loss_tokens_lower_95": 3.1527784095220714,
            "loss_tokens_upper_95": 3.4552365792668827,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8199817464402464,
            "data_time": 0.004865531250834465,
            "batch_time": 0.01876369081437588,
            "samples_per_second": 2199772.792400524,
            "samples_per_second_per_gpu": 274971.5990500655,
            "loss_sequences_lower_95": 3.8566235332293313,
            "loss_sequences_upper_95": 4.00598750547245,
            "loss_tokens_lower_95": 3.6767597495377373,
            "loss_tokens_upper_95": 3.823309934574088,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9857087280692123,
            "data_time": 0.03189360527765183,
            "batch_time": 0.04761276642481486,
            "samples_per_second": 1799038.101602289,
            "samples_per_second_per_gpu": 224879.7627002861,
            "loss_sequences_lower_95": 2.804084191671232,
            "loss_sequences_upper_95": 3.2663607713652816,
            "loss_tokens_lower_95": 2.7178485216946626,
            "loss_tokens_upper_95": 3.091213904625051,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4255547655131915,
            "data_time": 0.002196473118975979,
            "batch_time": 0.015620403417434113,
            "samples_per_second": 2300733.1574576986,
            "samples_per_second_per_gpu": 287591.6446822123,
            "loss_sequences_lower_95": 4.411500249268604,
            "loss_sequences_upper_95": 4.439645516994023,
            "loss_tokens_lower_95": 4.411177674988123,
            "loss_tokens_upper_95": 4.439823179479646,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.195521435112629,
            "data_time": 0.04746575789018111,
            "batch_time": 0.06205019517378373,
            "samples_per_second": 1703710.9138106192,
            "samples_per_second_per_gpu": 212963.8642263274,
            "loss_sequences_lower_95": 1.1422345355876442,
            "loss_sequences_upper_95": 1.3080427632748501,
            "loss_tokens_lower_95": 1.0214485722896143,
            "loss_tokens_upper_95": 1.2508050143535696,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.8458366951009015,
            "data_time": 0.0016610663749546137,
            "batch_time": 0.015257482831456136,
            "samples_per_second": 2276605.745413534,
            "samples_per_second_per_gpu": 284575.71817669173,
            "loss_sequences_lower_95": 5.226937985210299,
            "loss_sequences_upper_95": 5.274703622494104,
            "loss_tokens_lower_95": 4.254814277079304,
            "loss_tokens_upper_95": 4.30104541827853,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.414528577804566,
            "data_time": 0.005978372361924913,
            "batch_time": 0.019900460091848222,
            "samples_per_second": 2187748.1654549357,
            "samples_per_second_per_gpu": 273468.52068186697,
            "loss_sequences_lower_95": 6.4130382934570305,
            "loss_sequences_upper_95": 6.647783117675781,
            "loss_tokens_lower_95": 6.168700133380462,
            "loss_tokens_upper_95": 6.395711735063806,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.086936780680781,
            "data_time": 0.02378634072966495,
            "batch_time": 0.03783624455080194,
            "samples_per_second": 2011601.0176444498,
            "samples_per_second_per_gpu": 251450.12720555623,
            "loss_sequences_lower_95": 5.901269902768343,
            "loss_sequences_upper_95": 6.274627022121264,
            "loss_tokens_lower_95": 5.905572098441746,
            "loss_tokens_upper_95": 6.268265380859375,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.739828543952017,
            "data_time": 0.004740218441170382,
            "batch_time": 0.01830184567405517,
            "samples_per_second": 2252058.316564872,
            "samples_per_second_per_gpu": 281507.289570609,
            "loss_sequences_lower_95": 5.7070218912760415,
            "loss_sequences_upper_95": 5.772622218276515,
            "loss_tokens_lower_95": 5.707017082445549,
            "loss_tokens_upper_95": 5.7717694646661934,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.1717220415671667,
            "data_time": 0.004263104276454195,
            "batch_time": 0.01799324281672214,
            "samples_per_second": 2247563.896317111,
            "samples_per_second_per_gpu": 280945.48703963886,
            "loss_sequences_lower_95": 1.2198855346679687,
            "loss_sequences_upper_95": 1.2894433146158852,
            "loss_tokens_lower_95": 1.0870281960440427,
            "loss_tokens_upper_95": 1.1480621799501052,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.331492494401478,
            "data_time": 0.02519374872956957,
            "batch_time": 0.03963081751550947,
            "samples_per_second": 1880119.5707711647,
            "samples_per_second_per_gpu": 235014.9463463956,
            "loss_sequences_lower_95": 5.976860613141741,
            "loss_sequences_upper_95": 6.687649841308593,
            "loss_tokens_lower_95": 5.9799366978236606,
            "loss_tokens_upper_95": 6.690140264601935,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.356735300272703,
            "data_time": 0.15140798687934875,
            "batch_time": 0.16794440150260925,
            "samples_per_second": 1043295.446718597,
            "samples_per_second_per_gpu": 130411.93083982462,
            "loss_sequences_lower_95": 2.142123430967331,
            "loss_sequences_upper_95": 3.1320371508598326,
            "loss_tokens_lower_95": 1.836164009644813,
            "loss_tokens_upper_95": 2.3491418457031252,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.634473600387573,
            "data_time": 0.0061332592888483925,
            "batch_time": 0.019748666929820226,
            "samples_per_second": 2243424.137052976,
            "samples_per_second_per_gpu": 280428.017131622,
            "loss_sequences_lower_95": 7.572042456054687,
            "loss_sequences_upper_95": 7.929088159179687,
            "loss_tokens_lower_95": 7.323911384801978,
            "loss_tokens_upper_95": 7.636815642019617,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.707405972957611,
            "data_time": 0.006072252988815308,
            "batch_time": 0.019716730193486288,
            "samples_per_second": 2241061.190646788,
            "samples_per_second_per_gpu": 280132.6488308485,
            "loss_sequences_lower_95": 6.764918530273438,
            "loss_sequences_upper_95": 6.9881116333007816,
            "loss_tokens_lower_95": 6.490888860145687,
            "loss_tokens_upper_95": 6.695120586152666,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9391936138497075,
            "data_time": 0.004352628108251055,
            "batch_time": 0.0180820425218563,
            "samples_per_second": 2249345.941416257,
            "samples_per_second_per_gpu": 281168.24267703213,
            "loss_sequences_lower_95": 4.904063996108155,
            "loss_sequences_upper_95": 4.974405100582991,
            "loss_tokens_lower_95": 4.904022746736716,
            "loss_tokens_upper_95": 4.975344276747926,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.07796281420507,
            "data_time": 0.008434821471706978,
            "batch_time": 0.022094990191502874,
            "samples_per_second": 2205342.3079522103,
            "samples_per_second_per_gpu": 275667.7884940263,
            "loss_sequences_lower_95": 4.983624520719326,
            "loss_sequences_upper_95": 5.168973383046515,
            "loss_tokens_lower_95": 4.980720400480631,
            "loss_tokens_upper_95": 5.16888917140697,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.783718052864074,
            "data_time": 0.006476425462298923,
            "batch_time": 0.02048438787460327,
            "samples_per_second": 2197128.6970256204,
            "samples_per_second_per_gpu": 274641.08712820255,
            "loss_sequences_lower_95": 5.685245678710937,
            "loss_sequences_upper_95": 5.883173315429687,
            "loss_tokens_lower_95": 5.686792163085937,
            "loss_tokens_upper_95": 5.886814624023438,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.680111283379675,
            "data_time": 0.0023972247633906978,
            "batch_time": 0.016125236251629142,
            "samples_per_second": 2255686.292263064,
            "samples_per_second_per_gpu": 281960.786532883,
            "loss_sequences_lower_95": 4.285911251995625,
            "loss_sequences_upper_95": 4.393680088472682,
            "loss_tokens_lower_95": 2.9346089117753698,
            "loss_tokens_upper_95": 3.0068447271040397,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.867095539818949,
            "data_time": 0.01991839919771467,
            "batch_time": 0.033748143059866766,
            "samples_per_second": 2046738.2177208206,
            "samples_per_second_per_gpu": 255842.27721510257,
            "loss_sequences_lower_95": 5.679011672290404,
            "loss_sequences_upper_95": 6.05573847756457,
            "loss_tokens_lower_95": 5.682205951747609,
            "loss_tokens_upper_95": 6.053339032984492,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.7486760560204,
            "data_time": 0.01177592296153307,
            "batch_time": 0.025891277939081192,
            "samples_per_second": 2147599.1180420243,
            "samples_per_second_per_gpu": 268449.88975525304,
            "loss_sequences_lower_95": 5.621517417758119,
            "loss_sequences_upper_95": 5.8713836909275425,
            "loss_tokens_lower_95": 5.622363089767156,
            "loss_tokens_upper_95": 5.870647511201747,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9275251933466024,
            "data_time": 0.002393198537957701,
            "batch_time": 0.015931842505380375,
            "samples_per_second": 2292527.453457093,
            "samples_per_second_per_gpu": 286565.9316821366,
            "loss_sequences_lower_95": 4.308202770242234,
            "loss_sequences_upper_95": 4.40712510177878,
            "loss_tokens_lower_95": 3.3008187128808153,
            "loss_tokens_upper_95": 3.3801426467926277,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.015814198387994,
            "data_time": 0.02727794150511424,
            "batch_time": 0.041737332940101624,
            "samples_per_second": 1999898.6618955128,
            "samples_per_second_per_gpu": 249987.3327369391,
            "loss_sequences_lower_95": 4.883860358990058,
            "loss_sequences_upper_95": 5.140325427181506,
            "loss_tokens_lower_95": 4.885100535236339,
            "loss_tokens_upper_95": 5.1393311434952675,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.021260129056575,
            "data_time": 0.003543366006005815,
            "batch_time": 0.017470514381324853,
            "samples_per_second": 2245601.427668219,
            "samples_per_second_per_gpu": 280700.17845852737,
            "loss_sequences_lower_95": 4.981880733944954,
            "loss_sequences_upper_95": 5.059118824063456,
            "loss_tokens_lower_95": 4.983552417813456,
            "loss_tokens_upper_95": 5.058498005064984,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.969198642425167,
            "data_time": 0.024093721129677514,
            "batch_time": 0.03788537979125976,
            "samples_per_second": 1993478.4608077689,
            "samples_per_second_per_gpu": 249184.8076009711,
            "loss_sequences_lower_95": 5.7786078591948575,
            "loss_sequences_upper_95": 6.16008231153766,
            "loss_tokens_lower_95": 5.778237396536521,
            "loss_tokens_upper_95": 6.1590809646162015,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.4100432097911835,
            "data_time": 0.08254054188728333,
            "batch_time": 0.09776118397712708,
            "samples_per_second": 1389833.7117610301,
            "samples_per_second_per_gpu": 173729.21397012877,
            "loss_sequences_lower_95": 2.1565923627217614,
            "loss_sequences_upper_95": 2.768088893890381,
            "loss_tokens_lower_95": 1.9777541796366374,
            "loss_tokens_upper_95": 2.72891944249471,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.284062510728836,
            "data_time": 0.08160639554262161,
            "batch_time": 0.09721699357032776,
            "samples_per_second": 1330428.8944221572,
            "samples_per_second_per_gpu": 166303.61180276965,
            "loss_sequences_lower_95": 2.1263311958312987,
            "loss_sequences_upper_95": 2.7348210906982424,
            "loss_tokens_lower_95": 1.7651403491416673,
            "loss_tokens_upper_95": 2.575644842426428,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.76104563599307,
            "data_time": 0.0037520710617882703,
            "batch_time": 0.017269727971849898,
            "samples_per_second": 2277782.376958959,
            "samples_per_second_per_gpu": 284722.79711986985,
            "loss_sequences_lower_95": 3.7395886787785346,
            "loss_sequences_upper_95": 3.783423549107143,
            "loss_tokens_lower_95": 3.738981067056333,
            "loss_tokens_upper_95": 3.782819144652062,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.6274512755037603,
            "data_time": 0.001479995190379917,
            "batch_time": 0.01496406043808361,
            "samples_per_second": 2294333.6770030553,
            "samples_per_second_per_gpu": 286791.7096253819,
            "loss_sequences_lower_95": 0.7217980764196386,
            "loss_sequences_upper_95": 0.738260482269477,
            "loss_tokens_lower_95": 0.5199760723710966,
            "loss_tokens_upper_95": 0.5289628275630371,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.9109607747220618,
            "data_time": 0.03992172330617905,
            "batch_time": 0.05498325079679489,
            "samples_per_second": 1896294.3281850233,
            "samples_per_second_per_gpu": 237036.7910231279,
            "loss_sequences_lower_95": 1.8242207594743862,
            "loss_sequences_upper_95": 2.074359226977731,
            "loss_tokens_lower_95": 1.6928060105730702,
            "loss_tokens_upper_95": 1.8235616093155091,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.98046403801119,
            "data_time": 0.11605690774463472,
            "batch_time": 0.13232826051257907,
            "samples_per_second": 1076240.4555508336,
            "samples_per_second_per_gpu": 134530.0569438542,
            "loss_sequences_lower_95": 3.530814114132443,
            "loss_sequences_upper_95": 4.483938010963233,
            "loss_tokens_lower_95": 3.4130238379961177,
            "loss_tokens_upper_95": 4.464585829369816,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.7538248758490493,
            "data_time": 0.033266774245670865,
            "batch_time": 0.04769038870221093,
            "samples_per_second": 1916643.5519435955,
            "samples_per_second_per_gpu": 239580.44399294944,
            "loss_sequences_lower_95": 1.700998818002096,
            "loss_sequences_upper_95": 1.9169672803180973,
            "loss_tokens_lower_95": 1.5812321401584841,
            "loss_tokens_upper_95": 1.6865786104065916,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.8103734501978246,
            "data_time": 0.03246511164165679,
            "batch_time": 0.047310681570143925,
            "samples_per_second": 1864418.5896109403,
            "samples_per_second_per_gpu": 233052.32370136754,
            "loss_sequences_lower_95": 1.788610411853325,
            "loss_sequences_upper_95": 1.984810247653868,
            "loss_tokens_lower_95": 1.631144266809656,
            "loss_tokens_upper_95": 1.7201365595589744,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.7365264067562616,
            "data_time": 0.032086528482891265,
            "batch_time": 0.04681710402170817,
            "samples_per_second": 1889545.9145785798,
            "samples_per_second_per_gpu": 236193.23932232248,
            "loss_sequences_lower_95": 1.5971561106239875,
            "loss_sequences_upper_95": 1.837134779953375,
            "loss_tokens_lower_95": 1.6385036790967331,
            "loss_tokens_upper_95": 1.7777788018549634,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.883800002132974,
            "data_time": 0.03261173339117141,
            "batch_time": 0.04757381337029593,
            "samples_per_second": 1871900.8180848467,
            "samples_per_second_per_gpu": 233987.60226060584,
            "loss_sequences_lower_95": 1.857345399623964,
            "loss_sequences_upper_95": 2.040591239929199,
            "loss_tokens_lower_95": 1.706359706415194,
            "loss_tokens_upper_95": 1.7922035098447235,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.5514067315166782,
            "data_time": 0.0359269748499364,
            "batch_time": 0.05051370020265932,
            "samples_per_second": 1936187.939096475,
            "samples_per_second_per_gpu": 242023.49238705938,
            "loss_sequences_lower_95": 1.4977140959745607,
            "loss_sequences_upper_95": 1.6108187942031007,
            "loss_tokens_lower_95": 1.490145327581164,
            "loss_tokens_upper_95": 1.5543331983264477,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.420774741143715,
            "data_time": 0.03166530245826358,
            "batch_time": 0.04654058955964588,
            "samples_per_second": 1936562.0746147207,
            "samples_per_second_per_gpu": 242070.25932684008,
            "loss_sequences_lower_95": 1.4032619383276963,
            "loss_sequences_upper_95": 1.5277031503072598,
            "loss_tokens_lower_95": 1.2835282955893987,
            "loss_tokens_upper_95": 1.3385931230927892,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-8.0/params.txt",
    "uuid": "ff15e93d-2857-40ff-a4e6-dc1c6aabead5",
    "creation_date": "2023_12_14-06_34_34"
}