{
    "name": "rpj-d=96_l=8_h=4-16.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 3382179840,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "676435968",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.312730276584626,
            "data_time": 0.13566026091575623,
            "batch_time": 1.261986866593361,
            "samples_per_second": 380991.6302432515,
            "samples_per_second_per_gpu": 47623.95378040644,
            "loss_sequences_lower_95": 4.238061987559001,
            "loss_sequences_upper_95": 4.386037851969401,
            "loss_tokens_lower_95": 4.298850949605306,
            "loss_tokens_upper_95": 4.326362838745117,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.572292613487238,
            "data_time": 0.018963151238963756,
            "batch_time": 0.0641771515725945,
            "samples_per_second": 4678838.848708637,
            "samples_per_second_per_gpu": 584854.8560885796,
            "loss_sequences_lower_95": 4.569915528417981,
            "loss_sequences_upper_95": 4.574638433246274,
            "loss_tokens_lower_95": 4.56055290625,
            "loss_tokens_upper_95": 4.583861260416667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.710560512542725,
            "data_time": 0.0960085541009903,
            "batch_time": 0.1410822793841362,
            "samples_per_second": 4096573.061175081,
            "samples_per_second_per_gpu": 512071.6326468851,
            "loss_sequences_lower_95": 3.680781922632334,
            "loss_sequences_upper_95": 3.739964979522082,
            "loss_tokens_lower_95": 3.69806846875,
            "loss_tokens_upper_95": 3.72343009375,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.451343895627051,
            "data_time": 0.013404738746191325,
            "batch_time": 0.057400749702202644,
            "samples_per_second": 5370020.225924344,
            "samples_per_second_per_gpu": 671252.528240543,
            "loss_sequences_lower_95": 4.440333078286082,
            "loss_sequences_upper_95": 4.46240861589884,
            "loss_tokens_lower_95": 4.439882072916666,
            "loss_tokens_upper_95": 4.462698395833333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.563551307452915,
            "data_time": 0.09854934364557266,
            "batch_time": 0.14277087897062302,
            "samples_per_second": 4177992.651112881,
            "samples_per_second_per_gpu": 522249.0813891101,
            "loss_sequences_lower_95": 4.5321431767916245,
            "loss_sequences_upper_95": 4.594852834417961,
            "loss_tokens_lower_95": 4.5518670312500005,
            "loss_tokens_upper_95": 4.57509278125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.38530002428025,
            "data_time": 0.03485046327114105,
            "batch_time": 0.0777046059568723,
            "samples_per_second": 4925388.948183343,
            "samples_per_second_per_gpu": 615673.6185229179,
            "loss_sequences_lower_95": 4.345023746866503,
            "loss_sequences_upper_95": 4.4243746531244685,
            "loss_tokens_lower_95": 4.373267354166666,
            "loss_tokens_upper_95": 4.397066677083333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.079109897467555,
            "data_time": 0.012235285341739654,
            "batch_time": 0.05470888167619705,
            "samples_per_second": 5252443.098630493,
            "samples_per_second_per_gpu": 656555.3873288116,
            "loss_sequences_lower_95": 3.051502854950574,
            "loss_sequences_upper_95": 3.1061620545679207,
            "loss_tokens_lower_95": 3.0664053229166663,
            "loss_tokens_upper_95": 3.091848208333333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.680745183934716,
            "data_time": 0.015483205255709197,
            "batch_time": 0.05936361457172193,
            "samples_per_second": 5154076.636419687,
            "samples_per_second_per_gpu": 644259.5795524609,
            "loss_sequences_lower_95": 4.6728501206642665,
            "loss_sequences_upper_95": 4.6885481020942406,
            "loss_tokens_lower_95": 4.669510697916667,
            "loss_tokens_upper_95": 4.6921579375,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.691980076030018,
            "data_time": 0.09700016677379608,
            "batch_time": 0.14164821058511734,
            "samples_per_second": 3953851.3823596328,
            "samples_per_second_per_gpu": 494231.4227949541,
            "loss_sequences_lower_95": 4.6523608944280355,
            "loss_sequences_upper_95": 4.7317131476673655,
            "loss_tokens_lower_95": 4.6798895937500005,
            "loss_tokens_upper_95": 4.704092229166666,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.222496483165756,
            "data_time": 0.10096810758113861,
            "batch_time": 0.14636173844337463,
            "samples_per_second": 4151382.3377638245,
            "samples_per_second_per_gpu": 518922.79222047806,
            "loss_sequences_lower_95": 5.192490347760469,
            "loss_sequences_upper_95": 5.248871006419066,
            "loss_tokens_lower_95": 5.21033615625,
            "loss_tokens_upper_95": 5.234415177083333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.587203117172996,
            "data_time": 0.00997058430622364,
            "batch_time": 0.05335044449773328,
            "samples_per_second": 5397510.583517334,
            "samples_per_second_per_gpu": 674688.8229396668,
            "loss_sequences_lower_95": 4.579741647894683,
            "loss_sequences_upper_95": 4.594752124160614,
            "loss_tokens_lower_95": 4.575118677083333,
            "loss_tokens_upper_95": 4.59926446875,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.51686011627782,
            "data_time": 0.02262158691883087,
            "batch_time": 0.07742997258901596,
            "samples_per_second": 5038979.666921673,
            "samples_per_second_per_gpu": 629872.4583652092,
            "loss_sequences_lower_95": 4.507024193207257,
            "loss_sequences_upper_95": 4.5266962333402745,
            "loss_tokens_lower_95": 4.504981854166666,
            "loss_tokens_upper_95": 4.52863546875,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.762784815705088,
            "data_time": 0.10759256035089493,
            "batch_time": 0.179549440741539,
            "samples_per_second": 4200462.646064116,
            "samples_per_second_per_gpu": 525057.8307580145,
            "loss_sequences_lower_95": 4.730083708540663,
            "loss_sequences_upper_95": 4.7956419412794755,
            "loss_tokens_lower_95": 4.75130375,
            "loss_tokens_upper_95": 4.774389229166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.383076626025975,
            "data_time": 0.09375002980232239,
            "batch_time": 0.1388331726193428,
            "samples_per_second": 4118130.951348621,
            "samples_per_second_per_gpu": 514766.3689185776,
            "loss_sequences_lower_95": 4.322164960499936,
            "loss_sequences_upper_95": 4.440115835030549,
            "loss_tokens_lower_95": 4.37075509375,
            "loss_tokens_upper_95": 4.395814822916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.378125353292986,
            "data_time": 0.148211270570755,
            "batch_time": 0.17171461880207062,
            "samples_per_second": 978333.2198324755,
            "samples_per_second_per_gpu": 122291.65247905944,
            "loss_sequences_lower_95": 5.2996301997791635,
            "loss_sequences_upper_95": 5.451072753559459,
            "loss_tokens_lower_95": 5.354399897835471,
            "loss_tokens_upper_95": 5.401776400479403,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.038038764681135,
            "data_time": 0.09686515480279922,
            "batch_time": 0.13147588819265366,
            "samples_per_second": 3331583.312526517,
            "samples_per_second_per_gpu": 416447.91406581464,
            "loss_sequences_lower_95": 3.9274444491105593,
            "loss_sequences_upper_95": 4.147271328139236,
            "loss_tokens_lower_95": 4.02582546875,
            "loss_tokens_upper_95": 4.050235427083333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.310267308771138,
            "data_time": 0.0971641093492508,
            "batch_time": 0.1334345042705536,
            "samples_per_second": 3692985.9605580573,
            "samples_per_second_per_gpu": 461623.24506975716,
            "loss_sequences_lower_95": 6.259815097295515,
            "loss_sequences_upper_95": 6.358099445755689,
            "loss_tokens_lower_95": 6.298797197916667,
            "loss_tokens_upper_95": 6.321684708333334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.174899726617531,
            "data_time": 0.1580691635608673,
            "batch_time": 0.18676072359085083,
            "samples_per_second": 2311100.795811101,
            "samples_per_second_per_gpu": 288887.5994763876,
            "loss_sequences_lower_95": 5.115211274193936,
            "loss_sequences_upper_95": 5.229830407314613,
            "loss_tokens_lower_95": 5.1615146949643,
            "loss_tokens_upper_95": 5.188160230292649,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.9076900230209946,
            "data_time": 0.026426239447160202,
            "batch_time": 0.07112112045288085,
            "samples_per_second": 4491754.493823503,
            "samples_per_second_per_gpu": 561469.3117279379,
            "loss_sequences_lower_95": 4.890735091043832,
            "loss_sequences_upper_95": 4.92426345405747,
            "loss_tokens_lower_95": 4.890768542586526,
            "loss_tokens_upper_95": 4.92461147528397,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.412443559593804,
            "data_time": 0.028490459173917772,
            "batch_time": 0.07232965715229511,
            "samples_per_second": 4487517.936126004,
            "samples_per_second_per_gpu": 560939.7420157505,
            "loss_sequences_lower_95": 4.3952878087812435,
            "loss_sequences_upper_95": 4.421369097446973,
            "loss_tokens_lower_95": 4.4010244106847844,
            "loss_tokens_upper_95": 4.422863292265313,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.33940250354383,
            "data_time": 0.05271728336811066,
            "batch_time": 0.09473999175760481,
            "samples_per_second": 4300717.1422383785,
            "samples_per_second_per_gpu": 537589.6427797973,
            "loss_sequences_lower_95": 6.776430892403756,
            "loss_sequences_upper_95": 7.044301547923063,
            "loss_tokens_lower_95": 6.207364809964707,
            "loss_tokens_upper_95": 6.409710956579849,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.356910024483999,
            "data_time": 0.04098154604434967,
            "batch_time": 0.08524792020519574,
            "samples_per_second": 4504033.985380621,
            "samples_per_second_per_gpu": 563004.2481725776,
            "loss_sequences_lower_95": 6.777936767578125,
            "loss_sequences_upper_95": 6.970000439453125,
            "loss_tokens_lower_95": 6.256813494988207,
            "loss_tokens_upper_95": 6.3899741794418246,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.396757085726422,
            "data_time": 0.07335039228200912,
            "batch_time": 0.1135505959391594,
            "samples_per_second": 4052885.5154778366,
            "samples_per_second_per_gpu": 506610.6894347296,
            "loss_sequences_lower_95": 4.512931114584321,
            "loss_sequences_upper_95": 4.584651254649593,
            "loss_tokens_lower_95": 4.369529409866744,
            "loss_tokens_upper_95": 4.405152486549382,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.1920964230190623,
            "data_time": 0.33442290127277374,
            "batch_time": 0.37732453644275665,
            "samples_per_second": 2209573.099093195,
            "samples_per_second_per_gpu": 276196.6373866494,
            "loss_sequences_lower_95": 3.205769840587269,
            "loss_sequences_upper_95": 3.335593969171698,
            "loss_tokens_lower_95": 3.161398424115691,
            "loss_tokens_upper_95": 3.21308707516628,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.489337882217096,
            "data_time": 0.37129607796669006,
            "batch_time": 0.4179414361715317,
            "samples_per_second": 2112678.6519345464,
            "samples_per_second_per_gpu": 264084.8314918183,
            "loss_sequences_lower_95": 4.564287159199617,
            "loss_sequences_upper_95": 4.782932016800861,
            "loss_tokens_lower_95": 4.434377651330823,
            "loss_tokens_upper_95": 4.540992113447867,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.655756980578104,
            "data_time": 0.18539442867040634,
            "batch_time": 0.2170657217502594,
            "samples_per_second": 2509723.100852273,
            "samples_per_second_per_gpu": 313715.3876065341,
            "loss_sequences_lower_95": 4.64198476155599,
            "loss_sequences_upper_95": 4.7371005452473955,
            "loss_tokens_lower_95": 4.544508405197389,
            "loss_tokens_upper_95": 4.762010775685163,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.426608699483024,
            "data_time": 0.023896032385528086,
            "batch_time": 0.0682944530621171,
            "samples_per_second": 4510351.119218575,
            "samples_per_second_per_gpu": 563793.8899023219,
            "loss_sequences_lower_95": 8.519996747514886,
            "loss_sequences_upper_95": 8.595787224423011,
            "loss_tokens_lower_95": 8.367488088447846,
            "loss_tokens_upper_95": 8.447396181733481,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.488123976622366,
            "data_time": 0.047039668262004855,
            "batch_time": 0.08924199789762496,
            "samples_per_second": 4486088.650115262,
            "samples_per_second_per_gpu": 560761.0812644077,
            "loss_sequences_lower_95": 6.8016359887941915,
            "loss_sequences_upper_95": 7.116887009745897,
            "loss_tokens_lower_95": 5.339234806514674,
            "loss_tokens_upper_95": 5.4892997111113475,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1218327580865335,
            "data_time": 0.08334488570690154,
            "batch_time": 0.12568075358867645,
            "samples_per_second": 4296144.196827464,
            "samples_per_second_per_gpu": 537018.024603433,
            "loss_sequences_lower_95": 5.952705950948566,
            "loss_sequences_upper_95": 6.30463975509279,
            "loss_tokens_lower_95": 5.0127139834674495,
            "loss_tokens_upper_95": 5.183363033564254,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.51327532938082,
            "data_time": 0.34314869344234467,
            "batch_time": 0.3844226896762848,
            "samples_per_second": 2107650.542326986,
            "samples_per_second_per_gpu": 263456.31779087323,
            "loss_sequences_lower_95": 6.444397293160495,
            "loss_sequences_upper_95": 6.581582488316924,
            "loss_tokens_lower_95": 6.4440149037260985,
            "loss_tokens_upper_95": 6.582617744898688,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.6378408765792845,
            "data_time": 0.29058682918548584,
            "batch_time": 0.31641924381256104,
            "samples_per_second": 1865540.5628112063,
            "samples_per_second_per_gpu": 233192.5703514008,
            "loss_sequences_lower_95": 4.5539478607177735,
            "loss_sequences_upper_95": 4.9835488357543944,
            "loss_tokens_lower_95": 4.3788085282382045,
            "loss_tokens_upper_95": 4.870398986488847,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.252773889133605,
            "data_time": 0.05381308123469353,
            "batch_time": 0.09701908007264137,
            "samples_per_second": 4480640.131952935,
            "samples_per_second_per_gpu": 560080.0164941169,
            "loss_sequences_lower_95": 5.209193056510683,
            "loss_sequences_upper_95": 5.296911033856832,
            "loss_tokens_lower_95": 5.208146750255885,
            "loss_tokens_upper_95": 5.296957812899821,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4488340728503575,
            "data_time": 0.07644525766372681,
            "batch_time": 0.11956737041473389,
            "samples_per_second": 4456263.089742045,
            "samples_per_second_per_gpu": 557032.8862177556,
            "loss_sequences_lower_95": 5.403458838810401,
            "loss_sequences_upper_95": 5.493285073070229,
            "loss_tokens_lower_95": 5.403388965803517,
            "loss_tokens_upper_95": 5.493207511933098,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.989299461294181,
            "data_time": 0.057680606842041016,
            "batch_time": 0.09891591779887676,
            "samples_per_second": 4142259.0193557544,
            "samples_per_second_per_gpu": 517782.3774194693,
            "loss_sequences_lower_95": 5.239776020236841,
            "loss_sequences_upper_95": 5.357228528376632,
            "loss_tokens_lower_95": 4.9492800977496305,
            "loss_tokens_upper_95": 5.009830639534155,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.341838296890259,
            "data_time": 0.17694950103759766,
            "batch_time": 0.22201170027256012,
            "samples_per_second": 3887922.4684866583,
            "samples_per_second_per_gpu": 485990.3085608323,
            "loss_sequences_lower_95": 6.957816271972656,
            "loss_sequences_upper_95": 7.485911889648437,
            "loss_tokens_lower_95": 6.108383233242679,
            "loss_tokens_upper_95": 6.462905134482544,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.15290504693985,
            "data_time": 0.15159140527248383,
            "batch_time": 0.16833120584487915,
            "samples_per_second": 990246.264493732,
            "samples_per_second_per_gpu": 123780.7830617165,
            "loss_sequences_lower_95": 4.829578804969787,
            "loss_sequences_upper_95": 5.633632516860962,
            "loss_tokens_lower_95": 4.610269594740593,
            "loss_tokens_upper_95": 5.482981776094984,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.285372956045743,
            "data_time": 0.36514878273010254,
            "batch_time": 0.4005420356988907,
            "samples_per_second": 2283632.670264814,
            "samples_per_second_per_gpu": 285454.08378310176,
            "loss_sequences_lower_95": 6.920537733757633,
            "loss_sequences_upper_95": 7.779619642235766,
            "loss_tokens_lower_95": 4.93811113600532,
            "loss_tokens_upper_95": 5.412464053251832,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.84811511983409,
            "data_time": 0.049755255381266274,
            "batch_time": 0.09463110731707679,
            "samples_per_second": 4513570.601921706,
            "samples_per_second_per_gpu": 564196.3252402133,
            "loss_sequences_lower_95": 4.826660316237303,
            "loss_sequences_upper_95": 4.8693659937814004,
            "loss_tokens_lower_95": 4.82620190994976,
            "loss_tokens_upper_95": 4.8695041801482635,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4836840632122374,
            "data_time": 0.03868153691291809,
            "batch_time": 0.0821388406412942,
            "samples_per_second": 4252689.922296669,
            "samples_per_second_per_gpu": 531586.2402870837,
            "loss_sequences_lower_95": 5.598785692951921,
            "loss_sequences_upper_95": 5.816614250421478,
            "loss_tokens_lower_95": 5.346352256026704,
            "loss_tokens_upper_95": 5.560066212575191,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.097895692992997,
            "data_time": 0.17714476585388184,
            "batch_time": 0.20661118626594543,
            "samples_per_second": 1772819.5031627482,
            "samples_per_second_per_gpu": 221602.43789534352,
            "loss_sequences_lower_95": 4.036542061047676,
            "loss_sequences_upper_95": 4.417628518129006,
            "loss_tokens_lower_95": 3.893933591039836,
            "loss_tokens_upper_95": 4.227435782659286,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.528636326447376,
            "data_time": 0.07555430829524994,
            "batch_time": 0.12037351727485657,
            "samples_per_second": 4414325.250064651,
            "samples_per_second_per_gpu": 551790.6562580813,
            "loss_sequences_lower_95": 4.567313616842196,
            "loss_sequences_upper_95": 4.70370400059195,
            "loss_tokens_lower_95": 4.446729246118045,
            "loss_tokens_upper_95": 4.604316721429022,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.194059299259651,
            "data_time": 0.29996271431446075,
            "batch_time": 0.334992378950119,
            "samples_per_second": 2215095.17945165,
            "samples_per_second_per_gpu": 276886.89743145625,
            "loss_sequences_lower_95": 4.035279846191406,
            "loss_sequences_upper_95": 4.5760281074337845,
            "loss_tokens_lower_95": 3.9935519195108586,
            "loss_tokens_upper_95": 4.399779218017179,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.342287737265853,
            "data_time": 0.027539816022292947,
            "batch_time": 0.07131546119163776,
            "samples_per_second": 4464960.170077626,
            "samples_per_second_per_gpu": 558120.0212597032,
            "loss_sequences_lower_95": 4.33365462154931,
            "loss_sequences_upper_95": 4.350860845997325,
            "loss_tokens_lower_95": 4.333875349288608,
            "loss_tokens_upper_95": 4.350845969975245,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.6509917025427217,
            "data_time": 0.31070078909397125,
            "batch_time": 0.3371979147195816,
            "samples_per_second": 1663395.1417552084,
            "samples_per_second_per_gpu": 207924.39271940105,
            "loss_sequences_lower_95": 2.5632960347295963,
            "loss_sequences_upper_95": 2.8803766861702633,
            "loss_tokens_lower_95": 2.439297557194377,
            "loss_tokens_upper_95": 2.78952831006709,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.062911501907952,
            "data_time": 0.0232375560204188,
            "batch_time": 0.06722699910402298,
            "samples_per_second": 4485846.550998679,
            "samples_per_second_per_gpu": 560730.8188748349,
            "loss_sequences_lower_95": 6.000398005519522,
            "loss_sequences_upper_95": 6.0521338300085175,
            "loss_tokens_lower_95": 4.956358655705996,
            "loss_tokens_upper_95": 5.007723682301741,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.300436039447784,
            "data_time": 0.0968223363161087,
            "batch_time": 0.14182570576667786,
            "samples_per_second": 4182426.9866048032,
            "samples_per_second_per_gpu": 522803.3733256004,
            "loss_sequences_lower_95": 6.360030236816407,
            "loss_sequences_upper_95": 6.616115625,
            "loss_tokens_lower_95": 6.164630065587539,
            "loss_tokens_upper_95": 6.399145979855472,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.054417840294216,
            "data_time": 0.32759685814380646,
            "batch_time": 0.37031184136867523,
            "samples_per_second": 2334311.612955966,
            "samples_per_second_per_gpu": 291788.95161949575,
            "loss_sequences_lower_95": 4.92065649944803,
            "loss_sequences_upper_95": 5.1862908404806385,
            "loss_tokens_lower_95": 4.922477655825407,
            "loss_tokens_upper_95": 5.185059734842052,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.219561829711452,
            "data_time": 0.06573306769132614,
            "batch_time": 0.10582248618205388,
            "samples_per_second": 4053684.1346441265,
            "samples_per_second_per_gpu": 506710.5168305158,
            "loss_sequences_lower_95": 8.114850241921165,
            "loss_sequences_upper_95": 8.322965642755682,
            "loss_tokens_lower_95": 8.115811009262549,
            "loss_tokens_upper_95": 8.32404844341856,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.5431462752024332,
            "data_time": 0.07126267502705257,
            "batch_time": 0.11551978190739949,
            "samples_per_second": 4463471.63394748,
            "samples_per_second_per_gpu": 557933.954243435,
            "loss_sequences_lower_95": 1.6758853149414061,
            "loss_sequences_upper_95": 1.7650064127604168,
            "loss_tokens_lower_95": 1.500566687612545,
            "loss_tokens_upper_95": 1.5702698657588037,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.447639329092843,
            "data_time": 0.375363364815712,
            "batch_time": 0.41637901961803436,
            "samples_per_second": 1780969.0224219777,
            "samples_per_second_per_gpu": 222621.1278027472,
            "loss_sequences_lower_95": 6.093703497023809,
            "loss_sequences_upper_95": 6.802976481119792,
            "loss_tokens_lower_95": 6.0886340477353045,
            "loss_tokens_upper_95": 6.805289190383185,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.8424307331442833,
            "data_time": 0.15715546905994415,
            "batch_time": 0.17410631477832794,
            "samples_per_second": 828689.6627409118,
            "samples_per_second_per_gpu": 103586.20784261398,
            "loss_sequences_lower_95": 3.552400267124176,
            "loss_sequences_upper_95": 4.88515498638153,
            "loss_tokens_lower_95": 3.2487897986972456,
            "loss_tokens_upper_95": 3.855486739640383,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.186714428424835,
            "data_time": 0.09716656431555748,
            "batch_time": 0.14162317663431168,
            "samples_per_second": 4369404.861234888,
            "samples_per_second_per_gpu": 546175.607654361,
            "loss_sequences_lower_95": 7.285929821777343,
            "loss_sequences_upper_95": 7.642210852050781,
            "loss_tokens_lower_95": 7.0153573747818845,
            "loss_tokens_upper_95": 7.326785428068158,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.568691989421844,
            "data_time": 0.10054944455623627,
            "batch_time": 0.14452646300196648,
            "samples_per_second": 4085951.0716644553,
            "samples_per_second_per_gpu": 510743.8839580569,
            "loss_sequences_lower_95": 7.794561987304688,
            "loss_sequences_upper_95": 8.054280676269531,
            "loss_tokens_lower_95": 7.446100620657549,
            "loss_tokens_upper_95": 7.662036401770624,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.3697432170488115,
            "data_time": 0.038203476617733635,
            "batch_time": 0.08201813325285912,
            "samples_per_second": 4513773.459250151,
            "samples_per_second_per_gpu": 564221.6824062689,
            "loss_sequences_lower_95": 4.354292103495328,
            "loss_sequences_upper_95": 4.384832565200196,
            "loss_tokens_lower_95": 4.354412283211636,
            "loss_tokens_upper_95": 4.384726429912734,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.125267241224532,
            "data_time": 0.11628055572509766,
            "batch_time": 0.15643120805422464,
            "samples_per_second": 4053260.5441133915,
            "samples_per_second_per_gpu": 506657.56801417394,
            "loss_sequences_lower_95": 5.03696762530302,
            "loss_sequences_upper_95": 5.2111351282732095,
            "loss_tokens_lower_95": 5.037747225197412,
            "loss_tokens_upper_95": 5.211436469434044,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.928837351799011,
            "data_time": 0.09312814846634865,
            "batch_time": 0.13733221590518951,
            "samples_per_second": 4198612.774225815,
            "samples_per_second_per_gpu": 524826.5967782269,
            "loss_sequences_lower_95": 9.871984423828124,
            "loss_sequences_upper_95": 9.9842365234375,
            "loss_tokens_lower_95": 9.872503051757812,
            "loss_tokens_upper_95": 9.984381958007813,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.506590136426062,
            "data_time": 0.03248005821591332,
            "batch_time": 0.0765736848115921,
            "samples_per_second": 4390052.040930657,
            "samples_per_second_per_gpu": 548756.5051163321,
            "loss_sequences_lower_95": 5.584026440175616,
            "loss_sequences_upper_95": 5.689385846588221,
            "loss_tokens_lower_95": 4.390757485576718,
            "loss_tokens_upper_95": 4.460902266601438,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.291998637256338,
            "data_time": 0.1921057871409825,
            "batch_time": 0.22523051500320435,
            "samples_per_second": 2062437.6124690361,
            "samples_per_second_per_gpu": 257804.70155862952,
            "loss_sequences_lower_95": 5.152962858285477,
            "loss_sequences_upper_95": 5.4277378025339615,
            "loss_tokens_lower_95": 5.150503676684935,
            "loss_tokens_upper_95": 5.428952994275448,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.30976323707431,
            "data_time": 0.1879439651966095,
            "batch_time": 0.23453839123249054,
            "samples_per_second": 3509012.4014457604,
            "samples_per_second_per_gpu": 438626.55018072005,
            "loss_sequences_lower_95": 5.203106378293505,
            "loss_sequences_upper_95": 5.413739851409313,
            "loss_tokens_lower_95": 5.206866993623621,
            "loss_tokens_upper_95": 5.411402899050246,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.229997961619407,
            "data_time": 0.028189407661557198,
            "batch_time": 0.07214060099795461,
            "samples_per_second": 4431734.8147596,
            "samples_per_second_per_gpu": 553966.85184495,
            "loss_sequences_lower_95": 6.277947009465427,
            "loss_sequences_upper_95": 6.394680480689434,
            "loss_tokens_lower_95": 5.101171130324999,
            "loss_tokens_upper_95": 5.18963842018917,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.392665045601981,
            "data_time": 0.3728073239326477,
            "batch_time": 0.41123881936073303,
            "samples_per_second": 2111790.4687253106,
            "samples_per_second_per_gpu": 263973.80859066383,
            "loss_sequences_lower_95": 4.347089680666646,
            "loss_sequences_upper_95": 4.438870360359313,
            "loss_tokens_lower_95": 4.347388890180638,
            "loss_tokens_upper_95": 4.437320446841931,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.51708236105216,
            "data_time": 0.045189040211530834,
            "batch_time": 0.08961486357908982,
            "samples_per_second": 4450913.509301712,
            "samples_per_second_per_gpu": 556364.188662714,
            "loss_sequences_lower_95": 7.495814518826452,
            "loss_sequences_upper_95": 7.538238514191513,
            "loss_tokens_lower_95": 7.4960166403860855,
            "loss_tokens_upper_95": 7.538093328913417,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.322268907306264,
            "data_time": 0.35033902525901794,
            "batch_time": 0.39011096954345703,
            "samples_per_second": 2163360.8749448652,
            "samples_per_second_per_gpu": 270420.10936810815,
            "loss_sequences_lower_95": 5.155787732763198,
            "loss_sequences_upper_95": 5.48436990386074,
            "loss_tokens_lower_95": 5.156652728330742,
            "loss_tokens_upper_95": 5.484963159653747,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.987459746996562,
            "data_time": 0.31847549974918365,
            "batch_time": 0.33918987214565277,
            "samples_per_second": 1009553.9258186618,
            "samples_per_second_per_gpu": 126194.24072733273,
            "loss_sequences_lower_95": 5.787188771565756,
            "loss_sequences_upper_95": 6.570617815653483,
            "loss_tokens_lower_95": 5.167953279283312,
            "loss_tokens_upper_95": 6.6324417856004505,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.919617255528768,
            "data_time": 0.2912362664937973,
            "batch_time": 0.3119939714670181,
            "samples_per_second": 1250750.9183188537,
            "samples_per_second_per_gpu": 156343.8647898567,
            "loss_sequences_lower_95": 4.874250615437825,
            "loss_sequences_upper_95": 5.824456596374511,
            "loss_tokens_lower_95": 4.007620102100159,
            "loss_tokens_upper_95": 5.562008726998661,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.705476431923867,
            "data_time": 0.04436569660902023,
            "batch_time": 0.08732686298234123,
            "samples_per_second": 4321691.487013245,
            "samples_per_second_per_gpu": 540211.4358766556,
            "loss_sequences_lower_95": 7.674485657906849,
            "loss_sequences_upper_95": 7.7367210367958394,
            "loss_tokens_lower_95": 7.674589786220545,
            "loss_tokens_upper_95": 7.736400597731039,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.1947900224549657,
            "data_time": 0.02239897931767343,
            "batch_time": 0.06686614518863145,
            "samples_per_second": 4525033.890645727,
            "samples_per_second_per_gpu": 565629.2363307158,
            "loss_sequences_lower_95": 2.8971341900922347,
            "loss_sequences_upper_95": 2.936307068434572,
            "loss_tokens_lower_95": 2.1330044680747604,
            "loss_tokens_upper_95": 2.1575357180747603,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.3500373156990593,
            "data_time": 0.31691768765449524,
            "batch_time": 0.34698717296123505,
            "samples_per_second": 1964330.3897640794,
            "samples_per_second_per_gpu": 245541.29872050992,
            "loss_sequences_lower_95": 3.391680661899837,
            "loss_sequences_upper_95": 3.7764314065768025,
            "loss_tokens_lower_95": 3.183241104429578,
            "loss_tokens_upper_95": 3.371837986098317,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6065998850642025,
            "data_time": 0.22572548687458038,
            "batch_time": 0.24327877163887024,
            "samples_per_second": 1024245.2003135057,
            "samples_per_second_per_gpu": 128030.65003918821,
            "loss_sequences_lower_95": 5.216804050754856,
            "loss_sequences_upper_95": 6.126462688961544,
            "loss_tokens_lower_95": 5.019439179220318,
            "loss_tokens_upper_95": 6.073587120903863,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.163323280287952,
            "data_time": 0.39334775507450104,
            "batch_time": 0.428169846534729,
            "samples_per_second": 1437382.5477117596,
            "samples_per_second_per_gpu": 179672.81846396995,
            "loss_sequences_lower_95": 3.243427313827887,
            "loss_sequences_upper_95": 3.5808821515339178,
            "loss_tokens_lower_95": 3.037464050344302,
            "loss_tokens_upper_95": 3.1968207264507993,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.20287959168597,
            "data_time": 0.33995895087718964,
            "batch_time": 0.37440231442451477,
            "samples_per_second": 2157599.0573970606,
            "samples_per_second_per_gpu": 269699.88217463257,
            "loss_sequences_lower_95": 3.3625655104474323,
            "loss_sequences_upper_95": 3.6687039631169016,
            "loss_tokens_lower_95": 3.0893760605099976,
            "loss_tokens_upper_95": 3.2191460134444263,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.316960177770475,
            "data_time": 0.3656058758497238,
            "batch_time": 0.4000637084245682,
            "samples_per_second": 2291308.2767405408,
            "samples_per_second_per_gpu": 286413.5345925676,
            "loss_sequences_lower_95": 3.1195489883422853,
            "loss_sequences_upper_95": 3.500108132711271,
            "loss_tokens_lower_95": 3.1798368273006217,
            "loss_tokens_upper_95": 3.3932419656816055,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.2928604280076375,
            "data_time": 0.34831462800502777,
            "batch_time": 0.38292625546455383,
            "samples_per_second": 1682126.9567532013,
            "samples_per_second_per_gpu": 210265.86959415016,
            "loss_sequences_lower_95": 3.4666848066376477,
            "loss_sequences_upper_95": 3.7545427089784202,
            "loss_tokens_lower_95": 3.185985971611237,
            "loss_tokens_upper_95": 3.304099337854118,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.9178216057534545,
            "data_time": 0.3319431245326996,
            "batch_time": 0.3660774230957031,
            "samples_per_second": 2149088.3877164554,
            "samples_per_second_per_gpu": 268636.0484645569,
            "loss_sequences_lower_95": 2.9297739775284475,
            "loss_sequences_upper_95": 3.0826257504291417,
            "loss_tokens_lower_95": 2.8428801652986984,
            "loss_tokens_upper_95": 2.935818558743419,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.41124695975606,
            "data_time": 0.33012402057647705,
            "batch_time": 0.3650137633085251,
            "samples_per_second": 2301774.9607466017,
            "samples_per_second_per_gpu": 287721.8700933252,
            "loss_sequences_lower_95": 2.551677057219715,
            "loss_sequences_upper_95": 2.7372937086151867,
            "loss_tokens_lower_95": 2.3341686798334838,
            "loss_tokens_upper_95": 2.414766543286618,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/params.txt",
    "uuid": "1084ea36-5383-4043-8c42-00f5a786b48e",
    "creation_date": "2023_12_14-06_02_11"
}