{
    "name": "rpj-d=96_l=8_h=4-4.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 845544960,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "169108992",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.631616214911143,
            "data_time": 0.13656799495220184,
            "batch_time": 1.3140188306570053,
            "samples_per_second": 375262.97349186626,
            "samples_per_second_per_gpu": 46907.87168648328,
            "loss_sequences_lower_95": 4.554263394673665,
            "loss_sequences_upper_95": 4.706871019999186,
            "loss_tokens_lower_95": 4.617542444864909,
            "loss_tokens_upper_95": 4.6454641596476245,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.8231177076673335,
            "data_time": 0.018907178051154806,
            "batch_time": 0.0640747505911626,
            "samples_per_second": 4678170.719599438,
            "samples_per_second_per_gpu": 584771.3399499297,
            "loss_sequences_lower_95": 4.8207941325045,
            "loss_sequences_upper_95": 4.825395491785197,
            "loss_tokens_lower_95": 4.8113335,
            "loss_tokens_upper_95": 4.834664249999999,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.053770372332359,
            "data_time": 0.10029911994934082,
            "batch_time": 0.14511356502771378,
            "samples_per_second": 4158442.8570847814,
            "samples_per_second_per_gpu": 519805.3571355977,
            "loss_sequences_lower_95": 4.023253398038904,
            "loss_sequences_upper_95": 4.083600208516024,
            "loss_tokens_lower_95": 4.040948125,
            "loss_tokens_upper_95": 4.066470145833333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.684637446747613,
            "data_time": 0.014333190102326242,
            "batch_time": 0.05795164720008248,
            "samples_per_second": 5400834.479573074,
            "samples_per_second_per_gpu": 675104.3099466342,
            "loss_sequences_lower_95": 4.671321278994845,
            "loss_sequences_upper_95": 4.6980979180090205,
            "loss_tokens_lower_95": 4.673059958333334,
            "loss_tokens_upper_95": 4.696067177083333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.802410957041195,
            "data_time": 0.10136017203330994,
            "batch_time": 0.1450401172041893,
            "samples_per_second": 4186077.1452866453,
            "samples_per_second_per_gpu": 523259.64316083066,
            "loss_sequences_lower_95": 4.768159786309827,
            "loss_sequences_upper_95": 4.835526003711335,
            "loss_tokens_lower_95": 4.7908174895833335,
            "loss_tokens_upper_95": 4.814035041666666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.6716133768586054,
            "data_time": 0.03670665125052134,
            "batch_time": 0.07990216463804245,
            "samples_per_second": 4948491.869678825,
            "samples_per_second_per_gpu": 618561.4837098531,
            "loss_sequences_lower_95": 4.633362214732219,
            "loss_sequences_upper_95": 4.7087748140481605,
            "loss_tokens_lower_95": 4.659270239583334,
            "loss_tokens_upper_95": 4.683497510416666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.4997063361381997,
            "data_time": 0.013474134355783462,
            "batch_time": 0.05577628910541534,
            "samples_per_second": 5266573.324515579,
            "samples_per_second_per_gpu": 658321.6655644474,
            "loss_sequences_lower_95": 3.4719253627232143,
            "loss_sequences_upper_95": 3.5270178970025508,
            "loss_tokens_lower_95": 3.486762375,
            "loss_tokens_upper_95": 3.5124335416666668,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.853609823856054,
            "data_time": 0.014060370231929579,
            "batch_time": 0.05739729106426239,
            "samples_per_second": 5359273.969708748,
            "samples_per_second_per_gpu": 669909.2462135935,
            "loss_sequences_lower_95": 4.844261125654451,
            "loss_sequences_upper_95": 4.8629573277977745,
            "loss_tokens_lower_95": 4.842399458333333,
            "loss_tokens_upper_95": 4.865196343749999,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.925267924138201,
            "data_time": 0.09756992757320404,
            "batch_time": 0.14231617003679276,
            "samples_per_second": 4172000.5766583956,
            "samples_per_second_per_gpu": 521500.07208229945,
            "loss_sequences_lower_95": 4.881095526470402,
            "loss_sequences_upper_95": 4.968120543937372,
            "loss_tokens_lower_95": 4.913429385416666,
            "loss_tokens_upper_95": 4.937316354166667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4287552211595616,
            "data_time": 0.0984892025589943,
            "batch_time": 0.14424169808626175,
            "samples_per_second": 4098396.726875712,
            "samples_per_second_per_gpu": 512299.590859464,
            "loss_sequences_lower_95": 5.3838140073029885,
            "loss_sequences_upper_95": 5.466532385302155,
            "loss_tokens_lower_95": 5.416701083333334,
            "loss_tokens_upper_95": 5.44040103125,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.871188615370535,
            "data_time": 0.010383997497887447,
            "batch_time": 0.05367874790882242,
            "samples_per_second": 5364695.406934659,
            "samples_per_second_per_gpu": 670586.9258668324,
            "loss_sequences_lower_95": 4.86377309114619,
            "loss_sequences_upper_95": 4.8785634159372,
            "loss_tokens_lower_95": 4.85920771875,
            "loss_tokens_upper_95": 4.88330475,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.802335888879689,
            "data_time": 0.02472110241651535,
            "batch_time": 0.0671206384897232,
            "samples_per_second": 5065245.459565461,
            "samples_per_second_per_gpu": 633155.6824456826,
            "loss_sequences_lower_95": 4.791397749310184,
            "loss_sequences_upper_95": 4.813266607663474,
            "loss_tokens_lower_95": 4.79017940625,
            "loss_tokens_upper_95": 4.81422671875,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.9532947656349045,
            "data_time": 0.10221771150827408,
            "batch_time": 0.19572240859270096,
            "samples_per_second": 4157233.802780619,
            "samples_per_second_per_gpu": 519654.22534757736,
            "loss_sequences_lower_95": 4.912996749413666,
            "loss_sequences_upper_95": 4.992256557627092,
            "loss_tokens_lower_95": 4.941884666666667,
            "loss_tokens_upper_95": 4.964799729166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7383017433150965,
            "data_time": 0.10389160364866257,
            "batch_time": 0.153023399412632,
            "samples_per_second": 4229091.996604597,
            "samples_per_second_per_gpu": 528636.4995755746,
            "loss_sequences_lower_95": 4.6784178693037175,
            "loss_sequences_upper_95": 4.794406519499427,
            "loss_tokens_lower_95": 4.726136010416666,
            "loss_tokens_upper_95": 4.751161291666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.682776418599215,
            "data_time": 0.15151692926883698,
            "batch_time": 0.17899459600448608,
            "samples_per_second": 975270.1667165618,
            "samples_per_second_per_gpu": 121908.77083957022,
            "loss_sequences_lower_95": 5.606152127005837,
            "loss_sequences_upper_95": 5.75357477014715,
            "loss_tokens_lower_95": 5.6596473867242985,
            "loss_tokens_upper_95": 5.705858369307084,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.366697331559206,
            "data_time": 0.09976036846637726,
            "batch_time": 0.13470372557640076,
            "samples_per_second": 3314296.1292907037,
            "samples_per_second_per_gpu": 414287.01616133796,
            "loss_sequences_lower_95": 4.257134200115593,
            "loss_sequences_upper_95": 4.477925345640488,
            "loss_tokens_lower_95": 4.3546431875,
            "loss_tokens_upper_95": 4.3788285,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.524046091416895,
            "data_time": 0.10079346597194672,
            "batch_time": 0.1372881457209587,
            "samples_per_second": 3743457.933804022,
            "samples_per_second_per_gpu": 467932.24172550277,
            "loss_sequences_lower_95": 6.463658798338556,
            "loss_sequences_upper_95": 6.578875265398252,
            "loss_tokens_lower_95": 6.512536552083334,
            "loss_tokens_upper_95": 6.535305458333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.509965134448692,
            "data_time": 0.177312970161438,
            "batch_time": 0.20803894102573395,
            "samples_per_second": 2258860.651035128,
            "samples_per_second_per_gpu": 282357.581379391,
            "loss_sequences_lower_95": 5.420465400570729,
            "loss_sequences_upper_95": 5.578825428446786,
            "loss_tokens_lower_95": 5.496191531322041,
            "loss_tokens_upper_95": 5.523431946801358,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.629719187129751,
            "data_time": 0.027641531283205206,
            "batch_time": 0.07212187918749723,
            "samples_per_second": 4530229.780470069,
            "samples_per_second_per_gpu": 566278.7225587586,
            "loss_sequences_lower_95": 4.612682070974577,
            "loss_sequences_upper_95": 4.646095933738962,
            "loss_tokens_lower_95": 4.612773892329707,
            "loss_tokens_upper_95": 4.6462655768742875,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.669518045684869,
            "data_time": 0.02722260355949402,
            "batch_time": 0.07135043554008007,
            "samples_per_second": 4494211.071855529,
            "samples_per_second_per_gpu": 561776.3839819412,
            "loss_sequences_lower_95": 4.657470129362926,
            "loss_sequences_upper_95": 4.683720016555466,
            "loss_tokens_lower_95": 4.657723051902329,
            "loss_tokens_upper_95": 4.679864362186279,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.814733334515643,
            "data_time": 0.04844425949785444,
            "batch_time": 0.09043948517905341,
            "samples_per_second": 4438057.816489934,
            "samples_per_second_per_gpu": 554757.2270612417,
            "loss_sequences_lower_95": 7.242921720927315,
            "loss_sequences_upper_95": 7.504797518968469,
            "loss_tokens_lower_95": 6.6881761712234935,
            "loss_tokens_upper_95": 6.884208935511973,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.674809441248576,
            "data_time": 0.043150210132201515,
            "batch_time": 0.08684746424357097,
            "samples_per_second": 4591420.134964929,
            "samples_per_second_per_gpu": 573927.5168706161,
            "loss_sequences_lower_95": 7.059917171223958,
            "loss_sequences_upper_95": 7.2424625976562504,
            "loss_tokens_lower_95": 6.576876535475629,
            "loss_tokens_upper_95": 6.7042181603773585,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.747384638204278,
            "data_time": 0.0688410500685374,
            "batch_time": 0.10907808442910512,
            "samples_per_second": 4002471.9406198855,
            "samples_per_second_per_gpu": 500308.9925774857,
            "loss_sequences_lower_95": 4.850386886076099,
            "loss_sequences_upper_95": 4.919225528187784,
            "loss_tokens_lower_95": 4.721953029212242,
            "loss_tokens_upper_95": 4.756772749088504,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.5434328360991043,
            "data_time": 0.3609689772129059,
            "batch_time": 0.40373626351356506,
            "samples_per_second": 1935107.4210290804,
            "samples_per_second_per_gpu": 241888.42762863505,
            "loss_sequences_lower_95": 3.5530540188876065,
            "loss_sequences_upper_95": 3.68464824329723,
            "loss_tokens_lower_95": 3.512056790411166,
            "loss_tokens_upper_95": 3.5652553411266754,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7926513769188706,
            "data_time": 0.3328106254339218,
            "batch_time": 0.3785061091184616,
            "samples_per_second": 2716064.3628618764,
            "samples_per_second_per_gpu": 339508.04535773455,
            "loss_sequences_lower_95": 4.850320957728795,
            "loss_sequences_upper_95": 5.0720187440210465,
            "loss_tokens_lower_95": 4.738042048387779,
            "loss_tokens_upper_95": 4.844761525685511,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.5679259109497075,
            "data_time": 0.18497613817453384,
            "batch_time": 0.21669849008321762,
            "samples_per_second": 2619136.849199726,
            "samples_per_second_per_gpu": 327392.1061499657,
            "loss_sequences_lower_95": 4.55745176188151,
            "loss_sequences_upper_95": 4.673211822509766,
            "loss_tokens_lower_95": 4.452778623602152,
            "loss_tokens_upper_95": 4.677991658176568,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.162147786191268,
            "data_time": 0.02449418343603611,
            "batch_time": 0.06872435547411442,
            "samples_per_second": 4518131.52893682,
            "samples_per_second_per_gpu": 564766.4411171025,
            "loss_sequences_lower_95": 9.256374390255154,
            "loss_sequences_upper_95": 9.331963028548053,
            "loss_tokens_lower_95": 9.103312464164556,
            "loss_tokens_upper_95": 9.183401731747814,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.669696358518569,
            "data_time": 0.045813530683517456,
            "batch_time": 0.08781123906373978,
            "samples_per_second": 4444525.383708354,
            "samples_per_second_per_gpu": 555565.6729635443,
            "loss_sequences_lower_95": 6.8020189895373,
            "loss_sequences_upper_95": 7.082878405638415,
            "loss_tokens_lower_95": 5.528118835504655,
            "loss_tokens_upper_95": 5.673005453139192,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.371594164558645,
            "data_time": 0.08006691634654999,
            "batch_time": 0.12227429747581482,
            "samples_per_second": 4383242.203399967,
            "samples_per_second_per_gpu": 547905.2754249959,
            "loss_sequences_lower_95": 6.092100696759013,
            "loss_sequences_upper_95": 6.404927891024957,
            "loss_tokens_lower_95": 5.266905576269563,
            "loss_tokens_upper_95": 5.433579632542755,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.177905272131097,
            "data_time": 0.3673690855503082,
            "batch_time": 0.40908992290496826,
            "samples_per_second": 2584082.57500056,
            "samples_per_second_per_gpu": 323010.32187507,
            "loss_sequences_lower_95": 5.109328512949486,
            "loss_sequences_upper_95": 5.246317266873573,
            "loss_tokens_lower_95": 5.111082730227954,
            "loss_tokens_upper_95": 5.244049824753853,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.909293036460877,
            "data_time": 0.3143030256032944,
            "batch_time": 0.3390995115041733,
            "samples_per_second": 1901922.6198489538,
            "samples_per_second_per_gpu": 237740.32748111922,
            "loss_sequences_lower_95": 4.82984107208252,
            "loss_sequences_upper_95": 5.284230560302735,
            "loss_tokens_lower_95": 4.646122618864602,
            "loss_tokens_upper_95": 5.142236076996451,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.25873464376436,
            "data_time": 0.05388688109815121,
            "batch_time": 0.09734771400690079,
            "samples_per_second": 4509122.8220047,
            "samples_per_second_per_gpu": 563640.3527505875,
            "loss_sequences_lower_95": 4.218848480880565,
            "loss_sequences_upper_95": 4.299131264193641,
            "loss_tokens_lower_95": 4.218365309841191,
            "loss_tokens_upper_95": 4.298899230644671,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.085615552125848,
            "data_time": 0.07753253877162933,
            "batch_time": 0.12104812860488892,
            "samples_per_second": 4430323.258516943,
            "samples_per_second_per_gpu": 553790.4073146179,
            "loss_sequences_lower_95": 5.0382210046516684,
            "loss_sequences_upper_95": 5.132608249673679,
            "loss_tokens_lower_95": 5.036931748198838,
            "loss_tokens_upper_95": 5.132930921199964,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.243835469713927,
            "data_time": 0.05915670096874237,
            "batch_time": 0.10067063011229038,
            "samples_per_second": 4097434.3719048123,
            "samples_per_second_per_gpu": 512179.29648810154,
            "loss_sequences_lower_95": 5.480301358261357,
            "loss_sequences_upper_95": 5.593899008135541,
            "loss_tokens_lower_95": 5.204485748626373,
            "loss_tokens_upper_95": 5.265366917037974,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.60355854511261,
            "data_time": 0.2009846419095993,
            "batch_time": 0.24611590057611465,
            "samples_per_second": 3441237.459398994,
            "samples_per_second_per_gpu": 430154.6824248743,
            "loss_sequences_lower_95": 7.265133435058594,
            "loss_sequences_upper_95": 7.769649633789063,
            "loss_tokens_lower_95": 6.37043138727457,
            "loss_tokens_upper_95": 6.719722086857834,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3048374354839325,
            "data_time": 0.16891738772392273,
            "batch_time": 0.18628548085689545,
            "samples_per_second": 827763.0846109841,
            "samples_per_second_per_gpu": 103470.38557637301,
            "loss_sequences_lower_95": 5.001300847530365,
            "loss_sequences_upper_95": 5.747036004066467,
            "loss_tokens_lower_95": 4.746330787395609,
            "loss_tokens_upper_95": 5.64500529848296,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.612699741604684,
            "data_time": 0.35677386820316315,
            "batch_time": 0.3917820602655411,
            "samples_per_second": 2027094.167412303,
            "samples_per_second_per_gpu": 253386.77092653787,
            "loss_sequences_lower_95": 7.1400169986417925,
            "loss_sequences_upper_95": 7.921883190637348,
            "loss_tokens_lower_95": 5.268494822126903,
            "loss_tokens_upper_95": 5.7341536495032415,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.707676575661956,
            "data_time": 0.05161395503415002,
            "batch_time": 0.09597112238407135,
            "samples_per_second": 4556165.715567575,
            "samples_per_second_per_gpu": 569520.7144459469,
            "loss_sequences_lower_95": 5.685184173117355,
            "loss_sequences_upper_95": 5.7302664513877515,
            "loss_tokens_lower_95": 5.685490900775448,
            "loss_tokens_upper_95": 5.730203128413063,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.098020881373929,
            "data_time": 0.03372686320827121,
            "batch_time": 0.07698296932947069,
            "samples_per_second": 4452901.741176702,
            "samples_per_second_per_gpu": 556612.7176470878,
            "loss_sequences_lower_95": 6.226073375415413,
            "loss_sequences_upper_95": 6.445352733692752,
            "loss_tokens_lower_95": 5.958651950348044,
            "loss_tokens_upper_95": 6.176034166514764,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.34549385374719,
            "data_time": 0.1866299957036972,
            "batch_time": 0.21587198227643967,
            "samples_per_second": 1977719.2062618888,
            "samples_per_second_per_gpu": 247214.9007827361,
            "loss_sequences_lower_95": 4.27256583748283,
            "loss_sequences_upper_95": 4.6381484035170555,
            "loss_tokens_lower_95": 4.139489847724907,
            "loss_tokens_upper_95": 4.469241680247601,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.725315488034002,
            "data_time": 0.07423316538333893,
            "batch_time": 0.11914722025394439,
            "samples_per_second": 4451669.294877511,
            "samples_per_second_per_gpu": 556458.6618596889,
            "loss_sequences_lower_95": 4.778208474126875,
            "loss_sequences_upper_95": 4.916715058023382,
            "loss_tokens_lower_95": 4.644475174399059,
            "loss_tokens_upper_95": 4.799702713954026,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.612848197541585,
            "data_time": 0.358124703168869,
            "batch_time": 0.39258767664432526,
            "samples_per_second": 2356692.625400665,
            "samples_per_second_per_gpu": 294586.57817508315,
            "loss_sequences_lower_95": 4.405361063887433,
            "loss_sequences_upper_95": 4.84529261705352,
            "loss_tokens_lower_95": 4.42704004782448,
            "loss_tokens_upper_95": 4.825149421629664,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.362504462881979,
            "data_time": 0.02902007962469023,
            "batch_time": 0.07273054347142904,
            "samples_per_second": 4460057.753360116,
            "samples_per_second_per_gpu": 557507.2191700145,
            "loss_sequences_lower_95": 4.349539683327291,
            "loss_sequences_upper_95": 4.375639288013852,
            "loss_tokens_lower_95": 4.34925290722207,
            "loss_tokens_upper_95": 4.375542501078341,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.768160218174018,
            "data_time": 0.3466312885284424,
            "batch_time": 0.3725906163454056,
            "samples_per_second": 1335695.5486570229,
            "samples_per_second_per_gpu": 166961.94358212786,
            "loss_sequences_lower_95": 3.661479535149139,
            "loss_sequences_upper_95": 4.0274994414986915,
            "loss_tokens_lower_95": 3.5158303579996164,
            "loss_tokens_upper_95": 3.9182953724231546,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.780395793789838,
            "data_time": 0.023800301055113476,
            "batch_time": 0.06783492157856623,
            "samples_per_second": 4476280.4953016285,
            "samples_per_second_per_gpu": 559535.0619127036,
            "loss_sequences_lower_95": 6.574536317069575,
            "loss_sequences_upper_95": 6.623244285983359,
            "loss_tokens_lower_95": 5.68304960106383,
            "loss_tokens_upper_95": 5.7304492504835585,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5711472826004025,
            "data_time": 0.1056654341518879,
            "batch_time": 0.15098649263381958,
            "samples_per_second": 4157723.4879085347,
            "samples_per_second_per_gpu": 519715.43598856684,
            "loss_sequences_lower_95": 5.639401245117188,
            "loss_sequences_upper_95": 5.86219697265625,
            "loss_tokens_lower_95": 5.45206232742109,
            "loss_tokens_upper_95": 5.6591616271359,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.459297209200652,
            "data_time": 0.3430445194244385,
            "batch_time": 0.3856290429830551,
            "samples_per_second": 2538349.791977596,
            "samples_per_second_per_gpu": 317293.7239971995,
            "loss_sequences_lower_95": 4.339823044486668,
            "loss_sequences_upper_95": 4.577612397567085,
            "loss_tokens_lower_95": 4.338540642779806,
            "loss_tokens_upper_95": 4.576817998471467,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.082593527345946,
            "data_time": 0.06919176131486893,
            "batch_time": 0.10905908793210983,
            "samples_per_second": 4027654.1819944917,
            "samples_per_second_per_gpu": 503456.77274931147,
            "loss_sequences_lower_95": 8.957018025716145,
            "loss_sequences_upper_95": 9.209286443536932,
            "loss_tokens_lower_95": 8.958319369229402,
            "loss_tokens_upper_95": 9.208568670099432,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.2881073179244997,
            "data_time": 0.06555203845103581,
            "batch_time": 0.10994301984707515,
            "samples_per_second": 4449829.3170495145,
            "samples_per_second_per_gpu": 556228.6646311893,
            "loss_sequences_lower_95": 2.442221122233073,
            "loss_sequences_upper_95": 2.5441439656575517,
            "loss_tokens_lower_95": 2.234621880002001,
            "loss_tokens_upper_95": 2.320140935280362,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.336023498716808,
            "data_time": 0.36497339606285095,
            "batch_time": 0.4067491441965103,
            "samples_per_second": 2704301.3718968476,
            "samples_per_second_per_gpu": 338037.67148710595,
            "loss_sequences_lower_95": 5.986705133347284,
            "loss_sequences_upper_95": 6.68674556187221,
            "loss_tokens_lower_95": 5.985017162504651,
            "loss_tokens_upper_95": 6.685269412086123,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.819000884890556,
            "data_time": 0.16396617889404297,
            "batch_time": 0.1834505945444107,
            "samples_per_second": 715029.6416231155,
            "samples_per_second_per_gpu": 89378.70520288944,
            "loss_sequences_lower_95": 4.495716333389282,
            "loss_sequences_upper_95": 5.771781158447266,
            "loss_tokens_lower_95": 4.2309342154276735,
            "loss_tokens_upper_95": 4.831271469273518,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.624796941280365,
            "data_time": 0.1009368859231472,
            "batch_time": 0.14534413814544678,
            "samples_per_second": 4126486.0585436183,
            "samples_per_second_per_gpu": 515810.7573179523,
            "loss_sequences_lower_95": 7.744752514648438,
            "loss_sequences_upper_95": 8.069460400390625,
            "loss_tokens_lower_95": 7.463326711380341,
            "loss_tokens_upper_95": 7.752514379924122,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.572881213665008,
            "data_time": 0.09839846566319466,
            "batch_time": 0.1425301767885685,
            "samples_per_second": 4327171.909193627,
            "samples_per_second_per_gpu": 540896.4886492033,
            "loss_sequences_lower_95": 7.854639025878906,
            "loss_sequences_upper_95": 8.129279956054688,
            "loss_tokens_lower_95": 7.45134716234508,
            "loss_tokens_upper_95": 7.660589272580746,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.417803657402578,
            "data_time": 0.04014997432629267,
            "batch_time": 0.08391742408275604,
            "samples_per_second": 4627947.297937018,
            "samples_per_second_per_gpu": 578493.4122421272,
            "loss_sequences_lower_95": 4.397789492017265,
            "loss_sequences_upper_95": 4.438222510567696,
            "loss_tokens_lower_95": 4.397365892073312,
            "loss_tokens_upper_95": 4.43801794151232,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.883510004357076,
            "data_time": 0.13122254113356271,
            "batch_time": 0.17144196728865305,
            "samples_per_second": 3842898.443949864,
            "samples_per_second_per_gpu": 480362.305493733,
            "loss_sequences_lower_95": 4.815541566535258,
            "loss_sequences_upper_95": 4.950870965116767,
            "loss_tokens_lower_95": 4.814513982019849,
            "loss_tokens_upper_95": 4.951472719254032,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.109966729164123,
            "data_time": 0.09465570747852325,
            "batch_time": 0.13944155350327492,
            "samples_per_second": 4373594.999755922,
            "samples_per_second_per_gpu": 546699.3749694902,
            "loss_sequences_lower_95": 10.061556103515624,
            "loss_sequences_upper_95": 10.1587287109375,
            "loss_tokens_lower_95": 10.061428540039064,
            "loss_tokens_upper_95": 10.157473364257813,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.190454858682465,
            "data_time": 0.02776302893956502,
            "batch_time": 0.07163508059013457,
            "samples_per_second": 4495010.679267796,
            "samples_per_second_per_gpu": 561876.3349084745,
            "loss_sequences_lower_95": 6.166476623477412,
            "loss_sequences_upper_95": 6.266839522232734,
            "loss_tokens_lower_95": 5.080895908093567,
            "loss_tokens_upper_95": 5.148585073702704,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.661928113716752,
            "data_time": 0.23277827671595983,
            "batch_time": 0.26568020241601126,
            "samples_per_second": 1974481.9862863738,
            "samples_per_second_per_gpu": 246810.24828579673,
            "loss_sequences_lower_95": 4.549174750029151,
            "loss_sequences_upper_95": 4.772616394953942,
            "loss_tokens_lower_95": 4.549108180715077,
            "loss_tokens_upper_95": 4.773432592135757,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.649724677964753,
            "data_time": 0.1829230859875679,
            "batch_time": 0.22879932075738907,
            "samples_per_second": 3729041.758503528,
            "samples_per_second_per_gpu": 466130.219812941,
            "loss_sequences_lower_95": 4.571081913966759,
            "loss_sequences_upper_95": 4.725950628542432,
            "loss_tokens_lower_95": 4.573850935393689,
            "loss_tokens_upper_95": 4.727256936465992,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.939466590109616,
            "data_time": 0.02934883162379265,
            "batch_time": 0.07283485727384686,
            "samples_per_second": 4465371.928082611,
            "samples_per_second_per_gpu": 558171.4910103264,
            "loss_sequences_lower_95": 6.8650867590700555,
            "loss_sequences_upper_95": 6.978806049280503,
            "loss_tokens_lower_95": 5.813314703092349,
            "loss_tokens_upper_95": 5.900686819481614,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.560208588050156,
            "data_time": 0.3429170250892639,
            "batch_time": 0.38025468587875366,
            "samples_per_second": 2003941.6213762583,
            "samples_per_second_per_gpu": 250492.7026720323,
            "loss_sequences_lower_95": 4.495491245814732,
            "loss_sequences_upper_95": 4.626024995026765,
            "loss_tokens_lower_95": 4.493427393676112,
            "loss_tokens_upper_95": 4.624048658905836,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.155693154816234,
            "data_time": 0.05061003565788269,
            "batch_time": 0.09475774489916287,
            "samples_per_second": 4318117.5630100155,
            "samples_per_second_per_gpu": 539764.6953762519,
            "loss_sequences_lower_95": 8.109865252293577,
            "loss_sequences_upper_95": 8.201881122180811,
            "loss_tokens_lower_95": 8.109064859279435,
            "loss_tokens_upper_95": 8.201294587394878,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.646121756544391,
            "data_time": 0.38028447329998016,
            "batch_time": 0.42006509006023407,
            "samples_per_second": 2602749.9377932083,
            "samples_per_second_per_gpu": 325343.74222415104,
            "loss_sequences_lower_95": 4.515610467114494,
            "loss_sequences_upper_95": 4.777336098383931,
            "loss_tokens_lower_95": 4.513427882518583,
            "loss_tokens_upper_95": 4.777727538173639,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.785463484128316,
            "data_time": 0.2968544661998749,
            "batch_time": 0.3173277974128723,
            "samples_per_second": 1421045.2223201839,
            "samples_per_second_per_gpu": 177630.65279002298,
            "loss_sequences_lower_95": 5.57270243326823,
            "loss_sequences_upper_95": 6.323483276367187,
            "loss_tokens_lower_95": 5.001017761230469,
            "loss_tokens_upper_95": 6.382206683688693,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2014081080754595,
            "data_time": 0.30720245838165283,
            "batch_time": 0.3271874040365219,
            "samples_per_second": 1060058.3646905678,
            "samples_per_second_per_gpu": 132507.29558632098,
            "loss_sequences_lower_95": 5.148133239746094,
            "loss_sequences_upper_95": 6.0642292531331385,
            "loss_tokens_lower_95": 4.355159888106785,
            "loss_tokens_upper_95": 5.8280288524841986,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.431014824334932,
            "data_time": 0.04192811782870974,
            "batch_time": 0.08464629841702324,
            "samples_per_second": 4401569.85922431,
            "samples_per_second_per_gpu": 550196.2324030388,
            "loss_sequences_lower_95": 6.376560371410162,
            "loss_sequences_upper_95": 6.484599508698453,
            "loss_tokens_lower_95": 6.377172427282769,
            "loss_tokens_upper_95": 6.484569118763806,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.4822988866259705,
            "data_time": 0.02250585031561456,
            "batch_time": 0.06698534458745516,
            "samples_per_second": 4513670.317090994,
            "samples_per_second_per_gpu": 564208.7896363742,
            "loss_sequences_lower_95": 4.234574212605997,
            "loss_sequences_upper_95": 4.271852020094629,
            "loss_tokens_lower_95": 3.4054189966669024,
            "loss_tokens_upper_95": 3.434965094202002,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.6724491175704115,
            "data_time": 0.34912557899951935,
            "batch_time": 0.3799029141664505,
            "samples_per_second": 1413959.840297844,
            "samples_per_second_per_gpu": 176744.9800372305,
            "loss_sequences_lower_95": 3.6762819725697438,
            "loss_sequences_upper_95": 4.05881503848579,
            "loss_tokens_lower_95": 3.511029327100221,
            "loss_tokens_upper_95": 3.6997641833049313,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.892902851104736,
            "data_time": 0.22425635159015656,
            "batch_time": 0.24460367858409882,
            "samples_per_second": 1049453.5937700686,
            "samples_per_second_per_gpu": 131181.69922125858,
            "loss_sequences_lower_95": 5.509687980445655,
            "loss_sequences_upper_95": 6.33853451496846,
            "loss_tokens_lower_95": 5.338057247208961,
            "loss_tokens_upper_95": 6.290847071894893,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.5016075416309076,
            "data_time": 0.3795050084590912,
            "batch_time": 0.4146392196416855,
            "samples_per_second": 1724899.291720232,
            "samples_per_second_per_gpu": 215612.411465029,
            "loss_sequences_lower_95": 3.5425461187595273,
            "loss_sequences_upper_95": 3.8686345821473656,
            "loss_tokens_lower_95": 3.3744412157012196,
            "loss_tokens_upper_95": 3.530639020119323,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.56853302659058,
            "data_time": 0.3803575187921524,
            "batch_time": 0.41492360830307007,
            "samples_per_second": 1894346.2517671343,
            "samples_per_second_per_gpu": 236793.2814708918,
            "loss_sequences_lower_95": 3.6974372119438357,
            "loss_sequences_upper_95": 3.9828337134384526,
            "loss_tokens_lower_95": 3.4522858113382493,
            "loss_tokens_upper_95": 3.5853576495525243,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.634492253384939,
            "data_time": 0.3809186667203903,
            "batch_time": 0.4160964787006378,
            "samples_per_second": 1826277.964916612,
            "samples_per_second_per_gpu": 228284.7456145765,
            "loss_sequences_lower_95": 3.3849658407816072,
            "loss_sequences_upper_95": 3.7651277588634953,
            "loss_tokens_lower_95": 3.4946513688721965,
            "loss_tokens_upper_95": 3.7074558918334892,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.672623096442804,
            "data_time": 0.3504611551761627,
            "batch_time": 0.3854802995920181,
            "samples_per_second": 2323777.518893133,
            "samples_per_second_per_gpu": 290472.1898616416,
            "loss_sequences_lower_95": 3.8178658555193645,
            "loss_sequences_upper_95": 4.084355740430878,
            "loss_tokens_lower_95": 3.564842998944339,
            "loss_tokens_upper_95": 3.68538402902003,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.282478557610364,
            "data_time": 0.3504427522420883,
            "batch_time": 0.38584116101264954,
            "samples_per_second": 1808467.3265278381,
            "samples_per_second_per_gpu": 226058.41581597977,
            "loss_sequences_lower_95": 3.2908982270993064,
            "loss_sequences_upper_95": 3.4479444989506502,
            "loss_tokens_lower_95": 3.205807836264259,
            "loss_tokens_upper_95": 3.2982356294786483,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.7492129424723184,
            "data_time": 0.36026081442832947,
            "batch_time": 0.39562954008579254,
            "samples_per_second": 2153764.7879226343,
            "samples_per_second_per_gpu": 269220.5984903293,
            "loss_sequences_lower_95": 2.8740369517628737,
            "loss_sequences_upper_95": 3.061237656197897,
            "loss_tokens_lower_95": 2.6700076406155815,
            "loss_tokens_upper_95": 2.753407184768686,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-4.0/params.txt",
    "uuid": "772810a2-6994-45cc-acce-cb57d3b77413",
    "creation_date": "2023_12_14-05_57_35"
}