{
    "name": "rpj-d=512_l=8_h=4-4.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 6313123840,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1262624768",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.1687608540058134,
            "data_time": 0.0345478393137455,
            "batch_time": 0.3510488085448742,
            "samples_per_second": 1714371.5115479496,
            "samples_per_second_per_gpu": 214296.4389434937,
            "loss_sequences_lower_95": 3.097598114013672,
            "loss_sequences_upper_95": 3.2356397946675615,
            "loss_tokens_lower_95": 3.1563562520345054,
            "loss_tokens_upper_95": 3.1811565717061363,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6344822539735,
            "data_time": 0.0013999370166313742,
            "batch_time": 0.015071331247246647,
            "samples_per_second": 2275257.0586798145,
            "samples_per_second_per_gpu": 284407.1323349768,
            "loss_sequences_lower_95": 3.631911101527665,
            "loss_sequences_upper_95": 3.637004019900234,
            "loss_tokens_lower_95": 3.6235074375000003,
            "loss_tokens_upper_95": 3.645532197916667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.952224299372459,
            "data_time": 0.010243830680847167,
            "batch_time": 0.02391408348083496,
            "samples_per_second": 2214606.757180911,
            "samples_per_second_per_gpu": 276825.8446476139,
            "loss_sequences_lower_95": 2.925379377092634,
            "loss_sequences_upper_95": 2.9789980394013074,
            "loss_tokens_lower_95": 2.9403865416666664,
            "loss_tokens_upper_95": 2.9641018645833332,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4756607857930293,
            "data_time": 0.0016105785769851583,
            "batch_time": 0.014859498037319434,
            "samples_per_second": 2354694.798614802,
            "samples_per_second_per_gpu": 294336.84982685023,
            "loss_sequences_lower_95": 3.462686704655283,
            "loss_sequences_upper_95": 3.4881637000644328,
            "loss_tokens_lower_95": 3.4645500208333333,
            "loss_tokens_upper_95": 3.4865702291666665,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.622356563867231,
            "data_time": 0.010234769122059126,
            "batch_time": 0.0242948522605744,
            "samples_per_second": 2152785.568941556,
            "samples_per_second_per_gpu": 269098.1961176945,
            "loss_sequences_lower_95": 3.587271167887204,
            "loss_sequences_upper_95": 3.6564374313820647,
            "loss_tokens_lower_95": 3.61129359375,
            "loss_tokens_upper_95": 3.6332100416666666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4039754243548592,
            "data_time": 0.0041151344776153564,
            "batch_time": 0.01824584668097289,
            "samples_per_second": 2219052.7984336866,
            "samples_per_second_per_gpu": 277381.5998042108,
            "loss_sequences_lower_95": 3.3607336981777913,
            "loss_sequences_upper_95": 3.447206705950459,
            "loss_tokens_lower_95": 3.3926801979166665,
            "loss_tokens_upper_95": 3.4152934427083332,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.0135703622078407,
            "data_time": 0.0016107737142926912,
            "batch_time": 0.014898897675861153,
            "samples_per_second": 2352370.0733326175,
            "samples_per_second_per_gpu": 294046.2591665772,
            "loss_sequences_lower_95": 1.9902963269292093,
            "loss_sequences_upper_95": 2.0365646125637755,
            "loss_tokens_lower_95": 2.0031015364583333,
            "loss_tokens_upper_95": 2.0244773124999997,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9525358105205117,
            "data_time": 0.0016744614445180985,
            "batch_time": 0.015003926282345825,
            "samples_per_second": 2345859.617403747,
            "samples_per_second_per_gpu": 293232.4521754684,
            "loss_sequences_lower_95": 3.9434840375490836,
            "loss_sequences_upper_95": 3.961581325670812,
            "loss_tokens_lower_95": 3.941563052083333,
            "loss_tokens_upper_95": 3.9633734166666663,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.787562276289715,
            "data_time": 0.011460285338144454,
            "batch_time": 0.02537015127757239,
            "samples_per_second": 2219932.026732236,
            "samples_per_second_per_gpu": 277491.5033415295,
            "loss_sequences_lower_95": 3.7476287841796876,
            "loss_sequences_upper_95": 3.8320576613511497,
            "loss_tokens_lower_95": 3.7763540208333333,
            "loss_tokens_upper_95": 3.7989911770833333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.414281844150407,
            "data_time": 0.01118350587785244,
            "batch_time": 0.024959336034953594,
            "samples_per_second": 2228659.5821768763,
            "samples_per_second_per_gpu": 278582.44777210953,
            "loss_sequences_lower_95": 4.384842087817287,
            "loss_sequences_upper_95": 4.441065381920856,
            "loss_tokens_lower_95": 4.4022060312499995,
            "loss_tokens_upper_95": 4.426503020833333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5621319353637655,
            "data_time": 0.0012531645757330708,
            "batch_time": 0.014606078966704746,
            "samples_per_second": 2346578.5895530577,
            "samples_per_second_per_gpu": 293322.3236941322,
            "loss_sequences_lower_95": 3.553771014093977,
            "loss_sequences_upper_95": 3.570358725118199,
            "loss_tokens_lower_95": 3.5510022500000002,
            "loss_tokens_upper_95": 3.5732018645833334,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.495804090491934,
            "data_time": 0.003042673687454465,
            "batch_time": 0.018044792940773435,
            "samples_per_second": 2352215.237909996,
            "samples_per_second_per_gpu": 294026.9047387495,
            "loss_sequences_lower_95": 3.485330615824136,
            "loss_sequences_upper_95": 3.5061735344648066,
            "loss_tokens_lower_95": 3.4850995,
            "loss_tokens_upper_95": 3.506685291666667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9214922303360327,
            "data_time": 0.01010446397683366,
            "batch_time": 0.023903384039053333,
            "samples_per_second": 2176144.2307091225,
            "samples_per_second_per_gpu": 272018.0288386403,
            "loss_sequences_lower_95": 3.883252937363321,
            "loss_sequences_upper_95": 3.9595264984192764,
            "loss_tokens_lower_95": 3.9101887395833335,
            "loss_tokens_upper_95": 3.9327408020833334,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3444582165622907,
            "data_time": 0.010628562524499171,
            "batch_time": 0.024404536205458925,
            "samples_per_second": 2193882.6604986596,
            "samples_per_second_per_gpu": 274235.33256233245,
            "loss_sequences_lower_95": 3.281026712017487,
            "loss_sequences_upper_95": 3.406080848087847,
            "loss_tokens_lower_95": 3.33293796875,
            "loss_tokens_upper_95": 3.3560057031249997,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.252804788676175,
            "data_time": 0.08885090691702706,
            "batch_time": 0.10558530262538365,
            "samples_per_second": 1125080.9337415234,
            "samples_per_second_per_gpu": 140635.11671769043,
            "loss_sequences_lower_95": 4.18697246204723,
            "loss_sequences_upper_95": 4.31906328201294,
            "loss_tokens_lower_95": 4.23209375034679,
            "loss_tokens_upper_95": 4.273906369642778,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.027635870800074,
            "data_time": 0.015181315216151152,
            "batch_time": 0.02919458801096136,
            "samples_per_second": 2121772.5965660955,
            "samples_per_second_per_gpu": 265221.57457076194,
            "loss_sequences_lower_95": 2.926770442150772,
            "loss_sequences_upper_95": 3.1283868461586644,
            "loss_tokens_lower_95": 3.016473463541667,
            "loss_tokens_upper_95": 3.0387145677083334,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.908768891659135,
            "data_time": 0.013583829005559286,
            "batch_time": 0.02763440211613973,
            "samples_per_second": 2177192.48419937,
            "samples_per_second_per_gpu": 272149.0605249212,
            "loss_sequences_lower_95": 5.862423601376979,
            "loss_sequences_upper_95": 5.952350127288094,
            "loss_tokens_lower_95": 5.897398083333333,
            "loss_tokens_upper_95": 5.92006375,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.893877226798261,
            "data_time": 0.03801567479968071,
            "batch_time": 0.052472785115242004,
            "samples_per_second": 1928935.4690480852,
            "samples_per_second_per_gpu": 241116.93363101064,
            "loss_sequences_lower_95": 3.8444117999467693,
            "loss_sequences_upper_95": 3.948978180181785,
            "loss_tokens_lower_95": 3.8811960126532883,
            "loss_tokens_upper_95": 3.9063312030229413,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.55691325275702,
            "data_time": 0.0019224527077685728,
            "batch_time": 0.015501847130019463,
            "samples_per_second": 2284580.049066742,
            "samples_per_second_per_gpu": 285572.50613334274,
            "loss_sequences_lower_95": 5.532498347591155,
            "loss_sequences_upper_95": 5.581405754833714,
            "loss_tokens_lower_95": 5.532547433035714,
            "loss_tokens_upper_95": 5.581062824361736,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.406880472195953,
            "data_time": 0.002196785371015026,
            "batch_time": 0.01584554662939849,
            "samples_per_second": 2267357.949800022,
            "samples_per_second_per_gpu": 283419.74372500274,
            "loss_sequences_lower_95": 3.3948937801079215,
            "loss_sequences_upper_95": 3.420725453096993,
            "loss_tokens_lower_95": 3.3921953730640957,
            "loss_tokens_upper_95": 3.4118869850898013,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.725403307826665,
            "data_time": 0.0035242508538594008,
            "batch_time": 0.017363320196944514,
            "samples_per_second": 2294982.1876037875,
            "samples_per_second_per_gpu": 286872.77345047344,
            "loss_sequences_lower_95": 4.979368283538026,
            "loss_sequences_upper_95": 5.281938483481931,
            "loss_tokens_lower_95": 4.175207865644358,
            "loss_tokens_upper_95": 4.391517320314362,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.087962139983972,
            "data_time": 0.004504002313664619,
            "batch_time": 0.01805966077966893,
            "samples_per_second": 2265861.8417997346,
            "samples_per_second_per_gpu": 283232.7302249668,
            "loss_sequences_lower_95": 5.2451763997395835,
            "loss_sequences_upper_95": 5.4493903564453126,
            "loss_tokens_lower_95": 4.726474044319968,
            "loss_tokens_upper_95": 4.869027712264151,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3078350957548013,
            "data_time": 0.0049191003111872375,
            "batch_time": 0.018552735560260387,
            "samples_per_second": 2249808.435974808,
            "samples_per_second_per_gpu": 281226.054496851,
            "loss_sequences_lower_95": 3.352052845061955,
            "loss_sequences_upper_95": 3.414622403854127,
            "loss_tokens_lower_95": 3.210736742722147,
            "loss_tokens_upper_95": 3.242451729271529,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.385145517912778,
            "data_time": 0.022572032042912075,
            "batch_time": 0.038932546973228455,
            "samples_per_second": 2073410.9875922154,
            "samples_per_second_per_gpu": 259176.37344902693,
            "loss_sequences_lower_95": 2.364884206598455,
            "loss_sequences_upper_95": 2.478136284568093,
            "loss_tokens_lower_95": 2.3183158445026706,
            "loss_tokens_upper_95": 2.3649850764952633,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6114744877328677,
            "data_time": 0.021918101236224174,
            "batch_time": 0.036306802183389664,
            "samples_per_second": 1968739.5264056474,
            "samples_per_second_per_gpu": 246092.44080070592,
            "loss_sequences_lower_95": 3.6071886210538904,
            "loss_sequences_upper_95": 3.804335713289222,
            "loss_tokens_lower_95": 3.474076622799594,
            "loss_tokens_upper_95": 3.5690184918754233,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.871741618315379,
            "data_time": 0.016558780119969294,
            "batch_time": 0.030677191722087372,
            "samples_per_second": 2032215.9175269508,
            "samples_per_second_per_gpu": 254026.98969086885,
            "loss_sequences_lower_95": 3.8413695068359375,
            "loss_sequences_upper_95": 3.95372666422526,
            "loss_tokens_lower_95": 3.7274066033852558,
            "loss_tokens_upper_95": 3.9394350332441674,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.5480768569900905,
            "data_time": 0.0018333052218564559,
            "batch_time": 0.015481996860059272,
            "samples_per_second": 2268442.5232583745,
            "samples_per_second_per_gpu": 283555.3154072968,
            "loss_sequences_lower_95": 6.564728009971212,
            "loss_sequences_upper_95": 6.6382855751131835,
            "loss_tokens_lower_95": 6.4005050557838405,
            "loss_tokens_upper_95": 6.477661733569449,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.95689557337199,
            "data_time": 0.002895812660255688,
            "batch_time": 0.016581109306156236,
            "samples_per_second": 2258272.171687508,
            "samples_per_second_per_gpu": 282284.0214609385,
            "loss_sequences_lower_95": 5.537495293922295,
            "loss_sequences_upper_95": 5.848852682916404,
            "loss_tokens_lower_95": 4.170012601736348,
            "loss_tokens_upper_95": 4.3120916687067155,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.450064925610409,
            "data_time": 0.00513477623462677,
            "batch_time": 0.018765845411532634,
            "samples_per_second": 2241921.8481019908,
            "samples_per_second_per_gpu": 280240.23101274885,
            "loss_sequences_lower_95": 4.91868347584591,
            "loss_sequences_upper_95": 5.260899587377347,
            "loss_tokens_lower_95": 4.002501240004336,
            "loss_tokens_upper_95": 4.161320790539791,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.815471601268472,
            "data_time": 0.0250887040581022,
            "batch_time": 0.039180953587804525,
            "samples_per_second": 2048175.4161360373,
            "samples_per_second_per_gpu": 256021.92701700467,
            "loss_sequences_lower_95": 5.738151264626142,
            "loss_sequences_upper_95": 5.893449018643871,
            "loss_tokens_lower_95": 5.739484868855237,
            "loss_tokens_upper_95": 5.8910825407124,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.721998538970947,
            "data_time": 0.048478626287900485,
            "batch_time": 0.06364366182914147,
            "samples_per_second": 1661708.471960592,
            "samples_per_second_per_gpu": 207713.558995074,
            "loss_sequences_lower_95": 3.5854639434814453,
            "loss_sequences_upper_95": 3.9558006134033206,
            "loss_tokens_lower_95": 3.4088796707726545,
            "loss_tokens_upper_95": 3.875695451385006,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.565454111006511,
            "data_time": 0.0034856508602150137,
            "batch_time": 0.016936069861023948,
            "samples_per_second": 2290917.0263775284,
            "samples_per_second_per_gpu": 286364.62829719106,
            "loss_sequences_lower_95": 5.5127161531633835,
            "loss_sequences_upper_95": 5.619339685820752,
            "loss_tokens_lower_95": 5.511534957339112,
            "loss_tokens_upper_95": 5.6203253817289855,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.008139941459987,
            "data_time": 0.004793552551145071,
            "batch_time": 0.018577171966652303,
            "samples_per_second": 2225366.608158587,
            "samples_per_second_per_gpu": 278170.8260198234,
            "loss_sequences_lower_95": 5.9429499445734795,
            "loss_sequences_upper_95": 6.072109382998056,
            "loss_tokens_lower_95": 5.9405039294443585,
            "loss_tokens_upper_95": 6.074535382994855,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9542064470876417,
            "data_time": 0.0035185965811974544,
            "batch_time": 0.01721709104130129,
            "samples_per_second": 2237490.7775643608,
            "samples_per_second_per_gpu": 279686.3471955451,
            "loss_sequences_lower_95": 4.124908048777119,
            "loss_sequences_upper_95": 4.25316159452785,
            "loss_tokens_lower_95": 3.763586463365791,
            "loss_tokens_upper_95": 3.8220621917945383,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.75471975183487,
            "data_time": 0.010911586694419384,
            "batch_time": 0.025364574044942856,
            "samples_per_second": 2083036.8424605266,
            "samples_per_second_per_gpu": 260379.60530756583,
            "loss_sequences_lower_95": 5.966056115722656,
            "loss_sequences_upper_95": 6.509665759277343,
            "loss_tokens_lower_95": 5.13165173331765,
            "loss_tokens_upper_95": 5.496469913990114,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.114316940307617,
            "data_time": 0.15383024513721466,
            "batch_time": 0.17045296728610992,
            "samples_per_second": 973042.1288664321,
            "samples_per_second_per_gpu": 121630.26610830401,
            "loss_sequences_lower_95": 3.885265600681305,
            "loss_sequences_upper_95": 4.420074450969696,
            "loss_tokens_lower_95": 3.682567508741357,
            "loss_tokens_upper_95": 4.437012227376302,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.575971873327234,
            "data_time": 0.027343656154389075,
            "batch_time": 0.04125579874566261,
            "samples_per_second": 1889500.6002101984,
            "samples_per_second_per_gpu": 236187.5750262748,
            "loss_sequences_lower_95": 6.0854004607803525,
            "loss_sequences_upper_95": 6.954833072355424,
            "loss_tokens_lower_95": 3.9966757308805136,
            "loss_tokens_upper_95": 4.492356265303516,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5353089178842767,
            "data_time": 0.003067384577459759,
            "batch_time": 0.016792041146092944,
            "samples_per_second": 2246049.301747507,
            "samples_per_second_per_gpu": 280756.16271843837,
            "loss_sequences_lower_95": 3.515097036565845,
            "loss_sequences_upper_95": 3.5554497061779844,
            "loss_tokens_lower_95": 3.514790170252089,
            "loss_tokens_upper_95": 3.5553958277871067,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3598061047630234,
            "data_time": 0.002894808258897443,
            "batch_time": 0.016385380871128473,
            "samples_per_second": 2295400.4987835344,
            "samples_per_second_per_gpu": 286925.0623479418,
            "loss_sequences_lower_95": 3.3306660144879197,
            "loss_sequences_upper_95": 3.5002207830996506,
            "loss_tokens_lower_95": 3.1712246140032585,
            "loss_tokens_upper_95": 3.3364634214506244,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.448386298867809,
            "data_time": 0.01835915942986806,
            "batch_time": 0.032441169023513794,
            "samples_per_second": 2013642.4583643717,
            "samples_per_second_per_gpu": 251705.30729554646,
            "loss_sequences_lower_95": 3.305477966755738,
            "loss_sequences_upper_95": 3.6970760359432235,
            "loss_tokens_lower_95": 3.1905122797539893,
            "loss_tokens_upper_95": 3.4875889667860345,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8612062045694246,
            "data_time": 0.005006304755806923,
            "batch_time": 0.018646826967597008,
            "samples_per_second": 2242560.0479045887,
            "samples_per_second_per_gpu": 280320.0059880736,
            "loss_sequences_lower_95": 3.892899843380032,
            "loss_sequences_upper_95": 4.042408363444652,
            "loss_tokens_lower_95": 3.7171962495009665,
            "loss_tokens_upper_95": 3.8634701111794,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.071975209364077,
            "data_time": 0.032576478662944976,
            "batch_time": 0.0471947363444737,
            "samples_per_second": 1919616.4589052578,
            "samples_per_second_per_gpu": 239952.05736315722,
            "loss_sequences_lower_95": 2.9134624667283964,
            "loss_sequences_upper_95": 3.385250352068645,
            "loss_tokens_lower_95": 2.811570284222701,
            "loss_tokens_upper_95": 3.1877376039887717,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.900633594755603,
            "data_time": 0.0021176402015396466,
            "batch_time": 0.015690562775551295,
            "samples_per_second": 2276617.72903628,
            "samples_per_second_per_gpu": 284577.216129535,
            "loss_sequences_lower_95": 4.887053406775105,
            "loss_sequences_upper_95": 4.914167364722945,
            "loss_tokens_lower_95": 4.887065743226771,
            "loss_tokens_upper_95": 4.914133744717693,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.2071991788530814,
            "data_time": 0.04566807313398882,
            "batch_time": 0.06118095571344549,
            "samples_per_second": 1595771.1945928887,
            "samples_per_second_per_gpu": 199471.3993241111,
            "loss_sequences_lower_95": 1.1562343615930057,
            "loss_sequences_upper_95": 1.3133611105020762,
            "loss_tokens_lower_95": 1.0330117263715854,
            "loss_tokens_upper_95": 1.2700332563511547,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.706780487864397,
            "data_time": 0.001574881068818827,
            "batch_time": 0.015046709041483544,
            "samples_per_second": 2296598.4637487587,
            "samples_per_second_per_gpu": 287074.80796859483,
            "loss_sequences_lower_95": 6.156249979526992,
            "loss_sequences_upper_95": 6.209685166077044,
            "loss_tokens_lower_95": 5.012386061411992,
            "loss_tokens_upper_95": 5.0647483196324945,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.762823852777481,
            "data_time": 0.005973395847138904,
            "batch_time": 0.019504874471634154,
            "samples_per_second": 2254595.812502025,
            "samples_per_second_per_gpu": 281824.47656275315,
            "loss_sequences_lower_95": 5.759655187988281,
            "loss_sequences_upper_95": 6.039048254394531,
            "loss_tokens_lower_95": 5.468056521670172,
            "loss_tokens_upper_95": 5.717119894005583,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.079572714929995,
            "data_time": 0.022672782510013905,
            "batch_time": 0.036545513039928375,
            "samples_per_second": 2051812.250857825,
            "samples_per_second_per_gpu": 256476.53135722812,
            "loss_sequences_lower_95": 5.881768851902175,
            "loss_sequences_upper_95": 6.277253510848335,
            "loss_tokens_lower_95": 5.880941984757134,
            "loss_tokens_upper_95": 6.274561130689538,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.731925190940048,
            "data_time": 0.0048459836517471865,
            "batch_time": 0.018491255231650478,
            "samples_per_second": 2250151.5395952202,
            "samples_per_second_per_gpu": 281268.94244940253,
            "loss_sequences_lower_95": 5.675059305826823,
            "loss_sequences_upper_95": 5.787255489464962,
            "loss_tokens_lower_95": 5.67411113392223,
            "loss_tokens_upper_95": 5.788035907167377,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.1632419023911158,
            "data_time": 0.0042445906933317795,
            "batch_time": 0.017669752874272936,
            "samples_per_second": 2290755.99491743,
            "samples_per_second_per_gpu": 286344.49936467875,
            "loss_sequences_lower_95": 1.2078812581380207,
            "loss_sequences_upper_95": 1.2694199157714843,
            "loss_tokens_lower_95": 1.0825066256971538,
            "loss_tokens_upper_95": 1.1438712477178372,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.775268835113162,
            "data_time": 0.025045205439840044,
            "batch_time": 0.040250963398388455,
            "samples_per_second": 1859996.4312666708,
            "samples_per_second_per_gpu": 232499.55390833385,
            "loss_sequences_lower_95": 5.450546061197917,
            "loss_sequences_upper_95": 6.099932672409784,
            "loss_tokens_lower_95": 5.453800746372767,
            "loss_tokens_upper_95": 6.104678926013765,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.5150885842740536,
            "data_time": 0.15902596712112427,
            "batch_time": 0.1773787885904312,
            "samples_per_second": 731390.8571209181,
            "samples_per_second_per_gpu": 91423.85714011476,
            "loss_sequences_lower_95": 2.331413996219635,
            "loss_sequences_upper_95": 3.475977349281311,
            "loss_tokens_lower_95": 1.952235944295667,
            "loss_tokens_upper_95": 2.4993502209850194,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.599992986679077,
            "data_time": 0.006226802629137796,
            "batch_time": 0.020078768332799275,
            "samples_per_second": 2205296.3299153917,
            "samples_per_second_per_gpu": 275662.04123942397,
            "loss_sequences_lower_95": 7.546964831542969,
            "loss_sequences_upper_95": 7.896565747070313,
            "loss_tokens_lower_95": 7.291008705616805,
            "loss_tokens_upper_95": 7.600379016960131,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.836861347198487,
            "data_time": 0.0058931415043179955,
            "batch_time": 0.01942740830164107,
            "samples_per_second": 2249087.52584034,
            "samples_per_second_per_gpu": 281135.9407300425,
            "loss_sequences_lower_95": 6.9097478515625,
            "loss_sequences_upper_95": 7.134986401367187,
            "loss_tokens_lower_95": 6.621606200805115,
            "loss_tokens_upper_95": 6.808145323453931,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.180763267407752,
            "data_time": 0.0038548656131910243,
            "batch_time": 0.017447167814375962,
            "samples_per_second": 2260603.815622225,
            "samples_per_second_per_gpu": 282575.4769527781,
            "loss_sequences_lower_95": 5.1463578523743925,
            "loss_sequences_upper_95": 5.2151212256825135,
            "loss_tokens_lower_95": 5.1472186956555905,
            "loss_tokens_upper_95": 5.214535476423693,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.487356489521384,
            "data_time": 0.009263875621323138,
            "batch_time": 0.022751958708748715,
            "samples_per_second": 2229915.523845398,
            "samples_per_second_per_gpu": 278739.44048067473,
            "loss_sequences_lower_95": 5.369896110851094,
            "loss_sequences_upper_95": 5.603897811959966,
            "loss_tokens_lower_95": 5.364111740651402,
            "loss_tokens_upper_95": 5.602480999408963,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.341219706535339,
            "data_time": 0.007365585319579594,
            "batch_time": 0.02090877436456226,
            "samples_per_second": 2256700.125014461,
            "samples_per_second_per_gpu": 282087.51562680764,
            "loss_sequences_lower_95": 7.267196838378906,
            "loss_sequences_upper_95": 7.416466235351562,
            "loss_tokens_lower_95": 7.267991052246094,
            "loss_tokens_upper_95": 7.4176591186523435,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.785675843526647,
            "data_time": 0.0022501917552407162,
            "batch_time": 0.01591415959630436,
            "samples_per_second": 2262555.1691198,
            "samples_per_second_per_gpu": 282819.396139975,
            "loss_sequences_lower_95": 4.416541684528737,
            "loss_sequences_upper_95": 4.526210142945837,
            "loss_tokens_lower_95": 3.018343020504522,
            "loss_tokens_upper_95": 3.0911868681482746,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.3488620181581865,
            "data_time": 0.0189471891948155,
            "batch_time": 0.03299616575241089,
            "samples_per_second": 2030765.3109868858,
            "samples_per_second_per_gpu": 253845.66387336073,
            "loss_sequences_lower_95": 6.136834762345499,
            "loss_sequences_upper_95": 6.555809704225455,
            "loss_tokens_lower_95": 6.140100029333314,
            "loss_tokens_upper_95": 6.553166528957993,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.292860018038282,
            "data_time": 0.010858635418117046,
            "batch_time": 0.025180591270327568,
            "samples_per_second": 2123083.338974593,
            "samples_per_second_per_gpu": 265385.41737182415,
            "loss_sequences_lower_95": 6.137141280828738,
            "loss_sequences_upper_95": 6.446061245787377,
            "loss_tokens_lower_95": 6.14031369676777,
            "loss_tokens_upper_95": 6.44260757745481,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.266233242351012,
            "data_time": 0.002116055093905007,
            "batch_time": 0.015635267917321127,
            "samples_per_second": 2284415.2567413733,
            "samples_per_second_per_gpu": 285551.90709267167,
            "loss_sequences_lower_95": 4.705437873841288,
            "loss_sequences_upper_95": 4.8101251047147064,
            "loss_tokens_lower_95": 3.563731563565458,
            "loss_tokens_upper_95": 3.6467348149310777,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.556187077174111,
            "data_time": 0.027128877739111584,
            "batch_time": 0.04105894764264425,
            "samples_per_second": 2049348.53161629,
            "samples_per_second_per_gpu": 256168.56645203626,
            "loss_sequences_lower_95": 5.431298828125,
            "loss_sequences_upper_95": 5.674977394386574,
            "loss_tokens_lower_95": 5.433101432033316,
            "loss_tokens_upper_95": 5.675097171843998,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.017540030814821,
            "data_time": 0.003926905956897107,
            "batch_time": 0.01737533631633344,
            "samples_per_second": 2284819.692584372,
            "samples_per_second_per_gpu": 285602.4615730465,
            "loss_sequences_lower_95": 5.984696100917431,
            "loss_sequences_upper_95": 6.049936090405199,
            "loss_tokens_lower_95": 5.985678472023126,
            "loss_tokens_upper_95": 6.049282226562499,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.213005984871133,
            "data_time": 0.025613193078474566,
            "batch_time": 0.0403974251313643,
            "samples_per_second": 1882659.2066303752,
            "samples_per_second_per_gpu": 235332.4008287969,
            "loss_sequences_lower_95": 5.998471158222087,
            "loss_sequences_upper_95": 6.425539079684656,
            "loss_tokens_lower_95": 5.996049529140436,
            "loss_tokens_upper_95": 6.429113562130233,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.207017058134079,
            "data_time": 0.0812845304608345,
            "batch_time": 0.09614594280719757,
            "samples_per_second": 1447228.0081140194,
            "samples_per_second_per_gpu": 180903.50101425243,
            "loss_sequences_lower_95": 3.853965721130371,
            "loss_sequences_upper_95": 4.776090545654297,
            "loss_tokens_lower_95": 3.4708261277940538,
            "loss_tokens_upper_95": 4.630585522121853,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6525023261706036,
            "data_time": 0.08540202677249908,
            "batch_time": 0.1001804918050766,
            "samples_per_second": 1394526.0411396413,
            "samples_per_second_per_gpu": 174315.75514245516,
            "loss_sequences_lower_95": 3.449626865386963,
            "loss_sequences_upper_95": 4.207792841593424,
            "loss_tokens_lower_95": 2.8424234840307343,
            "loss_tokens_upper_95": 3.96705954048071,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4369319094825035,
            "data_time": 0.0034935303194653365,
            "batch_time": 0.017229338124473352,
            "samples_per_second": 2237374.6923774523,
            "samples_per_second_per_gpu": 279671.83654718153,
            "loss_sequences_lower_95": 3.4141362599525955,
            "loss_sequences_upper_95": 3.4596803951122976,
            "loss_tokens_lower_95": 3.41446891683542,
            "loss_tokens_upper_95": 3.460172415776878,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.9008063948378876,
            "data_time": 0.0015417183841083453,
            "batch_time": 0.015136547777924026,
            "samples_per_second": 2273729.55712434,
            "samples_per_second_per_gpu": 284216.1946405425,
            "loss_sequences_lower_95": 1.0690903293265532,
            "loss_sequences_upper_95": 1.094026895838854,
            "loss_tokens_lower_95": 0.7167728313315592,
            "loss_tokens_upper_95": 0.7294558643619373,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.0176585236872273,
            "data_time": 0.04113495349884033,
            "batch_time": 0.056400448083877563,
            "samples_per_second": 1909322.6376660096,
            "samples_per_second_per_gpu": 238665.3297082512,
            "loss_sequences_lower_95": 1.9389521381047767,
            "loss_sequences_upper_95": 2.1978971646526664,
            "loss_tokens_lower_95": 1.7997103296372206,
            "loss_tokens_upper_95": 1.9368220075112073,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6951936837789177,
            "data_time": 0.12664897101266043,
            "batch_time": 0.14235225177946545,
            "samples_per_second": 1031498.4663833523,
            "samples_per_second_per_gpu": 128937.30829791904,
            "loss_sequences_lower_95": 3.3067189242388753,
            "loss_sequences_upper_95": 4.127965277594489,
            "loss_tokens_lower_95": 3.2002447057653356,
            "loss_tokens_upper_95": 4.123360000421972,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.8738091737031937,
            "data_time": 0.03490609498251052,
            "batch_time": 0.04967484303883144,
            "samples_per_second": 1908491.7026295583,
            "samples_per_second_per_gpu": 238561.46282869478,
            "loss_sequences_lower_95": 1.8170276641845704,
            "loss_sequences_upper_95": 2.0376186789535895,
            "loss_tokens_lower_95": 1.6933458230393188,
            "loss_tokens_upper_95": 1.8041340707790159,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.9303619032952843,
            "data_time": 0.03382685922441028,
            "batch_time": 0.04901507071086338,
            "samples_per_second": 1875244.811035117,
            "samples_per_second_per_gpu": 234405.6013793896,
            "loss_sequences_lower_95": 1.9035196676486876,
            "loss_sequences_upper_95": 2.108508337997809,
            "loss_tokens_lower_95": 1.7406932774889687,
            "loss_tokens_upper_95": 1.8346572691661414,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.8645789412463583,
            "data_time": 0.03339363279796782,
            "batch_time": 0.047606635661352245,
            "samples_per_second": 1973336.5519580257,
            "samples_per_second_per_gpu": 246667.0689947532,
            "loss_sequences_lower_95": 1.723982503937512,
            "loss_sequences_upper_95": 1.9713342620105279,
            "loss_tokens_lower_95": 1.7581105438987026,
            "loss_tokens_upper_95": 1.901714414011959,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.019205582578008,
            "data_time": 0.03321804319109235,
            "batch_time": 0.04773487931206113,
            "samples_per_second": 1903874.0587570295,
            "samples_per_second_per_gpu": 237984.25734462868,
            "loss_sequences_lower_95": 1.9872417682554662,
            "loss_sequences_upper_95": 2.1830854183290063,
            "loss_tokens_lower_95": 1.8294813684956677,
            "loss_tokens_upper_95": 1.9187894399292371,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.6261022705469073,
            "data_time": 0.03282475177152657,
            "batch_time": 0.0480039178589244,
            "samples_per_second": 1897810.7229554458,
            "samples_per_second_per_gpu": 237226.34036943072,
            "loss_sequences_lower_95": 1.572341333709148,
            "loss_sequences_upper_95": 1.6890420570136597,
            "loss_tokens_lower_95": 1.5662505683999342,
            "loss_tokens_upper_95": 1.6336669536277053,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.5195683887818965,
            "data_time": 0.03290140344983056,
            "batch_time": 0.04747226124718076,
            "samples_per_second": 1987985.418789615,
            "samples_per_second_per_gpu": 248498.17734870187,
            "loss_sequences_lower_95": 1.4929777657113426,
            "loss_sequences_upper_95": 1.6241038392229779,
            "loss_tokens_lower_95": 1.3743209564779235,
            "loss_tokens_upper_95": 1.4323289897198992,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-4.0/params.txt",
    "uuid": "2c5de962-5816-4449-bc39-c1ba4c1f39da",
    "creation_date": "2023_12_14-06_34_28"
}