{
    "name": "rpj-d=1024_l=24_h=8-8.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 65858600960,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "13171720192",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=1024_l=24_h=8-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.383926523725192,
            "data_time": 0.048986658453941345,
            "batch_time": 0.4179750308394432,
            "samples_per_second": 682925.7133814791,
            "samples_per_second_per_gpu": 85365.7141726849,
            "loss_sequences_lower_95": 2.3184783999125163,
            "loss_sequences_upper_95": 2.446043853759766,
            "loss_tokens_lower_95": 2.373009484608968,
            "loss_tokens_upper_95": 2.3946564356486,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.908766024148731,
            "data_time": 0.0009916116077109056,
            "batch_time": 0.03661377038478436,
            "samples_per_second": 900292.939372363,
            "samples_per_second_per_gpu": 112536.61742154538,
            "loss_sequences_lower_95": 2.906079382183908,
            "loss_sequences_upper_95": 2.911424886248662,
            "loss_tokens_lower_95": 2.8985288541666665,
            "loss_tokens_upper_95": 2.9189935208333333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.453839049047353,
            "data_time": 0.00977869701385498,
            "batch_time": 0.04520308208465576,
            "samples_per_second": 868210.0863696887,
            "samples_per_second_per_gpu": 108526.26079621109,
            "loss_sequences_lower_95": 2.4303681976941167,
            "loss_sequences_upper_95": 2.477284527214206,
            "loss_tokens_lower_95": 2.4430940416666664,
            "loss_tokens_upper_95": 2.464779973958333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7585911536462526,
            "data_time": 0.0016487502541981246,
            "batch_time": 0.03710025646968892,
            "samples_per_second": 901606.1172487584,
            "samples_per_second_per_gpu": 112700.7646560948,
            "loss_sequences_lower_95": 2.746927341736469,
            "loss_sequences_upper_95": 2.7699571973663013,
            "loss_tokens_lower_95": 2.748380822916667,
            "loss_tokens_upper_95": 2.7686545885416667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.912290720735694,
            "data_time": 0.010010107579934169,
            "batch_time": 0.04535455627745366,
            "samples_per_second": 867626.5199461237,
            "samples_per_second_per_gpu": 108453.31499326546,
            "loss_sequences_lower_95": 2.8786133309731414,
            "loss_sequences_upper_95": 2.945416844012538,
            "loss_tokens_lower_95": 2.90199971875,
            "loss_tokens_upper_95": 2.9224748177083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.718065524409208,
            "data_time": 0.0038494607028753862,
            "batch_time": 0.03922161071196846,
            "samples_per_second": 901732.079065646,
            "samples_per_second_per_gpu": 112716.50988320576,
            "loss_sequences_lower_95": 2.678820323619933,
            "loss_sequences_upper_95": 2.7578302006102353,
            "loss_tokens_lower_95": 2.7076674322916667,
            "loss_tokens_upper_95": 2.7285200416666666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4677974363370818,
            "data_time": 0.0015856134560913286,
            "batch_time": 0.03697476145882785,
            "samples_per_second": 906771.0947861829,
            "samples_per_second_per_gpu": 113346.38684827286,
            "loss_sequences_lower_95": 1.4476280741788905,
            "loss_sequences_upper_95": 1.488483958964445,
            "loss_tokens_lower_95": 1.4588104322916666,
            "loss_tokens_upper_95": 1.4771869479166666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3479590911266066,
            "data_time": 0.0020635685428123203,
            "batch_time": 0.03807146556204904,
            "samples_per_second": 904052.7638106523,
            "samples_per_second_per_gpu": 113006.59547633154,
            "loss_sequences_lower_95": 3.339870735847513,
            "loss_sequences_upper_95": 3.3560389909195028,
            "loss_tokens_lower_95": 3.337620932291667,
            "loss_tokens_upper_95": 3.358318697916667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.109924955581262,
            "data_time": 0.010091011486356221,
            "batch_time": 0.04568034883529421,
            "samples_per_second": 862496.0891823936,
            "samples_per_second_per_gpu": 107812.0111477992,
            "loss_sequences_lower_95": 3.0693641662597657,
            "loss_sequences_upper_95": 3.1550414232703727,
            "loss_tokens_lower_95": 3.0993468333333336,
            "loss_tokens_upper_95": 3.120865864583333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7886864823314985,
            "data_time": 0.00952716451138258,
            "batch_time": 0.04533170163631439,
            "samples_per_second": 871559.9094287358,
            "samples_per_second_per_gpu": 108944.98867859198,
            "loss_sequences_lower_95": 3.7664278837060743,
            "loss_sequences_upper_95": 3.809648120261935,
            "loss_tokens_lower_95": 3.7768618333333337,
            "loss_tokens_upper_95": 3.8009811770833335,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.836626648951903,
            "data_time": 0.0013251147161741459,
            "batch_time": 0.03679787253444899,
            "samples_per_second": 904191.2975636679,
            "samples_per_second_per_gpu": 113023.91219545848,
            "loss_sequences_lower_95": 2.8288172124460393,
            "loss_sequences_upper_95": 2.844506518089626,
            "loss_tokens_lower_95": 2.826455640625,
            "loss_tokens_upper_95": 2.8465891354166666,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.742096747323306,
            "data_time": 0.002569453702381906,
            "batch_time": 0.037988136848938855,
            "samples_per_second": 902963.1802225845,
            "samples_per_second_per_gpu": 112870.39752782306,
            "loss_sequences_lower_95": 2.7322015485084337,
            "loss_sequences_upper_95": 2.751830597114483,
            "loss_tokens_lower_95": 2.732263005208333,
            "loss_tokens_upper_95": 2.752270729166667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2983032714039027,
            "data_time": 0.010191913649969893,
            "batch_time": 0.045637590611876236,
            "samples_per_second": 861724.114787021,
            "samples_per_second_per_gpu": 107715.51434837762,
            "loss_sequences_lower_95": 3.2669879781788795,
            "loss_sequences_upper_95": 3.3283487192273866,
            "loss_tokens_lower_95": 3.287536489583333,
            "loss_tokens_upper_95": 3.3089430677083334,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6608073534159455,
            "data_time": 0.009962629986949176,
            "batch_time": 0.0452073304301714,
            "samples_per_second": 867196.4206863258,
            "samples_per_second_per_gpu": 108399.55258579072,
            "loss_sequences_lower_95": 2.602017218129455,
            "loss_sequences_upper_95": 2.7179455603698606,
            "loss_tokens_lower_95": 2.650213244791667,
            "loss_tokens_upper_95": 2.671305354166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.348704554817893,
            "data_time": 0.08094123431614467,
            "batch_time": 0.11587197440011161,
            "samples_per_second": 528798.5611901976,
            "samples_per_second_per_gpu": 66099.8201487747,
            "loss_sequences_lower_95": 3.291324806213379,
            "loss_sequences_upper_95": 3.4054531010714446,
            "loss_tokens_lower_95": 3.3290344498374247,
            "loss_tokens_upper_95": 3.368838275562633,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3343563642863283,
            "data_time": 0.014395604079419916,
            "batch_time": 0.050048206340182914,
            "samples_per_second": 850957.1422494903,
            "samples_per_second_per_gpu": 106369.64278118628,
            "loss_sequences_lower_95": 2.24848545174557,
            "loss_sequences_upper_95": 2.419814458463352,
            "loss_tokens_lower_95": 2.3242824062499996,
            "loss_tokens_upper_95": 2.34424515625,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.417475257511189,
            "data_time": 0.01289639746149381,
            "batch_time": 0.04861235866943995,
            "samples_per_second": 863245.2589139179,
            "samples_per_second_per_gpu": 107905.65736423974,
            "loss_sequences_lower_95": 5.368093123222089,
            "loss_sequences_upper_95": 5.463990755508946,
            "loss_tokens_lower_95": 5.405933697916667,
            "loss_tokens_upper_95": 5.428876708333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.894582253987672,
            "data_time": 0.03823043406009674,
            "batch_time": 0.07454979047179222,
            "samples_per_second": 768166.2996417708,
            "samples_per_second_per_gpu": 96020.78745522135,
            "loss_sequences_lower_95": 2.8528027456314837,
            "loss_sequences_upper_95": 2.9391627014660444,
            "loss_tokens_lower_95": 2.8832518405601624,
            "loss_tokens_upper_95": 2.9060046336689935,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0282676298552778,
            "data_time": 0.0017260618864267997,
            "batch_time": 0.03721715669425582,
            "samples_per_second": 899532.954721291,
            "samples_per_second_per_gpu": 112441.61934016137,
            "loss_sequences_lower_95": 2.017224647903388,
            "loss_sequences_upper_95": 2.0392527460865173,
            "loss_tokens_lower_95": 2.017187082725039,
            "loss_tokens_upper_95": 2.0391756858331256,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.689113746789128,
            "data_time": 0.0017350239169066119,
            "batch_time": 0.037182105906829716,
            "samples_per_second": 899542.0588922609,
            "samples_per_second_per_gpu": 112442.75736153261,
            "loss_sequences_lower_95": 2.6814604345343307,
            "loss_sequences_upper_95": 2.705940824902597,
            "loss_tokens_lower_95": 2.671497350528778,
            "loss_tokens_upper_95": 2.6896097987071648,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9006316614199568,
            "data_time": 0.0032388222431035452,
            "batch_time": 0.038952949247081585,
            "samples_per_second": 895837.018779051,
            "samples_per_second_per_gpu": 111979.62734738138,
            "loss_sequences_lower_95": 3.1233037416136487,
            "loss_sequences_upper_95": 3.39180683898205,
            "loss_tokens_lower_95": 2.409350812801555,
            "loss_tokens_upper_95": 2.5949723842458154,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9704602635353803,
            "data_time": 0.0035062592397344873,
            "batch_time": 0.03906550131579663,
            "samples_per_second": 890766.1561104083,
            "samples_per_second_per_gpu": 111345.76951380103,
            "loss_sequences_lower_95": 3.0208981689453127,
            "loss_sequences_upper_95": 3.2084004231770833,
            "loss_tokens_lower_95": 2.7901498132861633,
            "loss_tokens_upper_95": 2.9223189367138365,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.228493643403505,
            "data_time": 0.004612876998533131,
            "batch_time": 0.0401763545621754,
            "samples_per_second": 889375.1149900076,
            "samples_per_second_per_gpu": 111171.88937375095,
            "loss_sequences_lower_95": 2.2653667643722755,
            "loss_sequences_upper_95": 2.3152207103437146,
            "loss_tokens_lower_95": 2.153240770849466,
            "loss_tokens_upper_95": 2.1823858726668117,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7768375635147096,
            "data_time": 0.023526949541909353,
            "batch_time": 0.060258522629737854,
            "samples_per_second": 815314.8210856259,
            "samples_per_second_per_gpu": 101914.35263570324,
            "loss_sequences_lower_95": 1.7617534984241832,
            "loss_sequences_upper_95": 1.8528502377596767,
            "loss_tokens_lower_95": 1.718982111477124,
            "loss_tokens_upper_95": 1.7594489522447847,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.705938735300181,
            "data_time": 0.020365595817565918,
            "batch_time": 0.05577069893479347,
            "samples_per_second": 824453.9424653806,
            "samples_per_second_per_gpu": 103056.74280817258,
            "loss_sequences_lower_95": 2.6883015971281092,
            "loss_sequences_upper_95": 2.857028223154496,
            "loss_tokens_lower_95": 2.6114626618568044,
            "loss_tokens_upper_95": 2.6962175360739677,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.530460435549418,
            "data_time": 0.01721826272133069,
            "batch_time": 0.0525782490387941,
            "samples_per_second": 832957.044932572,
            "samples_per_second_per_gpu": 104119.6306165715,
            "loss_sequences_lower_95": 2.515576853434245,
            "loss_sequences_upper_95": 2.597458323160807,
            "loss_tokens_lower_95": 2.412436403479653,
            "loss_tokens_upper_95": 2.5774947158191055,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.416039293946503,
            "data_time": 0.0014096454728859263,
            "batch_time": 0.03689585435039684,
            "samples_per_second": 900511.9957791036,
            "samples_per_second_per_gpu": 112563.99947238795,
            "loss_sequences_lower_95": 4.418141139400128,
            "loss_sequences_upper_95": 4.496709761453668,
            "loss_tokens_lower_95": 4.292686163263295,
            "loss_tokens_upper_95": 4.371945980308423,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3827084377533456,
            "data_time": 0.0031819499579051995,
            "batch_time": 0.038572518817530384,
            "samples_per_second": 897308.4791970327,
            "samples_per_second_per_gpu": 112163.5598996291,
            "loss_sequences_lower_95": 3.7636955184165877,
            "loss_sequences_upper_95": 4.020180719549005,
            "loss_tokens_lower_95": 2.833121866751107,
            "loss_tokens_upper_95": 2.951456705729167,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.236639886433354,
            "data_time": 0.005161327687469688,
            "batch_time": 0.04058094604595287,
            "samples_per_second": 885420.1555579391,
            "samples_per_second_per_gpu": 110677.51944474239,
            "loss_sequences_lower_95": 3.5263189843083405,
            "loss_sequences_upper_95": 3.811045446819006,
            "loss_tokens_lower_95": 2.909209537413783,
            "loss_tokens_upper_95": 3.045227054720273,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.047667215948236,
            "data_time": 0.023572395954813277,
            "batch_time": 0.06009791578565325,
            "samples_per_second": 820577.0206936566,
            "samples_per_second_per_gpu": 102572.12758670708,
            "loss_sequences_lower_95": 4.9514510063275905,
            "loss_sequences_upper_95": 5.1422176134640765,
            "loss_tokens_lower_95": 4.9495068693814215,
            "loss_tokens_upper_95": 5.143313779787386,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9746315097808838,
            "data_time": 0.04963604762003972,
            "batch_time": 0.08594123675273015,
            "samples_per_second": 742394.5762301374,
            "samples_per_second_per_gpu": 92799.32202876717,
            "loss_sequences_lower_95": 2.8412463607788085,
            "loss_sequences_upper_95": 3.197198959350586,
            "loss_tokens_lower_95": 2.6824286554708463,
            "loss_tokens_upper_95": 3.1045532444935153,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3069700862880618,
            "data_time": 0.0033630725064891977,
            "batch_time": 0.038927328610956546,
            "samples_per_second": 895713.2243898065,
            "samples_per_second_per_gpu": 111964.15304872581,
            "loss_sequences_lower_95": 2.270026315710562,
            "loss_sequences_upper_95": 2.3436326713132516,
            "loss_tokens_lower_95": 2.2699831662915013,
            "loss_tokens_upper_95": 2.345269081951286,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2411691792962203,
            "data_time": 0.004903211095788156,
            "batch_time": 0.04043466154163852,
            "samples_per_second": 888524.7608520662,
            "samples_per_second_per_gpu": 111065.59510650828,
            "loss_sequences_lower_95": 2.2027949802608005,
            "loss_sequences_upper_95": 2.279481510096744,
            "loss_tokens_lower_95": 2.2035944508295584,
            "loss_tokens_upper_95": 2.2797716545336173,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0662797153060923,
            "data_time": 0.0037602108961562616,
            "batch_time": 0.03919066131501167,
            "samples_per_second": 891286.1400692636,
            "samples_per_second_per_gpu": 111410.76750865795,
            "loss_sequences_lower_95": 3.219055541062381,
            "loss_sequences_upper_95": 3.35217195496336,
            "loss_tokens_lower_95": 2.885230191932213,
            "loss_tokens_upper_95": 2.9384991401145837,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.921554500579834,
            "data_time": 0.010611352510750294,
            "batch_time": 0.04595544375479221,
            "samples_per_second": 863682.7132331169,
            "samples_per_second_per_gpu": 107960.33915413961,
            "loss_sequences_lower_95": 5.093713720703126,
            "loss_sequences_upper_95": 5.652003869628905,
            "loss_tokens_lower_95": 4.34849883407098,
            "loss_tokens_upper_95": 4.707745252264642,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1973627656698227,
            "data_time": 0.16332729160785675,
            "batch_time": 0.20253251492977142,
            "samples_per_second": 485515.6022980852,
            "samples_per_second_per_gpu": 60689.45028726065,
            "loss_sequences_lower_95": 2.9966641545295714,
            "loss_sequences_upper_95": 3.4163099110126494,
            "loss_tokens_lower_95": 2.7671189669905036,
            "loss_tokens_upper_95": 3.5478423370712107,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8549596996142945,
            "data_time": 0.02678864560228713,
            "batch_time": 0.061783252878391995,
            "samples_per_second": 787675.2686247163,
            "samples_per_second_per_gpu": 98459.40857808954,
            "loss_sequences_lower_95": 4.216499723237137,
            "loss_sequences_upper_95": 4.9080804583670075,
            "loss_tokens_lower_95": 2.7759233029611754,
            "loss_tokens_upper_95": 3.1831762750361117,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5319992243436396,
            "data_time": 0.002958122433887588,
            "batch_time": 0.0384464491572645,
            "samples_per_second": 892442.1561564704,
            "samples_per_second_per_gpu": 111555.2695195588,
            "loss_sequences_lower_95": 2.509134896387956,
            "loss_sequences_upper_95": 2.5542828921096823,
            "loss_tokens_lower_95": 2.509307202713726,
            "loss_tokens_upper_95": 2.55447200776813,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9180632284697055,
            "data_time": 0.0024584632165958404,
            "batch_time": 0.03800278437151482,
            "samples_per_second": 896741.8834185319,
            "samples_per_second_per_gpu": 112092.73542731648,
            "loss_sequences_lower_95": 1.8929931261598218,
            "loss_sequences_upper_95": 2.0096258446611803,
            "loss_tokens_lower_95": 1.8046027138682557,
            "loss_tokens_upper_95": 1.918169905676096,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.896921165041871,
            "data_time": 0.02032728162076738,
            "batch_time": 0.056052330467436046,
            "samples_per_second": 807977.5959006989,
            "samples_per_second_per_gpu": 100997.19948758736,
            "loss_sequences_lower_95": 2.767021693414821,
            "loss_sequences_upper_95": 3.1812244121844953,
            "loss_tokens_lower_95": 2.6408661255673276,
            "loss_tokens_upper_95": 2.9226467644253002,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.376582715392771,
            "data_time": 0.004939326271414757,
            "batch_time": 0.04040663167834282,
            "samples_per_second": 883921.8501650523,
            "samples_per_second_per_gpu": 110490.23127063154,
            "loss_sequences_lower_95": 3.4219696117181333,
            "loss_sequences_upper_95": 3.579522478665154,
            "loss_tokens_lower_95": 3.232599310939864,
            "loss_tokens_upper_95": 3.3727790795327994,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.331355399474865,
            "data_time": 0.03178518726712182,
            "batch_time": 0.06801978463218325,
            "samples_per_second": 809467.579223217,
            "samples_per_second_per_gpu": 101183.44740290212,
            "loss_sequences_lower_95": 2.2006556441144247,
            "loss_sequences_upper_95": 2.6307490185993476,
            "loss_tokens_lower_95": 2.054401102735012,
            "loss_tokens_upper_95": 2.3669724830017587,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.900167730122143,
            "data_time": 0.0021230905855173497,
            "batch_time": 0.0376504135284241,
            "samples_per_second": 895635.4129183311,
            "samples_per_second_per_gpu": 111954.42661479139,
            "loss_sequences_lower_95": 2.884737670190288,
            "loss_sequences_upper_95": 2.9153169256898255,
            "loss_tokens_lower_95": 2.884749977339218,
            "loss_tokens_upper_95": 2.915483965933812,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9443069687746104,
            "data_time": 0.048923154310746626,
            "batch_time": 0.08585741736672141,
            "samples_per_second": 729152.6816470282,
            "samples_per_second_per_gpu": 91144.08520587852,
            "loss_sequences_lower_95": 0.887277692035564,
            "loss_sequences_upper_95": 1.052319934067217,
            "loss_tokens_lower_95": 0.7700461193385363,
            "loss_tokens_upper_95": 1.009611676781302,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3270741644676627,
            "data_time": 0.001180616315932006,
            "batch_time": 0.0366903520989398,
            "samples_per_second": 899115.6681665446,
            "samples_per_second_per_gpu": 112389.45852081808,
            "loss_sequences_lower_95": 3.594621075324292,
            "loss_sequences_upper_95": 3.6297714188613734,
            "loss_tokens_lower_95": 2.909439905705996,
            "loss_tokens_upper_95": 2.944409018375242,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.213018984079361,
            "data_time": 0.005962471167246501,
            "batch_time": 0.041586803538458686,
            "samples_per_second": 881963.542201396,
            "samples_per_second_per_gpu": 110245.4427751745,
            "loss_sequences_lower_95": 4.233140612792969,
            "loss_sequences_upper_95": 4.486534985351563,
            "loss_tokens_lower_95": 3.923312314203886,
            "loss_tokens_upper_95": 4.150946710502465,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.978082091393678,
            "data_time": 0.022732813479536672,
            "batch_time": 0.05918965703350002,
            "samples_per_second": 820518.2598367169,
            "samples_per_second_per_gpu": 102564.78247958962,
            "loss_sequences_lower_95": 1.9304129824431047,
            "loss_sequences_upper_95": 2.0263973932680877,
            "loss_tokens_lower_95": 1.9308168427840522,
            "loss_tokens_upper_95": 2.0282423865276833,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.9937186512080105,
            "data_time": 0.004789725484618221,
            "batch_time": 0.040407321898334,
            "samples_per_second": 886749.2439624337,
            "samples_per_second_per_gpu": 110843.6554953042,
            "loss_sequences_lower_95": 5.93830672755386,
            "loss_sequences_upper_95": 6.048072093616832,
            "loss_tokens_lower_95": 5.938126590613162,
            "loss_tokens_upper_95": 6.048893414121686,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1250280400911967,
            "data_time": 0.004219122072483631,
            "batch_time": 0.03971434432141324,
            "samples_per_second": 892725.651872733,
            "samples_per_second_per_gpu": 111590.70648409163,
            "loss_sequences_lower_95": 1.1687484639485677,
            "loss_sequences_upper_95": 1.2240173014322915,
            "loss_tokens_lower_95": 1.0410730444521559,
            "loss_tokens_upper_95": 1.1113402822066327,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.729459490094866,
            "data_time": 0.02446473070553371,
            "batch_time": 0.05952967703342438,
            "samples_per_second": 802630.6164434722,
            "samples_per_second_per_gpu": 100328.82705543403,
            "loss_sequences_lower_95": 5.3667586844308035,
            "loss_sequences_upper_95": 6.0967503284272695,
            "loss_tokens_lower_95": 5.368636125837054,
            "loss_tokens_upper_95": 6.092555818103608,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7536131255328655,
            "data_time": 0.16307298839092255,
            "batch_time": 0.20545043051242828,
            "samples_per_second": 448756.89313677826,
            "samples_per_second_per_gpu": 56094.61164209728,
            "loss_sequences_lower_95": 1.5970334738492966,
            "loss_sequences_upper_95": 2.280312460660934,
            "loss_tokens_lower_95": 1.3485748448322727,
            "loss_tokens_upper_95": 1.7532067556479543,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.694226032733917,
            "data_time": 0.006477176670044187,
            "batch_time": 0.042094098670142036,
            "samples_per_second": 881471.2132664684,
            "samples_per_second_per_gpu": 110183.90165830855,
            "loss_sequences_lower_95": 7.6227264404296875,
            "loss_sequences_upper_95": 7.993464453125,
            "loss_tokens_lower_95": 7.374760589341423,
            "loss_tokens_upper_95": 7.700250337087563,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.343075075149536,
            "data_time": 0.0058732884270804265,
            "batch_time": 0.04141015050903199,
            "samples_per_second": 884023.726809699,
            "samples_per_second_per_gpu": 110502.96585121237,
            "loss_sequences_lower_95": 6.425645068359375,
            "loss_sequences_upper_95": 6.640580297851563,
            "loss_tokens_lower_95": 6.110561398738733,
            "loss_tokens_upper_95": 6.299276307038683,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2994889377788374,
            "data_time": 0.004243329057725377,
            "batch_time": 0.039853364568091555,
            "samples_per_second": 887424.4854552133,
            "samples_per_second_per_gpu": 110928.06068190167,
            "loss_sequences_lower_95": 2.279961440022104,
            "loss_sequences_upper_95": 2.3190177894461845,
            "loss_tokens_lower_95": 2.280238788846432,
            "loss_tokens_upper_95": 2.3193101934726474,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2608403749363397,
            "data_time": 0.009153283254617652,
            "batch_time": 0.044760284106897086,
            "samples_per_second": 868718.8845684021,
            "samples_per_second_per_gpu": 108589.86057105026,
            "loss_sequences_lower_95": 2.1923684397051413,
            "loss_sequences_upper_95": 2.3321764452299947,
            "loss_tokens_lower_95": 2.1909187844272036,
            "loss_tokens_upper_95": 2.331063908402638,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.700289136886597,
            "data_time": 0.006118000972838629,
            "batch_time": 0.04174491668504382,
            "samples_per_second": 882113.5012131379,
            "samples_per_second_per_gpu": 110264.18765164223,
            "loss_sequences_lower_95": 3.6291932006835936,
            "loss_sequences_upper_95": 3.771226727294922,
            "loss_tokens_lower_95": 3.6308181457519533,
            "loss_tokens_upper_95": 3.7722085815429685,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.596049599587748,
            "data_time": 0.0017869279154757696,
            "batch_time": 0.03726540734051306,
            "samples_per_second": 898036.2995709279,
            "samples_per_second_per_gpu": 112254.53744636598,
            "loss_sequences_lower_95": 3.0797030215231787,
            "loss_sequences_upper_95": 3.1748613872546123,
            "loss_tokens_lower_95": 2.026496786254159,
            "loss_tokens_upper_95": 2.087374700237011,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9016500809299413,
            "data_time": 0.020061675139835904,
            "batch_time": 0.05556176560265677,
            "samples_per_second": 831010.9201335248,
            "samples_per_second_per_gpu": 103876.3650166906,
            "loss_sequences_lower_95": 1.8396024988658393,
            "loss_sequences_upper_95": 1.9646633034321799,
            "loss_tokens_lower_95": 1.8395306316774283,
            "loss_tokens_upper_95": 1.9625375149854973,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9118220780410018,
            "data_time": 0.01118539460003376,
            "batch_time": 0.04687586519867182,
            "samples_per_second": 873650.0577407953,
            "samples_per_second_per_gpu": 109206.25721759941,
            "loss_sequences_lower_95": 1.868718007405599,
            "loss_sequences_upper_95": 1.955379285625383,
            "loss_tokens_lower_95": 1.8690699259440104,
            "loss_tokens_upper_95": 1.9537041727701823,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.118994520520053,
            "data_time": 0.0020763496662205712,
            "batch_time": 0.03757393488319971,
            "samples_per_second": 896538.3340500805,
            "samples_per_second_per_gpu": 112067.29175626006,
            "loss_sequences_lower_95": 2.2818204382555116,
            "loss_sequences_upper_95": 2.358180249240574,
            "loss_tokens_lower_95": 1.8081883174240203,
            "loss_tokens_upper_95": 1.865978924695027,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.242504624462632,
            "data_time": 0.028265451391537983,
            "batch_time": 0.06481972585121791,
            "samples_per_second": 812978.8428890771,
            "samples_per_second_per_gpu": 101622.35536113464,
            "loss_sequences_lower_95": 2.1862603646737555,
            "loss_sequences_upper_95": 2.2976349280624793,
            "loss_tokens_lower_95": 2.187910812620133,
            "loss_tokens_upper_95": 2.2969748280035756,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.334340840669218,
            "data_time": 0.0036098118666764143,
            "batch_time": 0.03908297153910729,
            "samples_per_second": 894922.791665849,
            "samples_per_second_per_gpu": 111865.34895823113,
            "loss_sequences_lower_95": 3.3040516891246177,
            "loss_sequences_upper_95": 3.3646299664325303,
            "loss_tokens_lower_95": 3.304357775766915,
            "loss_tokens_upper_95": 3.3648438694571863,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8243092292720833,
            "data_time": 0.024231152100996536,
            "batch_time": 0.05916672186418013,
            "samples_per_second": 799660.0841884962,
            "samples_per_second_per_gpu": 99957.51052356203,
            "loss_sequences_lower_95": 1.7710782689955626,
            "loss_sequences_upper_95": 1.87863866194938,
            "loss_tokens_lower_95": 1.7705493149248142,
            "loss_tokens_upper_95": 1.8796262759606814,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.300969388584296,
            "data_time": 0.0824202448129654,
            "batch_time": 0.11918292194604874,
            "samples_per_second": 634916.2103428333,
            "samples_per_second_per_gpu": 79364.52629285416,
            "loss_sequences_lower_95": 1.1799639320373534,
            "loss_sequences_upper_95": 1.56290176709493,
            "loss_tokens_lower_95": 1.0419919437832303,
            "loss_tokens_upper_95": 1.4243179056379531,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.19575401643912,
            "data_time": 0.079412542283535,
            "batch_time": 0.11708829551935196,
            "samples_per_second": 626185.2578215146,
            "samples_per_second_per_gpu": 78273.15722768933,
            "loss_sequences_lower_95": 1.0957112979888917,
            "loss_sequences_upper_95": 1.4596164798736573,
            "loss_tokens_lower_95": 0.8997643374325184,
            "loss_tokens_upper_95": 1.3160682828238839,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4736724850004306,
            "data_time": 0.003049756303262963,
            "batch_time": 0.038562880606423974,
            "samples_per_second": 895080.0457945602,
            "samples_per_second_per_gpu": 111885.00572432003,
            "loss_sequences_lower_95": 3.4536632527729196,
            "loss_sequences_upper_95": 3.4931529233592595,
            "loss_tokens_lower_95": 3.453728203527706,
            "loss_tokens_upper_95": 3.4940993907630706,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.43252899255025895,
            "data_time": 0.0011857809902411321,
            "batch_time": 0.03666057496349973,
            "samples_per_second": 899916.2151060762,
            "samples_per_second_per_gpu": 112489.52688825953,
            "loss_sequences_lower_95": 0.4974302425304956,
            "loss_sequences_upper_95": 0.5082896160278862,
            "loss_tokens_lower_95": 0.3655595806386374,
            "loss_tokens_upper_95": 0.37204188143190575,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.178232081762449,
            "data_time": 0.04105207696557045,
            "batch_time": 0.07738341763615608,
            "samples_per_second": 796133.5128723555,
            "samples_per_second_per_gpu": 99516.68910904444,
            "loss_sequences_lower_95": 1.0953319309264655,
            "loss_sequences_upper_95": 1.2883175196610097,
            "loss_tokens_lower_95": 1.0470806694358727,
            "loss_tokens_upper_95": 1.1516617694612041,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8319462376671867,
            "data_time": 0.13260025069827125,
            "batch_time": 0.17065076600937618,
            "samples_per_second": 487540.4958989439,
            "samples_per_second_per_gpu": 60942.56198736799,
            "loss_sequences_lower_95": 3.380893392820616,
            "loss_sequences_upper_95": 4.299964657345333,
            "loss_tokens_lower_95": 3.1771334330240886,
            "loss_tokens_upper_95": 4.382994136103878,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0501385549219644,
            "data_time": 0.03167061862491426,
            "batch_time": 0.0682683785756429,
            "samples_per_second": 791068.6615642984,
            "samples_per_second_per_gpu": 98883.5826955373,
            "loss_sequences_lower_95": 1.0011584816909418,
            "loss_sequences_upper_95": 1.1550956749334569,
            "loss_tokens_lower_95": 0.9371278611063656,
            "loss_tokens_upper_95": 1.0188341289292473,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0959704303886832,
            "data_time": 0.03236748774846395,
            "batch_time": 0.06843902951195127,
            "samples_per_second": 803429.5507946642,
            "samples_per_second_per_gpu": 100428.69384933302,
            "loss_sequences_lower_95": 1.0680988125684785,
            "loss_sequences_upper_95": 1.2108228823033775,
            "loss_tokens_lower_95": 0.9763921312448692,
            "loss_tokens_upper_95": 1.045984374885474,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0273268703462146,
            "data_time": 0.03247337681906564,
            "batch_time": 0.06920423961821057,
            "samples_per_second": 799561.7994610628,
            "samples_per_second_per_gpu": 99945.22493263285,
            "loss_sequences_lower_95": 0.9258388472766411,
            "loss_sequences_upper_95": 1.1003604005022747,
            "loss_tokens_lower_95": 0.9557044934650617,
            "loss_tokens_upper_95": 1.0639173897057173,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1527619678072814,
            "data_time": 0.03200828461419968,
            "batch_time": 0.06822854848135085,
            "samples_per_second": 800784.9373227266,
            "samples_per_second_per_gpu": 100098.11716534082,
            "loss_sequences_lower_95": 1.1149275570380979,
            "loss_sequences_upper_95": 1.2449347472772367,
            "loss_tokens_lower_95": 1.0404584961888204,
            "loss_tokens_upper_95": 1.1094157804209868,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9863905795612691,
            "data_time": 0.03155197037590875,
            "batch_time": 0.06751472567334588,
            "samples_per_second": 815835.0320596123,
            "samples_per_second_per_gpu": 101979.37900745154,
            "loss_sequences_lower_95": 0.9558322598475106,
            "loss_sequences_upper_95": 1.0439372258156723,
            "loss_tokens_lower_95": 0.9310450346981025,
            "loss_tokens_upper_95": 0.9819362386662767,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9509874382397023,
            "data_time": 0.03199264549073719,
            "batch_time": 0.06768712543305896,
            "samples_per_second": 809919.0430438942,
            "samples_per_second_per_gpu": 101239.88038048678,
            "loss_sequences_lower_95": 0.9444278275094381,
            "loss_sequences_upper_95": 1.0417496309047793,
            "loss_tokens_lower_95": 0.8489171737845652,
            "loss_tokens_upper_95": 0.8931624716861626,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-8.0/params.txt",
    "uuid": "ed1b1707-5ab3-427c-bf92-72e01e47d004",
    "creation_date": "2023_12_14-08_11_23"
}