{
    "name": "rpj-d=512_l=8_h=4-2.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 3156561920,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "631312384",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.3197100877761843,
            "data_time": 0.028862494975328445,
            "batch_time": 0.3217583857476711,
            "samples_per_second": 1755502.2851449137,
            "samples_per_second_per_gpu": 219437.7856431142,
            "loss_sequences_lower_95": 3.2464280319213867,
            "loss_sequences_upper_95": 3.3883664894104,
            "loss_tokens_lower_95": 3.307231814066569,
            "loss_tokens_upper_95": 3.332261136372884,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7441492913284,
            "data_time": 0.0014089424436300993,
            "batch_time": 0.015177771211739072,
            "samples_per_second": 2262100.668381982,
            "samples_per_second_per_gpu": 282762.58354774775,
            "loss_sequences_lower_95": 3.741606500517653,
            "loss_sequences_upper_95": 3.746663609645408,
            "loss_tokens_lower_95": 3.733102885416667,
            "loss_tokens_upper_95": 3.75522084375,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.030323263090484,
            "data_time": 0.008719368934631348,
            "batch_time": 0.023152304649353028,
            "samples_per_second": 2118434.9925877308,
            "samples_per_second_per_gpu": 264804.37407346634,
            "loss_sequences_lower_95": 3.002739158163265,
            "loss_sequences_upper_95": 3.057790346729512,
            "loss_tokens_lower_95": 3.0184210729166665,
            "loss_tokens_upper_95": 3.0424319791666665,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5839505056007623,
            "data_time": 0.0014290611602758106,
            "batch_time": 0.014950374043301531,
            "samples_per_second": 2314987.7773583243,
            "samples_per_second_per_gpu": 289373.47216979053,
            "loss_sequences_lower_95": 3.570604824420103,
            "loss_sequences_upper_95": 3.59676799089884,
            "loss_tokens_lower_95": 3.572614197916667,
            "loss_tokens_upper_95": 3.595132427083333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.731479899218029,
            "data_time": 0.008639721281499977,
            "batch_time": 0.02237919792236085,
            "samples_per_second": 2197592.735010935,
            "samples_per_second_per_gpu": 274699.0918763669,
            "loss_sequences_lower_95": 3.6959845206888047,
            "loss_sequences_upper_95": 3.7654444987807882,
            "loss_tokens_lower_95": 3.720339822916667,
            "loss_tokens_upper_95": 3.7426368020833336,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.509946044815551,
            "data_time": 0.0032533242002777433,
            "batch_time": 0.016804889816304912,
            "samples_per_second": 2304931.6568895807,
            "samples_per_second_per_gpu": 288116.4571111976,
            "loss_sequences_lower_95": 3.466525946953603,
            "loss_sequences_upper_95": 3.5532260586176494,
            "loss_tokens_lower_95": 3.4984945416666666,
            "loss_tokens_upper_95": 3.521284291666667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.1211031373909544,
            "data_time": 0.0014288819244596348,
            "batch_time": 0.01476675155306718,
            "samples_per_second": 2346277.8868956156,
            "samples_per_second_per_gpu": 293284.73586195195,
            "loss_sequences_lower_95": 2.0970666802853954,
            "loss_sequences_upper_95": 2.144582071109694,
            "loss_tokens_lower_95": 2.1103451406250002,
            "loss_tokens_upper_95": 2.1321409427083333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.032798482335675,
            "data_time": 0.0014531184618864932,
            "batch_time": 0.014849752524267026,
            "samples_per_second": 2336331.132931847,
            "samples_per_second_per_gpu": 292041.3916164809,
            "loss_sequences_lower_95": 4.023557765461387,
            "loss_sequences_upper_95": 4.041907108965968,
            "loss_tokens_lower_95": 4.021908989583333,
            "loss_tokens_upper_95": 4.043676895833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8851905566890066,
            "data_time": 0.01092894682808528,
            "batch_time": 0.025673112225910975,
            "samples_per_second": 2158876.7710112096,
            "samples_per_second_per_gpu": 269859.5963764012,
            "loss_sequences_lower_95": 3.8419592446428004,
            "loss_sequences_upper_95": 3.931380264158171,
            "loss_tokens_lower_95": 3.8739781979166668,
            "loss_tokens_upper_95": 3.8965301145833333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.482046595675201,
            "data_time": 0.009109744802117348,
            "batch_time": 0.023051977157592773,
            "samples_per_second": 2199542.279791219,
            "samples_per_second_per_gpu": 274942.7849739024,
            "loss_sequences_lower_95": 4.445856991492713,
            "loss_sequences_upper_95": 4.514185052803853,
            "loss_tokens_lower_95": 4.46994790625,
            "loss_tokens_upper_95": 4.49409765625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6790906833069967,
            "data_time": 0.0012261056677875012,
            "batch_time": 0.014774236720893506,
            "samples_per_second": 2319405.0634369813,
            "samples_per_second_per_gpu": 289925.63292962266,
            "loss_sequences_lower_95": 3.670756095184665,
            "loss_sequences_upper_95": 3.6873374760175412,
            "loss_tokens_lower_95": 3.6678459479166667,
            "loss_tokens_upper_95": 3.6903446041666665,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.617102778091971,
            "data_time": 0.002368604213768596,
            "batch_time": 0.017335342229355583,
            "samples_per_second": 2310821.589797883,
            "samples_per_second_per_gpu": 288852.69872473535,
            "loss_sequences_lower_95": 3.606339155560183,
            "loss_sequences_upper_95": 3.6276723484355475,
            "loss_tokens_lower_95": 3.606153385416667,
            "loss_tokens_upper_95": 3.6281486875,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.00399604917298,
            "data_time": 0.008778120689241312,
            "batch_time": 0.022742132895548824,
            "samples_per_second": 2178188.2352784197,
            "samples_per_second_per_gpu": 272273.52940980246,
            "loss_sequences_lower_95": 3.9665630224510333,
            "loss_sequences_upper_95": 4.040428548507227,
            "loss_tokens_lower_95": 3.992842375,
            "loss_tokens_upper_95": 4.015192187499999,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.45745058846328,
            "data_time": 0.00901400520507083,
            "batch_time": 0.023024103081083866,
            "samples_per_second": 2163225.6334053474,
            "samples_per_second_per_gpu": 270403.2041756684,
            "loss_sequences_lower_95": 3.393034664239514,
            "loss_sequences_upper_95": 3.5204541860918406,
            "loss_tokens_lower_95": 3.4458206666666666,
            "loss_tokens_upper_95": 3.468904770833333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.388010285117409,
            "data_time": 0.07920655182429723,
            "batch_time": 0.09500724077224731,
            "samples_per_second": 1083249.0840837075,
            "samples_per_second_per_gpu": 135406.13551046344,
            "loss_sequences_lower_95": 4.320867815884677,
            "loss_sequences_upper_95": 4.45532358342951,
            "loss_tokens_lower_95": 4.367145607688211,
            "loss_tokens_upper_95": 4.40989666852084,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.140334654827507,
            "data_time": 0.012927452271634882,
            "batch_time": 0.027010015465996483,
            "samples_per_second": 2116792.623169383,
            "samples_per_second_per_gpu": 264599.0778961729,
            "loss_sequences_lower_95": 3.0369979458022045,
            "loss_sequences_upper_95": 3.24323894178207,
            "loss_tokens_lower_95": 3.1288578697916667,
            "loss_tokens_upper_95": 3.151573578125,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.967248232194805,
            "data_time": 0.012036086370547613,
            "batch_time": 0.026078655074040096,
            "samples_per_second": 2166477.795860128,
            "samples_per_second_per_gpu": 270809.724482516,
            "loss_sequences_lower_95": 5.911750298250948,
            "loss_sequences_upper_95": 6.017581297539784,
            "loss_tokens_lower_95": 5.95593325,
            "loss_tokens_upper_95": 5.978521989583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.019662958676697,
            "data_time": 0.033362045884132385,
            "batch_time": 0.048291418701410294,
            "samples_per_second": 1770304.0451064548,
            "samples_per_second_per_gpu": 221288.00563830684,
            "loss_sequences_lower_95": 3.974997010778208,
            "loss_sequences_upper_95": 4.061954710913486,
            "loss_tokens_lower_95": 4.006878118045995,
            "loss_tokens_upper_95": 4.0322150683793865,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.4284003652274615,
            "data_time": 0.0019188656690180437,
            "batch_time": 0.015830156879164364,
            "samples_per_second": 2234826.8060007,
            "samples_per_second_per_gpu": 279353.3507500875,
            "loss_sequences_lower_95": 5.407244060230024,
            "loss_sequences_upper_95": 5.450062911154928,
            "loss_tokens_lower_95": 5.407060250609778,
            "loss_tokens_upper_95": 5.449598776215995,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.520042902324998,
            "data_time": 0.0020332938166940286,
            "batch_time": 0.015529698152451,
            "samples_per_second": 2293217.2889929386,
            "samples_per_second_per_gpu": 286652.1611241173,
            "loss_sequences_lower_95": 3.507542433109067,
            "loss_sequences_upper_95": 3.533537725537119,
            "loss_tokens_lower_95": 3.506288882310411,
            "loss_tokens_upper_95": 3.5263763640204298,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.082777435029516,
            "data_time": 0.0028455980985823046,
            "batch_time": 0.016406538574118078,
            "samples_per_second": 2284989.699849191,
            "samples_per_second_per_gpu": 285623.7124811489,
            "loss_sequences_lower_95": 5.3406371666678965,
            "loss_sequences_upper_95": 5.653221722126682,
            "loss_tokens_lower_95": 4.523210682481535,
            "loss_tokens_upper_95": 4.7453782789418035,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.412860117177169,
            "data_time": 0.0033818419943464563,
            "batch_time": 0.017210088828776744,
            "samples_per_second": 2228413.3972568475,
            "samples_per_second_per_gpu": 278551.67465710593,
            "loss_sequences_lower_95": 5.5755658203125,
            "loss_sequences_upper_95": 5.78593076171875,
            "loss_tokens_lower_95": 5.042131792944182,
            "loss_tokens_upper_95": 5.184961649960692,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5099288296572992,
            "data_time": 0.003973191320266839,
            "batch_time": 0.01768780581731782,
            "samples_per_second": 2239676.0135336537,
            "samples_per_second_per_gpu": 279959.5016917067,
            "loss_sequences_lower_95": 3.555673539376422,
            "loss_sequences_upper_95": 3.620326992957496,
            "loss_tokens_lower_95": 3.4108495517132904,
            "loss_tokens_upper_95": 3.443146770288856,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.4876732436093416,
            "data_time": 0.021659086857523237,
            "batch_time": 0.03626115833009992,
            "samples_per_second": 2000588.4199264494,
            "samples_per_second_per_gpu": 250073.55249080618,
            "loss_sequences_lower_95": 2.464003871570934,
            "loss_sequences_upper_95": 2.581575130115856,
            "loss_tokens_lower_95": 2.41717538326363,
            "loss_tokens_upper_95": 2.4657126160498337,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7606063132383385,
            "data_time": 0.01941612921655178,
            "batch_time": 0.0333100613206625,
            "samples_per_second": 2037494.9950590462,
            "samples_per_second_per_gpu": 254686.87438238077,
            "loss_sequences_lower_95": 3.7533431072624364,
            "loss_sequences_upper_95": 3.9592806508589766,
            "loss_tokens_lower_95": 3.6187810688367468,
            "loss_tokens_upper_95": 3.714695348203707,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.22330624739329,
            "data_time": 0.01583621287957216,
            "batch_time": 0.029932055717859514,
            "samples_per_second": 2036439.4526575,
            "samples_per_second_per_gpu": 254554.9315821875,
            "loss_sequences_lower_95": 4.183753763834635,
            "loss_sequences_upper_95": 4.299713470458984,
            "loss_tokens_lower_95": 4.08428839646989,
            "loss_tokens_upper_95": 4.311236564721106,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.408725061960903,
            "data_time": 0.001664049805366269,
            "batch_time": 0.015384407332534572,
            "samples_per_second": 2262367.7736254423,
            "samples_per_second_per_gpu": 282795.9717031803,
            "loss_sequences_lower_95": 7.428457072578859,
            "loss_sequences_upper_95": 7.510263858816003,
            "loss_tokens_lower_95": 7.243650724622533,
            "loss_tokens_upper_95": 7.32929011852573,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.162731855304956,
            "data_time": 0.0026284564261468465,
            "batch_time": 0.01631538179896822,
            "samples_per_second": 2261779.454401335,
            "samples_per_second_per_gpu": 282722.4318001669,
            "loss_sequences_lower_95": 5.765499538845487,
            "loss_sequences_upper_95": 6.0806637439663564,
            "loss_tokens_lower_95": 4.3511516408201345,
            "loss_tokens_upper_95": 4.496388570316048,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.622817357652424,
            "data_time": 0.004552734864724649,
            "batch_time": 0.01846973195269301,
            "samples_per_second": 2196951.206370576,
            "samples_per_second_per_gpu": 274618.900796322,
            "loss_sequences_lower_95": 5.093220181513972,
            "loss_sequences_upper_95": 5.440662100209312,
            "loss_tokens_lower_95": 4.156879117066493,
            "loss_tokens_upper_95": 4.318380614210229,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.7830568796967805,
            "data_time": 0.020422697067260742,
            "batch_time": 0.03472926786967686,
            "samples_per_second": 2043032.2802814075,
            "samples_per_second_per_gpu": 255379.03503517594,
            "loss_sequences_lower_95": 5.711601250356735,
            "loss_sequences_upper_95": 5.854354161650079,
            "loss_tokens_lower_95": 5.713473482872254,
            "loss_tokens_upper_95": 5.8510256414544095,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8830521535873412,
            "data_time": 0.04382205009460449,
            "batch_time": 0.0594367843407851,
            "samples_per_second": 1719186.870479857,
            "samples_per_second_per_gpu": 214898.35880998214,
            "loss_sequences_lower_95": 3.739509574890137,
            "loss_sequences_upper_95": 4.11523755645752,
            "loss_tokens_lower_95": 3.5605017482914523,
            "loss_tokens_upper_95": 4.035783143290893,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.867508925581418,
            "data_time": 0.0032454604751493304,
            "batch_time": 0.017047238008620062,
            "samples_per_second": 2240822.441343958,
            "samples_per_second_per_gpu": 280102.80516799475,
            "loss_sequences_lower_95": 5.815807518231832,
            "loss_sequences_upper_95": 5.919887290493859,
            "loss_tokens_lower_95": 5.814450563647486,
            "loss_tokens_upper_95": 5.91995939569073,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.973341246299525,
            "data_time": 0.004498510920787519,
            "batch_time": 0.018412737434100948,
            "samples_per_second": 2209062.9546164083,
            "samples_per_second_per_gpu": 276132.86932705104,
            "loss_sequences_lower_95": 5.916281890244292,
            "loss_sequences_upper_95": 6.02840278244331,
            "loss_tokens_lower_95": 5.916047527241375,
            "loss_tokens_upper_95": 6.030139949964169,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.086748721290854,
            "data_time": 0.0033161660764953744,
            "batch_time": 0.016952906520011357,
            "samples_per_second": 2255737.9580488713,
            "samples_per_second_per_gpu": 281967.2447561089,
            "loss_sequences_lower_95": 4.256405410517546,
            "loss_sequences_upper_95": 4.385469574871209,
            "loss_tokens_lower_95": 3.8975009250075097,
            "loss_tokens_upper_95": 3.955875616410924,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.884597738027573,
            "data_time": 0.010047604329884052,
            "batch_time": 0.02377373445779085,
            "samples_per_second": 2151391.1371332933,
            "samples_per_second_per_gpu": 268923.89214166166,
            "loss_sequences_lower_95": 6.0833318847656255,
            "loss_sequences_upper_95": 6.632648266601563,
            "loss_tokens_lower_95": 5.248475437934315,
            "loss_tokens_upper_95": 5.609934002859241,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.488590240478516,
            "data_time": 0.14287793636322021,
            "batch_time": 0.1611369252204895,
            "samples_per_second": 936321.950999798,
            "samples_per_second_per_gpu": 117040.24387497475,
            "loss_sequences_lower_95": 4.222405326366425,
            "loss_sequences_upper_95": 4.846308588981628,
            "loss_tokens_lower_95": 4.001770361538591,
            "loss_tokens_upper_95": 4.797963495364135,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.372135854315483,
            "data_time": 0.02475275384618881,
            "batch_time": 0.03891364310650115,
            "samples_per_second": 1888990.0822207585,
            "samples_per_second_per_gpu": 236123.76027759482,
            "loss_sequences_lower_95": 5.808376733187972,
            "loss_sequences_upper_95": 6.614013724491514,
            "loss_tokens_lower_95": 3.9688010438319123,
            "loss_tokens_upper_95": 4.445516406965632,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.202355532210797,
            "data_time": 0.0027276641792721218,
            "batch_time": 0.016218727661503687,
            "samples_per_second": 2279361.5912353476,
            "samples_per_second_per_gpu": 284920.19890441844,
            "loss_sequences_lower_95": 3.1855254527000736,
            "loss_sequences_upper_95": 3.219019109310152,
            "loss_tokens_lower_95": 3.185493065936954,
            "loss_tokens_upper_95": 3.2191278366815474,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6474606473665405,
            "data_time": 0.0025526950837661926,
            "batch_time": 0.016258014002464332,
            "samples_per_second": 2265903.9498445163,
            "samples_per_second_per_gpu": 283237.99373056454,
            "loss_sequences_lower_95": 3.615474801162551,
            "loss_sequences_upper_95": 3.796342495800383,
            "loss_tokens_lower_95": 3.441522152268274,
            "loss_tokens_upper_95": 3.616753280367071,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.479153967602349,
            "data_time": 0.017624835173288982,
            "batch_time": 0.032011701001061335,
            "samples_per_second": 1932722.155714482,
            "samples_per_second_per_gpu": 241590.26946431026,
            "loss_sequences_lower_95": 3.334491961692279,
            "loss_sequences_upper_95": 3.7111933110834476,
            "loss_tokens_lower_95": 3.219814416989477,
            "loss_tokens_upper_95": 3.5180499532948373,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.939286728684321,
            "data_time": 0.004589627683162689,
            "batch_time": 0.01827341765165329,
            "samples_per_second": 2232355.063156782,
            "samples_per_second_per_gpu": 279044.3828945978,
            "loss_sequences_lower_95": 3.9734292335179067,
            "loss_sequences_upper_95": 4.122237472714828,
            "loss_tokens_lower_95": 3.7988776097479615,
            "loss_tokens_upper_95": 3.9472403942626912,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2038446054226015,
            "data_time": 0.027310283411116826,
            "batch_time": 0.041899388744717554,
            "samples_per_second": 1928026.994366621,
            "samples_per_second_per_gpu": 241003.3742958276,
            "loss_sequences_lower_95": 3.0339508289244117,
            "loss_sequences_upper_95": 3.500265056330983,
            "loss_tokens_lower_95": 2.9344180830540116,
            "loss_tokens_upper_95": 3.3184781944187782,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.963086417351372,
            "data_time": 0.002025617916870727,
            "batch_time": 0.015563569400389577,
            "samples_per_second": 2284337.302844563,
            "samples_per_second_per_gpu": 285542.1628555704,
            "loss_sequences_lower_95": 4.952762759583167,
            "loss_sequences_upper_95": 4.973639698642854,
            "loss_tokens_lower_95": 4.9526397271641835,
            "loss_tokens_upper_95": 4.973671423737873,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.1781042073536845,
            "data_time": 0.04391164779663086,
            "batch_time": 0.059091528979214755,
            "samples_per_second": 1698017.9389517454,
            "samples_per_second_per_gpu": 212252.24236896817,
            "loss_sequences_lower_95": 1.1245193555517103,
            "loss_sequences_upper_95": 1.2829494513354254,
            "loss_tokens_lower_95": 1.0007303071973754,
            "loss_tokens_upper_95": 1.238728815115828,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.72207507909094,
            "data_time": 0.0015494156213538145,
            "batch_time": 0.015203904575993809,
            "samples_per_second": 2271029.407209757,
            "samples_per_second_per_gpu": 283878.6759012196,
            "loss_sequences_lower_95": 6.150665864124738,
            "loss_sequences_upper_95": 6.205524641312893,
            "loss_tokens_lower_95": 5.039064784816247,
            "loss_tokens_upper_95": 5.091425108800774,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.0024566359519955,
            "data_time": 0.005268463539698768,
            "batch_time": 0.01944730868415227,
            "samples_per_second": 2165493.276068524,
            "samples_per_second_per_gpu": 270686.6595085655,
            "loss_sequences_lower_95": 6.995322766113281,
            "loss_sequences_upper_95": 7.291043994140625,
            "loss_tokens_lower_95": 6.686705608323666,
            "loss_tokens_upper_95": 6.952131481716454,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.502083032027535,
            "data_time": 0.021098338951498777,
            "batch_time": 0.03512936527446165,
            "samples_per_second": 2026475.3496122826,
            "samples_per_second_per_gpu": 253309.41870153532,
            "loss_sequences_lower_95": 5.331702774711276,
            "loss_sequences_upper_95": 5.6724655284052306,
            "loss_tokens_lower_95": 5.330945553986923,
            "loss_tokens_upper_95": 5.670260699728261,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.978523450909239,
            "data_time": 0.004258630146463233,
            "batch_time": 0.017872893666646565,
            "samples_per_second": 2254975.978505225,
            "samples_per_second_per_gpu": 281871.9973131531,
            "loss_sequences_lower_95": 5.926311090642756,
            "loss_sequences_upper_95": 6.02997910008286,
            "loss_tokens_lower_95": 5.926623544404,
            "loss_tokens_upper_95": 6.028372802734375,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.0460421902338664,
            "data_time": 0.003899683977695222,
            "batch_time": 0.017846419139111294,
            "samples_per_second": 2224025.4496420855,
            "samples_per_second_per_gpu": 278003.1812052607,
            "loss_sequences_lower_95": 1.079640018717448,
            "loss_sequences_upper_95": 1.135340370686849,
            "loss_tokens_lower_95": 0.9750177629645609,
            "loss_tokens_upper_95": 1.030367680275235,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.3450276579175675,
            "data_time": 0.0210422830922263,
            "batch_time": 0.035244520221437724,
            "samples_per_second": 1953770.8330033186,
            "samples_per_second_per_gpu": 244221.35412541483,
            "loss_sequences_lower_95": 5.9991114007859005,
            "loss_sequences_upper_95": 6.693852393740699,
            "loss_tokens_lower_95": 5.998086664109003,
            "loss_tokens_upper_95": 6.690064522879465,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.80902211740613,
            "data_time": 0.1501188725233078,
            "batch_time": 0.1691790372133255,
            "samples_per_second": 932521.9199886543,
            "samples_per_second_per_gpu": 116565.23999858179,
            "loss_sequences_lower_95": 2.5733555793762206,
            "loss_sequences_upper_95": 3.864438021183014,
            "loss_tokens_lower_95": 2.1707949923485823,
            "loss_tokens_upper_95": 2.7671170075406732,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.504902596473694,
            "data_time": 0.005265731187093826,
            "batch_time": 0.01912968735846262,
            "samples_per_second": 2203353.8517999025,
            "samples_per_second_per_gpu": 275419.2314749878,
            "loss_sequences_lower_95": 7.436273986816406,
            "loss_sequences_upper_95": 7.807804614257812,
            "loss_tokens_lower_95": 7.2015796022366745,
            "loss_tokens_upper_95": 7.527577687116645,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.732975253582,
            "data_time": 0.005409980104083107,
            "batch_time": 0.019168684406886027,
            "samples_per_second": 2214323.286549288,
            "samples_per_second_per_gpu": 276790.410818661,
            "loss_sequences_lower_95": 6.822751159667969,
            "loss_sequences_upper_95": 7.069081726074218,
            "loss_tokens_lower_95": 6.493577108827304,
            "loss_tokens_upper_95": 6.691988581016055,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.582556670198815,
            "data_time": 0.003626152663725276,
            "batch_time": 0.017378958730793318,
            "samples_per_second": 2238600.461318802,
            "samples_per_second_per_gpu": 279825.05766485026,
            "loss_sequences_lower_95": 5.554856475103713,
            "loss_sequences_upper_95": 5.610355507380364,
            "loss_tokens_lower_95": 5.5547667413520365,
            "loss_tokens_upper_95": 5.610767804669167,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.209320169440063,
            "data_time": 0.007693228764836521,
            "batch_time": 0.02146351229388188,
            "samples_per_second": 2187208.4915760886,
            "samples_per_second_per_gpu": 273401.0614470111,
            "loss_sequences_lower_95": 5.102000396775394,
            "loss_sequences_upper_95": 5.314711441532258,
            "loss_tokens_lower_95": 5.099013199344758,
            "loss_tokens_upper_95": 5.313378193704398,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.605408092021942,
            "data_time": 0.005482744129877242,
            "batch_time": 0.01916511617009602,
            "samples_per_second": 2228543.7345818505,
            "samples_per_second_per_gpu": 278567.9668227313,
            "loss_sequences_lower_95": 6.5274212890625005,
            "loss_sequences_upper_95": 6.681107995605469,
            "loss_tokens_lower_95": 6.529253247070312,
            "loss_tokens_upper_95": 6.684340112304687,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7227531678290897,
            "data_time": 0.0019726742868838103,
            "batch_time": 0.015637265216200024,
            "samples_per_second": 2266916.1132460353,
            "samples_per_second_per_gpu": 283364.5141557544,
            "loss_sequences_lower_95": 4.345908914528146,
            "loss_sequences_upper_95": 4.456141534117786,
            "loss_tokens_lower_95": 2.9636778569295736,
            "loss_tokens_upper_95": 3.0340476768118414,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.623913767622478,
            "data_time": 0.01766176564352853,
            "batch_time": 0.03231841325759888,
            "samples_per_second": 1969189.961600089,
            "samples_per_second_per_gpu": 246148.74520001112,
            "loss_sequences_lower_95": 5.439914942499417,
            "loss_sequences_upper_95": 5.805727227054425,
            "loss_tokens_lower_95": 5.444138142599988,
            "loss_tokens_upper_95": 5.802761112042328,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.6228613451415415,
            "data_time": 0.009847231209278107,
            "batch_time": 0.024234244599938393,
            "samples_per_second": 2123017.5955477655,
            "samples_per_second_per_gpu": 265377.1994434707,
            "loss_sequences_lower_95": 5.49072540881587,
            "loss_sequences_upper_95": 5.749858925015318,
            "loss_tokens_lower_95": 5.494803634344363,
            "loss_tokens_upper_95": 5.7499397427428,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.472075996849302,
            "data_time": 0.0021452467928650557,
            "batch_time": 0.015832783550463726,
            "samples_per_second": 2258931.1919996175,
            "samples_per_second_per_gpu": 282366.3989999522,
            "loss_sequences_lower_95": 5.000892130151572,
            "loss_sequences_upper_95": 5.1150779023589195,
            "loss_tokens_lower_95": 3.678009833146974,
            "loss_tokens_upper_95": 3.7641692176559007,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.142621232088281,
            "data_time": 0.02519798030455907,
            "batch_time": 0.03996458897988001,
            "samples_per_second": 1977041.420655814,
            "samples_per_second_per_gpu": 247130.17758197675,
            "loss_sequences_lower_95": 5.06231845270389,
            "loss_sequences_upper_95": 5.22297605484251,
            "loss_tokens_lower_95": 5.063323643598608,
            "loss_tokens_upper_95": 5.2216795179579,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.2254400056436525,
            "data_time": 0.0034206316730187197,
            "batch_time": 0.01720390535245157,
            "samples_per_second": 2237897.609964888,
            "samples_per_second_per_gpu": 279737.201245611,
            "loss_sequences_lower_95": 6.1958170423595185,
            "loss_sequences_upper_95": 6.254369594562309,
            "loss_tokens_lower_95": 6.196950885177753,
            "loss_tokens_upper_95": 6.254835835961392,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.691103407480185,
            "data_time": 0.022180830348621717,
            "batch_time": 0.036781601472334426,
            "samples_per_second": 1936873.7164279066,
            "samples_per_second_per_gpu": 242109.21455348833,
            "loss_sequences_lower_95": 5.513540353126896,
            "loss_sequences_upper_95": 5.869295375786939,
            "loss_tokens_lower_95": 5.511330702698347,
            "loss_tokens_upper_95": 5.868966185930864,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.596971243619919,
            "data_time": 0.07041089236736298,
            "batch_time": 0.08615290373563766,
            "samples_per_second": 1382172.3852575773,
            "samples_per_second_per_gpu": 172771.54815719716,
            "loss_sequences_lower_95": 3.2643732452392578,
            "loss_sequences_upper_95": 4.062477995554605,
            "loss_tokens_lower_95": 2.9696087996164957,
            "loss_tokens_upper_95": 4.037133280436198,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.2669045607248943,
            "data_time": 0.07159024477005005,
            "batch_time": 0.08655814081430435,
            "samples_per_second": 1429172.256533935,
            "samples_per_second_per_gpu": 178646.53206674187,
            "loss_sequences_lower_95": 3.0415744018554687,
            "loss_sequences_upper_95": 3.7436018562316895,
            "loss_tokens_lower_95": 2.5261447692185306,
            "loss_tokens_upper_95": 3.5824733348374953,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.804016175782733,
            "data_time": 0.0034340761071306736,
            "batch_time": 0.01698475167497317,
            "samples_per_second": 2269986.139662097,
            "samples_per_second_per_gpu": 283748.26745776215,
            "loss_sequences_lower_95": 5.785501311671576,
            "loss_sequences_upper_95": 5.821918851827135,
            "loss_tokens_lower_95": 5.7862043503773934,
            "loss_tokens_upper_95": 5.821577011805044,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.88066921168046,
            "data_time": 0.0014458084377079094,
            "batch_time": 0.015065830592188323,
            "samples_per_second": 2274860.8940693103,
            "samples_per_second_per_gpu": 284357.6117586638,
            "loss_sequences_lower_95": 1.0605675442261335,
            "loss_sequences_upper_95": 1.091332350003526,
            "loss_tokens_lower_95": 0.6845704865593062,
            "loss_tokens_upper_95": 0.699398707982636,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.187485343827976,
            "data_time": 0.03592391312122345,
            "batch_time": 0.05107009410858154,
            "samples_per_second": 1902567.3875913688,
            "samples_per_second_per_gpu": 237820.9234489211,
            "loss_sequences_lower_95": 2.101409107118141,
            "loss_sequences_upper_95": 2.3889912102166124,
            "loss_tokens_lower_95": 1.9538019609931527,
            "loss_tokens_upper_95": 2.0919068972035433,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.772197910257288,
            "data_time": 0.10840372812180292,
            "batch_time": 0.12483576365879603,
            "samples_per_second": 1000375.4684903973,
            "samples_per_second_per_gpu": 125046.93356129967,
            "loss_sequences_lower_95": 3.405823279715873,
            "loss_sequences_upper_95": 4.196719638721363,
            "loss_tokens_lower_95": 3.3028726271641107,
            "loss_tokens_upper_95": 4.188343519045983,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.024635014010639,
            "data_time": 0.027994456745329358,
            "batch_time": 0.04310554833639236,
            "samples_per_second": 1898565.3881995874,
            "samples_per_second_per_gpu": 237320.67352494842,
            "loss_sequences_lower_95": 1.963258343208127,
            "loss_sequences_upper_95": 2.2108293579845895,
            "loss_tokens_lower_95": 1.8277990178524495,
            "loss_tokens_upper_95": 1.9432560926329898,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.0816892603548562,
            "data_time": 0.027162160192217146,
            "batch_time": 0.04236100968860444,
            "samples_per_second": 1844087.3853132906,
            "samples_per_second_per_gpu": 230510.92316416133,
            "loss_sequences_lower_95": 2.0610611985369425,
            "loss_sequences_upper_95": 2.2851503511754476,
            "loss_tokens_lower_95": 1.8779332635717794,
            "loss_tokens_upper_95": 1.9742830906605489,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.032789694826777,
            "data_time": 0.02774312382652646,
            "batch_time": 0.04324299380892799,
            "samples_per_second": 1841673.317633228,
            "samples_per_second_per_gpu": 230209.1647041535,
            "loss_sequences_lower_95": 1.8778686290834008,
            "loss_sequences_upper_95": 2.1521020144951053,
            "loss_tokens_lower_95": 1.9090131903325367,
            "loss_tokens_upper_95": 2.0606606902988425,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.1767945863851685,
            "data_time": 0.027312800997779482,
            "batch_time": 0.041754745301746186,
            "samples_per_second": 1946801.3456317927,
            "samples_per_second_per_gpu": 243350.16820397408,
            "loss_sequences_lower_95": 2.154172906643007,
            "loss_sequences_upper_95": 2.3639401273029605,
            "loss_tokens_lower_95": 1.9732124732662213,
            "loss_tokens_upper_95": 2.065751619428118,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.7759645000007582,
            "data_time": 0.03081700536939833,
            "batch_time": 0.04501411649915907,
            "samples_per_second": 2002804.4477358542,
            "samples_per_second_per_gpu": 250350.55596698177,
            "loss_sequences_lower_95": 1.7219664674367963,
            "loss_sequences_upper_95": 1.8545317987477556,
            "loss_tokens_lower_95": 1.7054792782520474,
            "loss_tokens_upper_95": 1.7761080685370356,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.6190281982828931,
            "data_time": 0.028497227600642612,
            "batch_time": 0.0438801646232605,
            "samples_per_second": 1811576.6470009387,
            "samples_per_second_per_gpu": 226447.08087511733,
            "loss_sequences_lower_95": 1.5972454443210509,
            "loss_sequences_upper_95": 1.7371185581858566,
            "loss_tokens_lower_95": 1.4638361688375903,
            "loss_tokens_upper_95": 1.523386608673335,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-2.0/params.txt",
    "uuid": "3cee7676-e579-48b7-ae42-c5a12e3821e0",
    "creation_date": "2023_12_14-06_31_11"
}