{
    "name": "c4_original-d=512_l=8_h=4-16.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 25252495360,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "5050499072",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.161810634533564,
            "data_time": 0.03376055136322975,
            "batch_time": 0.3527144007384777,
            "samples_per_second": 1755661.783997986,
            "samples_per_second_per_gpu": 219457.72299974825,
            "loss_sequences_lower_95": 4.033459199269613,
            "loss_sequences_upper_95": 4.289619445800781,
            "loss_tokens_lower_95": 4.14541394551595,
            "loss_tokens_upper_95": 4.1779842821757,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.372367425963273,
            "data_time": 0.0014511684197564799,
            "batch_time": 0.015196083736507186,
            "samples_per_second": 2263315.7943798313,
            "samples_per_second_per_gpu": 282914.4742974789,
            "loss_sequences_lower_95": 3.3695414523259095,
            "loss_sequences_upper_95": 3.3751084663496043,
            "loss_tokens_lower_95": 3.3616209322916664,
            "loss_tokens_upper_95": 3.3830175156249997,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6972989919234296,
            "data_time": 0.01000368309020996,
            "batch_time": 0.02415619373321533,
            "samples_per_second": 2149920.857863816,
            "samples_per_second_per_gpu": 268740.107232977,
            "loss_sequences_lower_95": 3.674961529167331,
            "loss_sequences_upper_95": 3.7211255817024074,
            "loss_tokens_lower_95": 3.68188828125,
            "loss_tokens_upper_95": 3.71324296875,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3643962196959665,
            "data_time": 0.0016322731971740723,
            "batch_time": 0.01501863743913801,
            "samples_per_second": 2327805.382049053,
            "samples_per_second_per_gpu": 290975.6727561316,
            "loss_sequences_lower_95": 3.353304007933312,
            "loss_sequences_upper_95": 3.3757779025048325,
            "loss_tokens_lower_95": 3.3531983072916667,
            "loss_tokens_upper_95": 3.3752903020833336,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3902748699343617,
            "data_time": 0.01030569247515553,
            "batch_time": 0.024661260772036366,
            "samples_per_second": 2111434.4003333286,
            "samples_per_second_per_gpu": 263929.3000416661,
            "loss_sequences_lower_95": 3.355746161622088,
            "loss_sequences_upper_95": 3.424999229291306,
            "loss_tokens_lower_95": 3.379332380208333,
            "loss_tokens_upper_95": 3.400715583333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.873045581607572,
            "data_time": 0.0039467050329498625,
            "batch_time": 0.017387341545975727,
            "samples_per_second": 2315033.1333196205,
            "samples_per_second_per_gpu": 289379.14166495256,
            "loss_sequences_lower_95": 3.830109294007053,
            "loss_sequences_upper_95": 3.917104946942981,
            "loss_tokens_lower_95": 3.86019603125,
            "loss_tokens_upper_95": 3.8857630520833335,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6032334083683635,
            "data_time": 0.0016297194347101556,
            "batch_time": 0.015093804669808019,
            "samples_per_second": 2325657.4509222917,
            "samples_per_second_per_gpu": 290707.18136528647,
            "loss_sequences_lower_95": 3.5688287727200256,
            "loss_sequences_upper_95": 3.6365201988998725,
            "loss_tokens_lower_95": 3.587096947916667,
            "loss_tokens_upper_95": 3.61954840625,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.083205883765096,
            "data_time": 0.001645292312417854,
            "batch_time": 0.014989329202062463,
            "samples_per_second": 2342021.7720326344,
            "samples_per_second_per_gpu": 292752.7215040793,
            "loss_sequences_lower_95": 4.069668684554974,
            "loss_sequences_upper_95": 4.098182141688482,
            "loss_tokens_lower_95": 4.0707801875,
            "loss_tokens_upper_95": 4.095688864583333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.786212984139357,
            "data_time": 0.010566945113832988,
            "batch_time": 0.02510081302551996,
            "samples_per_second": 2186009.3245251975,
            "samples_per_second_per_gpu": 273251.1655656497,
            "loss_sequences_lower_95": 3.734542623380335,
            "loss_sequences_upper_95": 3.846910870559816,
            "loss_tokens_lower_95": 3.7745908020833334,
            "loss_tokens_upper_95": 3.797792708333333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9020934199156025,
            "data_time": 0.010618551634252071,
            "batch_time": 0.03135236445814371,
            "samples_per_second": 2209549.364895341,
            "samples_per_second_per_gpu": 276193.6706119176,
            "loss_sequences_lower_95": 4.849264309246078,
            "loss_sequences_upper_95": 4.968933600022388,
            "loss_tokens_lower_95": 4.888299479166666,
            "loss_tokens_upper_95": 4.916297239583333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7415824511516123,
            "data_time": 0.0012809484747868102,
            "batch_time": 0.014777206335697608,
            "samples_per_second": 2324946.400138675,
            "samples_per_second_per_gpu": 290618.30001733435,
            "loss_sequences_lower_95": 3.7342415373505378,
            "loss_sequences_upper_95": 3.7489909300268947,
            "loss_tokens_lower_95": 3.729971072916667,
            "loss_tokens_upper_95": 3.75309859375,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5577653266647764,
            "data_time": 0.0026503851967588452,
            "batch_time": 0.016058964693576074,
            "samples_per_second": 2331294.722834704,
            "samples_per_second_per_gpu": 291411.840354338,
            "loss_sequences_lower_95": 3.5487508053285093,
            "loss_sequences_upper_95": 3.566703245392545,
            "loss_tokens_lower_95": 3.54615265625,
            "loss_tokens_upper_95": 3.5692962604166665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.2383216583221,
            "data_time": 0.010097166295108116,
            "batch_time": 0.02376869356208168,
            "samples_per_second": 2180904.690030047,
            "samples_per_second_per_gpu": 272613.0862537559,
            "loss_sequences_lower_95": 4.188892418666012,
            "loss_sequences_upper_95": 4.296441415163857,
            "loss_tokens_lower_95": 4.22421946875,
            "loss_tokens_upper_95": 4.252061010416667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.379261640327283,
            "data_time": 0.010502192128701989,
            "batch_time": 0.02438528984191408,
            "samples_per_second": 2178404.5253235786,
            "samples_per_second_per_gpu": 272300.5656654473,
            "loss_sequences_lower_95": 3.316484029424166,
            "loss_sequences_upper_95": 3.4419434712520682,
            "loss_tokens_lower_95": 3.3671265052083332,
            "loss_tokens_upper_95": 3.3912282135416665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.951091679659757,
            "data_time": 0.08891259772436959,
            "batch_time": 0.10529932805470057,
            "samples_per_second": 949693.2363629501,
            "samples_per_second_per_gpu": 118711.65454536876,
            "loss_sequences_lower_95": 4.876524465734308,
            "loss_sequences_upper_95": 5.026997522874312,
            "loss_tokens_lower_95": 4.920557368885388,
            "loss_tokens_upper_95": 4.981739165566185,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9778445886105906,
            "data_time": 0.014387274330312555,
            "batch_time": 0.02831018783829429,
            "samples_per_second": 2133282.847580313,
            "samples_per_second_per_gpu": 266660.35594753915,
            "loss_sequences_lower_95": 3.8881184035765535,
            "loss_sequences_upper_95": 4.066421811295668,
            "loss_tokens_lower_95": 3.96350553125,
            "loss_tokens_upper_95": 3.992081677083333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.027961281799075,
            "data_time": 0.013247419148683548,
            "batch_time": 0.027126137167215347,
            "samples_per_second": 2177898.920754996,
            "samples_per_second_per_gpu": 272237.3650943745,
            "loss_sequences_lower_95": 5.959920644634318,
            "loss_sequences_upper_95": 6.103720970455764,
            "loss_tokens_lower_95": 6.01550490625,
            "loss_tokens_upper_95": 6.040595427083334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.115154029893094,
            "data_time": 0.03821713104844093,
            "batch_time": 0.052717532962560654,
            "samples_per_second": 1867442.5934004043,
            "samples_per_second_per_gpu": 233430.32417505054,
            "loss_sequences_lower_95": 4.003894118011974,
            "loss_sequences_upper_95": 4.299810553378746,
            "loss_tokens_lower_95": 4.100143563942831,
            "loss_tokens_upper_95": 4.1302438392013805,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.370335566999226,
            "data_time": 0.001934557270623557,
            "batch_time": 0.01555199910948108,
            "samples_per_second": 2275748.310149274,
            "samples_per_second_per_gpu": 284468.53876865923,
            "loss_sequences_lower_95": 5.34908199508617,
            "loss_sequences_upper_95": 5.392350751873843,
            "loss_tokens_lower_95": 5.348520203897237,
            "loss_tokens_upper_95": 5.391868034289987,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.086836316485349,
            "data_time": 0.002036496000305103,
            "batch_time": 0.015679314020712664,
            "samples_per_second": 2271962.995564835,
            "samples_per_second_per_gpu": 283995.3744456044,
            "loss_sequences_lower_95": 3.09218506880477,
            "loss_sequences_upper_95": 3.1182055873145287,
            "loss_tokens_lower_95": 3.0627825823564363,
            "loss_tokens_upper_95": 3.0820171273223127,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.71795369821953,
            "data_time": 0.003139731796365319,
            "batch_time": 0.01691906346564693,
            "samples_per_second": 2247548.2628062875,
            "samples_per_second_per_gpu": 280943.53285078594,
            "loss_sequences_lower_95": 4.955102181558514,
            "loss_sequences_upper_95": 5.256540120250502,
            "loss_tokens_lower_95": 4.185835120324041,
            "loss_tokens_upper_95": 4.403872546145312,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.94709580480059,
            "data_time": 0.004173216984627095,
            "batch_time": 0.01798259546148016,
            "samples_per_second": 2226694.8641704195,
            "samples_per_second_per_gpu": 278336.85802130244,
            "loss_sequences_lower_95": 5.084084415690104,
            "loss_sequences_upper_95": 5.298151708984375,
            "loss_tokens_lower_95": 4.592100530660377,
            "loss_tokens_upper_95": 4.739099265428459,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.422771788619521,
            "data_time": 0.004835833251745992,
            "batch_time": 0.018558488891854723,
            "samples_per_second": 2240956.382029094,
            "samples_per_second_per_gpu": 280119.5477536367,
            "loss_sequences_lower_95": 3.4643297220017533,
            "loss_sequences_upper_95": 3.5322059428899735,
            "loss_tokens_lower_95": 3.326183709829639,
            "loss_tokens_upper_95": 3.3594039884006066,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8895598465746097,
            "data_time": 0.02421631557600839,
            "batch_time": 0.03885449681963239,
            "samples_per_second": 1979795.2494542263,
            "samples_per_second_per_gpu": 247474.4061817783,
            "loss_sequences_lower_95": 3.7827295268665657,
            "loss_sequences_upper_95": 4.063817006891424,
            "loss_tokens_lower_95": 3.7767828214753605,
            "loss_tokens_upper_95": 3.861184991622997,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.897968258176531,
            "data_time": 0.021543873474001884,
            "batch_time": 0.03651588410139084,
            "samples_per_second": 1906860.7097003073,
            "samples_per_second_per_gpu": 238357.58871253842,
            "loss_sequences_lower_95": 3.8880592688735653,
            "loss_sequences_upper_95": 4.116373839086416,
            "loss_tokens_lower_95": 3.7680220080293667,
            "loss_tokens_upper_95": 3.87222970971564,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.453385992050171,
            "data_time": 0.01771231798025278,
            "batch_time": 0.03197966172144963,
            "samples_per_second": 2011530.355173419,
            "samples_per_second_per_gpu": 251441.29439667737,
            "loss_sequences_lower_95": 4.421258555094401,
            "loss_sequences_upper_95": 4.547352762858073,
            "loss_tokens_lower_95": 4.292356044547667,
            "loss_tokens_upper_95": 4.521105179945825,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.034220279562021,
            "data_time": 0.0017286756237875863,
            "batch_time": 0.015342353425290146,
            "samples_per_second": 2276161.190984265,
            "samples_per_second_per_gpu": 284520.1488730331,
            "loss_sequences_lower_95": 6.045619140913341,
            "loss_sequences_upper_95": 6.124681277603833,
            "loss_tokens_lower_95": 5.893649707418534,
            "loss_tokens_upper_95": 5.97574299753034,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.808818123155973,
            "data_time": 0.0028798904194927855,
            "batch_time": 0.016484918410346012,
            "samples_per_second": 2270175.6490453193,
            "samples_per_second_per_gpu": 283771.9561306649,
            "loss_sequences_lower_95": 5.4305114026824235,
            "loss_sequences_upper_95": 5.7563829415575025,
            "loss_tokens_lower_95": 3.989888642161245,
            "loss_tokens_upper_95": 4.133771988178361,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.362357813445374,
            "data_time": 0.005331968133513992,
            "batch_time": 0.01914141991653958,
            "samples_per_second": 2208429.0265049958,
            "samples_per_second_per_gpu": 276053.62831312447,
            "loss_sequences_lower_95": 4.841779022737574,
            "loss_sequences_upper_95": 5.222863779946806,
            "loss_tokens_lower_95": 3.9131765507774685,
            "loss_tokens_upper_95": 4.0824030937399165,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.899117032142534,
            "data_time": 0.024248838424682617,
            "batch_time": 0.03875191935471126,
            "samples_per_second": 1958693.6199766765,
            "samples_per_second_per_gpu": 244836.70249708457,
            "loss_sequences_lower_95": 5.800551462391195,
            "loss_sequences_upper_95": 5.9957184813338325,
            "loss_tokens_lower_95": 5.799726177894906,
            "loss_tokens_upper_95": 5.996853456540739,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5507659268379212,
            "data_time": 0.04939809670815101,
            "batch_time": 0.06375301342744094,
            "samples_per_second": 1815029.7210060032,
            "samples_per_second_per_gpu": 226878.7151257504,
            "loss_sequences_lower_95": 3.4115456161499025,
            "loss_sequences_upper_95": 3.7930312423706054,
            "loss_tokens_lower_95": 3.2374126782357586,
            "loss_tokens_upper_95": 3.6917982070731776,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.403355131861863,
            "data_time": 0.003338909831758901,
            "batch_time": 0.01715196519785376,
            "samples_per_second": 2246316.325644241,
            "samples_per_second_per_gpu": 280789.54070553015,
            "loss_sequences_lower_95": 5.34146344937468,
            "loss_sequences_upper_95": 5.466528232851682,
            "loss_tokens_lower_95": 5.339913993530899,
            "loss_tokens_upper_95": 5.466092140720957,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.418196709404023,
            "data_time": 0.005001246637465709,
            "batch_time": 0.01872673003552983,
            "samples_per_second": 2233627.2930911146,
            "samples_per_second_per_gpu": 279203.4116363893,
            "loss_sequences_lower_95": 5.3556821381040125,
            "loss_sequences_upper_95": 5.479602360705877,
            "loss_tokens_lower_95": 5.354265792559249,
            "loss_tokens_upper_95": 5.48130890567414,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.498994362691541,
            "data_time": 0.003663330819642827,
            "batch_time": 0.017442064717830386,
            "samples_per_second": 2231709.896166512,
            "samples_per_second_per_gpu": 278963.737020814,
            "loss_sequences_lower_95": 3.660993640389095,
            "loss_sequences_upper_95": 3.787197520657644,
            "loss_tokens_lower_95": 3.30973048253711,
            "loss_tokens_upper_95": 3.3650987821973013,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.593441911935806,
            "data_time": 0.010885894298553467,
            "batch_time": 0.024754773825407028,
            "samples_per_second": 2131757.34954168,
            "samples_per_second_per_gpu": 266469.66869271,
            "loss_sequences_lower_95": 5.78698837890625,
            "loss_sequences_upper_95": 6.361471887207031,
            "loss_tokens_lower_95": 4.9712988216619785,
            "loss_tokens_upper_95": 5.340351097162475,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.806530848145485,
            "data_time": 0.155678853392601,
            "batch_time": 0.17203398048877716,
            "samples_per_second": 935849.9716095593,
            "samples_per_second_per_gpu": 116981.24645119491,
            "loss_sequences_lower_95": 3.5701292395591735,
            "loss_sequences_upper_95": 4.11155743598938,
            "loss_tokens_lower_95": 3.390727505738708,
            "loss_tokens_upper_95": 4.107831687488775,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.375233960562739,
            "data_time": 0.029013184790915632,
            "batch_time": 0.043874210499702616,
            "samples_per_second": 1797597.1538441342,
            "samples_per_second_per_gpu": 224699.64423051677,
            "loss_sequences_lower_95": 5.893365759137033,
            "loss_sequences_upper_95": 6.829719368068651,
            "loss_tokens_lower_95": 3.7145504853636733,
            "loss_tokens_upper_95": 4.180225620893373,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9179410690717584,
            "data_time": 0.003015824697083897,
            "batch_time": 0.01657718068195714,
            "samples_per_second": 2269308.503803734,
            "samples_per_second_per_gpu": 283663.5629754668,
            "loss_sequences_lower_95": 2.891098205113109,
            "loss_sequences_upper_95": 2.9446345964650913,
            "loss_tokens_lower_95": 2.890218584253154,
            "loss_tokens_upper_95": 2.944811025130379,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.315068046960666,
            "data_time": 0.00266632424984785,
            "batch_time": 0.016443040882565044,
            "samples_per_second": 2250680.893517542,
            "samples_per_second_per_gpu": 281335.11168969277,
            "loss_sequences_lower_95": 3.284988075817485,
            "loss_sequences_upper_95": 3.4523203829989813,
            "loss_tokens_lower_95": 3.1354505360235145,
            "loss_tokens_upper_95": 3.30045594574542,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.335071738822993,
            "data_time": 0.019085342685381573,
            "batch_time": 0.03371143506632911,
            "samples_per_second": 1958783.936611157,
            "samples_per_second_per_gpu": 244847.9920763946,
            "loss_sequences_lower_95": 3.1804786262931404,
            "loss_sequences_upper_95": 3.57352792928507,
            "loss_tokens_lower_95": 3.093838932908982,
            "loss_tokens_upper_95": 3.390681569031496,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.682157641145876,
            "data_time": 0.005089149251580239,
            "batch_time": 0.019534217566251753,
            "samples_per_second": 2122558.4667404722,
            "samples_per_second_per_gpu": 265319.80834255903,
            "loss_sequences_lower_95": 3.7093191386210043,
            "loss_sequences_upper_95": 3.8530803629236137,
            "loss_tokens_lower_95": 3.540928326977223,
            "loss_tokens_upper_95": 3.686296746630211,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.0124881790905462,
            "data_time": 0.03224029427482968,
            "batch_time": 0.0468572009177435,
            "samples_per_second": 1921822.7912155702,
            "samples_per_second_per_gpu": 240227.84890194627,
            "loss_sequences_lower_95": 2.849105030152856,
            "loss_sequences_upper_95": 3.3447839783459172,
            "loss_tokens_lower_95": 2.713152588288897,
            "loss_tokens_upper_95": 3.0829841451971607,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.913388351794123,
            "data_time": 0.0021549418979390257,
            "batch_time": 0.015836427890916275,
            "samples_per_second": 2259437.9535705177,
            "samples_per_second_per_gpu": 282429.7441963147,
            "loss_sequences_lower_95": 4.899087737469369,
            "loss_sequences_upper_95": 4.927470582007026,
            "loss_tokens_lower_95": 4.899529183180386,
            "loss_tokens_upper_95": 4.927246963064488,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.9446213392956743,
            "data_time": 0.050572208924727004,
            "batch_time": 0.06513395309448242,
            "samples_per_second": 1801909.6902251549,
            "samples_per_second_per_gpu": 225238.71127814436,
            "loss_sequences_lower_95": 0.8961496927205799,
            "loss_sequences_upper_95": 1.0290883314262316,
            "loss_tokens_lower_95": 0.8047242444759748,
            "loss_tokens_upper_95": 0.9942457060486901,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.972343485363499,
            "data_time": 0.0017074903118840207,
            "batch_time": 0.015250196101179867,
            "samples_per_second": 2284913.2008902165,
            "samples_per_second_per_gpu": 285614.15011127706,
            "loss_sequences_lower_95": 5.36304548283543,
            "loss_sequences_upper_95": 5.409951335659067,
            "loss_tokens_lower_95": 4.379523815280464,
            "loss_tokens_upper_95": 4.427809054642166,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.23215624666214,
            "data_time": 0.0058259959258730445,
            "batch_time": 0.019585512933276948,
            "samples_per_second": 2219157.556432747,
            "samples_per_second_per_gpu": 277394.69455409335,
            "loss_sequences_lower_95": 7.208294616699218,
            "loss_sequences_upper_95": 7.549038037109375,
            "loss_tokens_lower_95": 6.904178108988302,
            "loss_tokens_upper_95": 7.21293994404969,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.6390994921974515,
            "data_time": 0.02247401213241836,
            "batch_time": 0.037562243009017685,
            "samples_per_second": 1906562.9546867611,
            "samples_per_second_per_gpu": 238320.36933584514,
            "loss_sequences_lower_95": 5.447877860691237,
            "loss_sequences_upper_95": 5.835118089758832,
            "loss_tokens_lower_95": 5.445073640242867,
            "loss_tokens_upper_95": 5.828376517917799,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.126613347458117,
            "data_time": 0.004823927060667291,
            "batch_time": 0.018762991370924985,
            "samples_per_second": 2208777.2325987476,
            "samples_per_second_per_gpu": 276097.15407484345,
            "loss_sequences_lower_95": 6.086117757161459,
            "loss_sequences_upper_95": 6.165928566672585,
            "loss_tokens_lower_95": 6.086101998993845,
            "loss_tokens_upper_95": 6.166383824203954,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.8741038254102071,
            "data_time": 0.0041149594682328245,
            "batch_time": 0.017749888782805586,
            "samples_per_second": 2259424.9947393867,
            "samples_per_second_per_gpu": 282428.12434242334,
            "loss_sequences_lower_95": 0.894498252360026,
            "loss_sequences_upper_95": 0.9324054748535157,
            "loss_tokens_lower_95": 0.8213927758603441,
            "loss_tokens_upper_95": 0.8678062846232243,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.427892147927057,
            "data_time": 0.024820915290287564,
            "batch_time": 0.03972481404032026,
            "samples_per_second": 1862394.4401623625,
            "samples_per_second_per_gpu": 232799.30502029532,
            "loss_sequences_lower_95": 6.118447091238839,
            "loss_sequences_upper_95": 6.736645129975818,
            "loss_tokens_lower_95": 6.116891159784227,
            "loss_tokens_upper_95": 6.742823893229167,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.68281739205122,
            "data_time": 0.15283511579036713,
            "batch_time": 0.17043328285217285,
            "samples_per_second": 1035850.7308187187,
            "samples_per_second_per_gpu": 129481.34135233983,
            "loss_sequences_lower_95": 2.4103115797042847,
            "loss_sequences_upper_95": 3.6476192712783813,
            "loss_tokens_lower_95": 2.0507640846488404,
            "loss_tokens_upper_95": 2.5919983098924773,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.319920742273331,
            "data_time": 0.006095229160218011,
            "batch_time": 0.019747845237217253,
            "samples_per_second": 2228232.7128005787,
            "samples_per_second_per_gpu": 278529.08910007233,
            "loss_sequences_lower_95": 7.2639110595703125,
            "loss_sequences_upper_95": 7.604123120117188,
            "loss_tokens_lower_95": 7.022132339090261,
            "loss_tokens_upper_95": 7.321267188987151,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.767825087547302,
            "data_time": 0.0059416114337860594,
            "batch_time": 0.020207803873788743,
            "samples_per_second": 2192820.797493202,
            "samples_per_second_per_gpu": 274102.59968665027,
            "loss_sequences_lower_95": 6.853610498046875,
            "loss_sequences_upper_95": 7.066281396484375,
            "loss_tokens_lower_95": 6.531115887698736,
            "loss_tokens_upper_95": 6.72341625230815,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.908474052747623,
            "data_time": 0.004264787048799138,
            "batch_time": 0.018019886080636628,
            "samples_per_second": 2235264.8554502795,
            "samples_per_second_per_gpu": 279408.10693128494,
            "loss_sequences_lower_95": 4.868743509771518,
            "loss_sequences_upper_95": 4.948965573798399,
            "loss_tokens_lower_95": 4.868941138456461,
            "loss_tokens_upper_95": 4.949077372690035,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.3952618758616175,
            "data_time": 0.008916457254000662,
            "batch_time": 0.02275014427853495,
            "samples_per_second": 2164352.439585997,
            "samples_per_second_per_gpu": 270544.0549482496,
            "loss_sequences_lower_95": 5.280947786908362,
            "loss_sequences_upper_95": 5.506499625351022,
            "loss_tokens_lower_95": 5.2782958421839,
            "loss_tokens_upper_95": 5.504401206677227,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.49849003982544,
            "data_time": 0.005914472398303804,
            "batch_time": 0.02003575601275005,
            "samples_per_second": 2162647.8068710626,
            "samples_per_second_per_gpu": 270330.9758588828,
            "loss_sequences_lower_95": 8.416758154296875,
            "loss_sequences_upper_95": 8.580127856445312,
            "loss_tokens_lower_95": 8.416775317382813,
            "loss_tokens_upper_95": 8.581874389648437,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6255240823484978,
            "data_time": 0.0021539605182150138,
            "batch_time": 0.01610852135151681,
            "samples_per_second": 2222673.576585545,
            "samples_per_second_per_gpu": 277834.19707319315,
            "loss_sequences_lower_95": 4.223946291834201,
            "loss_sequences_upper_95": 4.333787473391674,
            "loss_tokens_lower_95": 2.9065978600355113,
            "loss_tokens_upper_95": 2.975884884013885,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.956089408540014,
            "data_time": 0.019732533182416644,
            "batch_time": 0.03584799085344587,
            "samples_per_second": 1895024.876210003,
            "samples_per_second_per_gpu": 236878.10952625037,
            "loss_sequences_lower_95": 5.733253228486474,
            "loss_sequences_upper_95": 6.176128307741079,
            "loss_tokens_lower_95": 5.736576433324102,
            "loss_tokens_upper_95": 6.17749034824656,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.880319568222644,
            "data_time": 0.01111003290861845,
            "batch_time": 0.025203706696629524,
            "samples_per_second": 2153560.653654347,
            "samples_per_second_per_gpu": 269195.0817067934,
            "loss_sequences_lower_95": 5.719380373487286,
            "loss_sequences_upper_95": 6.037737103630515,
            "loss_tokens_lower_95": 5.721431525735294,
            "loss_tokens_upper_95": 6.035021027209712,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.155100935296607,
            "data_time": 0.0022575492768265003,
            "batch_time": 0.015889502966752737,
            "samples_per_second": 2269260.2550592585,
            "samples_per_second_per_gpu": 283657.5318824073,
            "loss_sequences_lower_95": 4.640593769083521,
            "loss_sequences_upper_95": 4.754464932317112,
            "loss_tokens_lower_95": 3.4432179440325186,
            "loss_tokens_upper_95": 3.5253958377250063,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.412759480652986,
            "data_time": 0.02823365479707718,
            "batch_time": 0.042737955848375954,
            "samples_per_second": 1971562.2772254355,
            "samples_per_second_per_gpu": 246445.28465317943,
            "loss_sequences_lower_95": 5.231575528906767,
            "loss_sequences_upper_95": 5.585599287729415,
            "loss_tokens_lower_95": 5.2365801775896985,
            "loss_tokens_upper_95": 5.585315877924521,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.482082894891045,
            "data_time": 0.004160263308384189,
            "batch_time": 0.017802137012737865,
            "samples_per_second": 2253487.173628745,
            "samples_per_second_per_gpu": 281685.89670359314,
            "loss_sequences_lower_95": 4.4371224331637045,
            "loss_sequences_upper_95": 4.527084333787271,
            "loss_tokens_lower_95": 4.436816816884079,
            "loss_tokens_upper_95": 4.525706477267297,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.9819895329984645,
            "data_time": 0.026222957264293325,
            "batch_time": 0.041618665781888095,
            "samples_per_second": 1785727.981015303,
            "samples_per_second_per_gpu": 223215.99762691287,
            "loss_sequences_lower_95": 5.7648633345816895,
            "loss_sequences_upper_95": 6.196613141402458,
            "loss_tokens_lower_95": 5.766990661621094,
            "loss_tokens_upper_95": 6.200558056877655,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.283180738488833,
            "data_time": 0.08227302134037018,
            "batch_time": 0.09829133749008179,
            "samples_per_second": 1345634.9115238818,
            "samples_per_second_per_gpu": 168204.36394048523,
            "loss_sequences_lower_95": 3.0000812403361,
            "loss_sequences_upper_95": 3.7085757509867348,
            "loss_tokens_lower_95": 2.7052575641208225,
            "loss_tokens_upper_95": 3.6092711554633246,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.8582006971041363,
            "data_time": 0.08685334771871567,
            "batch_time": 0.10198289901018143,
            "samples_per_second": 1466138.535972108,
            "samples_per_second_per_gpu": 183267.3169965135,
            "loss_sequences_lower_95": 2.692158260345459,
            "loss_sequences_upper_95": 3.2595759137471516,
            "loss_tokens_lower_95": 2.2086581712358457,
            "loss_tokens_upper_95": 3.097975973064979,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.955897544726005,
            "data_time": 0.0038494071657339075,
            "batch_time": 0.017737250527330817,
            "samples_per_second": 2222676.1491946494,
            "samples_per_second_per_gpu": 277834.5186493312,
            "loss_sequences_lower_95": 3.9372094474871133,
            "loss_sequences_upper_95": 3.975241307299337,
            "loss_tokens_lower_95": 3.936494881316734,
            "loss_tokens_upper_95": 3.9746570453447165,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.776476027956,
            "data_time": 0.0015336726211353778,
            "batch_time": 0.015269551458778736,
            "samples_per_second": 2256045.880159527,
            "samples_per_second_per_gpu": 282005.7350199409,
            "loss_sequences_lower_95": 0.9380075797625103,
            "loss_sequences_upper_95": 0.9647129946055657,
            "loss_tokens_lower_95": 0.6128484833079888,
            "loss_tokens_upper_95": 0.6255726357773103,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.61473494251882,
            "data_time": 0.04433784261345863,
            "batch_time": 0.05969109386205673,
            "samples_per_second": 1837829.9206269195,
            "samples_per_second_per_gpu": 229728.74007836494,
            "loss_sequences_lower_95": 4.6501987247016485,
            "loss_sequences_upper_95": 5.041650763083631,
            "loss_tokens_lower_95": 4.254955017194793,
            "loss_tokens_upper_95": 4.461055978068964,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.316598222062394,
            "data_time": 0.11897168840680804,
            "batch_time": 0.1351671332404727,
            "samples_per_second": 997645.5184602411,
            "samples_per_second_per_gpu": 124705.68980753014,
            "loss_sequences_lower_95": 6.8588560568319785,
            "loss_sequences_upper_95": 7.994752419961465,
            "loss_tokens_lower_95": 6.588385122793692,
            "loss_tokens_upper_95": 7.724118946216724,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.442952252015835,
            "data_time": 0.03217896393367222,
            "batch_time": 0.04633070457549322,
            "samples_per_second": 1972368.4186101393,
            "samples_per_second_per_gpu": 246546.0523262674,
            "loss_sequences_lower_95": 4.408482323623286,
            "loss_sequences_upper_95": 4.768407830959413,
            "loss_tokens_lower_95": 4.064139540843145,
            "loss_tokens_upper_95": 4.238979652347035,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.546560931496504,
            "data_time": 0.03293942553656442,
            "batch_time": 0.049106262979053315,
            "samples_per_second": 1744039.2845724714,
            "samples_per_second_per_gpu": 218004.91057155892,
            "loss_sequences_lower_95": 4.536923254989996,
            "loss_sequences_upper_95": 4.867564615389196,
            "loss_tokens_lower_95": 4.184217252000997,
            "loss_tokens_upper_95": 4.329338459360707,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.666304070775102,
            "data_time": 0.032119197504861016,
            "batch_time": 0.04690247206460862,
            "samples_per_second": 1907725.9342597283,
            "samples_per_second_per_gpu": 238465.74178246604,
            "loss_sequences_lower_95": 4.601210021972657,
            "loss_sequences_upper_95": 5.023910578285776,
            "loss_tokens_lower_95": 4.278138961923609,
            "loss_tokens_upper_95": 4.514393971088359,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.586131548009267,
            "data_time": 0.035620081992376415,
            "batch_time": 0.05009611731483823,
            "samples_per_second": 1884611.0351445961,
            "samples_per_second_per_gpu": 235576.37939307452,
            "loss_sequences_lower_95": 4.561740121608827,
            "loss_sequences_upper_95": 4.854960213637933,
            "loss_tokens_lower_95": 4.265142698674188,
            "loss_tokens_upper_95": 4.39754963812427,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.922255359081007,
            "data_time": 0.03343695475731367,
            "batch_time": 0.04876283951747565,
            "samples_per_second": 1886294.5316193223,
            "samples_per_second_per_gpu": 235786.81645241528,
            "loss_sequences_lower_95": 4.904203237213704,
            "loss_sequences_upper_95": 5.20897209214868,
            "loss_tokens_lower_95": 4.656773636297163,
            "loss_tokens_upper_95": 4.7725978676148,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.993829735895482,
            "data_time": 0.03248429298400879,
            "batch_time": 0.04877403804234096,
            "samples_per_second": 1718456.0645012408,
            "samples_per_second_per_gpu": 214807.0080626551,
            "loss_sequences_lower_95": 5.0314829198325555,
            "loss_sequences_upper_95": 5.358778976812595,
            "loss_tokens_lower_95": 4.636502717186558,
            "loss_tokens_upper_95": 4.769904945562085,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-16.0/params.txt",
    "uuid": "b33ca1f4-24cf-4494-895d-0db8f5321f9b",
    "creation_date": "2023_12_14-04_59_31"
}