{
    "name": "c4_original-d=512_l=8_h=4-0.25",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 394570240,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "78914048",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.010154406229655,
            "data_time": 0.03422216698527336,
            "batch_time": 0.3554202802479267,
            "samples_per_second": 1705890.8379555563,
            "samples_per_second_per_gpu": 213236.35474444454,
            "loss_sequences_lower_95": 5.827329546610515,
            "loss_sequences_upper_95": 6.194333178202312,
            "loss_tokens_lower_95": 5.99523696899414,
            "loss_tokens_upper_95": 6.024817021687826,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.838835880174883,
            "data_time": 0.0015458675299527052,
            "batch_time": 0.015507707774654555,
            "samples_per_second": 2223958.6211928837,
            "samples_per_second_per_gpu": 277994.82764911046,
            "loss_sequences_lower_95": 4.8365907154054755,
            "loss_sequences_upper_95": 4.840997780120943,
            "loss_tokens_lower_95": 4.827928333333333,
            "loss_tokens_upper_95": 4.849887604166667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.914589289256504,
            "data_time": 0.01093670654296875,
            "batch_time": 0.02536546802520752,
            "samples_per_second": 2100046.315952179,
            "samples_per_second_per_gpu": 262505.7894940224,
            "loss_sequences_lower_95": 5.884679092095823,
            "loss_sequences_upper_95": 5.949978413484534,
            "loss_tokens_lower_95": 5.901235375000001,
            "loss_tokens_upper_95": 5.928139739583333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.957269642131845,
            "data_time": 0.0016184915837488677,
            "batch_time": 0.015064838783521401,
            "samples_per_second": 2316771.395075383,
            "samples_per_second_per_gpu": 289596.4243844229,
            "loss_sequences_lower_95": 4.9338587105347935,
            "loss_sequences_upper_95": 4.981676496053479,
            "loss_tokens_lower_95": 4.945724729166667,
            "loss_tokens_upper_95": 4.96880459375,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.851833287178863,
            "data_time": 0.010516369960222587,
            "batch_time": 0.024322100369578815,
            "samples_per_second": 2186753.78194291,
            "samples_per_second_per_gpu": 273344.22274286376,
            "loss_sequences_lower_95": 4.810623734546048,
            "loss_sequences_upper_95": 4.901280880557058,
            "loss_tokens_lower_95": 4.8405588645833335,
            "loss_tokens_upper_95": 4.863134541666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.682573581373991,
            "data_time": 0.0038175793445628624,
            "batch_time": 0.017301870752935825,
            "samples_per_second": 2307550.801290265,
            "samples_per_second_per_gpu": 288443.8501612831,
            "loss_sequences_lower_95": 5.630201722228713,
            "loss_sequences_upper_95": 5.736088648108217,
            "loss_tokens_lower_95": 5.669867885416667,
            "loss_tokens_upper_95": 5.6948031979166664,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.2247161036121605,
            "data_time": 0.0015571589959970694,
            "batch_time": 0.014969136839395247,
            "samples_per_second": 2333360.953421169,
            "samples_per_second_per_gpu": 291670.1191776461,
            "loss_sequences_lower_95": 7.197324577487245,
            "loss_sequences_upper_95": 7.251428411989796,
            "loss_tokens_lower_95": 7.211010604166666,
            "loss_tokens_upper_95": 7.2385241875,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.135338687497284,
            "data_time": 0.0016028705574587666,
            "batch_time": 0.014967743994450779,
            "samples_per_second": 2338506.077713584,
            "samples_per_second_per_gpu": 292313.259714198,
            "loss_sequences_lower_95": 5.120025942817408,
            "loss_sequences_upper_95": 5.15183580456479,
            "loss_tokens_lower_95": 5.123652614583333,
            "loss_tokens_upper_95": 5.147193770833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.198580774834485,
            "data_time": 0.012124780624631851,
            "batch_time": 0.02596240384238107,
            "samples_per_second": 2200578.08012236,
            "samples_per_second_per_gpu": 275072.260015295,
            "loss_sequences_lower_95": 5.137068039808816,
            "loss_sequences_upper_95": 5.265897363182006,
            "loss_tokens_lower_95": 5.187125541666666,
            "loss_tokens_upper_95": 5.210342947916667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.990153946895373,
            "data_time": 0.00977701973170042,
            "batch_time": 0.02341450285166502,
            "samples_per_second": 2232303.984370004,
            "samples_per_second_per_gpu": 279037.9980462505,
            "loss_sequences_lower_95": 5.927599989095696,
            "loss_sequences_upper_95": 6.06713528237324,
            "loss_tokens_lower_95": 5.977383104166666,
            "loss_tokens_upper_95": 6.00297321875,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.59939129371847,
            "data_time": 0.0012590183431594527,
            "batch_time": 0.014792978877921926,
            "samples_per_second": 2321382.0579153257,
            "samples_per_second_per_gpu": 290172.7572394157,
            "loss_sequences_lower_95": 5.589818844109737,
            "loss_sequences_upper_95": 5.609258567305057,
            "loss_tokens_lower_95": 5.58742384375,
            "loss_tokens_upper_95": 5.611340354166667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.239089486798561,
            "data_time": 0.0026023235051062183,
            "batch_time": 0.016027914892128365,
            "samples_per_second": 2326391.9316826905,
            "samples_per_second_per_gpu": 290798.9914603363,
            "loss_sequences_lower_95": 5.221628368144133,
            "loss_sequences_upper_95": 5.257566437568331,
            "loss_tokens_lower_95": 5.227140302083334,
            "loss_tokens_upper_95": 5.251097645833333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.393702638560328,
            "data_time": 0.00958054150517279,
            "batch_time": 0.023212774940159008,
            "samples_per_second": 2202510.5673005083,
            "samples_per_second_per_gpu": 275313.82091256353,
            "loss_sequences_lower_95": 5.327090138402479,
            "loss_sequences_upper_95": 5.466856238808158,
            "loss_tokens_lower_95": 5.380949343749999,
            "loss_tokens_upper_95": 5.406206354166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.460338928548964,
            "data_time": 0.009950426945173408,
            "batch_time": 0.023795466024087244,
            "samples_per_second": 2186470.713895908,
            "samples_per_second_per_gpu": 273308.8392369885,
            "loss_sequences_lower_95": 5.398584854529977,
            "loss_sequences_upper_95": 5.529901694863002,
            "loss_tokens_lower_95": 5.448147416666667,
            "loss_tokens_upper_95": 5.472061947916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.6800303134051235,
            "data_time": 0.08067728791918073,
            "batch_time": 0.09658774307795934,
            "samples_per_second": 1049732.236431972,
            "samples_per_second_per_gpu": 131216.5295539965,
            "loss_sequences_lower_95": 6.62617978182706,
            "loss_sequences_upper_95": 6.7422931671142585,
            "loss_tokens_lower_95": 6.653885407881304,
            "loss_tokens_upper_95": 6.706434180519798,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.2632689072856405,
            "data_time": 0.014113988388668407,
            "batch_time": 0.028226684440266003,
            "samples_per_second": 2122425.2608978483,
            "samples_per_second_per_gpu": 265303.15761223104,
            "loss_sequences_lower_95": 6.121055901085322,
            "loss_sequences_upper_95": 6.4074711123986425,
            "loss_tokens_lower_95": 6.24979653125,
            "loss_tokens_upper_95": 6.276537989583333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.770350716673919,
            "data_time": 0.01299745092789332,
            "batch_time": 0.02712770054737727,
            "samples_per_second": 2129765.539520552,
            "samples_per_second_per_gpu": 266220.692440069,
            "loss_sequences_lower_95": 6.701489547689231,
            "loss_sequences_upper_95": 6.845120746542092,
            "loss_tokens_lower_95": 6.7585435,
            "loss_tokens_upper_95": 6.782103583333334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.1095875013070025,
            "data_time": 0.036994386464357376,
            "batch_time": 0.05202962085604668,
            "samples_per_second": 1869063.716750672,
            "samples_per_second_per_gpu": 233632.964593834,
            "loss_sequences_lower_95": 6.016090518138448,
            "loss_sequences_upper_95": 6.256743359174885,
            "loss_tokens_lower_95": 6.095350922131147,
            "loss_tokens_upper_95": 6.12394136522637,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.988785216550578,
            "data_time": 0.0018997010785517768,
            "batch_time": 0.015490133765075093,
            "samples_per_second": 2276049.2410628367,
            "samples_per_second_per_gpu": 284506.1551328546,
            "loss_sequences_lower_95": 4.9701738534396815,
            "loss_sequences_upper_95": 5.007457732828301,
            "loss_tokens_lower_95": 4.970089702989246,
            "loss_tokens_upper_95": 5.00765244723419,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.505786184864769,
            "data_time": 0.0021038079147885557,
            "batch_time": 0.01569196747936261,
            "samples_per_second": 2273969.7573494827,
            "samples_per_second_per_gpu": 284246.21966868534,
            "loss_sequences_lower_95": 4.515139305541725,
            "loss_sequences_upper_95": 4.54137380813085,
            "loss_tokens_lower_95": 4.479795560085593,
            "loss_tokens_upper_95": 4.4998542919630555,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.048705069535624,
            "data_time": 0.003014058015124063,
            "batch_time": 0.016793697314033185,
            "samples_per_second": 2245569.7777947304,
            "samples_per_second_per_gpu": 280696.2222243413,
            "loss_sequences_lower_95": 7.216651924432423,
            "loss_sequences_upper_95": 7.5015308781958545,
            "loss_tokens_lower_95": 6.6158255943605555,
            "loss_tokens_upper_95": 6.817462773353735,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.871212533712387,
            "data_time": 0.004009450845261837,
            "batch_time": 0.017638663028148895,
            "samples_per_second": 2248930.264609155,
            "samples_per_second_per_gpu": 281116.28307614435,
            "loss_sequences_lower_95": 7.013977571614584,
            "loss_sequences_upper_95": 7.201689127604166,
            "loss_tokens_lower_95": 6.498598073899371,
            "loss_tokens_upper_95": 6.6261292501965405,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.5299251388653925,
            "data_time": 0.004562647633962502,
            "batch_time": 0.01824670319823298,
            "samples_per_second": 2237494.3931500115,
            "samples_per_second_per_gpu": 279686.79914375144,
            "loss_sequences_lower_95": 5.549030452609695,
            "loss_sequences_upper_95": 5.617798554924422,
            "loss_tokens_lower_95": 5.4493120548389955,
            "loss_tokens_upper_95": 5.485358652054496,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.321043623577465,
            "data_time": 0.023459391934531077,
            "batch_time": 0.037729237760816305,
            "samples_per_second": 2030055.5090157955,
            "samples_per_second_per_gpu": 253756.93862697444,
            "loss_sequences_lower_95": 6.18751894864169,
            "loss_sequences_upper_95": 6.480270025079901,
            "loss_tokens_lower_95": 6.2592189547012,
            "loss_tokens_upper_95": 6.337984181321174,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.415672695393465,
            "data_time": 0.02075730822980404,
            "batch_time": 0.03450676240026951,
            "samples_per_second": 2027094.9360635947,
            "samples_per_second_per_gpu": 253386.86700794933,
            "loss_sequences_lower_95": 5.368019147600446,
            "loss_sequences_upper_95": 5.567772054866869,
            "loss_tokens_lower_95": 5.325256471627454,
            "loss_tokens_upper_95": 5.423485938028944,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.433468960126241,
            "data_time": 0.01690918818498269,
            "batch_time": 0.031026016443203658,
            "samples_per_second": 2057999.0872788208,
            "samples_per_second_per_gpu": 257249.8859098526,
            "loss_sequences_lower_95": 5.355414896647135,
            "loss_sequences_upper_95": 5.517827087402344,
            "loss_tokens_lower_95": 5.306977336869399,
            "loss_tokens_upper_95": 5.520513893382068,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.083910319409874,
            "data_time": 0.0017327045012313473,
            "batch_time": 0.01551779606318617,
            "samples_per_second": 2247408.8975792523,
            "samples_per_second_per_gpu": 280926.11219740653,
            "loss_sequences_lower_95": 8.090367453926973,
            "loss_sequences_upper_95": 8.166599765175187,
            "loss_tokens_lower_95": 7.955522615151225,
            "loss_tokens_upper_95": 8.032826871506044,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.240740536890849,
            "data_time": 0.002836349026468776,
            "batch_time": 0.016720390559842924,
            "samples_per_second": 2226600.1896032956,
            "samples_per_second_per_gpu": 278325.02370041196,
            "loss_sequences_lower_95": 6.810814802895491,
            "loss_sequences_upper_95": 7.120967877834333,
            "loss_tokens_lower_95": 5.495290203596872,
            "loss_tokens_upper_95": 5.641712102101782,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.7146179212238195,
            "data_time": 0.0053145905604233615,
            "batch_time": 0.018952440571140598,
            "samples_per_second": 2234747.350266486,
            "samples_per_second_per_gpu": 279343.41878331074,
            "loss_sequences_lower_95": 6.104298531073352,
            "loss_sequences_upper_95": 6.43949947422275,
            "loss_tokens_lower_95": 5.350407326456114,
            "loss_tokens_upper_95": 5.513691822210794,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.7154544172766,
            "data_time": 0.022639953664370945,
            "batch_time": 0.03680138077054705,
            "samples_per_second": 2056491.5346282837,
            "samples_per_second_per_gpu": 257061.44182853546,
            "loss_sequences_lower_95": 5.647868758249501,
            "loss_sequences_upper_95": 5.780979494417095,
            "loss_tokens_lower_95": 5.648915441504352,
            "loss_tokens_upper_95": 5.781974088764626,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9260962963104244,
            "data_time": 0.04975859476969792,
            "batch_time": 0.06404315966826218,
            "samples_per_second": 1780896.455429027,
            "samples_per_second_per_gpu": 222612.0569286284,
            "loss_sequences_lower_95": 4.754325828552246,
            "loss_sequences_upper_95": 5.232099456787109,
            "loss_tokens_lower_95": 4.565969788580333,
            "loss_tokens_upper_95": 5.065104203062109,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.755653237979268,
            "data_time": 0.0033998216344291203,
            "batch_time": 0.017143723667277393,
            "samples_per_second": 2242391.0846400266,
            "samples_per_second_per_gpu": 280298.8855800033,
            "loss_sequences_lower_95": 4.72001629519975,
            "loss_sequences_upper_95": 4.791022771798234,
            "loss_tokens_lower_95": 4.72000313859391,
            "loss_tokens_upper_95": 4.791415820712321,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.075917686818566,
            "data_time": 0.005159416945490129,
            "batch_time": 0.018941791761563225,
            "samples_per_second": 2224823.154857197,
            "samples_per_second_per_gpu": 278102.8943571496,
            "loss_sequences_lower_95": 5.020671792620035,
            "loss_sequences_upper_95": 5.1301493676737815,
            "loss_tokens_lower_95": 5.018071844927313,
            "loss_tokens_upper_95": 5.131063115450322,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.977832117978325,
            "data_time": 0.0035240639878143243,
            "batch_time": 0.0172265572372832,
            "samples_per_second": 2232874.284117803,
            "samples_per_second_per_gpu": 279109.2855147254,
            "loss_sequences_lower_95": 5.098774010260813,
            "loss_sequences_upper_95": 5.21022828343903,
            "loss_tokens_lower_95": 4.841996592530476,
            "loss_tokens_upper_95": 4.898535564485344,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.904199193477631,
            "data_time": 0.010569191537797451,
            "batch_time": 0.024351217783987522,
            "samples_per_second": 2148173.156195879,
            "samples_per_second_per_gpu": 268521.6445244849,
            "loss_sequences_lower_95": 7.106221228027343,
            "loss_sequences_upper_95": 7.657911535644532,
            "loss_tokens_lower_95": 6.257676453404203,
            "loss_tokens_upper_95": 6.610410149786979,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.368837118148804,
            "data_time": 0.15560737252235413,
            "batch_time": 0.17197637259960175,
            "samples_per_second": 871645.5699419389,
            "samples_per_second_per_gpu": 108955.69624274236,
            "loss_sequences_lower_95": 5.032259857654571,
            "loss_sequences_upper_95": 5.903403854370117,
            "loss_tokens_lower_95": 4.80560153654252,
            "loss_tokens_upper_95": 5.742448609450768,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.448426346669252,
            "data_time": 0.02693703326773136,
            "batch_time": 0.04103948968522092,
            "samples_per_second": 1876209.3053817786,
            "samples_per_second_per_gpu": 234526.16317272233,
            "loss_sequences_lower_95": 6.809954693673671,
            "loss_sequences_upper_95": 7.5141590513032055,
            "loss_tokens_lower_95": 5.232273684117813,
            "loss_tokens_upper_95": 5.673662125889586,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.471110965654599,
            "data_time": 0.00290292604929871,
            "batch_time": 0.016446505569749408,
            "samples_per_second": 2263770.140364878,
            "samples_per_second_per_gpu": 282971.26754560973,
            "loss_sequences_lower_95": 4.42671086155936,
            "loss_sequences_upper_95": 4.514518890447521,
            "loss_tokens_lower_95": 4.426349279587838,
            "loss_tokens_upper_95": 4.515671278993966,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.836941278065533,
            "data_time": 0.0024403100981469947,
            "batch_time": 0.016038438170095908,
            "samples_per_second": 2272799.2587524024,
            "samples_per_second_per_gpu": 284099.9073440503,
            "loss_sequences_lower_95": 6.802954854878226,
            "loss_sequences_upper_95": 7.005429350166166,
            "loss_tokens_lower_95": 6.603997569096337,
            "loss_tokens_upper_95": 6.804126038865977,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.472222212906722,
            "data_time": 0.01901373267173767,
            "batch_time": 0.03308520052168104,
            "samples_per_second": 2005101.6392695021,
            "samples_per_second_per_gpu": 250637.70490868777,
            "loss_sequences_lower_95": 4.339707952366644,
            "loss_sequences_upper_95": 4.697407727363782,
            "loss_tokens_lower_95": 4.234827484497861,
            "loss_tokens_upper_95": 4.549731050080221,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.691903134077293,
            "data_time": 0.005018476024270057,
            "batch_time": 0.018949511647224426,
            "samples_per_second": 2198201.575337297,
            "samples_per_second_per_gpu": 274775.1969171621,
            "loss_sequences_lower_95": 4.702181108409999,
            "loss_sequences_upper_95": 4.832925716120141,
            "loss_tokens_lower_95": 4.557431913123529,
            "loss_tokens_upper_95": 4.701683236547109,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.111224767638416,
            "data_time": 0.03255600304830642,
            "batch_time": 0.04744000661940802,
            "samples_per_second": 1843250.2813176012,
            "samples_per_second_per_gpu": 230406.28516470015,
            "loss_sequences_lower_95": 5.798596386793183,
            "loss_sequences_upper_95": 6.315732109255907,
            "loss_tokens_lower_95": 5.926671656348772,
            "loss_tokens_upper_95": 6.280188635472956,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.971999303797146,
            "data_time": 0.002157140054371278,
            "batch_time": 0.015781977107131476,
            "samples_per_second": 2267630.5640891576,
            "samples_per_second_per_gpu": 283453.8205111447,
            "loss_sequences_lower_95": 4.954813824092943,
            "loss_sequences_upper_95": 4.988809754138328,
            "loss_tokens_lower_95": 4.954667945307811,
            "loss_tokens_upper_95": 4.989110497880826,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.7923539518152625,
            "data_time": 0.047297521071000534,
            "batch_time": 0.06278091343966398,
            "samples_per_second": 1683758.1142697923,
            "samples_per_second_per_gpu": 210469.76428372404,
            "loss_sequences_lower_95": 4.640519255110362,
            "loss_sequences_upper_95": 5.0097896390748256,
            "loss_tokens_lower_95": 4.516714436932255,
            "loss_tokens_upper_95": 4.91730573453093,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.413286216953765,
            "data_time": 0.0017047201822709358,
            "batch_time": 0.015362333072301029,
            "samples_per_second": 2258505.4250021027,
            "samples_per_second_per_gpu": 282313.17812526284,
            "loss_sequences_lower_95": 6.740063619873559,
            "loss_sequences_upper_95": 6.787820812041405,
            "loss_tokens_lower_95": 5.899643592843327,
            "loss_tokens_upper_95": 5.947033558994197,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.242234563827514,
            "data_time": 0.005888519306031484,
            "batch_time": 0.019916522124456982,
            "samples_per_second": 2188242.724233555,
            "samples_per_second_per_gpu": 273530.3405291944,
            "loss_sequences_lower_95": 7.0997204589843745,
            "loss_sequences_upper_95": 7.374481457519531,
            "loss_tokens_lower_95": 7.100968783231825,
            "loss_tokens_upper_95": 7.37332881682618,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.823333556755729,
            "data_time": 0.022474074767807783,
            "batch_time": 0.03717859114630748,
            "samples_per_second": 1999974.6041471236,
            "samples_per_second_per_gpu": 249996.82551839045,
            "loss_sequences_lower_95": 4.695531244692596,
            "loss_sequences_upper_95": 4.948436942722487,
            "loss_tokens_lower_95": 4.697437425696331,
            "loss_tokens_upper_95": 4.945557317319123,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 10.558291097843286,
            "data_time": 0.004689816968986787,
            "batch_time": 0.018522605120417583,
            "samples_per_second": 2210563.688748167,
            "samples_per_second_per_gpu": 276320.46109352086,
            "loss_sequences_lower_95": 10.45208915941643,
            "loss_sequences_upper_95": 10.663715450402462,
            "loss_tokens_lower_95": 10.453926613547585,
            "loss_tokens_upper_95": 10.662026940548058,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.907226369222005,
            "data_time": 0.004286702643049524,
            "batch_time": 0.01805121625991578,
            "samples_per_second": 2230578.1190640493,
            "samples_per_second_per_gpu": 278822.26488300617,
            "loss_sequences_lower_95": 3.950950708007812,
            "loss_sequences_upper_95": 4.034055485026042,
            "loss_tokens_lower_95": 3.804499487294918,
            "loss_tokens_upper_95": 3.8930983408988595,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.274428369885399,
            "data_time": 0.023955953972680227,
            "batch_time": 0.03813277184963226,
            "samples_per_second": 1923422.9297810975,
            "samples_per_second_per_gpu": 240427.86622263718,
            "loss_sequences_lower_95": 5.965091145833333,
            "loss_sequences_upper_95": 6.584280540829613,
            "loss_tokens_lower_95": 5.959196864536831,
            "loss_tokens_upper_95": 6.588927699497768,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.174779713153839,
            "data_time": 0.15841567516326904,
            "batch_time": 0.17574889957904816,
            "samples_per_second": 742324.4886905793,
            "samples_per_second_per_gpu": 92790.56108632241,
            "loss_sequences_lower_95": 4.966819787025452,
            "loss_sequences_upper_95": 6.1351630806922905,
            "loss_tokens_lower_95": 4.729940210716012,
            "loss_tokens_upper_95": 5.288930248771746,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.149841237545013,
            "data_time": 0.005845761488354396,
            "batch_time": 0.019492053796374607,
            "samples_per_second": 2228214.446603022,
            "samples_per_second_per_gpu": 278526.80582537776,
            "loss_sequences_lower_95": 8.104309887695312,
            "loss_sequences_upper_95": 8.483939794921875,
            "loss_tokens_lower_95": 7.795017961479483,
            "loss_tokens_upper_95": 8.131886645218644,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.319965886592865,
            "data_time": 0.005680934304282779,
            "batch_time": 0.019342763083321706,
            "samples_per_second": 2227888.5426238556,
            "samples_per_second_per_gpu": 278486.06782798195,
            "loss_sequences_lower_95": 7.446035791015625,
            "loss_sequences_upper_95": 7.687044580078124,
            "loss_tokens_lower_95": 7.046376204932399,
            "loss_tokens_upper_95": 7.2478903413714315,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0887437118526435,
            "data_time": 0.0037621566683153644,
            "batch_time": 0.017516678232811764,
            "samples_per_second": 2230472.05537747,
            "samples_per_second_per_gpu": 278809.00692218373,
            "loss_sequences_lower_95": 5.049697259077481,
            "loss_sequences_upper_95": 5.126785132024178,
            "loss_tokens_lower_95": 5.050926408502346,
            "loss_tokens_upper_95": 5.127488826658356,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.0041215906861005,
            "data_time": 0.008581010236480805,
            "batch_time": 0.022683688160876132,
            "samples_per_second": 2126080.186524802,
            "samples_per_second_per_gpu": 265760.02331560024,
            "loss_sequences_lower_95": 4.911733432189661,
            "loss_sequences_upper_95": 5.09396477624568,
            "loss_tokens_lower_95": 4.910686965215773,
            "loss_tokens_upper_95": 5.091890893517185,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.121418684959412,
            "data_time": 0.006043748723136054,
            "batch_time": 0.019972894872937883,
            "samples_per_second": 2222731.6889804737,
            "samples_per_second_per_gpu": 277841.4611225592,
            "loss_sequences_lower_95": 9.0698435546875,
            "loss_sequences_upper_95": 9.172277294921875,
            "loss_tokens_lower_95": 9.071898022460937,
            "loss_tokens_upper_95": 9.172969921875001,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.58352896801796,
            "data_time": 0.0021756314600347797,
            "batch_time": 0.01580749225075619,
            "samples_per_second": 2261047.355429317,
            "samples_per_second_per_gpu": 282630.9194286646,
            "loss_sequences_lower_95": 6.964108214581363,
            "loss_sequences_upper_95": 7.0525284746038315,
            "loss_tokens_lower_95": 6.094750465224161,
            "loss_tokens_upper_95": 6.158982275492685,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.812453287750928,
            "data_time": 0.019673757893698556,
            "batch_time": 0.03397782019206456,
            "samples_per_second": 2006544.870881526,
            "samples_per_second_per_gpu": 250818.10886019075,
            "loss_sequences_lower_95": 4.646243809941989,
            "loss_sequences_upper_95": 4.978117461702716,
            "loss_tokens_lower_95": 4.647837920687092,
            "loss_tokens_upper_95": 4.9746735416241545,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.983314011143703,
            "data_time": 0.011320920661091805,
            "batch_time": 0.025490744039416313,
            "samples_per_second": 2130536.151027003,
            "samples_per_second_per_gpu": 266317.01887837535,
            "loss_sequences_lower_95": 4.877268724628523,
            "loss_sequences_upper_95": 5.085966940487133,
            "loss_tokens_lower_95": 4.8776431932636335,
            "loss_tokens_upper_95": 5.085502666398591,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.694981219549608,
            "data_time": 0.0025613065897509227,
            "batch_time": 0.016139924988266108,
            "samples_per_second": 2267463.376082908,
            "samples_per_second_per_gpu": 283432.9220103635,
            "loss_sequences_lower_95": 6.933734992523175,
            "loss_sequences_upper_95": 7.0250964207143305,
            "loss_tokens_lower_95": 6.263417124546321,
            "loss_tokens_upper_95": 6.338676156881255,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.470880584111289,
            "data_time": 0.0274980291724205,
            "batch_time": 0.04181548704703649,
            "samples_per_second": 1983548.8383703434,
            "samples_per_second_per_gpu": 247943.60479629293,
            "loss_sequences_lower_95": 5.343424503386967,
            "loss_sequences_upper_95": 5.601257291925016,
            "loss_tokens_lower_95": 5.339583535169168,
            "loss_tokens_upper_95": 5.600474072007275,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.105439534901844,
            "data_time": 0.0036983313869061777,
            "batch_time": 0.01735186635065137,
            "samples_per_second": 2244934.8323601484,
            "samples_per_second_per_gpu": 280616.85404501855,
            "loss_sequences_lower_95": 8.086309444882454,
            "loss_sequences_upper_95": 8.124148449445718,
            "loss_tokens_lower_95": 8.08648453925363,
            "loss_tokens_upper_95": 8.124167413274082,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.14462338373499,
            "data_time": 0.023956342176957565,
            "batch_time": 0.038622466000643646,
            "samples_per_second": 1839298.2714170124,
            "samples_per_second_per_gpu": 229912.28392712655,
            "loss_sequences_lower_95": 4.962421906110153,
            "loss_sequences_upper_95": 5.3202917895270785,
            "loss_tokens_lower_95": 4.961951668748577,
            "loss_tokens_upper_95": 5.324539391971329,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.383658210436503,
            "data_time": 0.08080355823040009,
            "batch_time": 0.09693779051303864,
            "samples_per_second": 1340398.3842036435,
            "samples_per_second_per_gpu": 167549.79802545544,
            "loss_sequences_lower_95": 5.012597745259603,
            "loss_sequences_upper_95": 5.888985188802083,
            "loss_tokens_lower_95": 4.672084967295329,
            "loss_tokens_upper_95": 5.937108124627008,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.959076432387034,
            "data_time": 0.08280397206544876,
            "batch_time": 0.09938009083271027,
            "samples_per_second": 1287030.1317572473,
            "samples_per_second_per_gpu": 160878.7664696559,
            "loss_sequences_lower_95": 4.581176147460937,
            "loss_sequences_upper_95": 5.712994639078776,
            "loss_tokens_lower_95": 3.970557009236196,
            "loss_tokens_upper_95": 5.388167323423235,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.103381384454876,
            "data_time": 0.0035444156361580455,
            "batch_time": 0.01725417028250871,
            "samples_per_second": 2242520.6128063337,
            "samples_per_second_per_gpu": 280315.0766007917,
            "loss_sequences_lower_95": 8.08358661634757,
            "loss_sequences_upper_95": 8.123362783620214,
            "loss_tokens_lower_95": 8.083386600814617,
            "loss_tokens_upper_95": 8.123688486630154,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.014392364622671,
            "data_time": 0.0015327817061406518,
            "batch_time": 0.01519072802505034,
            "samples_per_second": 2259472.6897870093,
            "samples_per_second_per_gpu": 282434.08622337616,
            "loss_sequences_lower_95": 5.31129938175306,
            "loss_sequences_upper_95": 5.342971247670621,
            "loss_tokens_lower_95": 4.6377484240725675,
            "loss_tokens_upper_95": 4.669659738922092,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.1912794864083835,
            "data_time": 0.03842691704630852,
            "batch_time": 0.053225915879011154,
            "samples_per_second": 1840593.5996971726,
            "samples_per_second_per_gpu": 230074.19996214658,
            "loss_sequences_lower_95": 7.18073547843873,
            "loss_sequences_upper_95": 7.5054001695527806,
            "loss_tokens_lower_95": 7.015740539505803,
            "loss_tokens_upper_95": 7.213940519643515,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.283564206716177,
            "data_time": 0.11431592986697242,
            "batch_time": 0.13016928945268905,
            "samples_per_second": 936639.1065927697,
            "samples_per_second_per_gpu": 117079.88832409622,
            "loss_sequences_lower_95": 8.927922800424938,
            "loss_sequences_upper_95": 9.795054812044711,
            "loss_tokens_lower_95": 8.731239639093845,
            "loss_tokens_upper_95": 9.592504411862219,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.209911477275011,
            "data_time": 0.030907259100959414,
            "batch_time": 0.04541140510922387,
            "samples_per_second": 1927981.6853917646,
            "samples_per_second_per_gpu": 240997.71067397058,
            "loss_sequences_lower_95": 7.14069254805402,
            "loss_sequences_upper_95": 7.386507583245999,
            "loss_tokens_lower_95": 7.024197822487384,
            "loss_tokens_upper_95": 7.1923546079951635,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.199715035717662,
            "data_time": 0.031004119487035842,
            "batch_time": 0.04637963998885382,
            "samples_per_second": 1842923.2048087472,
            "samples_per_second_per_gpu": 230365.4006010934,
            "loss_sequences_lower_95": 7.120128166384813,
            "loss_sequences_upper_95": 7.341743878620427,
            "loss_tokens_lower_95": 7.044221732254897,
            "loss_tokens_upper_95": 7.1854410415994785,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.391300788739833,
            "data_time": 0.03225138641539074,
            "batch_time": 0.04712965658732823,
            "samples_per_second": 1902716.738349829,
            "samples_per_second_per_gpu": 237839.59229372864,
            "loss_sequences_lower_95": 7.368556027296113,
            "loss_sequences_upper_95": 7.68689163952339,
            "loss_tokens_lower_95": 7.1286378918604445,
            "loss_tokens_upper_95": 7.345511546910941,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.206182064079657,
            "data_time": 0.031076224077315556,
            "batch_time": 0.0461840828259786,
            "samples_per_second": 1838438.6877505337,
            "samples_per_second_per_gpu": 229804.8359688167,
            "loss_sequences_lower_95": 7.125563123749523,
            "loss_sequences_upper_95": 7.340789162240378,
            "loss_tokens_lower_95": 7.05964094976027,
            "loss_tokens_upper_95": 7.1864154375973515,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.549757756061435,
            "data_time": 0.03337329699669355,
            "batch_time": 0.04950631989373101,
            "samples_per_second": 1778215.1728024164,
            "samples_per_second_per_gpu": 222276.89660030205,
            "loss_sequences_lower_95": 7.49589191697399,
            "loss_sequences_upper_95": 7.669005892735831,
            "loss_tokens_lower_95": 7.465838855476747,
            "loss_tokens_upper_95": 7.568192956639368,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.202028576920672,
            "data_time": 0.03150678816295806,
            "batch_time": 0.0457645541145688,
            "samples_per_second": 1928378.4530774509,
            "samples_per_second_per_gpu": 241047.30663468136,
            "loss_sequences_lower_95": 7.164091231183308,
            "loss_sequences_upper_95": 7.375174117669826,
            "loss_tokens_lower_95": 7.044595883250452,
            "loss_tokens_upper_95": 7.1616570835217,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/params.txt",
    "uuid": "1fc79445-01d1-4a2a-9030-63c4aea3df94",
    "creation_date": "2023_12_14-04_59_25"
}