{
    "name": "c4_original-d=512_l=8_h=4-8.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 12626247680,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "2525249536",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.1912501136461895,
            "data_time": 0.030791964381933212,
            "batch_time": 0.3263601176440716,
            "samples_per_second": 1716504.1057398221,
            "samples_per_second_per_gpu": 214563.01321747777,
            "loss_sequences_lower_95": 4.062015177408854,
            "loss_sequences_upper_95": 4.319698740641275,
            "loss_tokens_lower_95": 4.175454069773356,
            "loss_tokens_upper_95": 4.20715524037679,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4352680159635556,
            "data_time": 0.0015224127803912255,
            "batch_time": 0.015376733813070313,
            "samples_per_second": 2240405.9560668524,
            "samples_per_second_per_gpu": 280050.74450835655,
            "loss_sequences_lower_95": 3.4325104835792186,
            "loss_sequences_upper_95": 3.4379820185384533,
            "loss_tokens_lower_95": 3.4244035416666665,
            "loss_tokens_upper_95": 3.4460473541666663,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6970339964847176,
            "data_time": 0.009973694801330566,
            "batch_time": 0.023618782997131348,
            "samples_per_second": 2215322.83480422,
            "samples_per_second_per_gpu": 276915.3543505275,
            "loss_sequences_lower_95": 3.676147566814812,
            "loss_sequences_upper_95": 3.718500939194037,
            "loss_tokens_lower_95": 3.682331666666667,
            "loss_tokens_upper_95": 3.7120908333333333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4276252374452416,
            "data_time": 0.0015713104880169819,
            "batch_time": 0.014972377372415442,
            "samples_per_second": 2324624.886786904,
            "samples_per_second_per_gpu": 290578.110848363,
            "loss_sequences_lower_95": 3.4171797781088915,
            "loss_sequences_upper_95": 3.4382269350032217,
            "loss_tokens_lower_95": 3.4166302239583333,
            "loss_tokens_upper_95": 3.4383106770833334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4546526420383494,
            "data_time": 0.009787126366360728,
            "batch_time": 0.023634252320247816,
            "samples_per_second": 2192618.0946945623,
            "samples_per_second_per_gpu": 274077.2618368203,
            "loss_sequences_lower_95": 3.421918656918279,
            "loss_sequences_upper_95": 3.487247244840727,
            "loss_tokens_lower_95": 3.4436616406250002,
            "loss_tokens_upper_95": 3.4652638489583336,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.901494050820128,
            "data_time": 0.0036549257195514183,
            "batch_time": 0.017213960056719574,
            "samples_per_second": 2299346.32535531,
            "samples_per_second_per_gpu": 287418.29066941375,
            "loss_sequences_lower_95": 3.863871212251976,
            "loss_sequences_upper_95": 3.939862122785308,
            "loss_tokens_lower_95": 3.8888689270833336,
            "loss_tokens_upper_95": 3.9137471354166666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6462739439399874,
            "data_time": 0.0015317363023368998,
            "batch_time": 0.01490754822533438,
            "samples_per_second": 2336550.5203473107,
            "samples_per_second_per_gpu": 292068.81504341384,
            "loss_sequences_lower_95": 3.6112292629942604,
            "loss_sequences_upper_95": 3.6807404237085457,
            "loss_tokens_lower_95": 3.630872666666667,
            "loss_tokens_upper_95": 3.6619809166666664,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.100225704103241,
            "data_time": 0.0016277439671049353,
            "batch_time": 0.015220719568403332,
            "samples_per_second": 2296951.9827289204,
            "samples_per_second_per_gpu": 287118.99784111505,
            "loss_sequences_lower_95": 4.091404174165576,
            "loss_sequences_upper_95": 4.109424410994764,
            "loss_tokens_lower_95": 4.088254468750001,
            "loss_tokens_upper_95": 4.11221,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.824233173839445,
            "data_time": 0.010787560826256162,
            "batch_time": 0.027038272411104233,
            "samples_per_second": 2190091.6448460286,
            "samples_per_second_per_gpu": 273761.4556057536,
            "loss_sequences_lower_95": 3.7842132072138592,
            "loss_sequences_upper_95": 3.8672957381581874,
            "loss_tokens_lower_95": 3.8128661041666665,
            "loss_tokens_upper_95": 3.8357113125,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.903951250046139,
            "data_time": 0.009943283163011074,
            "batch_time": 0.024346057325601578,
            "samples_per_second": 2213612.162315831,
            "samples_per_second_per_gpu": 276701.5202894789,
            "loss_sequences_lower_95": 4.878129028802803,
            "loss_sequences_upper_95": 4.936175549171659,
            "loss_tokens_lower_95": 4.890680729166667,
            "loss_tokens_upper_95": 4.9176036875,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.816642747768984,
            "data_time": 0.0013022118968550687,
            "batch_time": 0.01488024759174994,
            "samples_per_second": 2304254.70763392,
            "samples_per_second_per_gpu": 288031.83845424,
            "loss_sequences_lower_95": 3.8093238967533747,
            "loss_sequences_upper_95": 3.8241665627248356,
            "loss_tokens_lower_95": 3.8049690416666664,
            "loss_tokens_upper_95": 3.8281325729166666,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.620706160432942,
            "data_time": 0.002917417975686968,
            "batch_time": 0.01682475067792983,
            "samples_per_second": 2326468.15632184,
            "samples_per_second_per_gpu": 290808.51954023,
            "loss_sequences_lower_95": 3.6117710353433465,
            "loss_sequences_upper_95": 3.6294677225960537,
            "loss_tokens_lower_95": 3.6092462291666667,
            "loss_tokens_upper_95": 3.632140864583333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.242444368211541,
            "data_time": 0.009912282581857071,
            "batch_time": 0.023680820766644985,
            "samples_per_second": 2180727.8030354306,
            "samples_per_second_per_gpu": 272590.9753794288,
            "loss_sequences_lower_95": 4.204092128668547,
            "loss_sequences_upper_95": 4.282500948335288,
            "loss_tokens_lower_95": 4.229015281250001,
            "loss_tokens_upper_95": 4.255637125,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4365184574166046,
            "data_time": 0.00948372590114396,
            "batch_time": 0.023910082668897166,
            "samples_per_second": 2095066.7461259775,
            "samples_per_second_per_gpu": 261883.3432657472,
            "loss_sequences_lower_95": 3.379992427165542,
            "loss_sequences_upper_95": 3.492276560361905,
            "loss_tokens_lower_95": 3.4242803229166663,
            "loss_tokens_upper_95": 3.4484213281250002,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.941994656216014,
            "data_time": 0.08075388840266637,
            "batch_time": 0.09604058946881976,
            "samples_per_second": 1017551.5078541404,
            "samples_per_second_per_gpu": 127193.93848176755,
            "loss_sequences_lower_95": 4.874102679165927,
            "loss_sequences_upper_95": 5.0111496925354,
            "loss_tokens_lower_95": 4.912705031308261,
            "loss_tokens_upper_95": 4.9716483376242895,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.035079079536238,
            "data_time": 0.013558405366810885,
            "batch_time": 0.028694789518009533,
            "samples_per_second": 1980171.5400193764,
            "samples_per_second_per_gpu": 247521.44250242205,
            "loss_sequences_lower_95": 3.9454420173133427,
            "loss_sequences_upper_95": 4.12419522566281,
            "loss_tokens_lower_95": 4.02090340625,
            "loss_tokens_upper_95": 4.049124750000001,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.971743681814551,
            "data_time": 0.012587596972783407,
            "batch_time": 0.026778349032004673,
            "samples_per_second": 2159807.4688002816,
            "samples_per_second_per_gpu": 269975.9336000352,
            "loss_sequences_lower_95": 5.916854399426946,
            "loss_sequences_upper_95": 6.02535524393449,
            "loss_tokens_lower_95": 5.959638,
            "loss_tokens_upper_95": 5.98402265625,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.13640944684138,
            "data_time": 0.03631066530942917,
            "batch_time": 0.0509830005466938,
            "samples_per_second": 1866788.7959637227,
            "samples_per_second_per_gpu": 233348.59949546534,
            "loss_sequences_lower_95": 4.075743640837122,
            "loss_sequences_upper_95": 4.213203805391906,
            "loss_tokens_lower_95": 4.122014367775838,
            "loss_tokens_upper_95": 4.151065082237369,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.420592777020333,
            "data_time": 0.001982230226379864,
            "batch_time": 0.015637406251968175,
            "samples_per_second": 2263578.113624842,
            "samples_per_second_per_gpu": 282947.26420310524,
            "loss_sequences_lower_95": 5.398684248593505,
            "loss_sequences_upper_95": 5.443058631026563,
            "loss_tokens_lower_95": 5.398391794482624,
            "loss_tokens_upper_95": 5.443271872440714,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1503351108886832,
            "data_time": 0.002152177700950841,
            "batch_time": 0.015812432262927865,
            "samples_per_second": 2259719.068803238,
            "samples_per_second_per_gpu": 282464.8836004048,
            "loss_sequences_lower_95": 3.160117112619187,
            "loss_sequences_upper_95": 3.18589427334881,
            "loss_tokens_lower_95": 3.1239565748189038,
            "loss_tokens_upper_95": 3.143091040098852,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.971287548091539,
            "data_time": 0.0030419289447809594,
            "batch_time": 0.016694147332898857,
            "samples_per_second": 2261276.0766143096,
            "samples_per_second_per_gpu": 282659.5095767887,
            "loss_sequences_lower_95": 5.213705111338274,
            "loss_sequences_upper_95": 5.5230689249232405,
            "loss_tokens_lower_95": 4.429919587744966,
            "loss_tokens_upper_95": 4.649983153889683,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.075546332001686,
            "data_time": 0.004402287145878406,
            "batch_time": 0.01825765678857235,
            "samples_per_second": 2209883.6926570307,
            "samples_per_second_per_gpu": 276235.46158212883,
            "loss_sequences_lower_95": 5.218349682617188,
            "loss_sequences_upper_95": 5.425481551106771,
            "loss_tokens_lower_95": 4.739040757665094,
            "loss_tokens_upper_95": 4.883445017688679,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4800306391408715,
            "data_time": 0.004452778023652182,
            "batch_time": 0.018258797097529762,
            "samples_per_second": 2224025.6658012606,
            "samples_per_second_per_gpu": 278003.2082251576,
            "loss_sequences_lower_95": 3.522079474714509,
            "loss_sequences_upper_95": 3.590142040238225,
            "loss_tokens_lower_95": 3.381403525090343,
            "loss_tokens_upper_95": 3.4145876185751622,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9647225640036843,
            "data_time": 0.023317594613347734,
            "batch_time": 0.03751802444458008,
            "samples_per_second": 2044613.2769849356,
            "samples_per_second_per_gpu": 255576.65962311695,
            "loss_sequences_lower_95": 3.861431364579634,
            "loss_sequences_upper_95": 4.133113195245916,
            "loss_tokens_lower_95": 3.849368134636703,
            "loss_tokens_upper_95": 3.9335377406026404,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9425474273915193,
            "data_time": 0.019076170399785042,
            "batch_time": 0.03305329941213131,
            "samples_per_second": 2041815.0183704835,
            "samples_per_second_per_gpu": 255226.87729631044,
            "loss_sequences_lower_95": 3.935868648606904,
            "loss_sequences_upper_95": 4.158706210389429,
            "loss_tokens_lower_95": 3.8057241505162493,
            "loss_tokens_upper_95": 3.9089682223785545,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.468426428635915,
            "data_time": 0.016299490745250996,
            "batch_time": 0.030647082206530448,
            "samples_per_second": 2008389.838113277,
            "samples_per_second_per_gpu": 251048.72976415962,
            "loss_sequences_lower_95": 4.428519124348958,
            "loss_sequences_upper_95": 4.535832702636719,
            "loss_tokens_lower_95": 4.33163061342369,
            "loss_tokens_upper_95": 4.560697433415096,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.39660650027616,
            "data_time": 0.001791240450740154,
            "batch_time": 0.015440287405599138,
            "samples_per_second": 2263201.8006803673,
            "samples_per_second_per_gpu": 282900.2250850459,
            "loss_sequences_lower_95": 6.411241305561365,
            "loss_sequences_upper_95": 6.486662561974066,
            "loss_tokens_lower_95": 6.2510339085419755,
            "loss_tokens_upper_95": 6.3302712631097995,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9896553550103695,
            "data_time": 0.0027811967286487556,
            "batch_time": 0.016447519896014423,
            "samples_per_second": 2248501.3089144873,
            "samples_per_second_per_gpu": 281062.6636143109,
            "loss_sequences_lower_95": 5.662333859016599,
            "loss_sequences_upper_95": 6.011978550149937,
            "loss_tokens_lower_95": 4.1070176459787975,
            "loss_tokens_upper_95": 4.255762117889986,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.467628978421664,
            "data_time": 0.0047510975116008035,
            "batch_time": 0.01837857873053164,
            "samples_per_second": 2227216.948723828,
            "samples_per_second_per_gpu": 278402.1185904785,
            "loss_sequences_lower_95": 4.964667008435767,
            "loss_sequences_upper_95": 5.360428554287543,
            "loss_tokens_lower_95": 3.9918154120406784,
            "loss_tokens_upper_95": 4.162447217096241,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.189686457316081,
            "data_time": 0.02193096705845424,
            "batch_time": 0.036485137684004645,
            "samples_per_second": 1979080.844539302,
            "samples_per_second_per_gpu": 247385.10556741274,
            "loss_sequences_lower_95": 6.084039055811216,
            "loss_sequences_upper_95": 6.292926415569705,
            "loss_tokens_lower_95": 6.085200953374715,
            "loss_tokens_upper_95": 6.292260853667237,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5772179341316224,
            "data_time": 0.04732231910412128,
            "batch_time": 0.06305992603302002,
            "samples_per_second": 1633455.9467784562,
            "samples_per_second_per_gpu": 204181.99334730703,
            "loss_sequences_lower_95": 3.440855941772461,
            "loss_sequences_upper_95": 3.8079063415527346,
            "loss_tokens_lower_95": 3.2711487471524205,
            "loss_tokens_upper_95": 3.72341572824659,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.680251980511168,
            "data_time": 0.003169403973288819,
            "batch_time": 0.017004723441625177,
            "samples_per_second": 2241933.913980557,
            "samples_per_second_per_gpu": 280241.73924756964,
            "loss_sequences_lower_95": 5.613964681322768,
            "loss_sequences_upper_95": 5.748630163686668,
            "loss_tokens_lower_95": 5.610917833810453,
            "loss_tokens_upper_95": 5.747527395226938,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.677122469909068,
            "data_time": 0.004659747221730467,
            "batch_time": 0.01835695915471088,
            "samples_per_second": 2234610.6839222712,
            "samples_per_second_per_gpu": 279326.3354902839,
            "loss_sequences_lower_95": 5.621127961679719,
            "loss_sequences_upper_95": 5.73226283367885,
            "loss_tokens_lower_95": 5.619420117027539,
            "loss_tokens_upper_95": 5.733104059092828,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5620496465669493,
            "data_time": 0.003322222835301838,
            "batch_time": 0.01689506323754659,
            "samples_per_second": 2248327.2384946477,
            "samples_per_second_per_gpu": 281040.90481183096,
            "loss_sequences_lower_95": 3.716158503054016,
            "loss_sequences_upper_95": 3.839353210648463,
            "loss_tokens_lower_95": 3.377736868413477,
            "loss_tokens_upper_95": 3.432614865203134,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.679988601446151,
            "data_time": 0.00999725703150034,
            "batch_time": 0.02418697439134121,
            "samples_per_second": 2095195.7645419582,
            "samples_per_second_per_gpu": 261899.47056774478,
            "loss_sequences_lower_95": 5.872765014648437,
            "loss_sequences_upper_95": 6.436683679199219,
            "loss_tokens_lower_95": 5.028136851565085,
            "loss_tokens_upper_95": 5.392248518998697,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8090193569660187,
            "data_time": 0.14700748026371002,
            "batch_time": 0.16384761035442352,
            "samples_per_second": 862706.7630802251,
            "samples_per_second_per_gpu": 107838.34538502814,
            "loss_sequences_lower_95": 3.5532151460647583,
            "loss_sequences_upper_95": 4.153567337989807,
            "loss_tokens_lower_95": 3.371734601601787,
            "loss_tokens_upper_95": 4.147741760604683,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.498310396726104,
            "data_time": 0.02677374190472542,
            "batch_time": 0.04088496654591662,
            "samples_per_second": 1860988.113257933,
            "samples_per_second_per_gpu": 232623.51415724162,
            "loss_sequences_lower_95": 6.000320364414961,
            "loss_sequences_upper_95": 6.889417310692798,
            "loss_tokens_lower_95": 3.8995169552498767,
            "loss_tokens_upper_95": 4.377942967979319,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.6533198735765,
            "data_time": 0.002854489203956392,
            "batch_time": 0.016612137150433328,
            "samples_per_second": 2233354.6980696544,
            "samples_per_second_per_gpu": 279169.3372587068,
            "loss_sequences_lower_95": 2.629189278205548,
            "loss_sequences_upper_95": 2.677012106985856,
            "loss_tokens_lower_95": 2.628689061902714,
            "loss_tokens_upper_95": 2.6766804213084314,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5174460011587168,
            "data_time": 0.002868157095533496,
            "batch_time": 0.016563018533116772,
            "samples_per_second": 2255135.757162396,
            "samples_per_second_per_gpu": 281891.9696452995,
            "loss_sequences_lower_95": 3.4857232740636523,
            "loss_sequences_upper_95": 3.659733881507132,
            "loss_tokens_lower_95": 3.3287973702236373,
            "loss_tokens_upper_95": 3.4983110150411276,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.395430782775739,
            "data_time": 0.018314687742127314,
            "batch_time": 0.03218409584628211,
            "samples_per_second": 2008257.4966486911,
            "samples_per_second_per_gpu": 251032.1870810864,
            "loss_sequences_lower_95": 3.239898972284226,
            "loss_sequences_upper_95": 3.6368896204909995,
            "loss_tokens_lower_95": 3.134376654329397,
            "loss_tokens_upper_95": 3.4417061014378323,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.739310857811215,
            "data_time": 0.004589086771011353,
            "batch_time": 0.01831537261605263,
            "samples_per_second": 2223930.291677606,
            "samples_per_second_per_gpu": 277991.2864597007,
            "loss_sequences_lower_95": 3.772697958159654,
            "loss_sequences_upper_95": 3.9224385177985646,
            "loss_tokens_lower_95": 3.590811333180997,
            "loss_tokens_upper_95": 3.7363860801841695,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.156139182608302,
            "data_time": 0.030633213974180677,
            "batch_time": 0.045592149098714195,
            "samples_per_second": 1872673.4167377313,
            "samples_per_second_per_gpu": 234084.1770922164,
            "loss_sequences_lower_95": 2.9808261638734397,
            "loss_sequences_upper_95": 3.4663659630752193,
            "loss_tokens_lower_95": 2.8746318723793713,
            "loss_tokens_upper_95": 3.2525916244233026,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.614116759771441,
            "data_time": 0.0022381404091061183,
            "batch_time": 0.015911241507368282,
            "samples_per_second": 2254319.1641485873,
            "samples_per_second_per_gpu": 281789.8955185734,
            "loss_sequences_lower_95": 4.598319009505025,
            "loss_sequences_upper_95": 4.62968552694914,
            "loss_tokens_lower_95": 4.598517027624275,
            "loss_tokens_upper_95": 4.6298426286819865,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.1720019460303113,
            "data_time": 0.0499074632471258,
            "batch_time": 0.064770845933394,
            "samples_per_second": 1707788.672516782,
            "samples_per_second_per_gpu": 213473.58406459776,
            "loss_sequences_lower_95": 1.1213219392646865,
            "loss_sequences_upper_95": 1.2773853894576284,
            "loss_tokens_lower_95": 1.0050906459531128,
            "loss_tokens_upper_95": 1.2339941081395536,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.016209839218817,
            "data_time": 0.0016436110198347887,
            "batch_time": 0.01537081789091067,
            "samples_per_second": 2249237.442757019,
            "samples_per_second_per_gpu": 281154.68034462736,
            "loss_sequences_lower_95": 5.413976236979167,
            "loss_sequences_upper_95": 5.463099613469602,
            "loss_tokens_lower_95": 4.40813125,
            "loss_tokens_upper_95": 4.458187560444874,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.155022721290589,
            "data_time": 0.005513333138965425,
            "batch_time": 0.019329183631473117,
            "samples_per_second": 2210318.9909990625,
            "samples_per_second_per_gpu": 276289.8738748828,
            "loss_sequences_lower_95": 7.129339270019532,
            "loss_sequences_upper_95": 7.4893691406250005,
            "loss_tokens_lower_95": 6.806968751510538,
            "loss_tokens_upper_95": 7.135029308204031,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.471925967672597,
            "data_time": 0.022482425479565638,
            "batch_time": 0.036731093616808874,
            "samples_per_second": 1994300.6165673297,
            "samples_per_second_per_gpu": 249287.57707091622,
            "loss_sequences_lower_95": 5.3036861386506455,
            "loss_sequences_upper_95": 5.638107631517493,
            "loss_tokens_lower_95": 5.3075992086659305,
            "loss_tokens_upper_95": 5.636745472783628,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.235910525466457,
            "data_time": 0.004354407629334783,
            "batch_time": 0.018190689474703318,
            "samples_per_second": 2224898.394008918,
            "samples_per_second_per_gpu": 278112.2992511148,
            "loss_sequences_lower_95": 6.1912539117986505,
            "loss_sequences_upper_95": 6.280132723721591,
            "loss_tokens_lower_95": 6.1925110048236265,
            "loss_tokens_upper_95": 6.279050477923769,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.2372222588857016,
            "data_time": 0.004347406486247448,
            "batch_time": 0.01807183122381251,
            "samples_per_second": 2242931.0561586623,
            "samples_per_second_per_gpu": 280366.3820198328,
            "loss_sequences_lower_95": 1.2772870259602864,
            "loss_sequences_upper_95": 1.335597139485677,
            "loss_tokens_lower_95": 1.1585887362757603,
            "loss_tokens_upper_95": 1.22056696897509,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.396540891556513,
            "data_time": 0.02359503081866673,
            "batch_time": 0.03747491538524628,
            "samples_per_second": 1971795.6235434322,
            "samples_per_second_per_gpu": 246474.45294292903,
            "loss_sequences_lower_95": 6.064930172874814,
            "loss_sequences_upper_95": 6.720667375837054,
            "loss_tokens_lower_95": 6.064684055873326,
            "loss_tokens_upper_95": 6.725463038853237,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.6509213373064995,
            "data_time": 0.14982378482818604,
            "batch_time": 0.1664235144853592,
            "samples_per_second": 787578.4631353182,
            "samples_per_second_per_gpu": 98447.30789191478,
            "loss_sequences_lower_95": 2.425546634197235,
            "loss_sequences_upper_95": 3.6438258230686187,
            "loss_tokens_lower_95": 2.0554758445503785,
            "loss_tokens_upper_95": 2.6152645213333603,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.42408404302597,
            "data_time": 0.005661133735898941,
            "batch_time": 0.019343387985986376,
            "samples_per_second": 2217771.679383748,
            "samples_per_second_per_gpu": 277221.4599229685,
            "loss_sequences_lower_95": 7.353881750488281,
            "loss_sequences_upper_95": 7.699587573242187,
            "loss_tokens_lower_95": 7.134305062592534,
            "loss_tokens_upper_95": 7.436869881721923,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.835725369453431,
            "data_time": 0.005766340191402133,
            "batch_time": 0.019577603964578538,
            "samples_per_second": 2198410.485298056,
            "samples_per_second_per_gpu": 274801.310662257,
            "loss_sequences_lower_95": 6.907257116699219,
            "loss_sequences_upper_95": 7.110937634277344,
            "loss_tokens_lower_95": 6.626739560023629,
            "loss_tokens_upper_95": 6.8059536448227025,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.643330931303847,
            "data_time": 0.004583140440210451,
            "batch_time": 0.018348416835568024,
            "samples_per_second": 2228224.89068834,
            "samples_per_second_per_gpu": 278528.1113360425,
            "loss_sequences_lower_95": 4.607786260816502,
            "loss_sequences_upper_95": 4.678261515776903,
            "loss_tokens_lower_95": 4.608755751995684,
            "loss_tokens_upper_95": 4.678307749447389,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.313056410366123,
            "data_time": 0.008290333330091033,
            "batch_time": 0.022251048477034553,
            "samples_per_second": 2161471.2601628723,
            "samples_per_second_per_gpu": 270183.90752035903,
            "loss_sequences_lower_95": 5.195245708225326,
            "loss_sequences_upper_95": 5.427671577230943,
            "loss_tokens_lower_95": 5.191836946314564,
            "loss_tokens_upper_95": 5.428158161497335,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.7116280384063725,
            "data_time": 0.005932995251246861,
            "batch_time": 0.019498298565546673,
            "samples_per_second": 2240636.6020329394,
            "samples_per_second_per_gpu": 280079.57525411743,
            "loss_sequences_lower_95": 7.644618200683594,
            "loss_sequences_upper_95": 7.779828186035156,
            "loss_tokens_lower_95": 7.643999609375,
            "loss_tokens_upper_95": 7.7794400390625,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.788546991258042,
            "data_time": 0.0020830647030489658,
            "batch_time": 0.015748166452058097,
            "samples_per_second": 2262678.6664257506,
            "samples_per_second_per_gpu": 282834.8333032188,
            "loss_sequences_lower_95": 4.396657218247398,
            "loss_sequences_upper_95": 4.5059344696073795,
            "loss_tokens_lower_95": 3.0368754132363605,
            "loss_tokens_upper_95": 3.107114897037495,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.697094323030159,
            "data_time": 0.017519143649509974,
            "batch_time": 0.035957699162619455,
            "samples_per_second": 2041724.4302855833,
            "samples_per_second_per_gpu": 255215.5537856979,
            "loss_sequences_lower_95": 5.501819189270931,
            "loss_sequences_upper_95": 5.887584561020581,
            "loss_tokens_lower_95": 5.504759045501253,
            "loss_tokens_upper_95": 5.886632879456478,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.701829226811727,
            "data_time": 0.009855668060481548,
            "batch_time": 0.02362260315567255,
            "samples_per_second": 2188683.241029484,
            "samples_per_second_per_gpu": 273585.4051286855,
            "loss_sequences_lower_95": 5.563321736653646,
            "loss_sequences_upper_95": 5.83724136651731,
            "loss_tokens_lower_95": 5.566009174421723,
            "loss_tokens_upper_95": 5.836060108857996,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.305935173707843,
            "data_time": 0.0023218227285359853,
            "batch_time": 0.016073278618860494,
            "samples_per_second": 2247233.076466865,
            "samples_per_second_per_gpu": 280904.1345583581,
            "loss_sequences_lower_95": 4.762008868454842,
            "loss_sequences_upper_95": 4.870882718061036,
            "loss_tokens_lower_95": 3.614357233772959,
            "loss_tokens_upper_95": 3.695734881092587,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.940429585320609,
            "data_time": 0.02534062663714091,
            "batch_time": 0.04014809429645538,
            "samples_per_second": 1944110.470760231,
            "samples_per_second_per_gpu": 243013.80884502886,
            "loss_sequences_lower_95": 4.79780028812469,
            "loss_sequences_upper_95": 5.075133583028481,
            "loss_tokens_lower_95": 4.799824322089947,
            "loss_tokens_upper_95": 5.074842697224289,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.575469713911004,
            "data_time": 0.0034522234447418696,
            "batch_time": 0.017280921540126403,
            "samples_per_second": 2228609.6355974954,
            "samples_per_second_per_gpu": 278576.2044496869,
            "loss_sequences_lower_95": 4.5334529921038795,
            "loss_sequences_upper_95": 4.617988034869553,
            "loss_tokens_lower_95": 4.534133017070432,
            "loss_tokens_upper_95": 4.616073606531919,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.146595328756907,
            "data_time": 0.022710323333740234,
            "batch_time": 0.03701335516842929,
            "samples_per_second": 1883063.7600641737,
            "samples_per_second_per_gpu": 235382.97000802172,
            "loss_sequences_lower_95": 5.927847660398021,
            "loss_sequences_upper_95": 6.364759722959648,
            "loss_tokens_lower_95": 5.9257016820815,
            "loss_tokens_upper_95": 6.367234728174302,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5399208784103395,
            "data_time": 0.07988160848617554,
            "batch_time": 0.0950930267572403,
            "samples_per_second": 1465274.8498259056,
            "samples_per_second_per_gpu": 183159.3562282382,
            "loss_sequences_lower_95": 3.2790598297119145,
            "loss_sequences_upper_95": 3.9500188954671223,
            "loss_tokens_lower_95": 2.9415711720784508,
            "loss_tokens_upper_95": 3.87911655637953,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.03447603782018,
            "data_time": 0.07970498502254486,
            "batch_time": 0.09600306302309036,
            "samples_per_second": 1358983.2960418933,
            "samples_per_second_per_gpu": 169872.91200523666,
            "loss_sequences_lower_95": 2.837936019897461,
            "loss_sequences_upper_95": 3.545541222890218,
            "loss_tokens_lower_95": 2.3307374932792753,
            "loss_tokens_upper_95": 3.3428266975317107,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.419711300269845,
            "data_time": 0.0037499398887683674,
            "batch_time": 0.017447604930983213,
            "samples_per_second": 2243041.1332405778,
            "samples_per_second_per_gpu": 280380.1416550722,
            "loss_sequences_lower_95": 4.403570170114598,
            "loss_sequences_upper_95": 4.436254595165225,
            "loss_tokens_lower_95": 4.403173130580358,
            "loss_tokens_upper_95": 4.436313785783321,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.7576571177994189,
            "data_time": 0.0015518420276026236,
            "batch_time": 0.015232919114162937,
            "samples_per_second": 2259415.9443733804,
            "samples_per_second_per_gpu": 282426.99304667255,
            "loss_sequences_lower_95": 0.9054043715379881,
            "loss_sequences_upper_95": 0.9310984568402516,
            "loss_tokens_lower_95": 0.601758153711497,
            "loss_tokens_upper_95": 0.6142377970612158,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.697593396104227,
            "data_time": 0.03816215321421623,
            "batch_time": 0.0540156252682209,
            "samples_per_second": 1782245.3551853243,
            "samples_per_second_per_gpu": 222780.66939816554,
            "loss_sequences_lower_95": 4.71149161030927,
            "loss_sequences_upper_95": 5.0846770158902865,
            "loss_tokens_lower_95": 4.341217690198201,
            "loss_tokens_upper_95": 4.544940825733849,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.370092984792349,
            "data_time": 0.11458257266453334,
            "batch_time": 0.13035841215224492,
            "samples_per_second": 991971.2691929167,
            "samples_per_second_per_gpu": 123996.40864911459,
            "loss_sequences_lower_95": 6.971750826449008,
            "loss_sequences_upper_95": 7.995522555789432,
            "loss_tokens_lower_95": 6.693023191852334,
            "loss_tokens_upper_95": 7.763891808780622,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4897459033058915,
            "data_time": 0.02961302087420509,
            "batch_time": 0.045048228331974575,
            "samples_per_second": 1859371.1605235566,
            "samples_per_second_per_gpu": 232421.39506544458,
            "loss_sequences_lower_95": 4.463708942692454,
            "loss_sequences_upper_95": 4.814780900536514,
            "loss_tokens_lower_95": 4.1259373685870475,
            "loss_tokens_upper_95": 4.298290646026072,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.6056097411527865,
            "data_time": 0.030732907000042144,
            "batch_time": 0.04514157772064209,
            "samples_per_second": 1920527.4282887105,
            "samples_per_second_per_gpu": 240065.92853608882,
            "loss_sequences_lower_95": 4.600447780330007,
            "loss_sequences_upper_95": 4.922759972549066,
            "loss_tokens_lower_95": 4.257456702331564,
            "loss_tokens_upper_95": 4.401835002968511,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.710773422950652,
            "data_time": 0.03058362290972755,
            "batch_time": 0.04483324856985183,
            "samples_per_second": 1960677.4379691859,
            "samples_per_second_per_gpu": 245084.67974614823,
            "loss_sequences_lower_95": 4.6514536043492765,
            "loss_sequences_upper_95": 5.051928794674756,
            "loss_tokens_lower_95": 4.325653893951714,
            "loss_tokens_upper_95": 4.556795567956171,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.689111059758721,
            "data_time": 0.029379668689909436,
            "batch_time": 0.04346387726919992,
            "samples_per_second": 1969635.7434086162,
            "samples_per_second_per_gpu": 246204.46792607702,
            "loss_sequences_lower_95": 4.675929102083532,
            "loss_sequences_upper_95": 4.970815742306593,
            "loss_tokens_lower_95": 4.36887622488622,
            "loss_tokens_upper_95": 4.5023389489479895,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.79229600088937,
            "data_time": 0.03210375338424871,
            "batch_time": 0.04694699652401017,
            "samples_per_second": 1907393.2805287724,
            "samples_per_second_per_gpu": 238424.16006609655,
            "loss_sequences_lower_95": 4.769216084924544,
            "loss_sequences_upper_95": 5.055628313337054,
            "loss_tokens_lower_95": 4.5396194862350105,
            "loss_tokens_upper_95": 4.65461168287511,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.981202616924193,
            "data_time": 0.030931739580063594,
            "batch_time": 0.04572373060953049,
            "samples_per_second": 1911777.1453037546,
            "samples_per_second_per_gpu": 238972.14316296933,
            "loss_sequences_lower_95": 5.02389796187238,
            "loss_sequences_upper_95": 5.350166562708413,
            "loss_tokens_lower_95": 4.61954659009569,
            "loss_tokens_upper_95": 4.751032271888185,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-8.0/params.txt",
    "uuid": "5fd794d7-56bd-4df0-825d-8f3476238bae",
    "creation_date": "2023_12_14-04_59_29"
}