{
    "name": "c4_original-d=512_l=8_h=4-0.5",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 789140480,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "157828096",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.888268778721492,
            "data_time": 0.03438437357544899,
            "batch_time": 0.3365783207118511,
            "samples_per_second": 1747507.7382731002,
            "samples_per_second_per_gpu": 218438.46728413753,
            "loss_sequences_lower_95": 4.755009282430013,
            "loss_sequences_upper_95": 5.02227481842041,
            "loss_tokens_lower_95": 4.8724826812744135,
            "loss_tokens_upper_95": 4.903930702209473,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.057563001976937,
            "data_time": 0.0013944024106035502,
            "batch_time": 0.015237092837174887,
            "samples_per_second": 2248859.103460167,
            "samples_per_second_per_gpu": 281107.3879325209,
            "loss_sequences_lower_95": 4.0550844623907345,
            "loss_sequences_upper_95": 4.0599728379157405,
            "loss_tokens_lower_95": 4.04626359375,
            "loss_tokens_upper_95": 4.068884552083333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.488147540968292,
            "data_time": 0.010129109382629394,
            "batch_time": 0.02428726863861084,
            "samples_per_second": 2126291.272555217,
            "samples_per_second_per_gpu": 265786.40906940214,
            "loss_sequences_lower_95": 4.460106637137277,
            "loss_sequences_upper_95": 4.521412141761002,
            "loss_tokens_lower_95": 4.473156937500001,
            "loss_tokens_upper_95": 4.50352828125,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.1462494227812465,
            "data_time": 0.0015807042976743297,
            "batch_time": 0.014931047335267067,
            "samples_per_second": 2335977.6665004753,
            "samples_per_second_per_gpu": 291997.2083125594,
            "loss_sequences_lower_95": 4.12558285679768,
            "loss_sequences_upper_95": 4.167419720521908,
            "loss_tokens_lower_95": 4.134435833333334,
            "loss_tokens_upper_95": 4.158112729166667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.085412815971918,
            "data_time": 0.009930469125390527,
            "batch_time": 0.023631943174567356,
            "samples_per_second": 2209734.3947445387,
            "samples_per_second_per_gpu": 276216.79934306734,
            "loss_sequences_lower_95": 4.046282784953379,
            "loss_sequences_upper_95": 4.130134809381365,
            "loss_tokens_lower_95": 4.073914822916667,
            "loss_tokens_upper_95": 4.09674975,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.61877818013599,
            "data_time": 0.003795221771882928,
            "batch_time": 0.01719206636366637,
            "samples_per_second": 2326365.4478336144,
            "samples_per_second_per_gpu": 290795.6809792018,
            "loss_sequences_lower_95": 4.578722339580855,
            "loss_sequences_upper_95": 4.65965769525939,
            "loss_tokens_lower_95": 4.6058749895833335,
            "loss_tokens_upper_95": 4.631480729166666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.817272578453531,
            "data_time": 0.0016095900807839621,
            "batch_time": 0.014918843354136482,
            "samples_per_second": 2350566.419246003,
            "samples_per_second_per_gpu": 293820.8024057504,
            "loss_sequences_lower_95": 4.783448102678572,
            "loss_sequences_upper_95": 4.850657007334184,
            "loss_tokens_lower_95": 4.8015878333333335,
            "loss_tokens_upper_95": 4.832858916666666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.590178464020734,
            "data_time": 0.0015855580381347963,
            "batch_time": 0.01506143062995199,
            "samples_per_second": 2320940.623380053,
            "samples_per_second_per_gpu": 290117.5779225066,
            "loss_sequences_lower_95": 4.576400359947644,
            "loss_sequences_upper_95": 4.605112299574608,
            "loss_tokens_lower_95": 4.577991166666667,
            "loss_tokens_upper_95": 4.60228015625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4392156247201005,
            "data_time": 0.011416256427764893,
            "batch_time": 0.025637484732128325,
            "samples_per_second": 2212432.3101043403,
            "samples_per_second_per_gpu": 276554.03876304254,
            "loss_sequences_lower_95": 4.3845574975982915,
            "loss_sequences_upper_95": 4.501206858565168,
            "loss_tokens_lower_95": 4.4272288125,
            "loss_tokens_upper_95": 4.451122697916667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.506318833988175,
            "data_time": 0.009568074718117714,
            "batch_time": 0.023548724129796028,
            "samples_per_second": 2184733.3783870405,
            "samples_per_second_per_gpu": 273091.67229838006,
            "loss_sequences_lower_95": 5.453182814526464,
            "loss_sequences_upper_95": 5.571757308296536,
            "loss_tokens_lower_95": 5.493136895833333,
            "loss_tokens_upper_95": 5.5196187708333335,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4957911171977845,
            "data_time": 0.0012596059936665614,
            "batch_time": 0.014787005672522786,
            "samples_per_second": 2324235.0386038357,
            "samples_per_second_per_gpu": 290529.37982547947,
            "loss_sequences_lower_95": 4.48758427987315,
            "loss_sequences_upper_95": 4.504121543421098,
            "loss_tokens_lower_95": 4.4834989895833335,
            "loss_tokens_upper_95": 4.50813396875,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.331360270807218,
            "data_time": 0.002727471819328924,
            "batch_time": 0.017260999306354,
            "samples_per_second": 2322948.230759721,
            "samples_per_second_per_gpu": 290368.5288449651,
            "loss_sequences_lower_95": 4.317432565939322,
            "loss_sequences_upper_95": 4.345682239333871,
            "loss_tokens_lower_95": 4.3191761875,
            "loss_tokens_upper_95": 4.343343947916667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.777787012212417,
            "data_time": 0.009958941945916578,
            "batch_time": 0.023613488721282116,
            "samples_per_second": 2184465.646590353,
            "samples_per_second_per_gpu": 273058.2058237941,
            "loss_sequences_lower_95": 4.723100160053245,
            "loss_sequences_upper_95": 4.838782629666898,
            "loss_tokens_lower_95": 4.764194291666667,
            "loss_tokens_upper_95": 4.79067465625,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.155488245356107,
            "data_time": 0.010624109511356429,
            "batch_time": 0.024687995948639525,
            "samples_per_second": 2144283.0985791204,
            "samples_per_second_per_gpu": 268035.38732239004,
            "loss_sequences_lower_95": 4.0919373180133025,
            "loss_sequences_upper_95": 4.221988132577807,
            "loss_tokens_lower_95": 4.142816520833334,
            "loss_tokens_upper_95": 4.168015177083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.580702033909884,
            "data_time": 0.0827926652772086,
            "batch_time": 0.09792992898396083,
            "samples_per_second": 1069530.8339977139,
            "samples_per_second_per_gpu": 133691.35424971423,
            "loss_sequences_lower_95": 5.508922403508967,
            "loss_sequences_upper_95": 5.653918725794012,
            "loss_tokens_lower_95": 5.550870791348545,
            "loss_tokens_upper_95": 5.610777577486905,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.903382336085461,
            "data_time": 0.014507912776686928,
            "batch_time": 0.02822175215591084,
            "samples_per_second": 2132822.8883378822,
            "samples_per_second_per_gpu": 266602.8610422353,
            "loss_sequences_lower_95": 4.805907402372221,
            "loss_sequences_upper_95": 5.000751506482894,
            "loss_tokens_lower_95": 4.889287416666667,
            "loss_tokens_upper_95": 4.917360020833334,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.343019747167904,
            "data_time": 0.013455308973789215,
            "batch_time": 0.02737082540988922,
            "samples_per_second": 2151833.704722348,
            "samples_per_second_per_gpu": 268979.2130902935,
            "loss_sequences_lower_95": 6.281069326274943,
            "loss_sequences_upper_95": 6.409523175322601,
            "loss_tokens_lower_95": 6.330924427083334,
            "loss_tokens_upper_95": 6.355108864583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.979662082234367,
            "data_time": 0.037143904715776443,
            "batch_time": 0.05339378863573074,
            "samples_per_second": 1738365.8122590922,
            "samples_per_second_per_gpu": 217295.72653238653,
            "loss_sequences_lower_95": 4.882971866795274,
            "loss_sequences_upper_95": 5.1339118707375455,
            "loss_tokens_lower_95": 4.96506200071241,
            "loss_tokens_upper_95": 4.9944438371501985,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.731866793370352,
            "data_time": 0.001849083870581451,
            "batch_time": 0.015465619521966557,
            "samples_per_second": 2274290.1071022237,
            "samples_per_second_per_gpu": 284286.26338777796,
            "loss_sequences_lower_95": 4.714069732765988,
            "loss_sequences_upper_95": 4.749653856510824,
            "loss_tokens_lower_95": 4.714248451631712,
            "loss_tokens_upper_95": 4.7494994091386555,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.757503986904712,
            "data_time": 0.00202879252707123,
            "batch_time": 0.01565363092027652,
            "samples_per_second": 2268524.52653647,
            "samples_per_second_per_gpu": 283565.5658170587,
            "loss_sequences_lower_95": 3.7682487653418146,
            "loss_sequences_upper_95": 3.7948388059213802,
            "loss_tokens_lower_95": 3.73103239859794,
            "loss_tokens_upper_95": 3.751259497051277,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.167914063174825,
            "data_time": 0.003045191301926479,
            "batch_time": 0.016710280023803134,
            "samples_per_second": 2264308.1087785843,
            "samples_per_second_per_gpu": 283038.51359732304,
            "loss_sequences_lower_95": 6.350354171125856,
            "loss_sequences_upper_95": 6.640093045601825,
            "loss_tokens_lower_95": 5.73469737963873,
            "loss_tokens_upper_95": 5.942022075384948,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.1878005575339,
            "data_time": 0.004149559171909982,
            "batch_time": 0.017937279603582747,
            "samples_per_second": 2227922.194464631,
            "samples_per_second_per_gpu": 278490.27430807886,
            "loss_sequences_lower_95": 6.316680126953124,
            "loss_sequences_upper_95": 6.5131052734375,
            "loss_tokens_lower_95": 5.852303618808962,
            "loss_tokens_upper_95": 5.987292968749999,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.315246467973537,
            "data_time": 0.0045318891200185,
            "batch_time": 0.01820821460016173,
            "samples_per_second": 2242398.845031559,
            "samples_per_second_per_gpu": 280299.8556289449,
            "loss_sequences_lower_95": 4.35754507439348,
            "loss_sequences_upper_95": 4.433141710398503,
            "loss_tokens_lower_95": 4.207549260115187,
            "loss_tokens_upper_95": 4.243025201759268,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.707206562432376,
            "data_time": 0.023173185331480845,
            "batch_time": 0.0380003707749503,
            "samples_per_second": 1985872.893275827,
            "samples_per_second_per_gpu": 248234.11165947837,
            "loss_sequences_lower_95": 4.608115331476385,
            "loss_sequences_upper_95": 4.872409362792969,
            "loss_tokens_lower_95": 4.6011853774060265,
            "loss_tokens_upper_95": 4.682096526063186,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.557747535316311,
            "data_time": 0.020271534100174904,
            "batch_time": 0.034644631668925285,
            "samples_per_second": 1966361.3835751382,
            "samples_per_second_per_gpu": 245795.17294689227,
            "loss_sequences_lower_95": 4.53049945442044,
            "loss_sequences_upper_95": 4.7577093879544,
            "loss_tokens_lower_95": 4.4296936026891505,
            "loss_tokens_upper_95": 4.536078251946513,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.117558722496033,
            "data_time": 0.01705617171067458,
            "batch_time": 0.03131791108693832,
            "samples_per_second": 2016598.3610738148,
            "samples_per_second_per_gpu": 252074.79513422685,
            "loss_sequences_lower_95": 5.062156656901042,
            "loss_sequences_upper_95": 5.188087025960286,
            "loss_tokens_lower_95": 4.984258784234008,
            "loss_tokens_upper_95": 5.209883093156095,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.143271126053616,
            "data_time": 0.001743362194181791,
            "batch_time": 0.015405074992338019,
            "samples_per_second": 2267993.38503209,
            "samples_per_second_per_gpu": 283499.17312901124,
            "loss_sequences_lower_95": 8.162550863331036,
            "loss_sequences_upper_95": 8.239383596402735,
            "loss_tokens_lower_95": 7.988789292070811,
            "loss_tokens_upper_95": 8.068571289389126,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.589921011065795,
            "data_time": 0.0028861873101868087,
            "batch_time": 0.016794603943024705,
            "samples_per_second": 2229565.9989745314,
            "samples_per_second_per_gpu": 278695.7498718164,
            "loss_sequences_lower_95": 6.2169340294218225,
            "loss_sequences_upper_95": 6.534548282302188,
            "loss_tokens_lower_95": 4.787968568169562,
            "loss_tokens_upper_95": 4.935591379108481,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.046625143330251,
            "data_time": 0.005115038237056216,
            "batch_time": 0.018808735786257563,
            "samples_per_second": 2232616.0261174347,
            "samples_per_second_per_gpu": 279077.00326467934,
            "loss_sequences_lower_95": 5.50209289134159,
            "loss_sequences_upper_95": 5.856250541608895,
            "loss_tokens_lower_95": 4.612695714280312,
            "loss_tokens_upper_95": 4.782455364570728,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.687395840475004,
            "data_time": 0.024751422660691396,
            "batch_time": 0.03968106635979244,
            "samples_per_second": 1947120.748331851,
            "samples_per_second_per_gpu": 243390.0935414814,
            "loss_sequences_lower_95": 5.61036376953125,
            "loss_sequences_upper_95": 5.763751750231878,
            "loss_tokens_lower_95": 5.611294026135309,
            "loss_tokens_upper_95": 5.764518870175157,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.288930926322937,
            "data_time": 0.04971363911261925,
            "batch_time": 0.06470099779275748,
            "samples_per_second": 1695744.4559378696,
            "samples_per_second_per_gpu": 211968.0569922337,
            "loss_sequences_lower_95": 4.139150604248047,
            "loss_sequences_upper_95": 4.563534996032715,
            "loss_tokens_lower_95": 3.938831145093778,
            "loss_tokens_upper_95": 4.419890647710756,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.314207376452771,
            "data_time": 0.0034913153736137906,
            "batch_time": 0.017126255240177083,
            "samples_per_second": 2266130.7488166313,
            "samples_per_second_per_gpu": 283266.3436020789,
            "loss_sequences_lower_95": 5.262199547102898,
            "loss_sequences_upper_95": 5.366230541217535,
            "loss_tokens_lower_95": 5.261480844081851,
            "loss_tokens_upper_95": 5.366408998768551,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.042008522199276,
            "data_time": 0.004963335360827298,
            "batch_time": 0.018570706661529292,
            "samples_per_second": 2250081.9198164986,
            "samples_per_second_per_gpu": 281260.23997706233,
            "loss_sequences_lower_95": 4.992378283601428,
            "loss_sequences_upper_95": 5.0902934006449625,
            "loss_tokens_lower_95": 4.992052162913979,
            "loss_tokens_upper_95": 5.092122015925727,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.202725227042563,
            "data_time": 0.003591838717203161,
            "batch_time": 0.01743361218701686,
            "samples_per_second": 2254309.007850006,
            "samples_per_second_per_gpu": 281788.6259812508,
            "loss_sequences_lower_95": 4.342990135549851,
            "loss_sequences_upper_95": 4.466643483490887,
            "loss_tokens_lower_95": 4.041350045526797,
            "loss_tokens_upper_95": 4.099050664597361,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.305997536659241,
            "data_time": 0.011132764630019665,
            "batch_time": 0.02546169050037861,
            "samples_per_second": 2087564.6654131652,
            "samples_per_second_per_gpu": 260945.58317664565,
            "loss_sequences_lower_95": 6.5020875610351565,
            "loss_sequences_upper_95": 7.052536218261719,
            "loss_tokens_lower_95": 5.630825347193498,
            "loss_tokens_upper_95": 5.995881989111101,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.720055267214775,
            "data_time": 0.1593533605337143,
            "batch_time": 0.17780885100364685,
            "samples_per_second": 860474.0069244254,
            "samples_per_second_per_gpu": 107559.25086555317,
            "loss_sequences_lower_95": 4.433132457733154,
            "loss_sequences_upper_95": 5.125186824798584,
            "loss_tokens_lower_95": 4.1899348467245865,
            "loss_tokens_upper_95": 5.059106901322289,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.922328130952243,
            "data_time": 0.029344715970627804,
            "batch_time": 0.04362831724450943,
            "samples_per_second": 1848465.173863774,
            "samples_per_second_per_gpu": 231058.14673297174,
            "loss_sequences_lower_95": 6.373912100956358,
            "loss_sequences_upper_95": 7.210449815070492,
            "loss_tokens_lower_95": 4.491777442837514,
            "loss_tokens_upper_95": 4.969013146160689,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.166241340524716,
            "data_time": 0.0030092872265312406,
            "batch_time": 0.016759542955292597,
            "samples_per_second": 2229273.030373403,
            "samples_per_second_per_gpu": 278659.12879667536,
            "loss_sequences_lower_95": 4.126084191291913,
            "loss_sequences_upper_95": 4.206466601477174,
            "loss_tokens_lower_95": 4.124597023980177,
            "loss_tokens_upper_95": 4.2065522480135975,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.280184237384496,
            "data_time": 0.002863834382583802,
            "batch_time": 0.016741069102407566,
            "samples_per_second": 2232368.059464185,
            "samples_per_second_per_gpu": 279046.00743302313,
            "loss_sequences_lower_95": 5.2522464253984325,
            "loss_sequences_upper_95": 5.474819507447118,
            "loss_tokens_lower_95": 5.008972679747312,
            "loss_tokens_upper_95": 5.224028653191078,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8895980840200908,
            "data_time": 0.019877089394463435,
            "batch_time": 0.03523606724209256,
            "samples_per_second": 1850364.5676413213,
            "samples_per_second_per_gpu": 231295.57095516517,
            "loss_sequences_lower_95": 3.743866067404275,
            "loss_sequences_upper_95": 4.161045898884644,
            "loss_tokens_lower_95": 3.613325222414142,
            "loss_tokens_upper_95": 3.930149232933193,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.189096286151949,
            "data_time": 0.004752522706985474,
            "batch_time": 0.018998784199357032,
            "samples_per_second": 2141805.337545992,
            "samples_per_second_per_gpu": 267725.667193249,
            "loss_sequences_lower_95": 4.2239388168563785,
            "loss_sequences_upper_95": 4.367949403734215,
            "loss_tokens_lower_95": 4.054117649762565,
            "loss_tokens_upper_95": 4.203859876003319,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.847036054948481,
            "data_time": 0.03172196376891363,
            "batch_time": 0.04648100762140183,
            "samples_per_second": 1879967.3665136173,
            "samples_per_second_per_gpu": 234995.92081420217,
            "loss_sequences_lower_95": 3.6363696493753572,
            "loss_sequences_upper_95": 4.154288743181926,
            "loss_tokens_lower_95": 3.555986035395213,
            "loss_tokens_upper_95": 3.958230974156938,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.675273904587703,
            "data_time": 0.002211854070174994,
            "batch_time": 0.015985998591088887,
            "samples_per_second": 2236379.610301511,
            "samples_per_second_per_gpu": 279547.45128768886,
            "loss_sequences_lower_95": 4.657359968087367,
            "loss_sequences_upper_95": 4.69274790504976,
            "loss_tokens_lower_95": 4.6577210266272004,
            "loss_tokens_upper_95": 4.692606529118324,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.1498879541471165,
            "data_time": 0.04805013916709206,
            "batch_time": 0.06308725096962668,
            "samples_per_second": 1634173.1865706672,
            "samples_per_second_per_gpu": 204271.6483213334,
            "loss_sequences_lower_95": 2.034877640066795,
            "loss_sequences_upper_95": 2.3317079673693017,
            "loss_tokens_lower_95": 1.906930913700662,
            "loss_tokens_upper_95": 2.235343085047899,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.88244233926762,
            "data_time": 0.0016654810281931005,
            "batch_time": 0.015430028710361298,
            "samples_per_second": 2239945.207731703,
            "samples_per_second_per_gpu": 279993.15096646285,
            "loss_sequences_lower_95": 6.326658784558438,
            "loss_sequences_upper_95": 6.3799055784853245,
            "loss_tokens_lower_95": 5.206084344777563,
            "loss_tokens_upper_95": 5.260433002901354,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.4534660987854,
            "data_time": 0.006365089189438592,
            "batch_time": 0.020438069862032695,
            "samples_per_second": 2160648.667476803,
            "samples_per_second_per_gpu": 270081.0834346004,
            "loss_sequences_lower_95": 6.374698352050782,
            "loss_sequences_upper_95": 6.645425451660156,
            "loss_tokens_lower_95": 6.272580071705216,
            "loss_tokens_upper_95": 6.5197049429318925,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.019067418056985,
            "data_time": 0.02203665143352444,
            "batch_time": 0.0361473075414108,
            "samples_per_second": 1998151.7459946326,
            "samples_per_second_per_gpu": 249768.96824932907,
            "loss_sequences_lower_95": 4.889364451532779,
            "loss_sequences_upper_95": 5.1509459520422896,
            "loss_tokens_lower_95": 4.888840703549593,
            "loss_tokens_upper_95": 5.149005418860393,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.737285555131507,
            "data_time": 0.004598416477800852,
            "batch_time": 0.018620922622910464,
            "samples_per_second": 2191693.982359021,
            "samples_per_second_per_gpu": 273961.7477948776,
            "loss_sequences_lower_95": 8.639931936553031,
            "loss_sequences_upper_95": 8.835611794211648,
            "loss_tokens_lower_95": 8.643355139530067,
            "loss_tokens_upper_95": 8.832998231830018,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.4413612225453059,
            "data_time": 0.004211243796855845,
            "batch_time": 0.018113200334792443,
            "samples_per_second": 2212467.5065311776,
            "samples_per_second_per_gpu": 276558.4383163972,
            "loss_sequences_lower_95": 1.5014303833007812,
            "loss_sequences_upper_95": 1.589760518391927,
            "loss_tokens_lower_95": 1.3326067145608242,
            "loss_tokens_upper_95": 1.4019905774809922,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.238810429118929,
            "data_time": 0.024438698376928056,
            "batch_time": 0.039147945387022834,
            "samples_per_second": 1873631.744611712,
            "samples_per_second_per_gpu": 234203.968076464,
            "loss_sequences_lower_95": 5.942612144833519,
            "loss_sequences_upper_95": 6.531215704055059,
            "loss_tokens_lower_95": 5.939963233584449,
            "loss_tokens_upper_95": 6.530205296107701,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3403720259666443,
            "data_time": 0.1576937586069107,
            "batch_time": 0.17504920065402985,
            "samples_per_second": 873652.9150333607,
            "samples_per_second_per_gpu": 109206.61437917009,
            "loss_sequences_lower_95": 3.075349497795105,
            "loss_sequences_upper_95": 4.4079714179039,
            "loss_tokens_lower_95": 2.7554814289525615,
            "loss_tokens_upper_95": 3.3222824631523835,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.747236303806305,
            "data_time": 0.0059161777534182105,
            "batch_time": 0.019717566077671354,
            "samples_per_second": 2201910.6212547827,
            "samples_per_second_per_gpu": 275238.82765684783,
            "loss_sequences_lower_95": 7.690684155273438,
            "loss_sequences_upper_95": 8.044695190429689,
            "loss_tokens_lower_95": 7.414843460831747,
            "loss_tokens_upper_95": 7.727953502571118,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.220006835460663,
            "data_time": 0.005955619944466485,
            "batch_time": 0.019742312885466077,
            "samples_per_second": 2206246.9050270934,
            "samples_per_second_per_gpu": 275780.86312838667,
            "loss_sequences_lower_95": 7.287921923828125,
            "loss_sequences_upper_95": 7.506754577636719,
            "loss_tokens_lower_95": 6.998393806530108,
            "loss_tokens_upper_95": 7.17916076996354,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.423830020839322,
            "data_time": 0.0041991933931076406,
            "batch_time": 0.018202541823371197,
            "samples_per_second": 2191394.338959593,
            "samples_per_second_per_gpu": 273924.2923699491,
            "loss_sequences_lower_95": 4.391813481145763,
            "loss_sequences_upper_95": 4.456688708344682,
            "loss_tokens_lower_95": 4.39096535642076,
            "loss_tokens_upper_95": 4.456084519634701,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.6927910129412345,
            "data_time": 0.008998368081729578,
            "batch_time": 0.02317837429910988,
            "samples_per_second": 2119077.1240979666,
            "samples_per_second_per_gpu": 264884.6405122458,
            "loss_sequences_lower_95": 4.602376639604935,
            "loss_sequences_upper_95": 4.779153147051411,
            "loss_tokens_lower_95": 4.601246242259505,
            "loss_tokens_upper_95": 4.780106101790515,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.399729832172394,
            "data_time": 0.006014776608300587,
            "batch_time": 0.01985635076250349,
            "samples_per_second": 2189841.0429709866,
            "samples_per_second_per_gpu": 273730.1303713733,
            "loss_sequences_lower_95": 9.302161767578125,
            "loss_sequences_upper_95": 9.497420556640625,
            "loss_tokens_lower_95": 9.302178442382813,
            "loss_tokens_upper_95": 9.495573095703126,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.766905339743128,
            "data_time": 0.0020956864203757735,
            "batch_time": 0.0157619419981266,
            "samples_per_second": 2257540.8054810525,
            "samples_per_second_per_gpu": 282192.60068513156,
            "loss_sequences_lower_95": 5.335212339980488,
            "loss_sequences_upper_95": 5.440566766571074,
            "loss_tokens_lower_95": 4.059181787007316,
            "loss_tokens_upper_95": 4.130937340037538,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.096128911224763,
            "data_time": 0.019129647527422224,
            "batch_time": 0.03371427059173584,
            "samples_per_second": 1948871.272416457,
            "samples_per_second_per_gpu": 243608.90905205713,
            "loss_sequences_lower_95": 4.9348808288574215,
            "loss_sequences_upper_95": 5.262175944314074,
            "loss_tokens_lower_95": 4.93210683794164,
            "loss_tokens_upper_95": 5.257116152635261,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.121186034819659,
            "data_time": 0.01119140163064003,
            "batch_time": 0.025340923108160496,
            "samples_per_second": 2143653.833723336,
            "samples_per_second_per_gpu": 267956.729215417,
            "loss_sequences_lower_95": 5.016065314797793,
            "loss_sequences_upper_95": 5.226941971124387,
            "loss_tokens_lower_95": 5.016657236136642,
            "loss_tokens_upper_95": 5.22641149184283,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.372252031210653,
            "data_time": 0.0024183560800898163,
            "batch_time": 0.016414497577717794,
            "samples_per_second": 2202124.9587248336,
            "samples_per_second_per_gpu": 275265.6198406042,
            "loss_sequences_lower_95": 5.834384859819304,
            "loss_sequences_upper_95": 5.947304428159839,
            "loss_tokens_lower_95": 4.6477144993378126,
            "loss_tokens_upper_95": 4.736492805580251,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.447501560998341,
            "data_time": 0.028249002993106842,
            "batch_time": 0.04284793138504028,
            "samples_per_second": 2006078.8172455542,
            "samples_per_second_per_gpu": 250759.85215569427,
            "loss_sequences_lower_95": 5.332273420707258,
            "loss_sequences_upper_95": 5.561072084134218,
            "loss_tokens_lower_95": 5.334086189068184,
            "loss_tokens_upper_95": 5.561240318338707,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.881960787175263,
            "data_time": 0.0036028601747729403,
            "batch_time": 0.017399168276524806,
            "samples_per_second": 2230701.354383337,
            "samples_per_second_per_gpu": 278837.6692979171,
            "loss_sequences_lower_95": 6.855799198442278,
            "loss_sequences_upper_95": 6.90812882262997,
            "loss_tokens_lower_95": 6.8550824702551605,
            "loss_tokens_upper_95": 6.909079970613532,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.229573275857759,
            "data_time": 0.023638707941228694,
            "batch_time": 0.038236680897799406,
            "samples_per_second": 1858452.4259558623,
            "samples_per_second_per_gpu": 232306.55324448278,
            "loss_sequences_lower_95": 5.070740783098832,
            "loss_sequences_upper_95": 5.3902127015937875,
            "loss_tokens_lower_95": 5.067912559138918,
            "loss_tokens_upper_95": 5.390385851813751,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.495747935771942,
            "data_time": 0.07517711818218231,
            "batch_time": 0.08999154716730118,
            "samples_per_second": 1508678.6020694573,
            "samples_per_second_per_gpu": 188584.82525868216,
            "loss_sequences_lower_95": 4.1865031560262045,
            "loss_sequences_upper_95": 4.991108843485515,
            "loss_tokens_lower_95": 3.742800129784478,
            "loss_tokens_upper_95": 4.93307032055325,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7533793489138287,
            "data_time": 0.08034874498844147,
            "batch_time": 0.09616968780755997,
            "samples_per_second": 1387279.3888186505,
            "samples_per_second_per_gpu": 173409.9236023313,
            "loss_sequences_lower_95": 3.457934602101644,
            "loss_sequences_upper_95": 4.334259414672851,
            "loss_tokens_lower_95": 2.8774613112546086,
            "loss_tokens_upper_95": 4.051212979434581,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.287016414367047,
            "data_time": 0.0036030424700687043,
            "batch_time": 0.017416084703801307,
            "samples_per_second": 2229014.211067747,
            "samples_per_second_per_gpu": 278626.77638346836,
            "loss_sequences_lower_95": 5.253861190054308,
            "loss_sequences_upper_95": 5.322094978253865,
            "loss_tokens_lower_95": 5.252918426109168,
            "loss_tokens_upper_95": 5.320854571290501,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.7530288222181356,
            "data_time": 0.0016325812387388175,
            "batch_time": 0.015327009529331841,
            "samples_per_second": 2254934.198020377,
            "samples_per_second_per_gpu": 281866.77475254715,
            "loss_sequences_lower_95": 2.0170299764551136,
            "loss_sequences_upper_95": 2.0520265257962627,
            "loss_tokens_lower_95": 1.470610495057821,
            "loss_tokens_upper_95": 1.491046701493263,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.611229964128629,
            "data_time": 0.04029830917716026,
            "batch_time": 0.07174096629023552,
            "samples_per_second": 1949805.2656916014,
            "samples_per_second_per_gpu": 243725.65821145018,
            "loss_sequences_lower_95": 5.640032382274238,
            "loss_sequences_upper_95": 6.049286549485575,
            "loss_tokens_lower_95": 5.24343009242201,
            "loss_tokens_upper_95": 5.464521923360353,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.97421222119718,
            "data_time": 0.12459987685793922,
            "batch_time": 0.1416188989366804,
            "samples_per_second": 965322.9456853586,
            "samples_per_second_per_gpu": 120665.36821066983,
            "loss_sequences_lower_95": 7.55999755859375,
            "loss_sequences_upper_95": 8.573950917011981,
            "loss_tokens_lower_95": 7.240049800166378,
            "loss_tokens_upper_95": 8.366844629358363,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.491837545139034,
            "data_time": 0.031199409848167783,
            "batch_time": 0.04610615684872582,
            "samples_per_second": 1867210.8539331872,
            "samples_per_second_per_gpu": 233401.3567416484,
            "loss_sequences_lower_95": 5.448071000634171,
            "loss_sequences_upper_95": 5.801369001807236,
            "loss_tokens_lower_95": 5.146445493192809,
            "loss_tokens_upper_95": 5.336734913793103,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.566205972578468,
            "data_time": 0.0314251666977292,
            "batch_time": 0.04612678289413452,
            "samples_per_second": 1909719.135804941,
            "samples_per_second_per_gpu": 238714.89197561762,
            "loss_sequences_lower_95": 5.534036729393936,
            "loss_sequences_upper_95": 5.8561722173923405,
            "loss_tokens_lower_95": 5.246448745987012,
            "loss_tokens_upper_95": 5.407072238825935,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.693588925570976,
            "data_time": 0.03202025663285028,
            "batch_time": 0.047527537459418887,
            "samples_per_second": 1811980.1430137726,
            "samples_per_second_per_gpu": 226497.51787672157,
            "loss_sequences_lower_95": 5.636311507806545,
            "loss_sequences_upper_95": 6.037434191820098,
            "loss_tokens_lower_95": 5.318354433223117,
            "loss_tokens_upper_95": 5.569022275059302,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.562075934759,
            "data_time": 0.030749110948471797,
            "batch_time": 0.044841417244502475,
            "samples_per_second": 1968756.3770357128,
            "samples_per_second_per_gpu": 246094.5471294641,
            "loss_sequences_lower_95": 5.505836133259098,
            "loss_sequences_upper_95": 5.79489519072742,
            "loss_tokens_lower_95": 5.283330081927813,
            "loss_tokens_upper_95": 5.428678737176913,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.889656212018884,
            "data_time": 0.03415759110156401,
            "batch_time": 0.04870286105591574,
            "samples_per_second": 1929328.2693111605,
            "samples_per_second_per_gpu": 241166.03366389507,
            "loss_sequences_lower_95": 5.864128491893318,
            "loss_sequences_upper_95": 6.132721250545904,
            "loss_tokens_lower_95": 5.646242653645071,
            "loss_tokens_upper_95": 5.768500383884176,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.863556277461168,
            "data_time": 0.030865260532924106,
            "batch_time": 0.04637302954991659,
            "samples_per_second": 1822519.6417080306,
            "samples_per_second_per_gpu": 227814.95521350382,
            "loss_sequences_lower_95": 5.886783153254811,
            "loss_sequences_upper_95": 6.193191481799614,
            "loss_tokens_lower_95": 5.526028139598402,
            "loss_tokens_upper_95": 5.656711309523809,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.5/params.txt",
    "uuid": "bdbc2aac-8626-40ec-bc7b-bd6a9fb310e5",
    "creation_date": "2023_12_14-04_59_23"
}