{
    "name": "c4_original-d=96_l=8_h=4-4.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 845544960,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "169108992",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=96_l=8_h=4-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.875282943248749,
            "data_time": 0.12845855951309204,
            "batch_time": 1.1540431082248688,
            "samples_per_second": 379000.7298591876,
            "samples_per_second_per_gpu": 47375.09123239845,
            "loss_sequences_lower_95": 5.708519528706868,
            "loss_sequences_upper_95": 6.0444269943237305,
            "loss_tokens_lower_95": 5.8593464279174805,
            "loss_tokens_upper_95": 5.89120194753011,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.77306441451568,
            "data_time": 0.018866943134179484,
            "batch_time": 0.06408047653674452,
            "samples_per_second": 4660869.362826856,
            "samples_per_second_per_gpu": 582608.670353357,
            "loss_sequences_lower_95": 4.770821908566572,
            "loss_sequences_upper_95": 4.775336355060059,
            "loss_tokens_lower_95": 4.761447364583334,
            "loss_tokens_upper_95": 4.784617520833334,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.494954681396484,
            "data_time": 0.09507498890161514,
            "batch_time": 0.14030493050813675,
            "samples_per_second": 4061636.2308137044,
            "samples_per_second_per_gpu": 507704.52885171305,
            "loss_sequences_lower_95": 5.456078217175542,
            "loss_sequences_upper_95": 5.544166197484853,
            "loss_tokens_lower_95": 5.4798319375,
            "loss_tokens_upper_95": 5.510007166666666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.94040164170806,
            "data_time": 0.015171120825566743,
            "batch_time": 0.059222432343583355,
            "samples_per_second": 5255124.175891322,
            "samples_per_second_per_gpu": 656890.5219864153,
            "loss_sequences_lower_95": 4.91011189191366,
            "loss_sequences_upper_95": 4.971386728817654,
            "loss_tokens_lower_95": 4.928050385416667,
            "loss_tokens_upper_95": 4.952648083333333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.805677925739172,
            "data_time": 0.09741900861263275,
            "batch_time": 0.14151063561439514,
            "samples_per_second": 4205858.513011142,
            "samples_per_second_per_gpu": 525732.3141263927,
            "loss_sequences_lower_95": 4.7567340672137535,
            "loss_sequences_upper_95": 4.862199299301489,
            "loss_tokens_lower_95": 4.7938753541666665,
            "loss_tokens_upper_95": 4.817498666666666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.560585918595076,
            "data_time": 0.03417683641115824,
            "batch_time": 0.07736906905968984,
            "samples_per_second": 5025166.041967783,
            "samples_per_second_per_gpu": 628145.7552459729,
            "loss_sequences_lower_95": 5.50867085926878,
            "loss_sequences_upper_95": 5.614342456768354,
            "loss_tokens_lower_95": 5.5470747708333334,
            "loss_tokens_upper_95": 5.573855593749999,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.622241864009779,
            "data_time": 0.012860212475061417,
            "batch_time": 0.05552502647042275,
            "samples_per_second": 5224900.881416393,
            "samples_per_second_per_gpu": 653112.6101770492,
            "loss_sequences_lower_95": 6.5889488500478315,
            "loss_sequences_upper_95": 6.655535465162628,
            "loss_tokens_lower_95": 6.60599109375,
            "loss_tokens_upper_95": 6.638913052083333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.181469903117075,
            "data_time": 0.01397208477321424,
            "batch_time": 0.057511628458374424,
            "samples_per_second": 5294046.247394939,
            "samples_per_second_per_gpu": 661755.7809243674,
            "loss_sequences_lower_95": 5.162080845058901,
            "loss_sequences_upper_95": 5.202285227830497,
            "loss_tokens_lower_95": 5.168921979166667,
            "loss_tokens_upper_95": 5.193941302083333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.171006878217061,
            "data_time": 0.09327902644872665,
            "batch_time": 0.13781704008579254,
            "samples_per_second": 4159837.9598943377,
            "samples_per_second_per_gpu": 519979.7449867922,
            "loss_sequences_lower_95": 5.099101158080062,
            "loss_sequences_upper_95": 5.254838140224054,
            "loss_tokens_lower_95": 5.158489447916667,
            "loss_tokens_upper_95": 5.183359625,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.209184735188843,
            "data_time": 0.09535824507474899,
            "batch_time": 0.14159663021564484,
            "samples_per_second": 4182247.133741295,
            "samples_per_second_per_gpu": 522780.89171766187,
            "loss_sequences_lower_95": 6.127047608869349,
            "loss_sequences_upper_95": 6.308077170632102,
            "loss_tokens_lower_95": 6.195458927083333,
            "loss_tokens_upper_95": 6.222557125,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.429508536943789,
            "data_time": 0.010166672283205492,
            "batch_time": 0.053593271251382496,
            "samples_per_second": 5386322.134137479,
            "samples_per_second_per_gpu": 673290.2667671848,
            "loss_sequences_lower_95": 5.419389991713205,
            "loss_sequences_upper_95": 5.439855727846203,
            "loss_tokens_lower_95": 5.416641489583333,
            "loss_tokens_upper_95": 5.44240390625,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.150417385077486,
            "data_time": 0.029639598727226258,
            "batch_time": 0.0895389050245285,
            "samples_per_second": 4977755.862429022,
            "samples_per_second_per_gpu": 622219.4828036277,
            "loss_sequences_lower_95": 5.130440929560599,
            "loss_sequences_upper_95": 5.170710578356024,
            "loss_tokens_lower_95": 5.137527958333333,
            "loss_tokens_upper_95": 5.1627465104166665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.457046614215534,
            "data_time": 0.09899157285690308,
            "batch_time": 0.14509153366088867,
            "samples_per_second": 4108075.2717447747,
            "samples_per_second_per_gpu": 513509.40896809683,
            "loss_sequences_lower_95": 5.379093762875571,
            "loss_sequences_upper_95": 5.546868128902288,
            "loss_tokens_lower_95": 5.443616979166667,
            "loss_tokens_upper_95": 5.470488812499999,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.229042866079482,
            "data_time": 0.10088805109262466,
            "batch_time": 0.1462448090314865,
            "samples_per_second": 4080569.365227735,
            "samples_per_second_per_gpu": 510071.1706534669,
            "loss_sequences_lower_95": 5.160541604346996,
            "loss_sequences_upper_95": 5.307694071907619,
            "loss_tokens_lower_95": 5.216322166666667,
            "loss_tokens_upper_95": 5.24260303125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.50255286693573,
            "data_time": 0.1509033888578415,
            "batch_time": 0.1846117526292801,
            "samples_per_second": 843416.3185352283,
            "samples_per_second_per_gpu": 105427.03981690353,
            "loss_sequences_lower_95": 6.432044584100897,
            "loss_sequences_upper_95": 6.5746530012650926,
            "loss_tokens_lower_95": 6.473432315479625,
            "loss_tokens_upper_95": 6.531886308843439,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.091032848413762,
            "data_time": 0.09879713505506516,
            "batch_time": 0.13444169610738754,
            "samples_per_second": 3296485.0109463753,
            "samples_per_second_per_gpu": 412060.6263682969,
            "loss_sequences_lower_95": 5.956883804721665,
            "loss_sequences_upper_95": 6.225365054294597,
            "loss_tokens_lower_95": 6.07640484375,
            "loss_tokens_upper_95": 6.10607834375,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.966599314697185,
            "data_time": 0.09356649219989777,
            "batch_time": 0.1300266534090042,
            "samples_per_second": 3774521.8771886313,
            "samples_per_second_per_gpu": 471815.2346485789,
            "loss_sequences_lower_95": 6.888717401751113,
            "loss_sequences_upper_95": 7.059171764202671,
            "loss_tokens_lower_95": 6.954305166666666,
            "loss_tokens_upper_95": 6.978838583333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.058997732694031,
            "data_time": 0.160586878657341,
            "batch_time": 0.19012093544006348,
            "samples_per_second": 2186748.2181860814,
            "samples_per_second_per_gpu": 273343.5272732602,
            "loss_sequences_lower_95": 5.939642233926742,
            "loss_sequences_upper_95": 6.262165169637711,
            "loss_tokens_lower_95": 6.043676169973906,
            "loss_tokens_upper_95": 6.074178251672964,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.41610210771456,
            "data_time": 0.027574477954344315,
            "batch_time": 0.07216674495827068,
            "samples_per_second": 4480399.47762764,
            "samples_per_second_per_gpu": 560049.934703455,
            "loss_sequences_lower_95": 5.39369893945396,
            "loss_sequences_upper_95": 5.4378991513183665,
            "loss_tokens_lower_95": 5.3934796197345465,
            "loss_tokens_upper_95": 5.4382220108647275,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.456402782994042,
            "data_time": 0.030230264365673064,
            "batch_time": 0.0741940651088953,
            "samples_per_second": 4408279.252413895,
            "samples_per_second_per_gpu": 551034.9065517369,
            "loss_sequences_lower_95": 4.4725921831432975,
            "loss_sequences_upper_95": 4.498470107190426,
            "loss_tokens_lower_95": 4.444656846034423,
            "loss_tokens_upper_95": 4.465936723963826,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.043595433854584,
            "data_time": 0.050545050038231745,
            "batch_time": 0.09289822644657558,
            "samples_per_second": 4308430.013324443,
            "samples_per_second_per_gpu": 538553.7516655554,
            "loss_sequences_lower_95": 7.516739662829918,
            "loss_sequences_upper_95": 7.7979126380196035,
            "loss_tokens_lower_95": 6.907414208158357,
            "loss_tokens_upper_95": 7.1132415429734035,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.827769812583924,
            "data_time": 0.04624688376983007,
            "batch_time": 0.09114587431152661,
            "samples_per_second": 4321711.393421832,
            "samples_per_second_per_gpu": 540213.924177729,
            "loss_sequences_lower_95": 7.181013069661458,
            "loss_sequences_upper_95": 7.359356754557291,
            "loss_tokens_lower_95": 6.727056456367925,
            "loss_tokens_upper_95": 6.856575766509434,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.223872940317983,
            "data_time": 0.07463355859120686,
            "batch_time": 0.11480696747700374,
            "samples_per_second": 3963962.4296511686,
            "samples_per_second_per_gpu": 495495.3037063961,
            "loss_sequences_lower_95": 5.325128789269807,
            "loss_sequences_upper_95": 5.397552059887936,
            "loss_tokens_lower_95": 5.196634068580002,
            "loss_tokens_upper_95": 5.235530427233569,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.205353329398415,
            "data_time": 0.34248186647892,
            "batch_time": 0.3852546215057373,
            "samples_per_second": 2493627.5035997373,
            "samples_per_second_per_gpu": 311703.43794996716,
            "loss_sequences_lower_95": 6.109092462713068,
            "loss_sequences_upper_95": 6.417872480912642,
            "loss_tokens_lower_95": 6.154119029023481,
            "loss_tokens_upper_95": 6.246970894638718,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.238596421845105,
            "data_time": 0.3438364565372467,
            "batch_time": 0.38942600786685944,
            "samples_per_second": 2675016.768001508,
            "samples_per_second_per_gpu": 334377.0960001885,
            "loss_sequences_lower_95": 5.239069500358737,
            "loss_sequences_upper_95": 5.4668585329639665,
            "loss_tokens_lower_95": 5.196081994223934,
            "loss_tokens_upper_95": 5.306280321703622,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.191809407869974,
            "data_time": 0.1760781705379486,
            "batch_time": 0.2071809023618698,
            "samples_per_second": 2624150.4966400387,
            "samples_per_second_per_gpu": 328018.81208000483,
            "loss_sequences_lower_95": 5.151517964680989,
            "loss_sequences_upper_95": 5.26047466023763,
            "loss_tokens_lower_95": 5.095611338385545,
            "loss_tokens_upper_95": 5.288429560148525,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.376855570912966,
            "data_time": 0.024155854247510433,
            "batch_time": 0.06844510529190302,
            "samples_per_second": 4528924.137455581,
            "samples_per_second_per_gpu": 566115.5171819476,
            "loss_sequences_lower_95": 8.443763186789527,
            "loss_sequences_upper_95": 8.51081049545421,
            "loss_tokens_lower_95": 8.327201341440107,
            "loss_tokens_upper_95": 8.396723893879306,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.644303178546404,
            "data_time": 0.04551409035921097,
            "batch_time": 0.08787877410650254,
            "samples_per_second": 4475572.890668407,
            "samples_per_second_per_gpu": 559446.6113335509,
            "loss_sequences_lower_95": 6.776160130356297,
            "loss_sequences_upper_95": 7.058210050537931,
            "loss_tokens_lower_95": 5.503793581651623,
            "loss_tokens_upper_95": 5.6509695886374605,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.432091567296624,
            "data_time": 0.08102012574672698,
            "batch_time": 0.12294034957885742,
            "samples_per_second": 4240210.033924902,
            "samples_per_second_per_gpu": 530026.2542406127,
            "loss_sequences_lower_95": 6.092420725122654,
            "loss_sequences_upper_95": 6.415619250613267,
            "loss_tokens_lower_95": 5.333866440661302,
            "loss_tokens_upper_95": 5.508447470454178,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6681828041599225,
            "data_time": 0.38063032925128937,
            "batch_time": 0.4230505973100662,
            "samples_per_second": 2181830.5789218154,
            "samples_per_second_per_gpu": 272728.8223652269,
            "loss_sequences_lower_95": 5.614130363290168,
            "loss_sequences_upper_95": 5.721210817659282,
            "loss_tokens_lower_95": 5.613885247217466,
            "loss_tokens_upper_95": 5.7221495188534535,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.946246156692505,
            "data_time": 0.3323343098163605,
            "batch_time": 0.35757413506507874,
            "samples_per_second": 1436444.1774375152,
            "samples_per_second_per_gpu": 179555.5221796894,
            "loss_sequences_lower_95": 4.882741668701172,
            "loss_sequences_upper_95": 5.3383248443603515,
            "loss_tokens_lower_95": 4.6816740796902945,
            "loss_tokens_upper_95": 5.183576957143057,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.790562986473466,
            "data_time": 0.053417036309838295,
            "batch_time": 0.09708080440759659,
            "samples_per_second": 4437508.619271675,
            "samples_per_second_per_gpu": 554688.5774089594,
            "loss_sequences_lower_95": 4.735514776879957,
            "loss_sequences_upper_95": 4.846085316278307,
            "loss_tokens_lower_95": 4.7341175028587195,
            "loss_tokens_upper_95": 4.845839701313811,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.460159741885149,
            "data_time": 0.07558091282844544,
            "batch_time": 0.11921969950199127,
            "samples_per_second": 4389579.271468261,
            "samples_per_second_per_gpu": 548697.4089335327,
            "loss_sequences_lower_95": 5.390706380208333,
            "loss_sequences_upper_95": 5.529819667856777,
            "loss_tokens_lower_95": 5.387378829468672,
            "loss_tokens_upper_95": 5.530656994219006,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.914225704132927,
            "data_time": 0.05345839448273182,
            "batch_time": 0.09523303993046284,
            "samples_per_second": 4200173.5525616165,
            "samples_per_second_per_gpu": 525021.6940702021,
            "loss_sequences_lower_95": 5.178840419698721,
            "loss_sequences_upper_95": 5.297638251602455,
            "loss_tokens_lower_95": 4.873887487796941,
            "loss_tokens_upper_95": 4.934249996871011,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.497804851531982,
            "data_time": 0.18011536449193954,
            "batch_time": 0.22512854635715485,
            "samples_per_second": 3745220.3178675785,
            "samples_per_second_per_gpu": 468152.5397334473,
            "loss_sequences_lower_95": 7.118255908203126,
            "loss_sequences_upper_95": 7.644260290527344,
            "loss_tokens_lower_95": 6.264907087607544,
            "loss_tokens_upper_95": 6.615633369612425,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.955220729112625,
            "data_time": 0.15027335286140442,
            "batch_time": 0.16780385375022888,
            "samples_per_second": 847324.07322417,
            "samples_per_second_per_gpu": 105915.50915302125,
            "loss_sequences_lower_95": 4.713339221477509,
            "loss_sequences_upper_95": 5.287017107009888,
            "loss_tokens_lower_95": 4.445284981563174,
            "loss_tokens_upper_95": 5.323365064599048,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5718651064511,
            "data_time": 0.36148248612880707,
            "batch_time": 0.3974098861217499,
            "samples_per_second": 2184813.4208418187,
            "samples_per_second_per_gpu": 273101.67760522733,
            "loss_sequences_lower_95": 6.89676127817439,
            "loss_sequences_upper_95": 7.618989036823142,
            "loss_tokens_lower_95": 5.262445942256201,
            "loss_tokens_upper_95": 5.709957937350268,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.555571825861566,
            "data_time": 0.04793042606777615,
            "batch_time": 0.092493270834287,
            "samples_per_second": 4581725.689178152,
            "samples_per_second_per_gpu": 572715.711147269,
            "loss_sequences_lower_95": 4.5324185685957294,
            "loss_sequences_upper_95": 4.579023851333825,
            "loss_tokens_lower_95": 4.5324167874037515,
            "loss_tokens_upper_95": 4.578910441667349,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.5173068375534715,
            "data_time": 0.03312288153739203,
            "batch_time": 0.07652993500232697,
            "samples_per_second": 4364831.262946815,
            "samples_per_second_per_gpu": 545603.9078683519,
            "loss_sequences_lower_95": 6.630961541668689,
            "loss_sequences_upper_95": 6.853498000254706,
            "loss_tokens_lower_95": 6.379102808569541,
            "loss_tokens_upper_95": 6.598966118301131,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.383014145351591,
            "data_time": 0.17477043718099594,
            "batch_time": 0.2045050635933876,
            "samples_per_second": 1981620.4751658877,
            "samples_per_second_per_gpu": 247702.55939573597,
            "loss_sequences_lower_95": 4.296369213062328,
            "loss_sequences_upper_95": 4.681525138882927,
            "loss_tokens_lower_95": 4.175329284950277,
            "loss_tokens_upper_95": 4.511323291873843,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.66440876055253,
            "data_time": 0.0776668757200241,
            "batch_time": 0.12200624942779541,
            "samples_per_second": 4443605.376231597,
            "samples_per_second_per_gpu": 555450.6720289496,
            "loss_sequences_lower_95": 4.740186327277156,
            "loss_sequences_upper_95": 4.878811946588891,
            "loss_tokens_lower_95": 4.581330813871029,
            "loss_tokens_upper_95": 4.7345245622951335,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.194345174766168,
            "data_time": 0.30593574047088623,
            "batch_time": 0.34064507484436035,
            "samples_per_second": 2267918.281474893,
            "samples_per_second_per_gpu": 283489.7851843616,
            "loss_sequences_lower_95": 4.934583505770056,
            "loss_sequences_upper_95": 5.414956553389387,
            "loss_tokens_lower_95": 5.000268455119546,
            "loss_tokens_upper_95": 5.399028674774418,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6626246978483525,
            "data_time": 0.0284829143446441,
            "batch_time": 0.07236979037616693,
            "samples_per_second": 4427568.9275769945,
            "samples_per_second_per_gpu": 553446.1159471243,
            "loss_sequences_lower_95": 5.6434379063312665,
            "loss_sequences_upper_95": 5.681975145029006,
            "loss_tokens_lower_95": 5.643436509567539,
            "loss_tokens_upper_95": 5.681608518969419,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.146676683888852,
            "data_time": 0.353636234998703,
            "batch_time": 0.38087978959083557,
            "samples_per_second": 1718907.5815849698,
            "samples_per_second_per_gpu": 214863.44769812122,
            "loss_sequences_lower_95": 4.013712044132566,
            "loss_sequences_upper_95": 4.412841567252446,
            "loss_tokens_lower_95": 3.8906684492847203,
            "loss_tokens_upper_95": 4.302227389629445,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.832752468582219,
            "data_time": 0.022669942378997804,
            "batch_time": 0.06731553912162781,
            "samples_per_second": 4476740.63654643,
            "samples_per_second_per_gpu": 559592.5795683038,
            "loss_sequences_lower_95": 6.70730266067217,
            "loss_sequences_upper_95": 6.758779112617924,
            "loss_tokens_lower_95": 5.732186158123791,
            "loss_tokens_upper_95": 5.784973899903289,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.9369026160240175,
            "data_time": 0.09423781931400299,
            "batch_time": 0.13939452916383743,
            "samples_per_second": 4290964.1515920535,
            "samples_per_second_per_gpu": 536370.5189490067,
            "loss_sequences_lower_95": 7.86387705078125,
            "loss_sequences_upper_95": 8.271745556640624,
            "loss_tokens_lower_95": 7.7144036171331205,
            "loss_tokens_upper_95": 8.107419420376546,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.177420218094536,
            "data_time": 0.34444360435009,
            "batch_time": 0.387214258313179,
            "samples_per_second": 2174712.3802973637,
            "samples_per_second_per_gpu": 271839.04753717047,
            "loss_sequences_lower_95": 5.0034663192085596,
            "loss_sequences_upper_95": 5.351303007706352,
            "loss_tokens_lower_95": 5.00403308370839,
            "loss_tokens_upper_95": 5.347179565429688,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.974193731221286,
            "data_time": 0.0707599272330602,
            "batch_time": 0.11103106290102005,
            "samples_per_second": 4034772.158729676,
            "samples_per_second_per_gpu": 504346.5198412095,
            "loss_sequences_lower_95": 9.859522242690579,
            "loss_sequences_upper_95": 10.08684141216856,
            "loss_tokens_lower_95": 9.861061049952651,
            "loss_tokens_upper_95": 10.08954963452888,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.7052595944404603,
            "data_time": 0.06828565895557404,
            "batch_time": 0.11295691629250844,
            "samples_per_second": 4366285.789580883,
            "samples_per_second_per_gpu": 545785.7236976104,
            "loss_sequences_lower_95": 2.86465302734375,
            "loss_sequences_upper_95": 2.9710655436197917,
            "loss_tokens_lower_95": 2.649055958320828,
            "loss_tokens_upper_95": 2.7393902873649463,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.510069308962141,
            "data_time": 0.3369651585817337,
            "batch_time": 0.378438800573349,
            "samples_per_second": 2503136.645143788,
            "samples_per_second_per_gpu": 312892.0806429735,
            "loss_sequences_lower_95": 6.201056242443266,
            "loss_sequences_upper_95": 6.825706205822172,
            "loss_tokens_lower_95": 6.204004429408482,
            "loss_tokens_upper_95": 6.82107421875,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.119089886546135,
            "data_time": 0.14183925092220306,
            "batch_time": 0.15854419767856598,
            "samples_per_second": 1065207.149676834,
            "samples_per_second_per_gpu": 133150.89370960425,
            "loss_sequences_lower_95": 4.767403221130371,
            "loss_sequences_upper_95": 6.1579507946968075,
            "loss_tokens_lower_95": 4.564269339964562,
            "loss_tokens_upper_95": 5.171660458279639,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.467766423225402,
            "data_time": 0.10141784325242043,
            "batch_time": 0.14594997093081474,
            "samples_per_second": 4247935.7220665235,
            "samples_per_second_per_gpu": 530991.9652583154,
            "loss_sequences_lower_95": 8.571478320312501,
            "loss_sequences_upper_95": 8.913347631835936,
            "loss_tokens_lower_95": 8.303946258493285,
            "loss_tokens_upper_95": 8.606468053517608,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.096776443481446,
            "data_time": 0.09676230698823929,
            "batch_time": 0.14173927158117294,
            "samples_per_second": 4382087.43185302,
            "samples_per_second_per_gpu": 547760.9289816275,
            "loss_sequences_lower_95": 8.4376580078125,
            "loss_sequences_upper_95": 8.71154521484375,
            "loss_tokens_lower_95": 7.9633003215761144,
            "loss_tokens_upper_95": 8.181900646379725,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.306237019511065,
            "data_time": 0.039701768507560097,
            "batch_time": 0.08342761794726054,
            "samples_per_second": 4586074.186631116,
            "samples_per_second_per_gpu": 573259.2733288895,
            "loss_sequences_lower_95": 6.279013563840932,
            "loss_sequences_upper_95": 6.333831277630532,
            "loss_tokens_lower_95": 6.278625066784697,
            "loss_tokens_upper_95": 6.333655804113937,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.10917991207492,
            "data_time": 0.11428548395633698,
            "batch_time": 0.15448888639609018,
            "samples_per_second": 4014836.9926913925,
            "samples_per_second_per_gpu": 501854.62408642407,
            "loss_sequences_lower_95": 4.990085265456989,
            "loss_sequences_upper_95": 5.2279412288636475,
            "loss_tokens_lower_95": 4.9893689696140555,
            "loss_tokens_upper_95": 5.225367223502304,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.01625797367096,
            "data_time": 0.0937727503478527,
            "batch_time": 0.13842172175645828,
            "samples_per_second": 4264258.203905299,
            "samples_per_second_per_gpu": 533032.2754881624,
            "loss_sequences_lower_95": 8.966994995117188,
            "loss_sequences_upper_95": 9.067090698242188,
            "loss_tokens_lower_95": 8.965459301757813,
            "loss_tokens_upper_95": 9.066297338867189,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.812314442451426,
            "data_time": 0.02816855481692723,
            "batch_time": 0.07250464317344484,
            "samples_per_second": 4469036.201592977,
            "samples_per_second_per_gpu": 558629.5251991221,
            "loss_sequences_lower_95": 6.721692400366603,
            "loss_sequences_upper_95": 6.819660023208373,
            "loss_tokens_lower_95": 5.7057886415987715,
            "loss_tokens_upper_95": 5.773966092623597,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4816098097545,
            "data_time": 0.21212525027138845,
            "batch_time": 0.24403448615755355,
            "samples_per_second": 2072269.304022868,
            "samples_per_second_per_gpu": 259033.6630028585,
            "loss_sequences_lower_95": 5.315024771619199,
            "loss_sequences_upper_95": 5.647100807303813,
            "loss_tokens_lower_95": 5.31072706535681,
            "loss_tokens_upper_95": 5.6448632311465134,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.603294556748633,
            "data_time": 0.174391970038414,
            "batch_time": 0.22011788189411163,
            "samples_per_second": 4021640.674765082,
            "samples_per_second_per_gpu": 502705.08434563526,
            "loss_sequences_lower_95": 5.48275270948223,
            "loss_sequences_upper_95": 5.722072311102175,
            "loss_tokens_lower_95": 5.484953697054994,
            "loss_tokens_upper_95": 5.721736605775122,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.251568091150126,
            "data_time": 0.028689364902675152,
            "batch_time": 0.0725723379291594,
            "samples_per_second": 4440848.286338941,
            "samples_per_second_per_gpu": 555106.0357923676,
            "loss_sequences_lower_95": 6.786173083623011,
            "loss_sequences_upper_95": 6.892034207945791,
            "loss_tokens_lower_95": 6.154525919845474,
            "loss_tokens_upper_95": 6.237140307653884,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.948757698926976,
            "data_time": 0.34258368611335754,
            "batch_time": 0.3807718753814697,
            "samples_per_second": 2232403.605129456,
            "samples_per_second_per_gpu": 279050.450641182,
            "loss_sequences_lower_95": 5.842435903397818,
            "loss_sequences_upper_95": 6.059080513444527,
            "loss_tokens_lower_95": 5.842835554496321,
            "loss_tokens_upper_95": 6.058915863844453,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.866563897322441,
            "data_time": 0.04458169295237614,
            "batch_time": 0.08866594617183392,
            "samples_per_second": 4435637.0271947635,
            "samples_per_second_per_gpu": 554454.6283993454,
            "loss_sequences_lower_95": 7.825045378798739,
            "loss_sequences_upper_95": 7.909183808772935,
            "loss_tokens_lower_95": 7.824616258720375,
            "loss_tokens_upper_95": 7.908181159809825,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.471938130925003,
            "data_time": 0.33127525448799133,
            "batch_time": 0.37092025578022003,
            "samples_per_second": 2675452.755886088,
            "samples_per_second_per_gpu": 334431.594485761,
            "loss_sequences_lower_95": 5.281558908999545,
            "loss_sequences_upper_95": 5.658756470911711,
            "loss_tokens_lower_95": 5.2834211923543695,
            "loss_tokens_upper_95": 5.656036451024916,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.834539763132731,
            "data_time": 0.29402172565460205,
            "batch_time": 0.3144170343875885,
            "samples_per_second": 1209830.5012120465,
            "samples_per_second_per_gpu": 151228.8126515058,
            "loss_sequences_lower_95": 5.520116869608562,
            "loss_sequences_upper_95": 6.3323587163289385,
            "loss_tokens_lower_95": 5.074177985721164,
            "loss_tokens_upper_95": 6.527359941270617,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.034763089815775,
            "data_time": 0.2846435606479645,
            "batch_time": 0.30561332404613495,
            "samples_per_second": 1195482.5973618133,
            "samples_per_second_per_gpu": 149435.32467022666,
            "loss_sequences_lower_95": 4.846590372721354,
            "loss_sequences_upper_95": 5.8011016845703125,
            "loss_tokens_lower_95": 4.175300298112162,
            "loss_tokens_upper_95": 5.6637743146232005,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.961713286106296,
            "data_time": 0.04793078771659306,
            "batch_time": 0.09074562255825315,
            "samples_per_second": 4214119.442377863,
            "samples_per_second_per_gpu": 526764.9302972329,
            "loss_sequences_lower_95": 8.934605693114873,
            "loss_sequences_upper_95": 8.988132320623158,
            "loss_tokens_lower_95": 8.935665342530376,
            "loss_tokens_upper_95": 8.988721537302098,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.398371703491192,
            "data_time": 0.02291792318810542,
            "batch_time": 0.06768802875514635,
            "samples_per_second": 4455895.099460775,
            "samples_per_second_per_gpu": 556986.8874325969,
            "loss_sequences_lower_95": 5.186436900559051,
            "loss_sequences_upper_95": 5.223725667078404,
            "loss_tokens_lower_95": 4.316480565035188,
            "loss_tokens_upper_95": 4.349215296442338,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.681520927609421,
            "data_time": 0.35707348585128784,
            "batch_time": 0.386237695813179,
            "samples_per_second": 1914154.2593675696,
            "samples_per_second_per_gpu": 239269.2824209462,
            "loss_sequences_lower_95": 6.774770289143239,
            "loss_sequences_upper_95": 7.163490872120294,
            "loss_tokens_lower_95": 6.526603927743491,
            "loss_tokens_upper_95": 6.759760167668417,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.73953999699773,
            "data_time": 0.20732371509075165,
            "batch_time": 0.22470802068710327,
            "samples_per_second": 1050902.180557399,
            "samples_per_second_per_gpu": 131362.77256967488,
            "loss_sequences_lower_95": 8.333640186206715,
            "loss_sequences_upper_95": 9.315393726245777,
            "loss_tokens_lower_95": 8.048201874156057,
            "loss_tokens_upper_95": 9.120756964035976,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.717850894462772,
            "data_time": 0.3117311894893646,
            "batch_time": 0.3460249602794647,
            "samples_per_second": 2159744.2127235583,
            "samples_per_second_per_gpu": 269968.0265904448,
            "loss_sequences_lower_95": 6.778654368330793,
            "loss_sequences_upper_95": 7.088261004192073,
            "loss_tokens_lower_95": 6.564810075431034,
            "loss_tokens_upper_95": 6.766363639350295,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.7521131387571005,
            "data_time": 0.3369622975587845,
            "batch_time": 0.37226927280426025,
            "samples_per_second": 1873584.7396536116,
            "samples_per_second_per_gpu": 234198.09245670144,
            "loss_sequences_lower_95": 6.816776294243045,
            "loss_sequences_upper_95": 7.116750131002287,
            "loss_tokens_lower_95": 6.625974821706052,
            "loss_tokens_upper_95": 6.794929227563914,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.820386610380033,
            "data_time": 0.33266110718250275,
            "batch_time": 0.36722132563591003,
            "samples_per_second": 2150812.2830639784,
            "samples_per_second_per_gpu": 268851.5353829973,
            "loss_sequences_lower_95": 6.88935319853992,
            "loss_sequences_upper_95": 7.258391738519436,
            "loss_tokens_lower_95": 6.642112615233252,
            "loss_tokens_upper_95": 6.903546356674993,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.728571929582736,
            "data_time": 0.3656644821166992,
            "batch_time": 0.40075647830963135,
            "samples_per_second": 2230974.8785503265,
            "samples_per_second_per_gpu": 278871.8598187908,
            "loss_sequences_lower_95": 6.74385560198528,
            "loss_sequences_upper_95": 7.022434960342035,
            "loss_tokens_lower_95": 6.611605331088152,
            "loss_tokens_upper_95": 6.768536909347011,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.038111843677782,
            "data_time": 0.37073075771331787,
            "batch_time": 0.40475888550281525,
            "samples_per_second": 1835753.2477618163,
            "samples_per_second_per_gpu": 229469.15597022703,
            "loss_sequences_lower_95": 7.05290834415033,
            "loss_sequences_upper_95": 7.273819671062209,
            "loss_tokens_lower_95": 6.9521410454445745,
            "loss_tokens_upper_95": 7.071893668653114,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.8809242568364954,
            "data_time": 0.30538664758205414,
            "batch_time": 0.3397977501153946,
            "samples_per_second": 2347056.0841975864,
            "samples_per_second_per_gpu": 293382.0105246983,
            "loss_sequences_lower_95": 7.018739839879478,
            "loss_sequences_upper_95": 7.2787751360637385,
            "loss_tokens_lower_95": 6.755915661260548,
            "loss_tokens_upper_95": 6.898428952870705,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-4.0/params.txt",
    "uuid": "90a1ca21-c439-49a1-9539-13ec1d73b1d7",
    "creation_date": "2023_12_14-04_59_15"
}