{
    "name": "c4_original-d=96_l=8_h=4-0.25",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 52846560,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "10569312",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=96_l=8_h=4-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 7.255416627724966,
            "data_time": 0.1315077543258667,
            "batch_time": 1.330711916089058,
            "samples_per_second": 382239.80988340144,
            "samples_per_second_per_gpu": 47779.97623542518,
            "loss_sequences_lower_95": 7.0702697245279955,
            "loss_sequences_upper_95": 7.442781473795573,
            "loss_tokens_lower_95": 7.240033124287923,
            "loss_tokens_upper_95": 7.270396003723145,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.974923739894816,
            "data_time": 0.018836384811085807,
            "batch_time": 0.06400187922321914,
            "samples_per_second": 4676799.3569774935,
            "samples_per_second_per_gpu": 584599.9196221867,
            "loss_sequences_lower_95": 5.972415061500723,
            "loss_sequences_upper_95": 5.977423015638419,
            "loss_tokens_lower_95": 5.963214020833334,
            "loss_tokens_upper_95": 5.986461770833333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.0682651753328285,
            "data_time": 0.10272525995969772,
            "batch_time": 0.14775314927101135,
            "samples_per_second": 3993930.1709321854,
            "samples_per_second_per_gpu": 499241.2713665232,
            "loss_sequences_lower_95": 7.031960474131059,
            "loss_sequences_upper_95": 7.1119611093949295,
            "loss_tokens_lower_95": 7.0553437500000005,
            "loss_tokens_upper_95": 7.081457604166666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.151588620844575,
            "data_time": 0.014179137976546036,
            "batch_time": 0.0580237256853204,
            "samples_per_second": 5402024.771122811,
            "samples_per_second_per_gpu": 675253.0963903514,
            "loss_sequences_lower_95": 6.124062489932346,
            "loss_sequences_upper_95": 6.179711964400773,
            "loss_tokens_lower_95": 6.139223854166667,
            "loss_tokens_upper_95": 6.164124989583333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.9889091202295,
            "data_time": 0.10147495567798615,
            "batch_time": 0.14573295414447784,
            "samples_per_second": 4104484.019849644,
            "samples_per_second_per_gpu": 513060.5024812055,
            "loss_sequences_lower_95": 5.939368719967222,
            "loss_sequences_upper_95": 6.044720123353169,
            "loss_tokens_lower_95": 5.976901291666667,
            "loss_tokens_upper_95": 6.0009886041666665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.928043333710982,
            "data_time": 0.03709458063046137,
            "batch_time": 0.08024184157450993,
            "samples_per_second": 5028514.031594444,
            "samples_per_second_per_gpu": 628564.2539493055,
            "loss_sequences_lower_95": 6.864640155639234,
            "loss_sequences_upper_95": 6.992941182523581,
            "loss_tokens_lower_95": 6.9148353020833335,
            "loss_tokens_upper_95": 6.941152458333333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.052596513981722,
            "data_time": 0.013092409074306487,
            "batch_time": 0.05565786138176918,
            "samples_per_second": 5241360.442328362,
            "samples_per_second_per_gpu": 655170.0552910452,
            "loss_sequences_lower_95": 9.024856026785713,
            "loss_sequences_upper_95": 9.080538225446428,
            "loss_tokens_lower_95": 9.039162729166666,
            "loss_tokens_upper_95": 9.066414583333334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.99372550504994,
            "data_time": 0.014077832824305483,
            "batch_time": 0.057429726186551545,
            "samples_per_second": 5350020.220755267,
            "samples_per_second_per_gpu": 668752.5275944084,
            "loss_sequences_lower_95": 5.975017056609948,
            "loss_sequences_upper_95": 6.01354099517343,
            "loss_tokens_lower_95": 5.981365,
            "loss_tokens_upper_95": 6.005995552083334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.304953088605307,
            "data_time": 0.10467910021543503,
            "batch_time": 0.1493564322590828,
            "samples_per_second": 4116465.7464830615,
            "samples_per_second_per_gpu": 514558.2183103827,
            "loss_sequences_lower_95": 6.2374257653709355,
            "loss_sequences_upper_95": 6.38100587178052,
            "loss_tokens_lower_95": 6.2926696458333335,
            "loss_tokens_upper_95": 6.317162197916667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.997966279154238,
            "data_time": 0.10588332265615463,
            "batch_time": 0.15079275518655777,
            "samples_per_second": 4206915.725563786,
            "samples_per_second_per_gpu": 525864.4656954732,
            "loss_sequences_lower_95": 6.92498070034585,
            "loss_sequences_upper_95": 7.085911614338872,
            "loss_tokens_lower_95": 6.985331625,
            "loss_tokens_upper_95": 7.0104025,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.811336829104587,
            "data_time": 0.010263036037313527,
            "batch_time": 0.05383718167913371,
            "samples_per_second": 5375496.919481728,
            "samples_per_second_per_gpu": 671937.114935216,
            "loss_sequences_lower_95": 6.799145276954571,
            "loss_sequences_upper_95": 6.823436643483624,
            "loss_tokens_lower_95": 6.798303510416667,
            "loss_tokens_upper_95": 6.824864052083333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.469802833209183,
            "data_time": 0.023717793822288512,
            "batch_time": 0.07318979054689408,
            "samples_per_second": 5055174.244042287,
            "samples_per_second_per_gpu": 631896.7805052858,
            "loss_sequences_lower_95": 6.4499483552588766,
            "loss_sequences_upper_95": 6.4902853384657435,
            "loss_tokens_lower_95": 6.4570339062499995,
            "loss_tokens_upper_95": 6.4822504895833335,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.185376044461016,
            "data_time": 0.11584670096635818,
            "batch_time": 0.1906554326415062,
            "samples_per_second": 4178609.500009103,
            "samples_per_second_per_gpu": 522326.1875011379,
            "loss_sequences_lower_95": 6.110008069516196,
            "loss_sequences_upper_95": 6.270273118086809,
            "loss_tokens_lower_95": 6.172755135416667,
            "loss_tokens_upper_95": 6.198008447916666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.831670697012162,
            "data_time": 0.09918766468763351,
            "batch_time": 0.14320389926433563,
            "samples_per_second": 4175707.800384443,
            "samples_per_second_per_gpu": 521963.47504805535,
            "loss_sequences_lower_95": 6.753587425216395,
            "loss_sequences_upper_95": 6.916646735974096,
            "loss_tokens_lower_95": 6.81966915625,
            "loss_tokens_upper_95": 6.844415489583334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.805415045131337,
            "data_time": 0.1567896455526352,
            "batch_time": 0.18287192285060883,
            "samples_per_second": 992396.1462688022,
            "samples_per_second_per_gpu": 124049.51828360028,
            "loss_sequences_lower_95": 7.744367027282715,
            "loss_sequences_upper_95": 7.8768100044944065,
            "loss_tokens_lower_95": 7.7804575833407315,
            "loss_tokens_upper_95": 7.831523843245073,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.646695629153237,
            "data_time": 0.1030299961566925,
            "batch_time": 0.13789406418800354,
            "samples_per_second": 3323770.727194528,
            "samples_per_second_per_gpu": 415471.340899316,
            "loss_sequences_lower_95": 7.482871382021,
            "loss_sequences_upper_95": 7.808546523380557,
            "loss_tokens_lower_95": 7.632437270833333,
            "loss_tokens_upper_95": 7.661123395833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.3856057541980595,
            "data_time": 0.10323899239301682,
            "batch_time": 0.14030881971120834,
            "samples_per_second": 3718348.457433918,
            "samples_per_second_per_gpu": 464793.55717923975,
            "loss_sequences_lower_95": 7.313382255936675,
            "loss_sequences_upper_95": 7.469420178899035,
            "loss_tokens_lower_95": 7.374506645833334,
            "loss_tokens_upper_95": 7.396602124999999,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.387862998931134,
            "data_time": 0.17347419261932373,
            "batch_time": 0.2027895450592041,
            "samples_per_second": 2224714.3398310374,
            "samples_per_second_per_gpu": 278089.2924788797,
            "loss_sequences_lower_95": 7.292384601030193,
            "loss_sequences_upper_95": 7.5472652998126915,
            "loss_tokens_lower_95": 7.3743360300533105,
            "loss_tokens_upper_95": 7.401150149986392,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.572078081672135,
            "data_time": 0.027538065205920826,
            "batch_time": 0.07213793478228829,
            "samples_per_second": 4510626.896447448,
            "samples_per_second_per_gpu": 563828.362055931,
            "loss_sequences_lower_95": 5.554119839143285,
            "loss_sequences_upper_95": 5.589276311689931,
            "loss_tokens_lower_95": 5.554236634404821,
            "loss_tokens_upper_95": 5.589627462478636,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.663127660751343,
            "data_time": 0.02829113192856312,
            "batch_time": 0.07249390110373496,
            "samples_per_second": 4457484.966617817,
            "samples_per_second_per_gpu": 557185.6208272271,
            "loss_sequences_lower_95": 5.667632408727097,
            "loss_sequences_upper_95": 5.694436170816944,
            "loss_tokens_lower_95": 5.649580210583803,
            "loss_tokens_upper_95": 5.672949429060633,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.368716203440007,
            "data_time": 0.05177338752481672,
            "batch_time": 0.09370610117912292,
            "samples_per_second": 4374778.611483189,
            "samples_per_second_per_gpu": 546847.3264353987,
            "loss_sequences_lower_95": 8.787278854953943,
            "loss_sequences_upper_95": 8.974069613176074,
            "loss_tokens_lower_95": 8.252506812902073,
            "loss_tokens_upper_95": 8.407471087048785,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.011825675010682,
            "data_time": 0.04518944397568703,
            "batch_time": 0.08925300339857738,
            "samples_per_second": 4583441.64499423,
            "samples_per_second_per_gpu": 572930.2056242788,
            "loss_sequences_lower_95": 8.38841875,
            "loss_sequences_upper_95": 8.523014713541667,
            "loss_tokens_lower_95": 7.923663202633648,
            "loss_tokens_upper_95": 8.033910242236635,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.991479040728635,
            "data_time": 0.06871516505877177,
            "batch_time": 0.10908221205075581,
            "samples_per_second": 4017290.304553006,
            "samples_per_second_per_gpu": 502161.28806912573,
            "loss_sequences_lower_95": 7.03393515817499,
            "loss_sequences_upper_95": 7.106943033607373,
            "loss_tokens_lower_95": 6.970799328679057,
            "loss_tokens_upper_95": 7.008124868922014,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.7868012645027855,
            "data_time": 0.4068398028612137,
            "batch_time": 0.4491241127252579,
            "samples_per_second": 1929891.5203625564,
            "samples_per_second_per_gpu": 241236.44004531956,
            "loss_sequences_lower_95": 7.651654135964134,
            "loss_sequences_upper_95": 7.952723471901634,
            "loss_tokens_lower_95": 7.74614048485841,
            "loss_tokens_upper_95": 7.828274101204273,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.665923383284588,
            "data_time": 0.3492831736803055,
            "batch_time": 0.39449410140514374,
            "samples_per_second": 2795647.4622668056,
            "samples_per_second_per_gpu": 349455.9327833507,
            "loss_sequences_lower_95": 6.61144407934072,
            "loss_sequences_upper_95": 6.8162607122927295,
            "loss_tokens_lower_95": 6.6266175498265065,
            "loss_tokens_upper_95": 6.729321801476811,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.996582250595093,
            "data_time": 0.18986117839813232,
            "batch_time": 0.222431942820549,
            "samples_per_second": 2564517.991448204,
            "samples_per_second_per_gpu": 320564.7489310255,
            "loss_sequences_lower_95": 5.914063598632812,
            "loss_sequences_upper_95": 6.045113118489583,
            "loss_tokens_lower_95": 5.906418122054619,
            "loss_tokens_upper_95": 6.0947501919325555,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.47790176632261,
            "data_time": 0.02448897548019886,
            "batch_time": 0.06924483440816402,
            "samples_per_second": 4475235.178846924,
            "samples_per_second_per_gpu": 559404.3973558655,
            "loss_sequences_lower_95": 10.534518420753162,
            "loss_sequences_upper_95": 10.588378151758034,
            "loss_tokens_lower_95": 10.438174602226576,
            "loss_tokens_upper_95": 10.493374045134741,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.0138500563624735,
            "data_time": 0.04987267255783081,
            "batch_time": 0.09208897352218628,
            "samples_per_second": 4406264.065999913,
            "samples_per_second_per_gpu": 550783.0082499891,
            "loss_sequences_lower_95": 8.002862733783143,
            "loss_sequences_upper_95": 8.223211053405144,
            "loss_tokens_lower_95": 6.875176841188408,
            "loss_tokens_upper_95": 7.0248245890632095,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.694322670279102,
            "data_time": 0.08816878199577331,
            "batch_time": 0.13061728775501252,
            "samples_per_second": 4182089.8714860543,
            "samples_per_second_per_gpu": 522761.2339357568,
            "loss_sequences_lower_95": 7.251551167875427,
            "loss_sequences_upper_95": 7.514274331571299,
            "loss_tokens_lower_95": 6.5964378787764195,
            "loss_tokens_upper_95": 6.77373141411544,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.186498589711647,
            "data_time": 0.3409120738506317,
            "batch_time": 0.383323535323143,
            "samples_per_second": 2687840.2963040476,
            "samples_per_second_per_gpu": 335980.03703800595,
            "loss_sequences_lower_95": 6.1554618243213115,
            "loss_sequences_upper_95": 6.217362578927654,
            "loss_tokens_lower_95": 6.155403255654252,
            "loss_tokens_upper_95": 6.217174470805686,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.922478346824646,
            "data_time": 0.3099255859851837,
            "batch_time": 0.3359133452177048,
            "samples_per_second": 1831374.8499014757,
            "samples_per_second_per_gpu": 228921.85623768446,
            "loss_sequences_lower_95": 5.814557891845704,
            "loss_sequences_upper_95": 6.3117800750732425,
            "loss_tokens_lower_95": 5.632807302730881,
            "loss_tokens_upper_95": 6.191393125803612,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.070078837834947,
            "data_time": 0.05538506433367729,
            "batch_time": 0.09870956093072891,
            "samples_per_second": 4534656.418257133,
            "samples_per_second_per_gpu": 566832.0522821416,
            "loss_sequences_lower_95": 5.020775492579324,
            "loss_sequences_upper_95": 5.120346259875576,
            "loss_tokens_lower_95": 5.018928582495042,
            "loss_tokens_upper_95": 5.120300280474348,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.580685809242442,
            "data_time": 0.07722758054733277,
            "batch_time": 0.12070781290531159,
            "samples_per_second": 4453685.904022721,
            "samples_per_second_per_gpu": 556710.7380028401,
            "loss_sequences_lower_95": 5.5299731905200655,
            "loss_sequences_upper_95": 5.630654594802544,
            "loss_tokens_lower_95": 5.52923846920429,
            "loss_tokens_upper_95": 5.630982145142301,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.28967261703541,
            "data_time": 0.05469699949026108,
            "batch_time": 0.09638720005750656,
            "samples_per_second": 4204930.8427683525,
            "samples_per_second_per_gpu": 525616.3553460441,
            "loss_sequences_lower_95": 6.347743253323925,
            "loss_sequences_upper_95": 6.4540585390242455,
            "loss_tokens_lower_95": 6.27689343952602,
            "loss_tokens_upper_95": 6.34087307019575,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.647149362564087,
            "data_time": 0.1929629221558571,
            "batch_time": 0.23878975212574005,
            "samples_per_second": 3683158.025487357,
            "samples_per_second_per_gpu": 460394.7531859196,
            "loss_sequences_lower_95": 8.358316284179688,
            "loss_sequences_upper_95": 8.818381591796875,
            "loss_tokens_lower_95": 7.414632005268655,
            "loss_tokens_upper_95": 7.744439006530237,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.801744639873505,
            "data_time": 0.1642787605524063,
            "batch_time": 0.18180106580257416,
            "samples_per_second": 867699.5394630334,
            "samples_per_second_per_gpu": 108462.44243287918,
            "loss_sequences_lower_95": 5.455307972431183,
            "loss_sequences_upper_95": 6.291845846176147,
            "loss_tokens_lower_95": 5.163011906064789,
            "loss_tokens_upper_95": 6.22062453346691,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.639636428876855,
            "data_time": 0.35066066682338715,
            "batch_time": 0.3863573670387268,
            "samples_per_second": 1711633.6338532793,
            "samples_per_second_per_gpu": 213954.2042316599,
            "loss_sequences_lower_95": 7.304721244724318,
            "loss_sequences_upper_95": 7.792820038192573,
            "loss_tokens_lower_95": 6.360991929321977,
            "loss_tokens_upper_95": 6.774809256535372,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5344913977626735,
            "data_time": 0.051415089103910655,
            "batch_time": 0.09615341160032484,
            "samples_per_second": 4545028.600576679,
            "samples_per_second_per_gpu": 568128.5750720849,
            "loss_sequences_lower_95": 5.513849738218108,
            "loss_sequences_upper_95": 5.555042586487003,
            "loss_tokens_lower_95": 5.513744157263679,
            "loss_tokens_upper_95": 5.555772448565149,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.383814895729081,
            "data_time": 0.03241864485400064,
            "batch_time": 0.07585815446717399,
            "samples_per_second": 4428396.212433808,
            "samples_per_second_per_gpu": 553549.526554226,
            "loss_sequences_lower_95": 9.42456540789346,
            "loss_sequences_upper_95": 9.55534694974408,
            "loss_tokens_lower_95": 9.307528467348705,
            "loss_tokens_upper_95": 9.436706556319496,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.080625105253506,
            "data_time": 0.1945401206612587,
            "batch_time": 0.22423311322927475,
            "samples_per_second": 1913598.7624044432,
            "samples_per_second_per_gpu": 239199.8453005554,
            "loss_sequences_lower_95": 4.934585548931862,
            "loss_sequences_upper_95": 5.3351016313601765,
            "loss_tokens_lower_95": 4.846254044920068,
            "loss_tokens_upper_95": 5.203557564790992,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.472936267069972,
            "data_time": 0.08329433500766754,
            "batch_time": 0.12826971113681793,
            "samples_per_second": 4451633.350080466,
            "samples_per_second_per_gpu": 556454.1687600582,
            "loss_sequences_lower_95": 5.508681810193863,
            "loss_sequences_upper_95": 5.646050442497657,
            "loss_tokens_lower_95": 5.389161469495924,
            "loss_tokens_upper_95": 5.549506744831064,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.401086025121735,
            "data_time": 0.350749671459198,
            "batch_time": 0.3860897272825241,
            "samples_per_second": 1484347.4505834095,
            "samples_per_second_per_gpu": 185543.4313229262,
            "loss_sequences_lower_95": 7.210045028314358,
            "loss_sequences_upper_95": 7.720606510813643,
            "loss_tokens_lower_95": 7.2488208565393055,
            "loss_tokens_upper_95": 7.600454925985165,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.186108918541978,
            "data_time": 0.029070992828536556,
            "batch_time": 0.07285225802454455,
            "samples_per_second": 4442657.147356805,
            "samples_per_second_per_gpu": 555332.1434196007,
            "loss_sequences_lower_95": 5.173750355539858,
            "loss_sequences_upper_95": 5.198237499062313,
            "loss_tokens_lower_95": 5.174209627081666,
            "loss_tokens_upper_95": 5.1982602086823615,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.044398090214405,
            "data_time": 0.3266812413930893,
            "batch_time": 0.35281428694725037,
            "samples_per_second": 1933431.9145214334,
            "samples_per_second_per_gpu": 241678.98931517918,
            "loss_sequences_lower_95": 6.921760336866656,
            "loss_sequences_upper_95": 7.3047625495392134,
            "loss_tokens_lower_95": 6.787696836817106,
            "loss_tokens_upper_95": 7.200355490740148,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.411606093060295,
            "data_time": 0.024231241941452028,
            "batch_time": 0.06881301661332448,
            "samples_per_second": 4431072.515597513,
            "samples_per_second_per_gpu": 553884.0644496891,
            "loss_sequences_lower_95": 7.7613999238404086,
            "loss_sequences_upper_95": 7.799972770898846,
            "loss_tokens_lower_95": 7.354113986943908,
            "loss_tokens_upper_95": 7.391097207446808,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.780700837135315,
            "data_time": 0.11064006388187408,
            "batch_time": 0.1559719182550907,
            "samples_per_second": 4132526.962769255,
            "samples_per_second_per_gpu": 516565.8703461569,
            "loss_sequences_lower_95": 10.531262573242188,
            "loss_sequences_upper_95": 11.0045130859375,
            "loss_tokens_lower_95": 10.526477267921017,
            "loss_tokens_upper_95": 11.011733893138535,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.458714804442033,
            "data_time": 0.37632031738758087,
            "batch_time": 0.41954566538333893,
            "samples_per_second": 2013203.9396838648,
            "samples_per_second_per_gpu": 251650.4924604831,
            "loss_sequences_lower_95": 5.323917315939198,
            "loss_sequences_upper_95": 5.593507292374321,
            "loss_tokens_lower_95": 5.328016596255095,
            "loss_tokens_upper_95": 5.588755612580672,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 11.55728212775606,
            "data_time": 0.06869020809729894,
            "batch_time": 0.10874962558348973,
            "samples_per_second": 4054831.8463986814,
            "samples_per_second_per_gpu": 506853.9807998352,
            "loss_sequences_lower_95": 11.365574914180872,
            "loss_sequences_upper_95": 11.74754570238518,
            "loss_tokens_lower_95": 11.365159246271308,
            "loss_tokens_upper_95": 11.749948194099195,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3345556694666545,
            "data_time": 0.07212061434984207,
            "batch_time": 0.11634840319554011,
            "samples_per_second": 4334490.911824307,
            "samples_per_second_per_gpu": 541811.3639780384,
            "loss_sequences_lower_95": 5.451103271484375,
            "loss_sequences_upper_95": 5.543455615234374,
            "loss_tokens_lower_95": 5.275173241171468,
            "loss_tokens_upper_95": 5.378097770358143,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.0120851607549755,
            "data_time": 0.36404871940612793,
            "batch_time": 0.4053656607866287,
            "samples_per_second": 2377143.5845063142,
            "samples_per_second_per_gpu": 297142.9480632893,
            "loss_sequences_lower_95": 6.662485961914062,
            "loss_sequences_upper_95": 7.3692910185314355,
            "loss_tokens_lower_95": 6.66660627092634,
            "loss_tokens_upper_95": 7.359744596935454,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.701567560434341,
            "data_time": 0.16858826577663422,
            "batch_time": 0.18707703053951263,
            "samples_per_second": 866251.2217349927,
            "samples_per_second_per_gpu": 108281.40271687409,
            "loss_sequences_lower_95": 7.545490956306457,
            "loss_sequences_upper_95": 8.959424901008607,
            "loss_tokens_lower_95": 7.276734531048647,
            "loss_tokens_upper_95": 7.85409645316527,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.589340348720551,
            "data_time": 0.10193059593439102,
            "batch_time": 0.14638026803731918,
            "samples_per_second": 4227470.071094713,
            "samples_per_second_per_gpu": 528433.7588868391,
            "loss_sequences_lower_95": 8.705147900390626,
            "loss_sequences_upper_95": 9.012396826171875,
            "loss_tokens_lower_95": 8.43948197986067,
            "loss_tokens_upper_95": 8.712802697196224,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.832394422054291,
            "data_time": 0.09872573614120483,
            "batch_time": 0.14332551881670952,
            "samples_per_second": 4350822.238285941,
            "samples_per_second_per_gpu": 543852.7797857426,
            "loss_sequences_lower_95": 8.092501611328125,
            "loss_sequences_upper_95": 8.3429484375,
            "loss_tokens_lower_95": 7.720626765343328,
            "loss_tokens_upper_95": 7.912977461798166,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.675173319548998,
            "data_time": 0.03998709966739019,
            "batch_time": 0.08393908912936847,
            "samples_per_second": 4549026.241300691,
            "samples_per_second_per_gpu": 568628.2801625864,
            "loss_sequences_lower_95": 4.653968320482736,
            "loss_sequences_upper_95": 4.696558509287001,
            "loss_tokens_lower_95": 4.65384864819917,
            "loss_tokens_upper_95": 4.696712171380008,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.665620502055882,
            "data_time": 0.13004306455453238,
            "batch_time": 0.17079496880372366,
            "samples_per_second": 3918147.46068199,
            "samples_per_second_per_gpu": 489768.4325852487,
            "loss_sequences_lower_95": 5.583156312629009,
            "loss_sequences_upper_95": 5.7466767341859875,
            "loss_tokens_lower_95": 5.583667573474702,
            "loss_tokens_upper_95": 5.746368792602727,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.794928985595703,
            "data_time": 0.09772083908319473,
            "batch_time": 0.14267072454094887,
            "samples_per_second": 4264282.315903804,
            "samples_per_second_per_gpu": 533035.2894879755,
            "loss_sequences_lower_95": 8.753071972656251,
            "loss_sequences_upper_95": 8.837715600585938,
            "loss_tokens_lower_95": 8.751888134765624,
            "loss_tokens_upper_95": 8.83661005859375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.801430258529865,
            "data_time": 0.03205196062723795,
            "batch_time": 0.07591482748587926,
            "samples_per_second": 4414851.7409995,
            "samples_per_second_per_gpu": 551856.4676249375,
            "loss_sequences_lower_95": 8.332633226407285,
            "loss_sequences_upper_95": 8.398245254848629,
            "loss_tokens_lower_95": 7.716551165993048,
            "loss_tokens_upper_95": 7.77169740834151,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.48979102942481,
            "data_time": 0.22148582765034266,
            "batch_time": 0.2575854744229998,
            "samples_per_second": 1572902.2414936668,
            "samples_per_second_per_gpu": 196612.78018670835,
            "loss_sequences_lower_95": 5.34400880728195,
            "loss_sequences_upper_95": 5.6303635326784045,
            "loss_tokens_lower_95": 5.342686826791336,
            "loss_tokens_upper_95": 5.6293429588204,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.61543610423219,
            "data_time": 0.18184424936771393,
            "batch_time": 0.2277180626988411,
            "samples_per_second": 3675096.5475560874,
            "samples_per_second_per_gpu": 459387.0684445109,
            "loss_sequences_lower_95": 5.5064121261297485,
            "loss_sequences_upper_95": 5.7231799436083035,
            "loss_tokens_lower_95": 5.506194769167433,
            "loss_tokens_upper_95": 5.723840858609069,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.406958011432233,
            "data_time": 0.028140402864664793,
            "batch_time": 0.07174272928386927,
            "samples_per_second": 4485637.743434893,
            "samples_per_second_per_gpu": 560704.7179293616,
            "loss_sequences_lower_95": 7.788196254932356,
            "loss_sequences_upper_95": 7.859493391229801,
            "loss_tokens_lower_95": 7.330015429093192,
            "loss_tokens_upper_95": 7.3981389569279985,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.912548254406642,
            "data_time": 0.34234072268009186,
            "batch_time": 0.3799459785223007,
            "samples_per_second": 2439920.5437693466,
            "samples_per_second_per_gpu": 304990.0679711683,
            "loss_sequences_lower_95": 4.813128581375041,
            "loss_sequences_upper_95": 5.016743711441282,
            "loss_tokens_lower_95": 4.814975266229539,
            "loss_tokens_upper_95": 5.012947292428799,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.484866282845127,
            "data_time": 0.04448076394888071,
            "batch_time": 0.08865432670483223,
            "samples_per_second": 4471906.79476489,
            "samples_per_second_per_gpu": 558988.3493456113,
            "loss_sequences_lower_95": 8.453870487504778,
            "loss_sequences_upper_95": 8.516517584097858,
            "loss_tokens_lower_95": 8.45398100033448,
            "loss_tokens_upper_95": 8.515499256379014,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.335980778758966,
            "data_time": 0.3603452891111374,
            "batch_time": 0.40077924728393555,
            "samples_per_second": 2434397.7043211786,
            "samples_per_second_per_gpu": 304299.7130401473,
            "loss_sequences_lower_95": 5.1769870498805375,
            "loss_sequences_upper_95": 5.492929521579186,
            "loss_tokens_lower_95": 5.176566188775221,
            "loss_tokens_upper_95": 5.49126450621966,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.062113602956137,
            "data_time": 0.2967904210090637,
            "batch_time": 0.3168397545814514,
            "samples_per_second": 1413947.6939529984,
            "samples_per_second_per_gpu": 176743.4617441248,
            "loss_sequences_lower_95": 8.84439682006836,
            "loss_sequences_upper_95": 9.511211013793945,
            "loss_tokens_lower_95": 8.59023929172092,
            "loss_tokens_upper_95": 9.489992396036783,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.804209740956624,
            "data_time": 0.3300200402736664,
            "batch_time": 0.34983138740062714,
            "samples_per_second": 1288001.890242494,
            "samples_per_second_per_gpu": 161000.23628031174,
            "loss_sequences_lower_95": 8.580298411051432,
            "loss_sequences_upper_95": 9.417622502644857,
            "loss_tokens_lower_95": 8.225664100218355,
            "loss_tokens_upper_95": 9.202043271868417,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.611368156287035,
            "data_time": 0.04781624036175864,
            "batch_time": 0.09089523128100804,
            "samples_per_second": 4256500.656732385,
            "samples_per_second_per_gpu": 532062.5820915481,
            "loss_sequences_lower_95": 8.58954281342047,
            "loss_sequences_upper_95": 8.632960350699559,
            "loss_tokens_lower_95": 8.589893023978277,
            "loss_tokens_upper_95": 8.632725932552466,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.469468072542844,
            "data_time": 0.023566911639605028,
            "batch_time": 0.06804683986709628,
            "samples_per_second": 4438498.405546596,
            "samples_per_second_per_gpu": 554812.3006933245,
            "loss_sequences_lower_95": 7.9689960272344935,
            "loss_sequences_upper_95": 8.000702700262428,
            "loss_tokens_lower_95": 7.416306494677653,
            "loss_tokens_upper_95": 7.444933780103971,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.91211032116507,
            "data_time": 0.37136223912239075,
            "batch_time": 0.40189531445503235,
            "samples_per_second": 2041636.8361697295,
            "samples_per_second_per_gpu": 255204.60452121618,
            "loss_sequences_lower_95": 8.925572673166831,
            "loss_sequences_upper_95": 9.253955558716783,
            "loss_tokens_lower_95": 8.801623358242754,
            "loss_tokens_upper_95": 9.028334609509335,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 11.251737775029364,
            "data_time": 0.2375749796628952,
            "batch_time": 0.25448426604270935,
            "samples_per_second": 1016148.643765451,
            "samples_per_second_per_gpu": 127018.58047068138,
            "loss_sequences_lower_95": 10.897517931139147,
            "loss_sequences_upper_95": 11.814890289306641,
            "loss_tokens_lower_95": 10.715431665491174,
            "loss_tokens_upper_95": 11.556009400921102,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.057802938833468,
            "data_time": 0.3397585451602936,
            "batch_time": 0.37393178045749664,
            "samples_per_second": 2277585.2464837767,
            "samples_per_second_per_gpu": 284698.1558104721,
            "loss_sequences_lower_95": 9.012199997320408,
            "loss_sequences_upper_95": 9.250217307486185,
            "loss_tokens_lower_95": 8.932825526308873,
            "loss_tokens_upper_95": 9.119838805587678,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.974916167375518,
            "data_time": 0.351445734500885,
            "batch_time": 0.3861479312181473,
            "samples_per_second": 2241322.0021333154,
            "samples_per_second_per_gpu": 280165.2502666644,
            "loss_sequences_lower_95": 8.927516248749523,
            "loss_sequences_upper_95": 9.150512025414445,
            "loss_tokens_lower_95": 8.878108874750792,
            "loss_tokens_upper_95": 9.036920911149876,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.140951354329179,
            "data_time": 0.3350986838340759,
            "batch_time": 0.37073980271816254,
            "samples_per_second": 2127429.248924936,
            "samples_per_second_per_gpu": 265928.656115617,
            "loss_sequences_lower_95": 9.196041460735042,
            "loss_sequences_upper_95": 9.523612678341749,
            "loss_tokens_lower_95": 8.99069735772175,
            "loss_tokens_upper_95": 9.236684634892537,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.970700810595256,
            "data_time": 0.33802178502082825,
            "batch_time": 0.3737284243106842,
            "samples_per_second": 1857669.581619667,
            "samples_per_second_per_gpu": 232208.69770245836,
            "loss_sequences_lower_95": 8.927720883997475,
            "loss_sequences_upper_95": 9.142794334597705,
            "loss_tokens_lower_95": 8.877244743751216,
            "loss_tokens_upper_95": 9.021564443237686,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.244200232606497,
            "data_time": 0.34809158742427826,
            "batch_time": 0.3830626606941223,
            "samples_per_second": 2224091.608637085,
            "samples_per_second_per_gpu": 278011.45107963565,
            "loss_sequences_lower_95": 9.160143379543138,
            "loss_sequences_upper_95": 9.322462378081328,
            "loss_tokens_lower_95": 9.197101628765722,
            "loss_tokens_upper_95": 9.300890839792336,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.329001496477824,
            "data_time": 0.3432781249284744,
            "batch_time": 0.37810271978378296,
            "samples_per_second": 1930433.3033085396,
            "samples_per_second_per_gpu": 241304.16291356744,
            "loss_sequences_lower_95": 9.321266565090273,
            "loss_sequences_upper_95": 9.512592632014577,
            "loss_tokens_lower_95": 9.244478413200724,
            "loss_tokens_upper_95": 9.359197323312236,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.25/params.txt",
    "uuid": "bfbb401b-cd66-44d2-8096-28e6acf9e70f",
    "creation_date": "2023_12_14-04_59_07"
}