{
    "name": "c4_original-d=576_l=24_h=8-0.25",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 768386880,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "153677376",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=576_l=24_h=8-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.795222818851471,
            "data_time": 0.03694535419344902,
            "batch_time": 0.3873315639793873,
            "samples_per_second": 841277.5988999064,
            "samples_per_second_per_gpu": 105159.6998624883,
            "loss_sequences_lower_95": 4.662096913655599,
            "loss_sequences_upper_95": 4.928319142659505,
            "loss_tokens_lower_95": 4.779527371724447,
            "loss_tokens_upper_95": 4.810507342020671,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9700447294669683,
            "data_time": 0.001157585150819188,
            "batch_time": 0.03067596043468551,
            "samples_per_second": 1081406.455866058,
            "samples_per_second_per_gpu": 135175.80698325724,
            "loss_sequences_lower_95": 3.967606907507735,
            "loss_sequences_upper_95": 3.9724300524859117,
            "loss_tokens_lower_95": 3.9589217395833334,
            "loss_tokens_upper_95": 3.9813864999999997,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.419608717548604,
            "data_time": 0.009944470405578613,
            "batch_time": 0.038994930267333985,
            "samples_per_second": 1058295.1289143295,
            "samples_per_second_per_gpu": 132286.89111429118,
            "loss_sequences_lower_95": 4.383067514847736,
            "loss_sequences_upper_95": 4.465175507214605,
            "loss_tokens_lower_95": 4.405218927083333,
            "loss_tokens_upper_95": 4.4347837708333335,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.101097363196698,
            "data_time": 0.0016387035383989936,
            "batch_time": 0.0306067648098657,
            "samples_per_second": 1104983.0954809205,
            "samples_per_second_per_gpu": 138122.88693511506,
            "loss_sequences_lower_95": 4.071795546069588,
            "loss_sequences_upper_95": 4.131150229542525,
            "loss_tokens_lower_95": 4.088665833333334,
            "loss_tokens_upper_95": 4.113402197916666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.010615311426446,
            "data_time": 0.010538829750273808,
            "batch_time": 0.03983349154194988,
            "samples_per_second": 1057356.5456768998,
            "samples_per_second_per_gpu": 132169.56820961248,
            "loss_sequences_lower_95": 3.9626174274141577,
            "loss_sequences_upper_95": 4.069247865385534,
            "loss_tokens_lower_95": 3.999200854166667,
            "loss_tokens_upper_95": 4.022094375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5573756226384665,
            "data_time": 0.003976447750692782,
            "batch_time": 0.033074765101723046,
            "samples_per_second": 1097273.63828995,
            "samples_per_second_per_gpu": 137159.20478624376,
            "loss_sequences_lower_95": 4.512435603968388,
            "loss_sequences_upper_95": 4.604646347948355,
            "loss_tokens_lower_95": 4.544502166666667,
            "loss_tokens_upper_95": 4.57008346875,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.791100103319907,
            "data_time": 0.001724006398467022,
            "batch_time": 0.03022580794370194,
            "samples_per_second": 1122457.7211354375,
            "samples_per_second_per_gpu": 140307.2151419297,
            "loss_sequences_lower_95": 4.757927863919005,
            "loss_sequences_upper_95": 4.823824099170919,
            "loss_tokens_lower_95": 4.7758463437500005,
            "loss_tokens_upper_95": 4.80624928125,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.518623494153247,
            "data_time": 0.0018629975919193028,
            "batch_time": 0.03047194076053072,
            "samples_per_second": 1117911.5581503548,
            "samples_per_second_per_gpu": 139738.94476879435,
            "loss_sequences_lower_95": 4.499660258507854,
            "loss_sequences_upper_95": 4.539439545157069,
            "loss_tokens_lower_95": 4.506471447916667,
            "loss_tokens_upper_95": 4.530903072916666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.374848144325783,
            "data_time": 0.009988054396614196,
            "batch_time": 0.03934141378554087,
            "samples_per_second": 1049054.9353976087,
            "samples_per_second_per_gpu": 131131.86692470108,
            "loss_sequences_lower_95": 4.303458509987932,
            "loss_sequences_upper_95": 4.459147532393293,
            "loss_tokens_lower_95": 4.362821708333334,
            "loss_tokens_upper_95": 4.38690234375,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.429531429124915,
            "data_time": 0.010431976057589054,
            "batch_time": 0.04002696834504604,
            "samples_per_second": 1056270.5259112935,
            "samples_per_second_per_gpu": 132033.81573891168,
            "loss_sequences_lower_95": 5.344324876385715,
            "loss_sequences_upper_95": 5.5351306055845475,
            "loss_tokens_lower_95": 5.415966947916667,
            "loss_tokens_upper_95": 5.44322015625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.459223772346671,
            "data_time": 0.0012981237598104522,
            "batch_time": 0.02958399872480598,
            "samples_per_second": 1132215.1246725467,
            "samples_per_second_per_gpu": 141526.89058406834,
            "loss_sequences_lower_95": 4.449923147729804,
            "loss_sequences_upper_95": 4.4688994955653865,
            "loss_tokens_lower_95": 4.447003270833333,
            "loss_tokens_upper_95": 4.471619958333333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.252067852496903,
            "data_time": 0.002663814654258963,
            "batch_time": 0.031391109852469234,
            "samples_per_second": 1114198.782494945,
            "samples_per_second_per_gpu": 139274.84781186812,
            "loss_sequences_lower_95": 4.233933675909777,
            "loss_sequences_upper_95": 4.27106961048131,
            "loss_tokens_lower_95": 4.239979541666666,
            "loss_tokens_upper_95": 4.264056885416667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.716188197687238,
            "data_time": 0.010560476732819448,
            "batch_time": 0.04020120032691202,
            "samples_per_second": 1045365.0816247519,
            "samples_per_second_per_gpu": 130670.63520309399,
            "loss_sequences_lower_95": 4.645165668975025,
            "loss_sequences_upper_95": 4.801909409962126,
            "loss_tokens_lower_95": 4.702718,
            "loss_tokens_upper_95": 4.729375083333333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.136666847586389,
            "data_time": 0.01051168897712373,
            "batch_time": 0.040205127214530546,
            "samples_per_second": 1038798.8815454494,
            "samples_per_second_per_gpu": 129849.86019318117,
            "loss_sequences_lower_95": 4.06465099824179,
            "loss_sequences_upper_95": 4.21833722334044,
            "loss_tokens_lower_95": 4.123935635416666,
            "loss_tokens_upper_95": 4.149215854166666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.54301223971627,
            "data_time": 0.08485447508948189,
            "batch_time": 0.11649880238941737,
            "samples_per_second": 565170.363307723,
            "samples_per_second_per_gpu": 70646.29541346537,
            "loss_sequences_lower_95": 5.471058316664262,
            "loss_sequences_upper_95": 5.617531438307329,
            "loss_tokens_lower_95": 5.51462066823786,
            "loss_tokens_upper_95": 5.572070156444203,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.828443190794297,
            "data_time": 0.01445670561356978,
            "batch_time": 0.04506435990333557,
            "samples_per_second": 1005628.8553830964,
            "samples_per_second_per_gpu": 125703.60692288705,
            "loss_sequences_lower_95": 4.73295935805963,
            "loss_sequences_upper_95": 4.925162757450916,
            "loss_tokens_lower_95": 4.814384927083333,
            "loss_tokens_upper_95": 4.841787708333333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.352858464132828,
            "data_time": 0.013350153962771097,
            "batch_time": 0.04340188577771187,
            "samples_per_second": 1037900.3671929818,
            "samples_per_second_per_gpu": 129737.54589912272,
            "loss_sequences_lower_95": 6.271720334659466,
            "loss_sequences_upper_95": 6.44971490623454,
            "loss_tokens_lower_95": 6.340393583333333,
            "loss_tokens_upper_95": 6.365375125,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.846745459759822,
            "data_time": 0.04138908162713051,
            "batch_time": 0.0726003460586071,
            "samples_per_second": 910264.8983573938,
            "samples_per_second_per_gpu": 113783.11229467423,
            "loss_sequences_lower_95": 4.718891544029361,
            "loss_sequences_upper_95": 5.065098284111649,
            "loss_tokens_lower_95": 4.832248687744141,
            "loss_tokens_upper_95": 4.8616119759981755,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.7586517937416675,
            "data_time": 0.0016892057820170234,
            "batch_time": 0.03060509315384276,
            "samples_per_second": 1101046.0432049036,
            "samples_per_second_per_gpu": 137630.75540061295,
            "loss_sequences_lower_95": 4.741753603586562,
            "loss_sequences_upper_95": 4.775866499274498,
            "loss_tokens_lower_95": 4.741815193370781,
            "loss_tokens_upper_95": 4.775824410140116,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6510090105231123,
            "data_time": 0.0019604036003161387,
            "batch_time": 0.030876255861133527,
            "samples_per_second": 1099249.271137915,
            "samples_per_second_per_gpu": 137406.1588922394,
            "loss_sequences_lower_95": 3.661760352612776,
            "loss_sequences_upper_95": 3.687486006040256,
            "loss_tokens_lower_95": 3.625719438128342,
            "loss_tokens_upper_95": 3.6458205353554782,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.9744805977712785,
            "data_time": 0.0031365357290226597,
            "batch_time": 0.03224439355812468,
            "samples_per_second": 1093128.1260388966,
            "samples_per_second_per_gpu": 136641.01575486208,
            "loss_sequences_lower_95": 6.14693365371841,
            "loss_sequences_upper_95": 6.435534523813917,
            "loss_tokens_lower_95": 5.544011591706129,
            "loss_tokens_upper_95": 5.753825800515993,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.00350182569027,
            "data_time": 0.004489122077505639,
            "batch_time": 0.033474514141995856,
            "samples_per_second": 1093815.940605926,
            "samples_per_second_per_gpu": 136726.99257574076,
            "loss_sequences_lower_95": 6.121363932291667,
            "loss_sequences_upper_95": 6.31360966796875,
            "loss_tokens_lower_95": 5.681645489386793,
            "loss_tokens_upper_95": 5.815606377751572,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.325710053151083,
            "data_time": 0.004739080978554658,
            "batch_time": 0.03451820556214852,
            "samples_per_second": 1068159.3590552972,
            "samples_per_second_per_gpu": 133519.91988191215,
            "loss_sequences_lower_95": 4.36630025520932,
            "loss_sequences_upper_95": 4.438452055889878,
            "loss_tokens_lower_95": 4.223398895012584,
            "loss_tokens_upper_95": 4.258846276780241,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.6073516683145,
            "data_time": 0.024621352553367615,
            "batch_time": 0.055266344121524265,
            "samples_per_second": 989029.9874988364,
            "samples_per_second_per_gpu": 123628.74843735454,
            "loss_sequences_lower_95": 4.519796614213424,
            "loss_sequences_upper_95": 4.7678098227761,
            "loss_tokens_lower_95": 4.507127467436764,
            "loss_tokens_upper_95": 4.58436102520659,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5516586284248195,
            "data_time": 0.021184014156460762,
            "batch_time": 0.051183491945266724,
            "samples_per_second": 985682.7937589901,
            "samples_per_second_per_gpu": 123210.34921987377,
            "loss_sequences_lower_95": 4.526557667012117,
            "loss_sequences_upper_95": 4.749722975127551,
            "loss_tokens_lower_95": 4.415712936907583,
            "loss_tokens_upper_95": 4.521554552619329,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.9324708112080895,
            "data_time": 0.01621079750550099,
            "batch_time": 0.045417726039886475,
            "samples_per_second": 1016527.9556098413,
            "samples_per_second_per_gpu": 127065.99445123016,
            "loss_sequences_lower_95": 4.8723593343098965,
            "loss_sequences_upper_95": 4.9915546875,
            "loss_tokens_lower_95": 4.795986036906578,
            "loss_tokens_upper_95": 5.04839179530574,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.767405837819007,
            "data_time": 0.001460175710850233,
            "batch_time": 0.03013828154823792,
            "samples_per_second": 1110825.4445081418,
            "samples_per_second_per_gpu": 138853.18056351773,
            "loss_sequences_lower_95": 7.784896070413612,
            "loss_sequences_upper_95": 7.863723257344619,
            "loss_tokens_lower_95": 7.612545074268957,
            "loss_tokens_upper_95": 7.6942415801638875,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.347599666829061,
            "data_time": 0.0030205655818017537,
            "batch_time": 0.03164286141427571,
            "samples_per_second": 1109209.1455216294,
            "samples_per_second_per_gpu": 138651.14319020367,
            "loss_sequences_lower_95": 5.920355409564394,
            "loss_sequences_upper_95": 6.212359855472037,
            "loss_tokens_lower_95": 4.592968849784996,
            "loss_tokens_upper_95": 4.735999056255677,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.867088910967824,
            "data_time": 0.005224180785385338,
            "batch_time": 0.034139683923205814,
            "samples_per_second": 1086028.7953278285,
            "samples_per_second_per_gpu": 135753.59941597856,
            "loss_sequences_lower_95": 5.314117733691739,
            "loss_sequences_upper_95": 5.64280348637812,
            "loss_tokens_lower_95": 4.461331339242498,
            "loss_tokens_upper_95": 4.627360081187985,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.525215323112871,
            "data_time": 0.022968060203960965,
            "batch_time": 0.05338106410843985,
            "samples_per_second": 982111.5919331089,
            "samples_per_second_per_gpu": 122763.94899163861,
            "loss_sequences_lower_95": 5.429083795416845,
            "loss_sequences_upper_95": 5.619302702603275,
            "loss_tokens_lower_95": 5.432654714366617,
            "loss_tokens_upper_95": 5.621199781165275,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.105877721309662,
            "data_time": 0.049725665495945856,
            "batch_time": 0.0821408354319059,
            "samples_per_second": 845824.702642416,
            "samples_per_second_per_gpu": 105728.087830302,
            "loss_sequences_lower_95": 3.9524146881103515,
            "loss_sequences_upper_95": 4.381482421875,
            "loss_tokens_lower_95": 3.7706916822730325,
            "loss_tokens_upper_95": 4.265911603187192,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.783831580268446,
            "data_time": 0.0034875925820785554,
            "batch_time": 0.03237192079821004,
            "samples_per_second": 1100722.4856765668,
            "samples_per_second_per_gpu": 137590.31070957085,
            "loss_sequences_lower_95": 4.739834604097364,
            "loss_sequences_upper_95": 4.827562539482312,
            "loss_tokens_lower_95": 4.739251602781954,
            "loss_tokens_upper_95": 4.828388569420899,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.739131043720792,
            "data_time": 0.004950587162185843,
            "batch_time": 0.03426874286782294,
            "samples_per_second": 1079012.6276512996,
            "samples_per_second_per_gpu": 134876.57845641245,
            "loss_sequences_lower_95": 4.694179425563703,
            "loss_sequences_upper_95": 4.784760126337274,
            "loss_tokens_lower_95": 4.693270986496085,
            "loss_tokens_upper_95": 4.784994919235642,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.049916955410332,
            "data_time": 0.0035354388197863847,
            "batch_time": 0.03284410807277935,
            "samples_per_second": 1080209.5665264856,
            "samples_per_second_per_gpu": 135026.1958158107,
            "loss_sequences_lower_95": 4.174854166489561,
            "loss_sequences_upper_95": 4.291411901895318,
            "loss_tokens_lower_95": 3.898025793433503,
            "loss_tokens_upper_95": 3.9558939307773664,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.159653324604035,
            "data_time": 0.011119145900011063,
            "batch_time": 0.040950086899101734,
            "samples_per_second": 1028582.4676163975,
            "samples_per_second_per_gpu": 128572.80845204969,
            "loss_sequences_lower_95": 6.328966394042969,
            "loss_sequences_upper_95": 6.863792211914062,
            "loss_tokens_lower_95": 5.496005562076026,
            "loss_tokens_upper_95": 5.853758214499917,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.582189604640007,
            "data_time": 0.1559692919254303,
            "batch_time": 0.1916361302137375,
            "samples_per_second": 531811.2557907744,
            "samples_per_second_per_gpu": 66476.4069738468,
            "loss_sequences_lower_95": 4.285762536525726,
            "loss_sequences_upper_95": 4.976168632507324,
            "loss_tokens_lower_95": 4.061283953436489,
            "loss_tokens_upper_95": 4.914095341474161,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.904976863970702,
            "data_time": 0.030745516432092546,
            "batch_time": 0.06119543440798496,
            "samples_per_second": 913434.8038522151,
            "samples_per_second_per_gpu": 114179.35048152688,
            "loss_sequences_lower_95": 6.365192053783899,
            "loss_sequences_upper_95": 7.224701734520923,
            "loss_tokens_lower_95": 4.477004182594067,
            "loss_tokens_upper_95": 4.966179463452121,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.045123565452179,
            "data_time": 0.0031222382353411782,
            "batch_time": 0.032494312359227076,
            "samples_per_second": 1080297.52178578,
            "samples_per_second_per_gpu": 135037.1902232225,
            "loss_sequences_lower_95": 4.0196578532860965,
            "loss_sequences_upper_95": 4.07009749839586,
            "loss_tokens_lower_95": 4.018923394239433,
            "loss_tokens_upper_95": 4.070376953551633,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.15562239553535,
            "data_time": 0.0026090591047269966,
            "batch_time": 0.03160907336802366,
            "samples_per_second": 1097772.857055953,
            "samples_per_second_per_gpu": 137221.60713199413,
            "loss_sequences_lower_95": 5.127416466287841,
            "loss_sequences_upper_95": 5.341622939610421,
            "loss_tokens_lower_95": 4.893085666924899,
            "loss_tokens_upper_95": 5.103677480532834,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.732601361396985,
            "data_time": 0.01819121175342136,
            "batch_time": 0.04831569724612766,
            "samples_per_second": 969411.7148760379,
            "samples_per_second_per_gpu": 121176.46435950474,
            "loss_sequences_lower_95": 3.568824002332303,
            "loss_sequences_upper_95": 3.948172567178915,
            "loss_tokens_lower_95": 3.4829003827203544,
            "loss_tokens_upper_95": 3.796553992342883,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.061313323647184,
            "data_time": 0.0047353353351354596,
            "batch_time": 0.034209584817290306,
            "samples_per_second": 1072123.0899218242,
            "samples_per_second_per_gpu": 134015.38624022802,
            "loss_sequences_lower_95": 4.088988872428719,
            "loss_sequences_upper_95": 4.230326158001184,
            "loss_tokens_lower_95": 3.932373137160657,
            "loss_tokens_upper_95": 4.081095882383069,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8143353665747295,
            "data_time": 0.029992041133698962,
            "batch_time": 0.060930873666490824,
            "samples_per_second": 946361.3641492393,
            "samples_per_second_per_gpu": 118295.17051865491,
            "loss_sequences_lower_95": 3.5638946440161727,
            "loss_sequences_upper_95": 4.04143664662431,
            "loss_tokens_lower_95": 3.5509425700002546,
            "loss_tokens_upper_95": 3.954197117866155,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.058687415402468,
            "data_time": 0.001928316370468353,
            "batch_time": 0.031238985356930205,
            "samples_per_second": 1086231.4456596335,
            "samples_per_second_per_gpu": 135778.9307074542,
            "loss_sequences_lower_95": 4.040633995939813,
            "loss_sequences_upper_95": 4.076224971556812,
            "loss_tokens_lower_95": 4.040988734856346,
            "loss_tokens_upper_95": 4.076452175200665,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1733174856426647,
            "data_time": 0.04731737917119806,
            "batch_time": 0.07748794122175737,
            "samples_per_second": 882967.8107914403,
            "samples_per_second_per_gpu": 110370.97634893004,
            "loss_sequences_lower_95": 2.0466379554526317,
            "loss_sequences_upper_95": 2.345997012240215,
            "loss_tokens_lower_95": 1.9368149506521077,
            "loss_tokens_upper_95": 2.2643488342222686,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.821812018389722,
            "data_time": 0.0012813142164355051,
            "batch_time": 0.030471205598950286,
            "samples_per_second": 1091719.0506764397,
            "samples_per_second_per_gpu": 136464.88133455496,
            "loss_sequences_lower_95": 6.268313249312107,
            "loss_sequences_upper_95": 6.325678895194575,
            "loss_tokens_lower_95": 5.14050622582205,
            "loss_tokens_upper_95": 5.19688959139265,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.932875911712647,
            "data_time": 0.005828690907311818,
            "batch_time": 0.0347686731626117,
            "samples_per_second": 1085275.3711856024,
            "samples_per_second_per_gpu": 135659.4213982003,
            "loss_sequences_lower_95": 6.877254821777344,
            "loss_sequences_upper_95": 7.125904382324219,
            "loss_tokens_lower_95": 6.720311801376401,
            "loss_tokens_upper_95": 6.95170330097762,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.76998436761939,
            "data_time": 0.022250080512741865,
            "batch_time": 0.052122358548439156,
            "samples_per_second": 993847.6171639058,
            "samples_per_second_per_gpu": 124230.95214548822,
            "loss_sequences_lower_95": 4.632871146824049,
            "loss_sequences_upper_95": 4.904199722953465,
            "loss_tokens_lower_95": 4.635337139627208,
            "loss_tokens_upper_95": 4.900837986158288,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.929107788837317,
            "data_time": 0.004606244793857436,
            "batch_time": 0.0338029714233904,
            "samples_per_second": 1082603.1905845585,
            "samples_per_second_per_gpu": 135325.39882306982,
            "loss_sequences_lower_95": 7.846201967181581,
            "loss_sequences_upper_95": 8.012268288352272,
            "loss_tokens_lower_95": 7.848345928770123,
            "loss_tokens_upper_95": 8.008650919596354,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.242427702029546,
            "data_time": 0.004077621913970785,
            "batch_time": 0.03310668024610966,
            "samples_per_second": 1091689.9171732017,
            "samples_per_second_per_gpu": 136461.2396466502,
            "loss_sequences_lower_95": 1.297294832356771,
            "loss_sequences_upper_95": 1.3809996663411457,
            "loss_tokens_lower_95": 1.140451385632378,
            "loss_tokens_upper_95": 1.20559682857518,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.665838433447338,
            "data_time": 0.024310935820852007,
            "batch_time": 0.053982162049838474,
            "samples_per_second": 959760.8419497529,
            "samples_per_second_per_gpu": 119970.10524371911,
            "loss_sequences_lower_95": 5.390806405203683,
            "loss_sequences_upper_95": 5.9409872291201635,
            "loss_tokens_lower_95": 5.3875272187732515,
            "loss_tokens_upper_95": 5.945817507789248,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.390149563550949,
            "data_time": 0.15875278413295746,
            "batch_time": 0.19387032091617584,
            "samples_per_second": 536241.4003576302,
            "samples_per_second_per_gpu": 67030.17504470378,
            "loss_sequences_lower_95": 3.0799318969249727,
            "loss_sequences_upper_95": 4.425414860248566,
            "loss_tokens_lower_95": 2.8147581749355672,
            "loss_tokens_upper_95": 3.4239070302432344,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.451583010196686,
            "data_time": 0.0057843947221362406,
            "batch_time": 0.03487368992396763,
            "samples_per_second": 1082448.5709236104,
            "samples_per_second_per_gpu": 135306.0713654513,
            "loss_sequences_lower_95": 7.394316137695313,
            "loss_sequences_upper_95": 7.7461956665039065,
            "loss_tokens_lower_95": 7.141949204704685,
            "loss_tokens_upper_95": 7.4567694494566945,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.926702518463134,
            "data_time": 0.00567808605375744,
            "batch_time": 0.03479235465564425,
            "samples_per_second": 1081529.428360688,
            "samples_per_second_per_gpu": 135191.178545086,
            "loss_sequences_lower_95": 7.004460375976563,
            "loss_sequences_upper_95": 7.234050085449218,
            "loss_tokens_lower_95": 6.697602018261768,
            "loss_tokens_upper_95": 6.881438625222991,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.269551862217298,
            "data_time": 0.0036021843403079437,
            "batch_time": 0.03261379127119696,
            "samples_per_second": 1091222.8180343646,
            "samples_per_second_per_gpu": 136402.85225429558,
            "loss_sequences_lower_95": 4.232147042878394,
            "loss_sequences_upper_95": 4.306184934027195,
            "loss_tokens_lower_95": 4.232136902407915,
            "loss_tokens_upper_95": 4.306432569390609,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.121005614964827,
            "data_time": 0.008483685035359824,
            "batch_time": 0.03843517605991882,
            "samples_per_second": 1039747.5868980958,
            "samples_per_second_per_gpu": 129968.44836226197,
            "loss_sequences_lower_95": 5.026036763227847,
            "loss_sequences_upper_95": 5.212398368495584,
            "loss_tokens_lower_95": 5.024755409346199,
            "loss_tokens_upper_95": 5.213250104256672,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 8.623392528533936,
            "data_time": 0.005942375887007941,
            "batch_time": 0.034957517234105916,
            "samples_per_second": 1082839.5610141966,
            "samples_per_second_per_gpu": 135354.94512677458,
            "loss_sequences_lower_95": 8.570028247070312,
            "loss_sequences_upper_95": 8.678566479492188,
            "loss_tokens_lower_95": 8.569263647460938,
            "loss_tokens_upper_95": 8.679091162109374,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.884088479175586,
            "data_time": 0.001807364534114845,
            "batch_time": 0.03054458923736457,
            "samples_per_second": 1105578.135740679,
            "samples_per_second_per_gpu": 138197.26696758487,
            "loss_sequences_lower_95": 5.505018570393803,
            "loss_sequences_upper_95": 5.619516347489948,
            "loss_tokens_lower_95": 4.158209848422237,
            "loss_tokens_upper_95": 4.231670326363413,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.131683588917576,
            "data_time": 0.019064790861947196,
            "batch_time": 0.048675715923309326,
            "samples_per_second": 990152.6970832224,
            "samples_per_second_per_gpu": 123769.0871354028,
            "loss_sequences_lower_95": 4.967960562634824,
            "loss_sequences_upper_95": 5.293085684705137,
            "loss_tokens_lower_95": 4.9686646646528105,
            "loss_tokens_upper_95": 5.290028404121968,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.226570849325142,
            "data_time": 0.011446856893599033,
            "batch_time": 0.04058909974992275,
            "samples_per_second": 1073803.329452481,
            "samples_per_second_per_gpu": 134225.4161815601,
            "loss_sequences_lower_95": 5.112813146254595,
            "loss_sequences_upper_95": 5.3380550249885115,
            "loss_tokens_lower_95": 5.114699156518076,
            "loss_tokens_upper_95": 5.336905086741727,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.195041298851339,
            "data_time": 0.002020728203558153,
            "batch_time": 0.03073725535828461,
            "samples_per_second": 1105394.1225373677,
            "samples_per_second_per_gpu": 138174.26531717097,
            "loss_sequences_lower_95": 5.602831945618815,
            "loss_sequences_upper_95": 5.712501639225541,
            "loss_tokens_lower_95": 4.55275476047815,
            "loss_tokens_upper_95": 4.638734155607288,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.774367727299847,
            "data_time": 0.02918725957473119,
            "batch_time": 0.05861988663673401,
            "samples_per_second": 1005421.5478753197,
            "samples_per_second_per_gpu": 125677.69348441497,
            "loss_sequences_lower_95": 4.613063008949239,
            "loss_sequences_upper_95": 4.924679242492353,
            "loss_tokens_lower_95": 4.618518211728051,
            "loss_tokens_upper_95": 4.925297231522817,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.091166173427477,
            "data_time": 0.00330343991699964,
            "batch_time": 0.032461361425117984,
            "samples_per_second": 1088259.7964443143,
            "samples_per_second_per_gpu": 136032.4745555393,
            "loss_sequences_lower_95": 7.064265427895642,
            "loss_sequences_upper_95": 7.118551850391819,
            "loss_tokens_lower_95": 7.06394256498471,
            "loss_tokens_upper_95": 7.118815970231269,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.212512056804398,
            "data_time": 0.024653647162697533,
            "batch_time": 0.05432307720184326,
            "samples_per_second": 959551.3161276996,
            "samples_per_second_per_gpu": 119943.91451596245,
            "loss_sequences_lower_95": 5.044309093882736,
            "loss_sequences_upper_95": 5.3794378039906325,
            "loss_tokens_lower_95": 5.045289330343598,
            "loss_tokens_upper_95": 5.379829777097239,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.3667629837989805,
            "data_time": 0.07802744954824448,
            "batch_time": 0.11184205114841461,
            "samples_per_second": 698943.2789180202,
            "samples_per_second_per_gpu": 87367.90986475252,
            "loss_sequences_lower_95": 5.03390126546224,
            "loss_sequences_upper_95": 5.942154057820638,
            "loss_tokens_lower_95": 4.459076415167914,
            "loss_tokens_upper_95": 5.784312693277995,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.701083827018738,
            "data_time": 0.08410567045211792,
            "batch_time": 0.11621767282485962,
            "samples_per_second": 730505.5261140357,
            "samples_per_second_per_gpu": 91313.19076425447,
            "loss_sequences_lower_95": 4.358848406473795,
            "loss_sequences_upper_95": 5.408567174275715,
            "loss_tokens_lower_95": 3.594918008868614,
            "loss_tokens_upper_95": 5.082913062277805,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.3136763332518635,
            "data_time": 0.0030399817028348764,
            "batch_time": 0.032178149495565446,
            "samples_per_second": 1091229.841021852,
            "samples_per_second_per_gpu": 136403.7301277315,
            "loss_sequences_lower_95": 6.276973174015096,
            "loss_sequences_upper_95": 6.352795198016384,
            "loss_tokens_lower_95": 6.274257150911266,
            "loss_tokens_upper_95": 6.352763758169183,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.0148804007229404,
            "data_time": 0.0011646049973479506,
            "batch_time": 0.029984810754363333,
            "samples_per_second": 1104309.6862046067,
            "samples_per_second_per_gpu": 138038.71077557583,
            "loss_sequences_lower_95": 2.332787102563171,
            "loss_sequences_upper_95": 2.3694605271874787,
            "loss_tokens_lower_95": 1.6722959361848853,
            "loss_tokens_upper_95": 1.693946189703646,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.497412644033357,
            "data_time": 0.040034931153059006,
            "batch_time": 0.07051732018589973,
            "samples_per_second": 966703.9260772165,
            "samples_per_second_per_gpu": 120837.99075965206,
            "loss_sequences_lower_95": 5.499956242118295,
            "loss_sequences_upper_95": 5.885888575756644,
            "loss_tokens_lower_95": 5.165507577414947,
            "loss_tokens_upper_95": 5.388395086338583,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.872388040697253,
            "data_time": 0.12104488554454985,
            "batch_time": 0.1557780901590983,
            "samples_per_second": 509243.06740653445,
            "samples_per_second_per_gpu": 63655.383425816806,
            "loss_sequences_lower_95": 7.436008742048934,
            "loss_sequences_upper_95": 8.524556402257971,
            "loss_tokens_lower_95": 7.144278387375819,
            "loss_tokens_upper_95": 8.31652184003665,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.416748872617396,
            "data_time": 0.030670211428687685,
            "batch_time": 0.061041136582692467,
            "samples_per_second": 962485.3545375151,
            "samples_per_second_per_gpu": 120310.66931718939,
            "loss_sequences_lower_95": 5.351958214364401,
            "loss_sequences_upper_95": 5.69438219768245,
            "loss_tokens_lower_95": 5.074257705726977,
            "loss_tokens_upper_95": 5.2649534305351136,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.5153658215592545,
            "data_time": 0.03050350859051659,
            "batch_time": 0.062274277210235596,
            "samples_per_second": 931753.3732722398,
            "samples_per_second_per_gpu": 116469.17165902998,
            "loss_sequences_lower_95": 5.4580286909894244,
            "loss_sequences_upper_95": 5.769633241979087,
            "loss_tokens_lower_95": 5.187111413561334,
            "loss_tokens_upper_95": 5.346816328372376,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.569822397173905,
            "data_time": 0.03136316935221354,
            "batch_time": 0.0630081608181908,
            "samples_per_second": 953676.7110075103,
            "samples_per_second_per_gpu": 119209.58887593879,
            "loss_sequences_lower_95": 5.4818621565655965,
            "loss_sequences_upper_95": 5.877208393376048,
            "loss_tokens_lower_95": 5.2088992276227355,
            "loss_tokens_upper_95": 5.459164384838628,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.533020801660491,
            "data_time": 0.03263682410830543,
            "batch_time": 0.06291692881357103,
            "samples_per_second": 970804.307757313,
            "samples_per_second_per_gpu": 121350.53846966413,
            "loss_sequences_lower_95": 5.466454901346347,
            "loss_sequences_upper_95": 5.753142668561238,
            "loss_tokens_lower_95": 5.232537471022561,
            "loss_tokens_upper_95": 5.377455115392572,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.728735325499351,
            "data_time": 0.03380553810684769,
            "batch_time": 0.06358323568179283,
            "samples_per_second": 1003083.0796780636,
            "samples_per_second_per_gpu": 125385.38495975795,
            "loss_sequences_lower_95": 5.703017780943687,
            "loss_sequences_upper_95": 5.948160074364324,
            "loss_tokens_lower_95": 5.518430553524422,
            "loss_tokens_upper_95": 5.638458213750365,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.8597049160701475,
            "data_time": 0.03090027968088786,
            "batch_time": 0.061031557264782134,
            "samples_per_second": 971503.925530669,
            "samples_per_second_per_gpu": 121437.99069133363,
            "loss_sequences_lower_95": 5.876092436255478,
            "loss_sequences_upper_95": 6.168622430940953,
            "loss_tokens_lower_95": 5.5068386805869505,
            "loss_tokens_upper_95": 5.63475030845012,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.25/params.txt",
    "uuid": "34fbe81e-1f06-493a-83b5-0c096e5ef91e",
    "creation_date": "2023_12_14-04_59_35"
}