{
    "name": "c4_original-d=576_l=24_h=8-32.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 98353520640,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 32.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "19670704128",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=576_l=24_h=8-32.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.7170440554618835,
            "data_time": 0.038602065294981,
            "batch_time": 0.38783323019742966,
            "samples_per_second": 826867.9772089083,
            "samples_per_second_per_gpu": 103358.49715111354,
            "loss_sequences_lower_95": 3.596650244394938,
            "loss_sequences_upper_95": 3.836738624572754,
            "loss_tokens_lower_95": 3.701688098907471,
            "loss_tokens_upper_95": 3.732195390065511,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.997552980701856,
            "data_time": 0.0011088610553680873,
            "batch_time": 0.030949847736299857,
            "samples_per_second": 1071998.0798788983,
            "samples_per_second_per_gpu": 133999.7599848623,
            "loss_sequences_lower_95": 2.994642887342792,
            "loss_sequences_upper_95": 3.000393216942258,
            "loss_tokens_lower_95": 2.9871685208333334,
            "loss_tokens_upper_95": 3.0078254843749996,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.558572522474795,
            "data_time": 0.010703217506408692,
            "batch_time": 0.04063748168945312,
            "samples_per_second": 1039709.9942055265,
            "samples_per_second_per_gpu": 129963.74927569082,
            "loss_sequences_lower_95": 3.5387671832649077,
            "loss_sequences_upper_95": 3.578191596829161,
            "loss_tokens_lower_95": 3.5428616354166667,
            "loss_tokens_upper_95": 3.5745838854166667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.952293882271678,
            "data_time": 0.0017247524504598818,
            "batch_time": 0.03025257597236257,
            "samples_per_second": 1119414.8135095865,
            "samples_per_second_per_gpu": 139926.8516886983,
            "loss_sequences_lower_95": 2.9419160558956188,
            "loss_sequences_upper_95": 2.962489353455219,
            "loss_tokens_lower_95": 2.941824375,
            "loss_tokens_upper_95": 2.96258296875,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0048636457827813,
            "data_time": 0.010921005233825441,
            "batch_time": 0.04028381104488297,
            "samples_per_second": 1049971.2403277447,
            "samples_per_second_per_gpu": 131246.4050409681,
            "loss_sequences_lower_95": 2.969592508910387,
            "loss_sequences_upper_95": 3.038874708125159,
            "loss_tokens_lower_95": 2.9944411927083334,
            "loss_tokens_upper_95": 3.0150109427083334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4975423404426498,
            "data_time": 0.004096555321112923,
            "batch_time": 0.03316333598416785,
            "samples_per_second": 1102391.3972402937,
            "samples_per_second_per_gpu": 137798.9246550367,
            "loss_sequences_lower_95": 3.460648605792722,
            "loss_sequences_upper_95": 3.5347489948908053,
            "loss_tokens_lower_95": 3.4855076145833332,
            "loss_tokens_upper_95": 3.509775770833333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.255568581916848,
            "data_time": 0.0017341537732284572,
            "batch_time": 0.031208277623766205,
            "samples_per_second": 1093866.1803601356,
            "samples_per_second_per_gpu": 136733.27254501695,
            "loss_sequences_lower_95": 3.2226405602279975,
            "loss_sequences_upper_95": 3.2876996621890946,
            "loss_tokens_lower_95": 3.241335078125,
            "loss_tokens_upper_95": 3.269966526041667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7202862176345906,
            "data_time": 0.001878290284083808,
            "batch_time": 0.030338258436163914,
            "samples_per_second": 1125506.4825578516,
            "samples_per_second_per_gpu": 140688.31031973145,
            "loss_sequences_lower_95": 3.7121685209424085,
            "loss_sequences_upper_95": 3.728518713187173,
            "loss_tokens_lower_95": 3.708392645833333,
            "loss_tokens_upper_95": 3.73220584375,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4183675026505944,
            "data_time": 0.015292238621484665,
            "batch_time": 0.06454905631050231,
            "samples_per_second": 1044226.1903991719,
            "samples_per_second_per_gpu": 130528.27379989649,
            "loss_sequences_lower_95": 3.377386152065866,
            "loss_sequences_upper_95": 3.4627964795120363,
            "loss_tokens_lower_95": 3.4073155625,
            "loss_tokens_upper_95": 3.4295476770833333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.43977001250497,
            "data_time": 0.010663101449608803,
            "batch_time": 0.04021425545215607,
            "samples_per_second": 1057053.5184149505,
            "samples_per_second_per_gpu": 132131.6898018688,
            "loss_sequences_lower_95": 4.409703950542706,
            "loss_sequences_upper_95": 4.466229163610888,
            "loss_tokens_lower_95": 4.42650796875,
            "loss_tokens_upper_95": 4.453633770833333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3821596272337744,
            "data_time": 0.0013493368415905698,
            "batch_time": 0.029751985815983335,
            "samples_per_second": 1128090.0442278855,
            "samples_per_second_per_gpu": 141011.2555284857,
            "loss_sequences_lower_95": 3.3741603061403658,
            "loss_sequences_upper_95": 3.3904363924172602,
            "loss_tokens_lower_95": 3.3710781822916664,
            "loss_tokens_upper_95": 3.393186484375,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1594457284751805,
            "data_time": 0.002799947295558145,
            "batch_time": 0.03173719496651553,
            "samples_per_second": 1107158.2958283431,
            "samples_per_second_per_gpu": 138394.7869785429,
            "loss_sequences_lower_95": 3.15023691097134,
            "loss_sequences_upper_95": 3.1685181935098656,
            "loss_tokens_lower_95": 3.1484857760416665,
            "loss_tokens_upper_95": 3.1707852656249997,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.868122534626154,
            "data_time": 0.01081962076571619,
            "batch_time": 0.03986339795259619,
            "samples_per_second": 1058932.9029761667,
            "samples_per_second_per_gpu": 132366.61287202084,
            "loss_sequences_lower_95": 3.8329510671371865,
            "loss_sequences_upper_95": 3.903173642419657,
            "loss_tokens_lower_95": 3.85469371875,
            "loss_tokens_upper_95": 3.8814623125,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0025538021580993,
            "data_time": 0.01133152593179528,
            "batch_time": 0.04026215864842631,
            "samples_per_second": 1068102.7289469114,
            "samples_per_second_per_gpu": 133512.84111836393,
            "loss_sequences_lower_95": 2.9471744886969358,
            "loss_sequences_upper_95": 3.057074161071156,
            "loss_tokens_lower_95": 2.9909416458333333,
            "loss_tokens_upper_95": 3.0139065104166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.472220009023493,
            "data_time": 0.09035204138074603,
            "batch_time": 0.12292970078332084,
            "samples_per_second": 559715.2226406966,
            "samples_per_second_per_gpu": 69964.40283008707,
            "loss_sequences_lower_95": 4.403587783466686,
            "loss_sequences_upper_95": 4.542971246892756,
            "loss_tokens_lower_95": 4.4432154048572885,
            "loss_tokens_upper_95": 4.501204213229093,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5502559721643654,
            "data_time": 0.016187896782701664,
            "batch_time": 0.04559693146835674,
            "samples_per_second": 1038521.0534937561,
            "samples_per_second_per_gpu": 129815.13168671951,
            "loss_sequences_lower_95": 3.4711474894087098,
            "loss_sequences_upper_95": 3.6281829745011844,
            "loss_tokens_lower_95": 3.5370319895833333,
            "loss_tokens_upper_95": 3.563094291666667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.789478597666154,
            "data_time": 0.014426127076148987,
            "batch_time": 0.044009858121474586,
            "samples_per_second": 1049868.3080777517,
            "samples_per_second_per_gpu": 131233.53850971896,
            "loss_sequences_lower_95": 5.7264130202321075,
            "loss_sequences_upper_95": 5.84935088547679,
            "loss_tokens_lower_95": 5.776481854166667,
            "loss_tokens_upper_95": 5.8026304374999995,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.529119427086877,
            "data_time": 0.04017632082104683,
            "batch_time": 0.07069481909275055,
            "samples_per_second": 916583.1065738256,
            "samples_per_second_per_gpu": 114572.8883217282,
            "loss_sequences_lower_95": 3.4786954223132525,
            "loss_sequences_upper_95": 3.5740061650510695,
            "loss_tokens_lower_95": 3.515108033477283,
            "loss_tokens_upper_95": 3.543177182557153,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.013402726707627,
            "data_time": 0.001735316797378123,
            "batch_time": 0.03078541132475086,
            "samples_per_second": 1097380.9208729444,
            "samples_per_second_per_gpu": 137172.61510911805,
            "loss_sequences_lower_95": 4.9942103377367895,
            "loss_sequences_upper_95": 5.033003236384596,
            "loss_tokens_lower_95": 4.993805386296468,
            "loss_tokens_upper_95": 5.032764429924512,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.737324937979222,
            "data_time": 0.002106482103751723,
            "batch_time": 0.031421563665198673,
            "samples_per_second": 1086079.5322268894,
            "samples_per_second_per_gpu": 135759.94152836117,
            "loss_sequences_lower_95": 2.742933376109403,
            "loss_sequences_upper_95": 2.768291651625672,
            "loss_tokens_lower_95": 2.7136366568576267,
            "loss_tokens_upper_95": 2.731917659449137,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.945196771711801,
            "data_time": 0.003290291668445181,
            "batch_time": 0.032716950866886176,
            "samples_per_second": 1082776.2542071976,
            "samples_per_second_per_gpu": 135347.0317758997,
            "loss_sequences_lower_95": 4.202737915442696,
            "loss_sequences_upper_95": 4.499502377516828,
            "loss_tokens_lower_95": 3.3962054892724716,
            "loss_tokens_upper_95": 3.60835285527609,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.979506579990188,
            "data_time": 0.0036461021988949877,
            "batch_time": 0.032681169027977804,
            "samples_per_second": 1087744.4820452046,
            "samples_per_second_per_gpu": 135968.06025565058,
            "loss_sequences_lower_95": 4.075129361979166,
            "loss_sequences_upper_95": 4.272508203125,
            "loss_tokens_lower_95": 3.716199562696541,
            "loss_tokens_upper_95": 3.858425019654088,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8968825262545455,
            "data_time": 0.0048675778045194,
            "batch_time": 0.033950439585459895,
            "samples_per_second": 1089108.4243849078,
            "samples_per_second_per_gpu": 136138.55304811348,
            "loss_sequences_lower_95": 2.9365956940553333,
            "loss_sequences_upper_95": 3.0009230468379813,
            "loss_tokens_lower_95": 2.804260450444455,
            "loss_tokens_upper_95": 2.837243080846885,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1375215974721042,
            "data_time": 0.025271639227867126,
            "batch_time": 0.05622647702693939,
            "samples_per_second": 975362.0718748629,
            "samples_per_second_per_gpu": 121920.25898435786,
            "loss_sequences_lower_95": 3.0651756702769886,
            "loss_sequences_upper_95": 3.263208299116655,
            "loss_tokens_lower_95": 3.0453118780232793,
            "loss_tokens_upper_95": 3.109959193603245,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5218478212551196,
            "data_time": 0.023718375712633133,
            "batch_time": 0.053130462765693665,
            "samples_per_second": 999334.0576403809,
            "samples_per_second_per_gpu": 124916.75720504762,
            "loss_sequences_lower_95": 3.506093687719228,
            "loss_sequences_upper_95": 3.718683160275829,
            "loss_tokens_lower_95": 3.4089093839920444,
            "loss_tokens_upper_95": 3.5113434114759645,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.506118160088857,
            "data_time": 0.018625924220451943,
            "batch_time": 0.048525082759368114,
            "samples_per_second": 1003042.5897261816,
            "samples_per_second_per_gpu": 125380.3237157727,
            "loss_sequences_lower_95": 3.475642883300781,
            "loss_sequences_upper_95": 3.584602549235026,
            "loss_tokens_lower_95": 3.3753707904603485,
            "loss_tokens_upper_95": 3.567978075025108,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.831914010856054,
            "data_time": 0.001524928493441565,
            "batch_time": 0.03052427796629538,
            "samples_per_second": 1098906.8578001612,
            "samples_per_second_per_gpu": 137363.35722502015,
            "loss_sequences_lower_95": 4.833917057016141,
            "loss_sequences_upper_95": 4.917537613106516,
            "loss_tokens_lower_95": 4.700742820219552,
            "loss_tokens_upper_95": 4.785568236283984,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.099476803533156,
            "data_time": 0.003086759740074209,
            "batch_time": 0.0319524359383039,
            "samples_per_second": 1100614.6952758208,
            "samples_per_second_per_gpu": 137576.8369094776,
            "loss_sequences_lower_95": 4.645730714123658,
            "loss_sequences_upper_95": 4.960173789900963,
            "loss_tokens_lower_95": 3.361618249877597,
            "loss_tokens_upper_95": 3.497582359210022,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.738497243094363,
            "data_time": 0.005304409845455273,
            "batch_time": 0.034875238263929215,
            "samples_per_second": 1065852.2330219157,
            "samples_per_second_per_gpu": 133231.52912773946,
            "loss_sequences_lower_95": 4.14378645444486,
            "loss_sequences_upper_95": 4.492655220943099,
            "loss_tokens_lower_95": 3.346612472017183,
            "loss_tokens_upper_95": 3.504434142061956,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.791740136603787,
            "data_time": 0.0259550256388528,
            "batch_time": 0.055238764200891764,
            "samples_per_second": 1011811.1880785301,
            "samples_per_second_per_gpu": 126476.39850981627,
            "loss_sequences_lower_95": 5.69824001364512,
            "loss_sequences_upper_95": 5.883969687544592,
            "loss_tokens_lower_95": 5.700700608344927,
            "loss_tokens_upper_95": 5.882231439738513,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.130162239074707,
            "data_time": 0.055829955981327936,
            "batch_time": 0.0864299673300523,
            "samples_per_second": 899111.8573378505,
            "samples_per_second_per_gpu": 112388.98216723131,
            "loss_sequences_lower_95": 2.9879151306152343,
            "loss_sequences_upper_95": 3.3620862350463865,
            "loss_tokens_lower_95": 2.8330599271334136,
            "loss_tokens_upper_95": 3.2678629702873434,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.93218654949321,
            "data_time": 0.0035935179587522166,
            "batch_time": 0.03348748644924359,
            "samples_per_second": 1076475.3944182443,
            "samples_per_second_per_gpu": 134559.42430228053,
            "loss_sequences_lower_95": 4.879992850702886,
            "loss_sequences_upper_95": 4.984962499300313,
            "loss_tokens_lower_95": 4.879132723539854,
            "loss_tokens_upper_95": 4.985364944005086,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.054335847240701,
            "data_time": 0.005384568872498455,
            "batch_time": 0.03432991065170014,
            "samples_per_second": 1094321.3690567992,
            "samples_per_second_per_gpu": 136790.1711320999,
            "loss_sequences_lower_95": 4.998700136126894,
            "loss_sequences_upper_95": 5.10852099769336,
            "loss_tokens_lower_95": 4.997697859880477,
            "loss_tokens_upper_95": 5.108997271863483,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0834236463473075,
            "data_time": 0.003788545373706529,
            "batch_time": 0.033295468429252344,
            "samples_per_second": 1074526.732874051,
            "samples_per_second_per_gpu": 134315.84160925637,
            "loss_sequences_lower_95": 3.227473670536079,
            "loss_sequences_upper_95": 3.3525226381320556,
            "loss_tokens_lower_95": 2.904394988375804,
            "loss_tokens_upper_95": 2.9573918024778467,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.144466571688652,
            "data_time": 0.011391025967895985,
            "batch_time": 0.04046641010791063,
            "samples_per_second": 1052065.811652665,
            "samples_per_second_per_gpu": 131508.22645658313,
            "loss_sequences_lower_95": 5.303087780761718,
            "loss_sequences_upper_95": 5.875612121582031,
            "loss_tokens_lower_95": 4.547381393866851,
            "loss_tokens_upper_95": 4.908952399202721,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2975190430879593,
            "data_time": 0.1628507673740387,
            "batch_time": 0.1984376162290573,
            "samples_per_second": 525325.9729030716,
            "samples_per_second_per_gpu": 65665.74661288394,
            "loss_sequences_lower_95": 3.0836607575416566,
            "loss_sequences_upper_95": 3.534759497642517,
            "loss_tokens_lower_95": 2.8571179839386334,
            "loss_tokens_upper_95": 3.644943070685726,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.767917275428772,
            "data_time": 0.029932450740895372,
            "batch_time": 0.06040301475119084,
            "samples_per_second": 907363.3529602675,
            "samples_per_second_per_gpu": 113420.41912003343,
            "loss_sequences_lower_95": 5.278620673870218,
            "loss_sequences_upper_95": 6.158424675601652,
            "loss_tokens_lower_95": 3.2660833141580645,
            "loss_tokens_upper_95": 3.7231492791933833,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.740975885112091,
            "data_time": 0.003094624934924973,
            "batch_time": 0.031881399245725736,
            "samples_per_second": 1098440.6032302713,
            "samples_per_second_per_gpu": 137305.0754037839,
            "loss_sequences_lower_95": 2.7162937789257047,
            "loss_sequences_upper_95": 2.7654861203548218,
            "loss_tokens_lower_95": 2.7153702415509637,
            "loss_tokens_upper_95": 2.765785440600767,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.480247491491058,
            "data_time": 0.002580851513052523,
            "batch_time": 0.03177075891194993,
            "samples_per_second": 1092617.4441270002,
            "samples_per_second_per_gpu": 136577.18051587502,
            "loss_sequences_lower_95": 2.4507023176729574,
            "loss_sequences_upper_95": 2.5912124062287742,
            "loss_tokens_lower_95": 2.344770909224902,
            "loss_tokens_upper_95": 2.481020456189619,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0572648648813967,
            "data_time": 0.020710946785079107,
            "batch_time": 0.050393001900778875,
            "samples_per_second": 989807.7935029444,
            "samples_per_second_per_gpu": 123725.97418786805,
            "loss_sequences_lower_95": 2.913935801747081,
            "loss_sequences_upper_95": 3.3195331280048075,
            "loss_tokens_lower_95": 2.8086442155158706,
            "loss_tokens_upper_95": 3.098195962261867,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.466019930666417,
            "data_time": 0.005018612742424012,
            "batch_time": 0.0342134241014719,
            "samples_per_second": 1076714.791287011,
            "samples_per_second_per_gpu": 134589.34891087638,
            "loss_sequences_lower_95": 3.507577965065731,
            "loss_sequences_upper_95": 3.659956293625444,
            "loss_tokens_lower_95": 3.3157067495587493,
            "loss_tokens_upper_95": 3.457914137026706,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.645791263842001,
            "data_time": 0.032308757305145264,
            "batch_time": 0.06319799593516759,
            "samples_per_second": 951567.8896373588,
            "samples_per_second_per_gpu": 118945.98620466985,
            "loss_sequences_lower_95": 2.508974652174042,
            "loss_sequences_upper_95": 2.9530445610604636,
            "loss_tokens_lower_95": 2.3786863989775564,
            "loss_tokens_upper_95": 2.717307937398042,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.346830070149925,
            "data_time": 0.0019130296246128182,
            "batch_time": 0.030917706415265547,
            "samples_per_second": 1097452.5267934804,
            "samples_per_second_per_gpu": 137181.56584918505,
            "loss_sequences_lower_95": 4.331471724032307,
            "loss_sequences_upper_95": 4.362150643019229,
            "loss_tokens_lower_95": 4.331303467724795,
            "loss_tokens_upper_95": 4.361998395772904,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8504527865104305,
            "data_time": 0.04938748533075506,
            "batch_time": 0.08337331251664595,
            "samples_per_second": 898692.5297619904,
            "samples_per_second_per_gpu": 112336.5662202488,
            "loss_sequences_lower_95": 0.8079420163793471,
            "loss_sequences_upper_95": 0.9335010047097808,
            "loss_tokens_lower_95": 0.7193380615484264,
            "loss_tokens_upper_95": 0.9041856529524773,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.871953012602134,
            "data_time": 0.001268895353875236,
            "batch_time": 0.03035127931748332,
            "samples_per_second": 1095546.8914044888,
            "samples_per_second_per_gpu": 136943.3614255611,
            "loss_sequences_lower_95": 5.252942687860325,
            "loss_sequences_upper_95": 5.300400441807521,
            "loss_tokens_lower_95": 4.2818552949709865,
            "loss_tokens_upper_95": 4.332407749032882,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.607674751281738,
            "data_time": 0.006377117501364814,
            "batch_time": 0.036246293120914035,
            "samples_per_second": 1056927.8598983106,
            "samples_per_second_per_gpu": 132115.98248728883,
            "loss_sequences_lower_95": 6.5791244140625,
            "loss_sequences_upper_95": 6.85933330078125,
            "loss_tokens_lower_95": 6.337242906515855,
            "loss_tokens_upper_95": 6.597047484501885,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.266498772994332,
            "data_time": 0.02457600124811722,
            "batch_time": 0.05441391063948809,
            "samples_per_second": 999560.2959158415,
            "samples_per_second_per_gpu": 124945.03698948018,
            "loss_sequences_lower_95": 5.108844073751698,
            "loss_sequences_upper_95": 5.427431375254756,
            "loss_tokens_lower_95": 5.1073692520805025,
            "loss_tokens_upper_95": 5.421148097826087,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.116714298725128,
            "data_time": 0.005088239549154259,
            "batch_time": 0.03434853884110968,
            "samples_per_second": 1080856.7194130837,
            "samples_per_second_per_gpu": 135107.08992663547,
            "loss_sequences_lower_95": 5.0767894952947445,
            "loss_sequences_upper_95": 5.157292406486742,
            "loss_tokens_lower_95": 5.076769381436435,
            "loss_tokens_upper_95": 5.1574244181315105,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8698611257076263,
            "data_time": 0.004518591343088353,
            "batch_time": 0.03341031042819328,
            "samples_per_second": 1097038.6460156043,
            "samples_per_second_per_gpu": 137129.83075195053,
            "loss_sequences_lower_95": 0.898766758219401,
            "loss_sequences_upper_95": 0.9412456075032553,
            "loss_tokens_lower_95": 0.8122789975365146,
            "loss_tokens_upper_95": 0.8613284942883404,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.611387152898879,
            "data_time": 0.025052566613469805,
            "batch_time": 0.05583833370889936,
            "samples_per_second": 928516.9063071107,
            "samples_per_second_per_gpu": 116064.61328838883,
            "loss_sequences_lower_95": 5.287427150181362,
            "loss_sequences_upper_95": 5.937876659574963,
            "loss_tokens_lower_95": 5.2840507725306916,
            "loss_tokens_upper_95": 5.937458263578868,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1791895665228367,
            "data_time": 0.1719100922346115,
            "batch_time": 0.20939627289772034,
            "samples_per_second": 525478.3600083381,
            "samples_per_second_per_gpu": 65684.79500104226,
            "loss_sequences_lower_95": 2.017193728685379,
            "loss_sequences_upper_95": 2.9087447047233583,
            "loss_tokens_lower_95": 1.7141954024796635,
            "loss_tokens_upper_95": 2.184864803982764,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.557689898014068,
            "data_time": 0.006286700093556964,
            "batch_time": 0.035550969933706614,
            "samples_per_second": 1073176.119153056,
            "samples_per_second_per_gpu": 134147.014894132,
            "loss_sequences_lower_95": 7.4558078613281245,
            "loss_sequences_upper_95": 7.8213618408203125,
            "loss_tokens_lower_95": 7.286995380743708,
            "loss_tokens_upper_95": 7.607776932304885,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.892666647911072,
            "data_time": 0.00643682669079493,
            "batch_time": 0.03621380196677314,
            "samples_per_second": 1059241.4361011854,
            "samples_per_second_per_gpu": 132405.17951264817,
            "loss_sequences_lower_95": 7.016687182617188,
            "loss_sequences_upper_95": 7.2496617187500005,
            "loss_tokens_lower_95": 6.630396473616675,
            "loss_tokens_upper_95": 6.819022197847552,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.168211343102195,
            "data_time": 0.004091090581887542,
            "batch_time": 0.03304720275776841,
            "samples_per_second": 1091247.5333277492,
            "samples_per_second_per_gpu": 136405.94166596865,
            "loss_sequences_lower_95": 4.136213502490676,
            "loss_sequences_upper_95": 4.199465918852665,
            "loss_tokens_lower_95": 4.137511130782769,
            "loss_tokens_upper_95": 4.199169479917449,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.983792443795505,
            "data_time": 0.009007787416348645,
            "batch_time": 0.037819543994085304,
            "samples_per_second": 1072113.8781900576,
            "samples_per_second_per_gpu": 134014.2347737572,
            "loss_sequences_lower_95": 4.8868411203317015,
            "loss_sequences_upper_95": 5.079750382149458,
            "loss_tokens_lower_95": 4.884963215770809,
            "loss_tokens_upper_95": 5.077780531078989,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.80710285282135,
            "data_time": 0.006163898441526625,
            "batch_time": 0.03589589822859991,
            "samples_per_second": 1058797.2148742194,
            "samples_per_second_per_gpu": 132349.65185927742,
            "loss_sequences_lower_95": 6.744473608398438,
            "loss_sequences_upper_95": 6.872213317871093,
            "loss_tokens_lower_95": 6.744759020996094,
            "loss_tokens_upper_95": 6.870441186523437,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1019605118876488,
            "data_time": 0.0020326949698262495,
            "batch_time": 0.030967675850837578,
            "samples_per_second": 1098059.2137860146,
            "samples_per_second_per_gpu": 137257.40172325182,
            "loss_sequences_lower_95": 3.636246452223274,
            "loss_sequences_upper_95": 3.7378413256859035,
            "loss_tokens_lower_95": 2.450769381951702,
            "loss_tokens_upper_95": 2.517074226581496,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.356389551910002,
            "data_time": 0.020053688117436002,
            "batch_time": 0.04924956049237932,
            "samples_per_second": 1004285.813416755,
            "samples_per_second_per_gpu": 125535.72667709437,
            "loss_sequences_lower_95": 5.19952952826201,
            "loss_sequences_upper_95": 5.5122264235766965,
            "loss_tokens_lower_95": 5.202311034700764,
            "loss_tokens_upper_95": 5.510548799429367,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.473827044169108,
            "data_time": 0.011685899458825588,
            "batch_time": 0.04099520668387413,
            "samples_per_second": 1066330.6890055505,
            "samples_per_second_per_gpu": 133291.3361256938,
            "loss_sequences_lower_95": 5.350750947840074,
            "loss_sequences_upper_95": 5.593373748180913,
            "loss_tokens_lower_95": 5.354514351639094,
            "loss_tokens_upper_95": 5.591482699525122,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.286281618209735,
            "data_time": 0.0020359753071889186,
            "batch_time": 0.030774066018354716,
            "samples_per_second": 1104740.0159997179,
            "samples_per_second_per_gpu": 138092.50199996473,
            "loss_sequences_lower_95": 3.687441061278655,
            "loss_sequences_upper_95": 3.783253543418436,
            "loss_tokens_lower_95": 2.695825767244382,
            "loss_tokens_upper_95": 2.7670618432539316,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.668308767692122,
            "data_time": 0.029491230845451355,
            "batch_time": 0.05950600405534109,
            "samples_per_second": 999537.00828363,
            "samples_per_second_per_gpu": 124942.12603545375,
            "loss_sequences_lower_95": 4.514630684020028,
            "loss_sequences_upper_95": 4.816637901023582,
            "loss_tokens_lower_95": 4.513707228564712,
            "loss_tokens_upper_95": 4.81430992651238,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.914221462926369,
            "data_time": 0.0034391264339069744,
            "batch_time": 0.03260934381985694,
            "samples_per_second": 1087579.8338223577,
            "samples_per_second_per_gpu": 135947.4792277947,
            "loss_sequences_lower_95": 4.8732994522888,
            "loss_sequences_upper_95": 4.955605774859041,
            "loss_tokens_lower_95": 4.873564602446483,
            "loss_tokens_upper_95": 4.955631585077408,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.482616618999,
            "data_time": 0.025996587493202904,
            "batch_time": 0.055169354785572396,
            "samples_per_second": 973641.4784355876,
            "samples_per_second_per_gpu": 121705.18480444845,
            "loss_sequences_lower_95": 5.320045160089881,
            "loss_sequences_upper_95": 5.647682293641915,
            "loss_tokens_lower_95": 5.316149872715033,
            "loss_tokens_upper_95": 5.648429515060869,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8701342433691024,
            "data_time": 0.08670547604560852,
            "batch_time": 0.11891043186187744,
            "samples_per_second": 727551.0974867565,
            "samples_per_second_per_gpu": 90943.88718584456,
            "loss_sequences_lower_95": 2.64588098526001,
            "loss_sequences_upper_95": 3.2379567209879556,
            "loss_tokens_lower_95": 2.3360525872972278,
            "loss_tokens_upper_95": 3.0609688970777724,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5978387117385866,
            "data_time": 0.08508175611495972,
            "batch_time": 0.11709889024496078,
            "samples_per_second": 748069.3442262625,
            "samples_per_second_per_gpu": 93508.66802828282,
            "loss_sequences_lower_95": 2.448490295410156,
            "loss_sequences_upper_95": 3.1160314242045084,
            "loss_tokens_lower_95": 1.9941333299272515,
            "loss_tokens_upper_95": 2.8462776226943793,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8842165904122354,
            "data_time": 0.003464906337753754,
            "batch_time": 0.03251011502118304,
            "samples_per_second": 1092059.7420856792,
            "samples_per_second_per_gpu": 136507.4677607099,
            "loss_sequences_lower_95": 2.869562617935383,
            "loss_sequences_upper_95": 2.8984846741531665,
            "loss_tokens_lower_95": 2.8695361040477727,
            "loss_tokens_upper_95": 2.89909755701169,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.6080892694932376,
            "data_time": 0.0011852446462575926,
            "batch_time": 0.03018182393008562,
            "samples_per_second": 1098617.7590365312,
            "samples_per_second_per_gpu": 137327.2198795664,
            "loss_sequences_lower_95": 0.6987519109851561,
            "loss_sequences_upper_95": 0.7148736101865426,
            "loss_tokens_lower_95": 0.5121762814632917,
            "loss_tokens_upper_95": 0.5209176268477914,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.509919981318196,
            "data_time": 0.043452534824609756,
            "batch_time": 0.07709281146526337,
            "samples_per_second": 901440.2096978017,
            "samples_per_second_per_gpu": 112680.02621222522,
            "loss_sequences_lower_95": 4.57076079601378,
            "loss_sequences_upper_95": 4.950530321016086,
            "loss_tokens_lower_95": 4.1489579974975435,
            "loss_tokens_upper_95": 4.363100342456552,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.641481103123845,
            "data_time": 0.12830276716323125,
            "batch_time": 0.16080529349190847,
            "samples_per_second": 565913.0436741622,
            "samples_per_second_per_gpu": 70739.13045927028,
            "loss_sequences_lower_95": 6.2373639802674985,
            "loss_sequences_upper_95": 7.250852409568992,
            "loss_tokens_lower_95": 5.902260137487341,
            "loss_tokens_upper_95": 7.0682615680459095,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.358663922402917,
            "data_time": 0.03364150297074091,
            "batch_time": 0.06325670651027135,
            "samples_per_second": 979953.6402181346,
            "samples_per_second_per_gpu": 122494.20502726683,
            "loss_sequences_lower_95": 4.356219798762624,
            "loss_sequences_upper_95": 4.722654593863139,
            "loss_tokens_lower_95": 3.949618187881098,
            "loss_tokens_upper_95": 4.129195473480866,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.4604628958353185,
            "data_time": 0.03371325560978481,
            "batch_time": 0.06425035283679054,
            "samples_per_second": 963577.5370581927,
            "samples_per_second_per_gpu": 120447.19213227408,
            "loss_sequences_lower_95": 4.456656255954649,
            "loss_sequences_upper_95": 4.789440210854135,
            "loss_tokens_lower_95": 4.071347928821685,
            "loss_tokens_upper_95": 4.2214596259491906,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.6435810371143065,
            "data_time": 0.03519554081417266,
            "batch_time": 0.06597712494078137,
            "samples_per_second": 954396.1453319293,
            "samples_per_second_per_gpu": 119299.51816649117,
            "loss_sequences_lower_95": 4.658938226467225,
            "loss_sequences_upper_95": 5.0931929146371235,
            "loss_tokens_lower_95": 4.200768607755445,
            "loss_tokens_upper_95": 4.440893351119986,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.520779566067021,
            "data_time": 0.03420310644876389,
            "batch_time": 0.06485932781582787,
            "samples_per_second": 959307.4762183374,
            "samples_per_second_per_gpu": 119913.43452729218,
            "loss_sequences_lower_95": 4.474965397904559,
            "loss_sequences_upper_95": 4.773008495423852,
            "loss_tokens_lower_95": 4.173905284085378,
            "loss_tokens_upper_95": 4.309441520714685,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.767611853084209,
            "data_time": 0.037353112373823,
            "batch_time": 0.0688065422905816,
            "samples_per_second": 961565.8989123967,
            "samples_per_second_per_gpu": 120195.7373640496,
            "loss_sequences_lower_95": 4.7237635831655185,
            "loss_sequences_upper_95": 5.019884268245342,
            "loss_tokens_lower_95": 4.475860311860193,
            "loss_tokens_upper_95": 4.590818290252998,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.715683085162465,
            "data_time": 0.03364031655447824,
            "batch_time": 0.06425431228819348,
            "samples_per_second": 959257.5326495516,
            "samples_per_second_per_gpu": 119907.19158119395,
            "loss_sequences_lower_95": 4.772952782235494,
            "loss_sequences_upper_95": 5.1331657316626575,
            "loss_tokens_lower_95": 4.2985308002750156,
            "loss_tokens_upper_95": 4.423920271530289,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-32.0/params.txt",
    "uuid": "1be6349d-b756-4faa-8fc4-eefabc77388c",
    "creation_date": "2023_12_14-13_35_11"
}