{
    "name": "c4_original-d=1024_l=24_h=8-0.5",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 4116162560,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "823232512",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=1024_l=24_h=8-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.8188780387242636,
            "data_time": 0.04810350388288498,
            "batch_time": 0.47614162787795067,
            "samples_per_second": 687288.9203652531,
            "samples_per_second_per_gpu": 85911.11504565664,
            "loss_sequences_lower_95": 3.6983989397684733,
            "loss_sequences_upper_95": 3.9382339096069336,
            "loss_tokens_lower_95": 3.8037550099690756,
            "loss_tokens_upper_95": 3.833750476837158,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1631078963079227,
            "data_time": 0.0009968036837127157,
            "batch_time": 0.03655617695888222,
            "samples_per_second": 902227.8293638325,
            "samples_per_second_per_gpu": 112778.47867047906,
            "loss_sequences_lower_95": 3.16032574647937,
            "loss_sequences_upper_95": 3.165846087159849,
            "loss_tokens_lower_95": 3.1526320625,
            "loss_tokens_upper_95": 3.173532359375,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4726604408147383,
            "data_time": 0.010436086654663087,
            "batch_time": 0.045785429954528806,
            "samples_per_second": 872644.8060703268,
            "samples_per_second_per_gpu": 109080.60075879085,
            "loss_sequences_lower_95": 3.4521175337810908,
            "loss_sequences_upper_95": 3.4932513490015147,
            "loss_tokens_lower_95": 3.458306890625,
            "loss_tokens_upper_95": 3.48723771875,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.120641628294876,
            "data_time": 0.0016838606250913519,
            "batch_time": 0.036963255095638727,
            "samples_per_second": 907379.9668095891,
            "samples_per_second_per_gpu": 113422.49585119863,
            "loss_sequences_lower_95": 3.1111033897793168,
            "loss_sequences_upper_95": 3.130199863079897,
            "loss_tokens_lower_95": 3.1099404375,
            "loss_tokens_upper_95": 3.131048916666667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.173881459867396,
            "data_time": 0.010402092420722384,
            "batch_time": 0.0458750810281214,
            "samples_per_second": 869699.9739922766,
            "samples_per_second_per_gpu": 108712.49674903457,
            "loss_sequences_lower_95": 3.14084738053518,
            "loss_sequences_upper_95": 3.2059475900684986,
            "loss_tokens_lower_95": 3.1633012291666667,
            "loss_tokens_upper_95": 3.1842146822916666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.607032454961988,
            "data_time": 0.003954463030980981,
            "batch_time": 0.03960222005844116,
            "samples_per_second": 897424.8705905837,
            "samples_per_second_per_gpu": 112178.10882382296,
            "loss_sequences_lower_95": 3.5716116401470086,
            "loss_sequences_upper_95": 3.64245000511185,
            "loss_tokens_lower_95": 3.594983645833333,
            "loss_tokens_upper_95": 3.61885978125,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3577454210300837,
            "data_time": 0.001684376796734858,
            "batch_time": 0.036949866553115225,
            "samples_per_second": 910381.9495536298,
            "samples_per_second_per_gpu": 113797.74369420373,
            "loss_sequences_lower_95": 3.325194993622449,
            "loss_sequences_upper_95": 3.389307378029337,
            "loss_tokens_lower_95": 3.34312134375,
            "loss_tokens_upper_95": 3.372545296875,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8068761286311124,
            "data_time": 0.0016800437896932732,
            "batch_time": 0.03694068263734748,
            "samples_per_second": 908898.6918450299,
            "samples_per_second_per_gpu": 113612.33648062874,
            "loss_sequences_lower_95": 3.7991302356020946,
            "loss_sequences_upper_95": 3.8147511861910997,
            "loss_tokens_lower_95": 3.7953957708333337,
            "loss_tokens_upper_95": 3.818386625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.535448055926377,
            "data_time": 0.010515441970219688,
            "batch_time": 0.045770597836327934,
            "samples_per_second": 866290.2702706266,
            "samples_per_second_per_gpu": 108286.28378382833,
            "loss_sequences_lower_95": 3.4960537173883703,
            "loss_sequences_upper_95": 3.5778999514696075,
            "loss_tokens_lower_95": 3.5244168645833334,
            "loss_tokens_upper_95": 3.5464801979166665,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.5516980815781904,
            "data_time": 0.010596288368105888,
            "batch_time": 0.04646127391606569,
            "samples_per_second": 870271.0621700353,
            "samples_per_second_per_gpu": 108783.88277125442,
            "loss_sequences_lower_95": 4.532766916629353,
            "loss_sequences_upper_95": 4.5702628033905635,
            "loss_tokens_lower_95": 4.538806916666666,
            "loss_tokens_upper_95": 4.5650551041666665,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5421149835237595,
            "data_time": 0.0013214119430044378,
            "batch_time": 0.03662455954202792,
            "samples_per_second": 910445.2618520259,
            "samples_per_second_per_gpu": 113805.65773150323,
            "loss_sequences_lower_95": 3.5348095281558343,
            "loss_sequences_upper_95": 3.5495056026877485,
            "loss_tokens_lower_95": 3.5306245520833333,
            "loss_tokens_upper_95": 3.5532038750000003,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.319299068017981,
            "data_time": 0.0026774049103011894,
            "batch_time": 0.037999537267057626,
            "samples_per_second": 906452.8983988044,
            "samples_per_second_per_gpu": 113306.61229985055,
            "loss_sequences_lower_95": 3.3105483544860865,
            "loss_sequences_upper_95": 3.3278784037331453,
            "loss_tokens_lower_95": 3.3082829270833334,
            "loss_tokens_upper_95": 3.330240828125,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.941998946013847,
            "data_time": 0.010464988678340384,
            "batch_time": 0.045731545436994835,
            "samples_per_second": 868115.1631765036,
            "samples_per_second_per_gpu": 108514.39539706295,
            "loss_sequences_lower_95": 3.908373509652716,
            "loss_sequences_upper_95": 3.9757164883565226,
            "loss_tokens_lower_95": 3.9290783125,
            "loss_tokens_upper_95": 3.9546654791666667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.146750829370347,
            "data_time": 0.01030358850243557,
            "batch_time": 0.045641597998569684,
            "samples_per_second": 868681.547649502,
            "samples_per_second_per_gpu": 108585.19345618776,
            "loss_sequences_lower_95": 3.092784519545172,
            "loss_sequences_upper_95": 3.2009395063287616,
            "loss_tokens_lower_95": 3.135110078125,
            "loss_tokens_upper_95": 3.1580064322916668,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.583604487505826,
            "data_time": 0.08736008405685425,
            "batch_time": 0.12226160083498273,
            "samples_per_second": 506904.7130690214,
            "samples_per_second_per_gpu": 63363.089133627676,
            "loss_sequences_lower_95": 4.5160011464899235,
            "loss_sequences_upper_95": 4.653050223263827,
            "loss_tokens_lower_95": 4.555524635314942,
            "loss_tokens_upper_95": 4.612588232213801,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7161300133685677,
            "data_time": 0.014772652225060896,
            "batch_time": 0.05038269676945426,
            "samples_per_second": 850584.0973253101,
            "samples_per_second_per_gpu": 106323.01216566376,
            "loss_sequences_lower_95": 3.6338410135608377,
            "loss_sequences_upper_95": 3.797157425505079,
            "loss_tokens_lower_95": 3.7028318125,
            "loss_tokens_upper_95": 3.72929546875,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.7370209467442494,
            "data_time": 0.013807114213705063,
            "batch_time": 0.04948100075125694,
            "samples_per_second": 869551.9788420411,
            "samples_per_second_per_gpu": 108693.99735525514,
            "loss_sequences_lower_95": 5.6812256181460254,
            "loss_sequences_upper_95": 5.790924668123351,
            "loss_tokens_lower_95": 5.724734114583334,
            "loss_tokens_upper_95": 5.749578760416666,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.711774517278202,
            "data_time": 0.037908684462308884,
            "batch_time": 0.07417641207575798,
            "samples_per_second": 761391.1865173171,
            "samples_per_second_per_gpu": 95173.89831466464,
            "loss_sequences_lower_95": 3.6646489690561763,
            "loss_sequences_upper_95": 3.761940158781458,
            "loss_tokens_lower_95": 3.697913285552478,
            "loss_tokens_upper_95": 3.7256885340956387,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.106103178791265,
            "data_time": 0.0014983925433799726,
            "batch_time": 0.03688693643974009,
            "samples_per_second": 903379.5690589319,
            "samples_per_second_per_gpu": 112922.44613236649,
            "loss_sequences_lower_95": 5.083789076409165,
            "loss_sequences_upper_95": 5.128838526273857,
            "loss_tokens_lower_95": 5.083399494596567,
            "loss_tokens_upper_95": 5.128622002296681,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.872184525303479,
            "data_time": 0.0016655891089682366,
            "batch_time": 0.03717527960895733,
            "samples_per_second": 899718.8592134686,
            "samples_per_second_per_gpu": 112464.85740168358,
            "loss_sequences_lower_95": 2.875253218707989,
            "loss_sequences_upper_95": 2.900615072943637,
            "loss_tokens_lower_95": 2.850103643226868,
            "loss_tokens_upper_95": 2.8684713031912152,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.403285085704454,
            "data_time": 0.0035486389846873666,
            "batch_time": 0.04122108855863206,
            "samples_per_second": 898636.1522164806,
            "samples_per_second_per_gpu": 112329.51902706007,
            "loss_sequences_lower_95": 4.665902838258738,
            "loss_sequences_upper_95": 4.971788841539473,
            "loss_tokens_lower_95": 3.856006237481758,
            "loss_tokens_upper_95": 4.075779667768645,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.455308854540189,
            "data_time": 0.0036955627989261708,
            "batch_time": 0.03909954824980269,
            "samples_per_second": 896800.939189243,
            "samples_per_second_per_gpu": 112100.11739865538,
            "loss_sequences_lower_95": 4.577167797851563,
            "loss_sequences_upper_95": 4.792010595703125,
            "loss_tokens_lower_95": 4.1431108122051885,
            "loss_tokens_upper_95": 4.290718209512579,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2138864386705306,
            "data_time": 0.0048521507199892995,
            "batch_time": 0.040242370976581836,
            "samples_per_second": 892898.6721729839,
            "samples_per_second_per_gpu": 111612.33402162298,
            "loss_sequences_lower_95": 3.256363916867063,
            "loss_sequences_upper_95": 3.3232216511828323,
            "loss_tokens_lower_95": 3.115809693922821,
            "loss_tokens_upper_95": 3.148528145468332,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.449282334067605,
            "data_time": 0.023945540189743042,
            "batch_time": 0.06005786572183881,
            "samples_per_second": 832006.6381978084,
            "samples_per_second_per_gpu": 104000.82977472605,
            "loss_sequences_lower_95": 3.37743051702326,
            "loss_sequences_upper_95": 3.5915469707142225,
            "loss_tokens_lower_95": 3.3427593170538143,
            "loss_tokens_upper_95": 3.4153906486193693,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.628969804608092,
            "data_time": 0.020939625799655914,
            "batch_time": 0.05609212443232536,
            "samples_per_second": 826773.0116796799,
            "samples_per_second_per_gpu": 103346.62645995998,
            "loss_sequences_lower_95": 3.6133963074976085,
            "loss_sequences_upper_95": 3.8219704577387597,
            "loss_tokens_lower_95": 3.5079024072232565,
            "loss_tokens_upper_95": 3.608773782635833,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.322425569693247,
            "data_time": 0.017070421805748574,
            "batch_time": 0.052444837032220304,
            "samples_per_second": 834997.9799841825,
            "samples_per_second_per_gpu": 104374.74749802281,
            "loss_sequences_lower_95": 4.271890950520833,
            "loss_sequences_upper_95": 4.381398793538411,
            "loss_tokens_lower_95": 4.200398637278854,
            "loss_tokens_upper_95": 4.426909985442097,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.8955211094531785,
            "data_time": 0.0012985105977238146,
            "batch_time": 0.03662693435992562,
            "samples_per_second": 905679.391196564,
            "samples_per_second_per_gpu": 113209.9238995705,
            "loss_sequences_lower_95": 4.89897493856417,
            "loss_sequences_upper_95": 4.97661896675545,
            "loss_tokens_lower_95": 4.770293210452602,
            "loss_tokens_upper_95": 4.850196236382532,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.434637072762657,
            "data_time": 0.002878608719614528,
            "batch_time": 0.038185223637011226,
            "samples_per_second": 901739.7525131733,
            "samples_per_second_per_gpu": 112717.46906414666,
            "loss_sequences_lower_95": 5.021934586341935,
            "loss_sequences_upper_95": 5.350073118884154,
            "loss_tokens_lower_95": 3.6322225433820816,
            "loss_tokens_upper_95": 3.7739112126404972,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.009013257844456,
            "data_time": 0.005168406141770852,
            "batch_time": 0.04063033574336284,
            "samples_per_second": 887452.8401918317,
            "samples_per_second_per_gpu": 110931.60502397896,
            "loss_sequences_lower_95": 4.465996335390892,
            "loss_sequences_upper_95": 4.8433437204198215,
            "loss_tokens_lower_95": 3.572528546885084,
            "loss_tokens_upper_95": 3.735609694533519,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.4145251975211925,
            "data_time": 0.023756308215005056,
            "batch_time": 0.05935342184134892,
            "samples_per_second": 828549.6714649049,
            "samples_per_second_per_gpu": 103568.70893311311,
            "loss_sequences_lower_95": 5.338483808682934,
            "loss_sequences_upper_95": 5.4901798335384555,
            "loss_tokens_lower_95": 5.338606645418629,
            "loss_tokens_upper_95": 5.488121109357163,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.212699062824249,
            "data_time": 0.05107636176622831,
            "batch_time": 0.08694196205872756,
            "samples_per_second": 748066.8315366594,
            "samples_per_second_per_gpu": 93508.35394208242,
            "loss_sequences_lower_95": 3.0781823120117187,
            "loss_sequences_upper_95": 3.4412051620483397,
            "loss_tokens_lower_95": 2.9095030755604316,
            "loss_tokens_upper_95": 3.3471644474909854,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.833265181451639,
            "data_time": 0.0035924089954430826,
            "batch_time": 0.03963825478388,
            "samples_per_second": 884452.3067989687,
            "samples_per_second_per_gpu": 110556.53834987109,
            "loss_sequences_lower_95": 4.7645371498568645,
            "loss_sequences_upper_95": 4.902462809161496,
            "loss_tokens_lower_95": 4.762624019439292,
            "loss_tokens_upper_95": 4.902384844089847,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.25224290608774,
            "data_time": 0.005394173679694071,
            "batch_time": 0.04087971435285509,
            "samples_per_second": 891902.1965990914,
            "samples_per_second_per_gpu": 111487.77457488643,
            "loss_sequences_lower_95": 5.18709013967803,
            "loss_sequences_upper_95": 5.316475073262183,
            "loss_tokens_lower_95": 5.184091269003379,
            "loss_tokens_upper_95": 5.3190658631820735,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.204102458061409,
            "data_time": 0.0034508190196224733,
            "batch_time": 0.038818562519988026,
            "samples_per_second": 894988.9416007926,
            "samples_per_second_per_gpu": 111873.61770009907,
            "loss_sequences_lower_95": 3.355860517333719,
            "loss_sequences_upper_95": 3.482534912906352,
            "loss_tokens_lower_95": 3.0322852897209565,
            "loss_tokens_upper_95": 3.085199332273649,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.253456324100495,
            "data_time": 0.011234915815293789,
            "batch_time": 0.04649405647069216,
            "samples_per_second": 864993.7495294767,
            "samples_per_second_per_gpu": 108124.2186911846,
            "loss_sequences_lower_95": 5.4424905029296875,
            "loss_sequences_upper_95": 6.003268115234375,
            "loss_tokens_lower_95": 4.652407475388816,
            "loss_tokens_upper_95": 5.01508722816533,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7251798659563065,
            "data_time": 0.15452708303928375,
            "batch_time": 0.1940590888261795,
            "samples_per_second": 462095.2532576525,
            "samples_per_second_per_gpu": 57761.906657206564,
            "loss_sequences_lower_95": 3.5191259741783143,
            "loss_sequences_upper_95": 3.9898243665695188,
            "loss_tokens_lower_95": 3.2758570002413343,
            "loss_tokens_upper_95": 4.066961564688847,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.032001330934722,
            "data_time": 0.029770769971482296,
            "batch_time": 0.06462493602265702,
            "samples_per_second": 785859.1397386378,
            "samples_per_second_per_gpu": 98232.39246732973,
            "loss_sequences_lower_95": 5.5412534341044815,
            "loss_sequences_upper_95": 6.401735608331088,
            "loss_tokens_lower_95": 3.5057749717050095,
            "loss_tokens_upper_95": 3.9776823746983334,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.54217724218906,
            "data_time": 0.003093082457780838,
            "batch_time": 0.038393612330158554,
            "samples_per_second": 898093.31048454,
            "samples_per_second_per_gpu": 112261.6638105675,
            "loss_sequences_lower_95": 2.5180107146696837,
            "loss_sequences_upper_95": 2.5668329041407274,
            "loss_tokens_lower_95": 2.517375021118324,
            "loss_tokens_upper_95": 2.566501991735269,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8619050735589853,
            "data_time": 0.0026058652949601494,
            "batch_time": 0.038048665746497144,
            "samples_per_second": 900032.1746013747,
            "samples_per_second_per_gpu": 112504.02182517183,
            "loss_sequences_lower_95": 2.830244470378299,
            "loss_sequences_upper_95": 2.983236934188822,
            "loss_tokens_lower_95": 2.7044184913871674,
            "loss_tokens_upper_95": 2.85250789296049,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.152497744385576,
            "data_time": 0.018544286489486694,
            "batch_time": 0.05335248510042826,
            "samples_per_second": 831570.3210581993,
            "samples_per_second_per_gpu": 103946.29013227491,
            "loss_sequences_lower_95": 3.018059374910571,
            "loss_sequences_upper_95": 3.4213763170626574,
            "loss_tokens_lower_95": 2.8856935965143675,
            "loss_tokens_upper_95": 3.1759100166765437,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5180421461716542,
            "data_time": 0.004921150207519531,
            "batch_time": 0.04020156972110271,
            "samples_per_second": 891190.0869093426,
            "samples_per_second_per_gpu": 111398.76086366782,
            "loss_sequences_lower_95": 3.5521107469662585,
            "loss_sequences_upper_95": 3.7025496894577494,
            "loss_tokens_lower_95": 3.3760611847842075,
            "loss_tokens_upper_95": 3.5188564303743277,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.882684524466352,
            "data_time": 0.033713530926477345,
            "batch_time": 0.06919624124254499,
            "samples_per_second": 813631.0881128978,
            "samples_per_second_per_gpu": 101703.88601411223,
            "loss_sequences_lower_95": 2.7390497951972774,
            "loss_sequences_upper_95": 3.185045977336604,
            "loss_tokens_lower_95": 2.6033025645120693,
            "loss_tokens_upper_95": 2.9639504462237456,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.558855783750973,
            "data_time": 0.00198073697671193,
            "batch_time": 0.037360253926519864,
            "samples_per_second": 900961.1094235273,
            "samples_per_second_per_gpu": 112620.13867794091,
            "loss_sequences_lower_95": 4.542754507932837,
            "loss_sequences_upper_95": 4.574758516156357,
            "loss_tokens_lower_95": 4.542659674513028,
            "loss_tokens_upper_95": 4.574582035547734,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8703335795587707,
            "data_time": 0.04888957630504261,
            "batch_time": 0.08632492585615678,
            "samples_per_second": 713842.8842792017,
            "samples_per_second_per_gpu": 89230.36053490022,
            "loss_sequences_lower_95": 0.819177131282473,
            "loss_sequences_upper_95": 0.9517067566658687,
            "loss_tokens_lower_95": 0.7323490217007781,
            "loss_tokens_upper_95": 0.9199747669294156,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.634971339769458,
            "data_time": 0.001164322526936559,
            "batch_time": 0.03654755147334816,
            "samples_per_second": 903574.4504703776,
            "samples_per_second_per_gpu": 112946.8063087972,
            "loss_sequences_lower_95": 5.0279884962165875,
            "loss_sequences_upper_95": 5.077856650042584,
            "loss_tokens_lower_95": 4.036001940280465,
            "loss_tokens_upper_95": 4.085691054158608,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.775599828720093,
            "data_time": 0.006141882567178635,
            "batch_time": 0.04183416470648751,
            "samples_per_second": 880459.4822862123,
            "samples_per_second_per_gpu": 110057.43528577653,
            "loss_sequences_lower_95": 6.7366602539062495,
            "loss_sequences_upper_95": 6.973867236328124,
            "loss_tokens_lower_95": 6.593996935119393,
            "loss_tokens_upper_95": 6.822505668292005,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.090713684455208,
            "data_time": 0.022743217015670517,
            "batch_time": 0.05845847170231706,
            "samples_per_second": 827387.2399086343,
            "samples_per_second_per_gpu": 103423.40498857929,
            "loss_sequences_lower_95": 4.932754344110903,
            "loss_sequences_upper_95": 5.250259001358696,
            "loss_tokens_lower_95": 4.935952546492866,
            "loss_tokens_upper_95": 5.246354356848675,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.212752810391513,
            "data_time": 0.004713418972061341,
            "batch_time": 0.04010304988148701,
            "samples_per_second": 892097.2844998625,
            "samples_per_second_per_gpu": 111512.16056248281,
            "loss_sequences_lower_95": 6.154717518199574,
            "loss_sequences_upper_95": 6.269583721738873,
            "loss_tokens_lower_95": 6.155449191006746,
            "loss_tokens_upper_95": 6.268677442146069,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8955321607589721,
            "data_time": 0.0040794208328774634,
            "batch_time": 0.03968920574543324,
            "samples_per_second": 890540.2914666608,
            "samples_per_second_per_gpu": 111317.5364333326,
            "loss_sequences_lower_95": 0.9288570780436197,
            "loss_sequences_upper_95": 0.9772115987141927,
            "loss_tokens_lower_95": 0.8312575557566776,
            "loss_tokens_upper_95": 0.886767095510079,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.01672001793271,
            "data_time": 0.024432156767163957,
            "batch_time": 0.05924084569726672,
            "samples_per_second": 809631.8188837172,
            "samples_per_second_per_gpu": 101203.97736046465,
            "loss_sequences_lower_95": 5.66852784656343,
            "loss_sequences_upper_95": 6.361525326683408,
            "loss_tokens_lower_95": 5.6684208025251115,
            "loss_tokens_upper_95": 6.366078040713356,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5163774453103542,
            "data_time": 0.15785440802574158,
            "batch_time": 0.1967129111289978,
            "samples_per_second": 471323.1045428945,
            "samples_per_second_per_gpu": 58915.388067861815,
            "loss_sequences_lower_95": 2.2770395934581757,
            "loss_sequences_upper_95": 3.4693345248699186,
            "loss_tokens_lower_95": 1.9225287022541477,
            "loss_tokens_upper_95": 2.456247445332635,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.725755707025528,
            "data_time": 0.0059638747147151405,
            "batch_time": 0.0412666244166238,
            "samples_per_second": 890077.5742307593,
            "samples_per_second_per_gpu": 111259.69677884491,
            "loss_sequences_lower_95": 7.6539107421875,
            "loss_sequences_upper_95": 7.999919274902344,
            "loss_tokens_lower_95": 7.443959357815144,
            "loss_tokens_upper_95": 7.747473206495875,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.20427238702774,
            "data_time": 0.006348263176660689,
            "batch_time": 0.041987438996632896,
            "samples_per_second": 881746.4198944679,
            "samples_per_second_per_gpu": 110218.30248680849,
            "loss_sequences_lower_95": 7.295442932128906,
            "loss_sequences_upper_95": 7.511731469726563,
            "loss_tokens_lower_95": 6.974237136955433,
            "loss_tokens_upper_95": 7.163207934460284,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.155152371694608,
            "data_time": 0.003825893689158768,
            "batch_time": 0.039210388572718385,
            "samples_per_second": 895345.5141789044,
            "samples_per_second_per_gpu": 111918.18927236304,
            "loss_sequences_lower_95": 5.105270703910702,
            "loss_sequences_upper_95": 5.203823227455581,
            "loss_tokens_lower_95": 5.105733842686683,
            "loss_tokens_upper_95": 5.20408944958934,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.992994983441635,
            "data_time": 0.008575834176331489,
            "batch_time": 0.04403244836813013,
            "samples_per_second": 873218.854198832,
            "samples_per_second_per_gpu": 109152.356774854,
            "loss_sequences_lower_95": 4.881617523521505,
            "loss_sequences_upper_95": 5.1004093668244765,
            "loss_tokens_lower_95": 4.878984986289122,
            "loss_tokens_upper_95": 5.100676758937571,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.600722500801086,
            "data_time": 0.006070035317587474,
            "batch_time": 0.04142944538404071,
            "samples_per_second": 887016.2929016881,
            "samples_per_second_per_gpu": 110877.03661271102,
            "loss_sequences_lower_95": 6.523629028320313,
            "loss_sequences_upper_95": 6.679121582031249,
            "loss_tokens_lower_95": 6.525518701171875,
            "loss_tokens_upper_95": 6.680495764160156,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.57332446274604,
            "data_time": 0.0016534271222206505,
            "batch_time": 0.03700348587252467,
            "samples_per_second": 903072.5634824347,
            "samples_per_second_per_gpu": 112884.07043530434,
            "loss_sequences_lower_95": 4.223500473036897,
            "loss_sequences_upper_95": 4.338493710087512,
            "loss_tokens_lower_95": 2.803794876135733,
            "loss_tokens_upper_95": 2.8768001775583327,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.331054349443805,
            "data_time": 0.02062762975692749,
            "batch_time": 0.05587880952017648,
            "samples_per_second": 829702.7821844907,
            "samples_per_second_per_gpu": 103712.84777306134,
            "loss_sequences_lower_95": 5.131602591898904,
            "loss_sequences_upper_95": 5.528543557693709,
            "loss_tokens_lower_95": 5.131895833940648,
            "loss_tokens_upper_95": 5.5273526433688485,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.34053202049405,
            "data_time": 0.011592642404139042,
            "batch_time": 0.04725122731178999,
            "samples_per_second": 876462.2186982431,
            "samples_per_second_per_gpu": 109557.7773372804,
            "loss_sequences_lower_95": 5.202928526635263,
            "loss_sequences_upper_95": 5.473903389725031,
            "loss_tokens_lower_95": 5.204294313916973,
            "loss_tokens_upper_95": 5.473789050532322,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7623493866440825,
            "data_time": 0.001921366560187868,
            "batch_time": 0.037336585103764953,
            "samples_per_second": 900291.403017583,
            "samples_per_second_per_gpu": 112536.42537719787,
            "loss_sequences_lower_95": 4.226229835079231,
            "loss_sequences_upper_95": 4.331686413217932,
            "loss_tokens_lower_95": 3.0692770294112255,
            "loss_tokens_upper_95": 3.1469900522876415,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.270594336999157,
            "data_time": 0.028875527282555897,
            "batch_time": 0.06470293800036113,
            "samples_per_second": 825096.8916733538,
            "samples_per_second_per_gpu": 103137.11145916923,
            "loss_sequences_lower_95": 5.096082196916853,
            "loss_sequences_upper_95": 5.4373034603381285,
            "loss_tokens_lower_95": 5.09771990094866,
            "loss_tokens_upper_95": 5.434501316938451,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.847106483514885,
            "data_time": 0.0033343120663448422,
            "batch_time": 0.03875482548141945,
            "samples_per_second": 895922.3036502874,
            "samples_per_second_per_gpu": 111990.28795628593,
            "loss_sequences_lower_95": 5.800508768157493,
            "loss_sequences_upper_95": 5.893273679400802,
            "loss_tokens_lower_95": 5.799800805141437,
            "loss_tokens_upper_95": 5.894095126744075,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.440197041891153,
            "data_time": 0.02507198290391402,
            "batch_time": 0.060028576850891115,
            "samples_per_second": 801002.4920965803,
            "samples_per_second_per_gpu": 100125.31151207254,
            "loss_sequences_lower_95": 5.247654946336469,
            "loss_sequences_upper_95": 5.631634521484376,
            "loss_tokens_lower_95": 5.244513902386415,
            "loss_tokens_upper_95": 5.6332403979255155,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0971621414025625,
            "data_time": 0.08140866458415985,
            "batch_time": 0.11999410390853882,
            "samples_per_second": 624813.7098591729,
            "samples_per_second_per_gpu": 78101.71373239662,
            "loss_sequences_lower_95": 1.8867080847422282,
            "loss_sequences_upper_95": 2.421251316070556,
            "loss_tokens_lower_95": 1.6989077779981825,
            "loss_tokens_upper_95": 2.305012861887614,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2388037999471027,
            "data_time": 0.08378928899765015,
            "batch_time": 0.12124528735876083,
            "samples_per_second": 626294.5376035564,
            "samples_per_second_per_gpu": 78286.81720044455,
            "loss_sequences_lower_95": 2.0824600982666017,
            "loss_sequences_upper_95": 2.7897484016418455,
            "loss_tokens_lower_95": 1.7025278027138013,
            "loss_tokens_upper_95": 2.486802763349554,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7799230874725924,
            "data_time": 0.003469994658929871,
            "batch_time": 0.03921475867933214,
            "samples_per_second": 887822.7862768096,
            "samples_per_second_per_gpu": 110977.8482846012,
            "loss_sequences_lower_95": 3.7664543374332657,
            "loss_sequences_upper_95": 3.7931248561763624,
            "loss_tokens_lower_95": 3.7663954919919,
            "loss_tokens_upper_95": 3.7933978909701764,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.5783903168910739,
            "data_time": 0.0010762809647381698,
            "batch_time": 0.0364830975850771,
            "samples_per_second": 903424.964791737,
            "samples_per_second_per_gpu": 112928.12059896713,
            "loss_sequences_lower_95": 0.6775762854722654,
            "loss_sequences_upper_95": 0.6946370170012566,
            "loss_tokens_lower_95": 0.4638180240345511,
            "loss_tokens_upper_95": 0.4728680787941613,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.467769619048111,
            "data_time": 0.04044768214225769,
            "batch_time": 0.07692519202828407,
            "samples_per_second": 797984.5557573074,
            "samples_per_second_per_gpu": 99748.06946966343,
            "loss_sequences_lower_95": 4.483691910871371,
            "loss_sequences_upper_95": 4.853332471472072,
            "loss_tokens_lower_95": 4.1605071744119995,
            "loss_tokens_upper_95": 4.372477297500613,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.701581168819118,
            "data_time": 0.12425420397803896,
            "batch_time": 0.16128139268784297,
            "samples_per_second": 475501.52016427193,
            "samples_per_second_per_gpu": 59437.69002053399,
            "loss_sequences_lower_95": 6.248918244645402,
            "loss_sequences_upper_95": 7.376567572516364,
            "loss_tokens_lower_95": 5.980326975127797,
            "loss_tokens_upper_95": 7.1284813774956595,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.3502186391411755,
            "data_time": 0.03204946858542306,
            "batch_time": 0.06872379212152391,
            "samples_per_second": 803212.0102222741,
            "samples_per_second_per_gpu": 100401.50127778426,
            "loss_sequences_lower_95": 4.313661780008456,
            "loss_sequences_upper_95": 4.641894456816883,
            "loss_tokens_lower_95": 3.9912269863067706,
            "loss_tokens_upper_95": 4.170318224676724,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.4335875351254534,
            "data_time": 0.031597864060174854,
            "batch_time": 0.06754716237386067,
            "samples_per_second": 805948.988817163,
            "samples_per_second_per_gpu": 100743.62360214538,
            "loss_sequences_lower_95": 4.3994848856111854,
            "loss_sequences_upper_95": 4.699205389255431,
            "loss_tokens_lower_95": 4.098491762379794,
            "loss_tokens_upper_95": 4.247177291159699,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.570210719253959,
            "data_time": 0.03307246026538667,
            "batch_time": 0.06866730110985893,
            "samples_per_second": 812810.0737565407,
            "samples_per_second_per_gpu": 101601.25921956758,
            "loss_sequences_lower_95": 4.530717859035585,
            "loss_sequences_upper_95": 4.9311948450600225,
            "loss_tokens_lower_95": 4.187159494725776,
            "loss_tokens_upper_95": 4.424295270326427,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.4839522315234674,
            "data_time": 0.03232201791944958,
            "batch_time": 0.0679095018477667,
            "samples_per_second": 810122.2397005262,
            "samples_per_second_per_gpu": 101265.27996256578,
            "loss_sequences_lower_95": 4.430110931396484,
            "loss_sequences_upper_95": 4.71442759444074,
            "loss_tokens_lower_95": 4.178441232461424,
            "loss_tokens_upper_95": 4.317291193216389,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.723612569133688,
            "data_time": 0.031187351839042005,
            "batch_time": 0.0676612471356804,
            "samples_per_second": 818001.3158876846,
            "samples_per_second_per_gpu": 102250.16448596057,
            "loss_sequences_lower_95": 4.702839461616848,
            "loss_sequences_upper_95": 4.989749666770792,
            "loss_tokens_lower_95": 4.457043931887979,
            "loss_tokens_upper_95": 4.573430667775665,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.771834754362339,
            "data_time": 0.0310182599794297,
            "batch_time": 0.0677586283002581,
            "samples_per_second": 791951.5644087642,
            "samples_per_second_per_gpu": 98993.94555109553,
            "loss_sequences_lower_95": 4.81932800106886,
            "loss_sequences_upper_95": 5.140681141178782,
            "loss_tokens_lower_95": 4.421883205715039,
            "loss_tokens_upper_95": 4.5591917547279985,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/params.txt",
    "uuid": "3de0077a-6fe5-46cc-a473-9cdcff47681d",
    "creation_date": "2023_12_14-04_59_54"
}