{
    "name": "c4_original-d=1024_l=24_h=8-1.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 8232325120,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1646465024",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=1024_l=24_h=8-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.69924139380455,
            "data_time": 0.04333341494202614,
            "batch_time": 0.4377455413341522,
            "samples_per_second": 692401.0673799812,
            "samples_per_second_per_gpu": 86550.13342249765,
            "loss_sequences_lower_95": 3.583230775197347,
            "loss_sequences_upper_95": 3.8144257672627764,
            "loss_tokens_lower_95": 3.6843507385253904,
            "loss_tokens_upper_95": 3.7141172663370767,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0417367343404416,
            "data_time": 0.0011635957301494597,
            "batch_time": 0.03699264402286088,
            "samples_per_second": 893426.684443212,
            "samples_per_second_per_gpu": 111678.3355554015,
            "loss_sequences_lower_95": 3.0389081461620138,
            "loss_sequences_upper_95": 3.0445410479093873,
            "loss_tokens_lower_95": 3.031372177083333,
            "loss_tokens_upper_95": 3.0519926875,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.457938329540953,
            "data_time": 0.009468859672546386,
            "batch_time": 0.04520375919342041,
            "samples_per_second": 859912.1869233846,
            "samples_per_second_per_gpu": 107489.02336542307,
            "loss_sequences_lower_95": 3.436031039490992,
            "loss_sequences_upper_95": 3.481202753806601,
            "loss_tokens_lower_95": 3.4432649010416667,
            "loss_tokens_upper_95": 3.472813328125,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.049127952339723,
            "data_time": 0.0016224859772544157,
            "batch_time": 0.037001096300388635,
            "samples_per_second": 903700.0559995386,
            "samples_per_second_per_gpu": 112962.50699994233,
            "loss_sequences_lower_95": 3.036985316325709,
            "loss_sequences_upper_95": 3.0615098763692012,
            "loss_tokens_lower_95": 3.03837465625,
            "loss_tokens_upper_95": 3.0596086927083332,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0651125990446135,
            "data_time": 0.009645284409542008,
            "batch_time": 0.04532130780922939,
            "samples_per_second": 861366.3142865942,
            "samples_per_second_per_gpu": 107670.78928582427,
            "loss_sequences_lower_95": 3.0296329723599,
            "loss_sequences_upper_95": 3.10065292700245,
            "loss_tokens_lower_95": 3.0547820208333336,
            "loss_tokens_upper_95": 3.0753801770833333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5005208890021535,
            "data_time": 0.0037124833982923756,
            "batch_time": 0.03927243630523267,
            "samples_per_second": 896371.4363047453,
            "samples_per_second_per_gpu": 112046.42953809316,
            "loss_sequences_lower_95": 3.4638311472982877,
            "loss_sequences_upper_95": 3.537568105442195,
            "loss_tokens_lower_95": 3.488716838541667,
            "loss_tokens_upper_95": 3.51229959375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.193902102830459,
            "data_time": 0.0017401682999939165,
            "batch_time": 0.0371595918646061,
            "samples_per_second": 904622.8392072826,
            "samples_per_second_per_gpu": 113077.85490091033,
            "loss_sequences_lower_95": 3.161590127750319,
            "loss_sequences_upper_95": 3.2251937230947068,
            "loss_tokens_lower_95": 3.179711114583333,
            "loss_tokens_upper_95": 3.2083229218750002,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.742004859235274,
            "data_time": 0.0016229133733592239,
            "batch_time": 0.03713259553370841,
            "samples_per_second": 902118.9183991835,
            "samples_per_second_per_gpu": 112764.86479989794,
            "loss_sequences_lower_95": 3.7332574750490837,
            "loss_sequences_upper_95": 3.75110546875,
            "loss_tokens_lower_95": 3.730471145833333,
            "loss_tokens_upper_95": 3.753776375,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.45608144659337,
            "data_time": 0.00928036750309051,
            "batch_time": 0.04457650014332363,
            "samples_per_second": 865724.9024068887,
            "samples_per_second_per_gpu": 108215.61280086108,
            "loss_sequences_lower_95": 3.413906469577696,
            "loss_sequences_upper_95": 3.501611433571916,
            "loss_tokens_lower_95": 3.4449253333333334,
            "loss_tokens_upper_95": 3.467096515625,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.512105774973692,
            "data_time": 0.009254215285182,
            "batch_time": 0.0450412891805172,
            "samples_per_second": 870016.1334395885,
            "samples_per_second_per_gpu": 108752.01667994856,
            "loss_sequences_lower_95": 4.488249067826704,
            "loss_sequences_upper_95": 4.539687857043601,
            "loss_tokens_lower_95": 4.4986891770833335,
            "loss_tokens_upper_95": 4.525762677083334,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4222159450374696,
            "data_time": 0.0013035184620308138,
            "batch_time": 0.03667680424642942,
            "samples_per_second": 907306.6353257515,
            "samples_per_second_per_gpu": 113413.32941571894,
            "loss_sequences_lower_95": 3.4151211943906743,
            "loss_sequences_upper_95": 3.4295224800752875,
            "loss_tokens_lower_95": 3.4110635260416666,
            "loss_tokens_upper_95": 3.4331369583333333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2170075224320724,
            "data_time": 0.00248770848797521,
            "batch_time": 0.0378972848785807,
            "samples_per_second": 902933.5111962392,
            "samples_per_second_per_gpu": 112866.6888995299,
            "loss_sequences_lower_95": 3.2084899876923028,
            "loss_sequences_upper_95": 3.2253414664055993,
            "loss_tokens_lower_95": 3.2061243541666666,
            "loss_tokens_upper_95": 3.22801034375,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.888833413984906,
            "data_time": 0.009862197717659086,
            "batch_time": 0.04508801693972863,
            "samples_per_second": 865559.2263017301,
            "samples_per_second_per_gpu": 108194.90328771627,
            "loss_sequences_lower_95": 3.8515384821088996,
            "loss_sequences_upper_95": 3.9281693959574673,
            "loss_tokens_lower_95": 3.87575890625,
            "loss_tokens_upper_95": 3.9017562916666666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.043807846951145,
            "data_time": 0.009610613028841664,
            "batch_time": 0.04509415949483317,
            "samples_per_second": 863428.8733640899,
            "samples_per_second_per_gpu": 107928.60917051123,
            "loss_sequences_lower_95": 2.9874081364959904,
            "loss_sequences_upper_95": 3.0993896422221074,
            "loss_tokens_lower_95": 3.032191098958333,
            "loss_tokens_upper_95": 3.0550275781250003,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.4807352369481865,
            "data_time": 0.07997197764260429,
            "batch_time": 0.1155109064919608,
            "samples_per_second": 514854.9307355489,
            "samples_per_second_per_gpu": 64356.86634194361,
            "loss_sequences_lower_95": 4.413262731378729,
            "loss_sequences_upper_95": 4.550915371287952,
            "loss_tokens_lower_95": 4.451592063903809,
            "loss_tokens_upper_95": 4.50998013236306,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.574653681096113,
            "data_time": 0.013608364896340803,
            "batch_time": 0.04901890456676483,
            "samples_per_second": 852293.5608826048,
            "samples_per_second_per_gpu": 106536.6951103256,
            "loss_sequences_lower_95": 3.494027024872449,
            "loss_sequences_upper_95": 3.652995046721255,
            "loss_tokens_lower_95": 3.5614590625,
            "loss_tokens_upper_95": 3.5874579479166666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.745965612909725,
            "data_time": 0.01185397058725357,
            "batch_time": 0.047458370526631675,
            "samples_per_second": 863711.3192116339,
            "samples_per_second_per_gpu": 107963.91490145423,
            "loss_sequences_lower_95": 5.686304644984745,
            "loss_sequences_upper_95": 5.803808964148046,
            "loss_tokens_lower_95": 5.733424125,
            "loss_tokens_upper_95": 5.758759208333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5754682099232906,
            "data_time": 0.03519006818532944,
            "batch_time": 0.07096501439809799,
            "samples_per_second": 770674.0952458747,
            "samples_per_second_per_gpu": 96334.26190573434,
            "loss_sequences_lower_95": 3.50805245071161,
            "loss_sequences_upper_95": 3.6719012025926934,
            "loss_tokens_lower_95": 3.5617831058189515,
            "loss_tokens_upper_95": 3.5893038952936895,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.31474069568646,
            "data_time": 0.0017137815306018317,
            "batch_time": 0.03723311587183785,
            "samples_per_second": 898661.4205623599,
            "samples_per_second_per_gpu": 112332.67757029498,
            "loss_sequences_lower_95": 5.291279370593577,
            "loss_sequences_upper_95": 5.338657020011395,
            "loss_tokens_lower_95": 5.291145745241953,
            "loss_tokens_upper_95": 5.338301112288136,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7677185620546485,
            "data_time": 0.0023419759740495377,
            "batch_time": 0.0378597741294059,
            "samples_per_second": 897565.612676004,
            "samples_per_second_per_gpu": 112195.7015845005,
            "loss_sequences_lower_95": 2.7695147275972167,
            "loss_sequences_upper_95": 2.794835985734913,
            "loss_tokens_lower_95": 2.745182301952992,
            "loss_tokens_upper_95": 2.7633273318438016,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.953401216539393,
            "data_time": 0.003351902310068488,
            "batch_time": 0.040402045236456297,
            "samples_per_second": 891766.594388611,
            "samples_per_second_per_gpu": 111470.82429857638,
            "loss_sequences_lower_95": 4.218806289579151,
            "loss_sequences_upper_95": 4.524357830582782,
            "loss_tokens_lower_95": 3.3987455173989605,
            "loss_tokens_upper_95": 3.615860926982815,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.08499999239544,
            "data_time": 0.003878372146728191,
            "batch_time": 0.03940670921447429,
            "samples_per_second": 890417.7954275158,
            "samples_per_second_per_gpu": 111302.22442843947,
            "loss_sequences_lower_95": 4.184972583007812,
            "loss_sequences_upper_95": 4.394087752278646,
            "loss_tokens_lower_95": 3.79689938949489,
            "loss_tokens_upper_95": 3.941815091882862,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0095170645030542,
            "data_time": 0.004346168239012383,
            "batch_time": 0.039929780845009186,
            "samples_per_second": 887902.6201114941,
            "samples_per_second_per_gpu": 110987.82751393676,
            "loss_sequences_lower_95": 3.050966039835458,
            "loss_sequences_upper_95": 3.113893804195887,
            "loss_tokens_lower_95": 2.9171427318265413,
            "loss_tokens_upper_95": 2.948707489140693,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.367706888372248,
            "data_time": 0.02218196221760341,
            "batch_time": 0.057791712028639655,
            "samples_per_second": 832793.8327891304,
            "samples_per_second_per_gpu": 104099.2290986413,
            "loss_sequences_lower_95": 3.290520817149769,
            "loss_sequences_upper_95": 3.509973380348899,
            "loss_tokens_lower_95": 3.258886173536229,
            "loss_tokens_upper_95": 3.3333353619747053,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.499366654181967,
            "data_time": 0.01990397274494171,
            "batch_time": 0.0554370041936636,
            "samples_per_second": 816138.9922721818,
            "samples_per_second_per_gpu": 102017.37403402272,
            "loss_sequences_lower_95": 3.489096374511719,
            "loss_sequences_upper_95": 3.698873141541773,
            "loss_tokens_lower_95": 3.383308358370007,
            "loss_tokens_upper_95": 3.482652798641672,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.645876384576162,
            "data_time": 0.016259586199735984,
            "batch_time": 0.05148795781991421,
            "samples_per_second": 833534.0273811595,
            "samples_per_second_per_gpu": 104191.75342264494,
            "loss_sequences_lower_95": 3.5969447224934896,
            "loss_sequences_upper_95": 3.689210469563802,
            "loss_tokens_lower_95": 3.5377385501366465,
            "loss_tokens_upper_95": 3.7528691351929466,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.77330625555882,
            "data_time": 0.001445546369362084,
            "batch_time": 0.0369953269646893,
            "samples_per_second": 898305.7099039842,
            "samples_per_second_per_gpu": 112288.21373799803,
            "loss_sequences_lower_95": 4.775044596722602,
            "loss_sequences_upper_95": 4.856532814345382,
            "loss_tokens_lower_95": 4.6451350305646475,
            "loss_tokens_upper_95": 4.728291998766664,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.053169369848088,
            "data_time": 0.002862780086146105,
            "batch_time": 0.03839460375324992,
            "samples_per_second": 894201.6650707937,
            "samples_per_second_per_gpu": 111775.20813384921,
            "loss_sequences_lower_95": 4.554437420263836,
            "loss_sequences_upper_95": 4.854734427198417,
            "loss_tokens_lower_95": 3.3615849549504,
            "loss_tokens_upper_95": 3.4950005332953706,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7764088304986725,
            "data_time": 0.004949191937575469,
            "batch_time": 0.04043173467790758,
            "samples_per_second": 885978.204648664,
            "samples_per_second_per_gpu": 110747.275581083,
            "loss_sequences_lower_95": 4.167929535432887,
            "loss_sequences_upper_95": 4.513002186016825,
            "loss_tokens_lower_95": 3.3918874016189697,
            "loss_tokens_upper_95": 3.5512640638865363,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.359295457465463,
            "data_time": 0.02149570201124464,
            "batch_time": 0.056937492319515774,
            "samples_per_second": 831605.5904194638,
            "samples_per_second_per_gpu": 103950.69880243298,
            "loss_sequences_lower_95": 6.280296137561536,
            "loss_sequences_upper_95": 6.4348929487951265,
            "loss_tokens_lower_95": 6.279889696373788,
            "loss_tokens_upper_95": 6.436811626556257,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.058205635547638,
            "data_time": 0.04708905403430645,
            "batch_time": 0.08278901301897489,
            "samples_per_second": 752204.3033432645,
            "samples_per_second_per_gpu": 94025.53791790806,
            "loss_sequences_lower_95": 2.930422721862793,
            "loss_sequences_upper_95": 3.274210594177246,
            "loss_tokens_lower_95": 2.772905751673608,
            "loss_tokens_upper_95": 3.1976071270719197,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.992826056944067,
            "data_time": 0.0033182468882367653,
            "batch_time": 0.03903599287590854,
            "samples_per_second": 891336.8266318435,
            "samples_per_second_per_gpu": 111417.10332898043,
            "loss_sequences_lower_95": 4.932584439671027,
            "loss_sequences_upper_95": 5.053759928052233,
            "loss_tokens_lower_95": 4.93128920743507,
            "loss_tokens_upper_95": 5.053552508476203,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.650854676595777,
            "data_time": 0.004763843573714936,
            "batch_time": 0.04051943743209466,
            "samples_per_second": 885155.1188167476,
            "samples_per_second_per_gpu": 110644.38985209345,
            "loss_sequences_lower_95": 5.5922023463893575,
            "loss_sequences_upper_95": 5.709263227183149,
            "loss_tokens_lower_95": 5.5898528377898495,
            "loss_tokens_upper_95": 5.711556759396115,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.106949046075798,
            "data_time": 0.0034025217492946024,
            "batch_time": 0.03894001773315166,
            "samples_per_second": 887806.127242255,
            "samples_per_second_per_gpu": 110975.76590528188,
            "loss_sequences_lower_95": 3.262711962678098,
            "loss_sequences_upper_95": 3.3892568627690594,
            "loss_tokens_lower_95": 2.9287273891399033,
            "loss_tokens_upper_95": 2.9807133538621113,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.165446553945541,
            "data_time": 0.01060788705945015,
            "batch_time": 0.046035369858145714,
            "samples_per_second": 859016.6802463714,
            "samples_per_second_per_gpu": 107377.08503079643,
            "loss_sequences_lower_95": 5.341050109863281,
            "loss_sequences_upper_95": 5.910034252929687,
            "loss_tokens_lower_95": 4.558736913997869,
            "loss_tokens_upper_95": 4.920951086886271,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6834916174411774,
            "data_time": 0.14683136343955994,
            "batch_time": 0.1858941912651062,
            "samples_per_second": 483840.5577063613,
            "samples_per_second_per_gpu": 60480.06971329516,
            "loss_sequences_lower_95": 3.434827446937561,
            "loss_sequences_upper_95": 3.979179453849792,
            "loss_tokens_lower_95": 3.226610679187994,
            "loss_tokens_upper_95": 4.013392025300826,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.5805108040228655,
            "data_time": 0.02566603143164452,
            "batch_time": 0.06097194235375587,
            "samples_per_second": 776225.466406377,
            "samples_per_second_per_gpu": 97028.18330079713,
            "loss_sequences_lower_95": 5.018256965724901,
            "loss_sequences_upper_95": 5.812011008427061,
            "loss_tokens_lower_95": 3.232482628032078,
            "loss_tokens_upper_95": 3.660898298502237,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2468760844243567,
            "data_time": 0.002876448548502392,
            "batch_time": 0.0383759666648176,
            "samples_per_second": 891079.8552369827,
            "samples_per_second_per_gpu": 111384.98190462284,
            "loss_sequences_lower_95": 2.2229518532075963,
            "loss_sequences_upper_95": 2.270649027959808,
            "loss_tokens_lower_95": 2.222566625112631,
            "loss_tokens_upper_95": 2.270932280147581,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6087741678109637,
            "data_time": 0.0024838122833038792,
            "batch_time": 0.03804024893228148,
            "samples_per_second": 895550.7826569492,
            "samples_per_second_per_gpu": 111943.84783211866,
            "loss_sequences_lower_95": 2.5793597555428875,
            "loss_sequences_upper_95": 2.721468920183024,
            "loss_tokens_lower_95": 2.4651488367762826,
            "loss_tokens_upper_95": 2.6052124601969786,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0419750091357107,
            "data_time": 0.017999488446447585,
            "batch_time": 0.05330860449208154,
            "samples_per_second": 818047.4446791318,
            "samples_per_second_per_gpu": 102255.93058489148,
            "loss_sequences_lower_95": 2.9059052184387877,
            "loss_sequences_upper_95": 3.2938317099770344,
            "loss_tokens_lower_95": 2.791831231977408,
            "loss_tokens_upper_95": 3.0752454844147494,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.447350882787637,
            "data_time": 0.004623159766197205,
            "batch_time": 0.04008250012993812,
            "samples_per_second": 884029.176637254,
            "samples_per_second_per_gpu": 110503.64707965674,
            "loss_sequences_lower_95": 3.4826702163082084,
            "loss_sequences_upper_95": 3.6350140626541534,
            "loss_tokens_lower_95": 3.3031928619831064,
            "loss_tokens_upper_95": 3.4449782132501263,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.681178523999889,
            "data_time": 0.030449225789024717,
            "batch_time": 0.06620979309082031,
            "samples_per_second": 807743.2231282943,
            "samples_per_second_per_gpu": 100967.90289103679,
            "loss_sequences_lower_95": 2.5444242058730704,
            "loss_sequences_upper_95": 2.97017566401784,
            "loss_tokens_lower_95": 2.4240484489702285,
            "loss_tokens_upper_95": 2.760033552409385,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.724111063190879,
            "data_time": 0.0017200544077636622,
            "batch_time": 0.03725269847615357,
            "samples_per_second": 896289.5035676116,
            "samples_per_second_per_gpu": 112036.18794595145,
            "loss_sequences_lower_95": 4.712492430126651,
            "loss_sequences_upper_95": 4.735761644516404,
            "loss_tokens_lower_95": 4.712653468193639,
            "loss_tokens_upper_95": 4.735730114772955,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.7689095356510681,
            "data_time": 0.04491691155867143,
            "batch_time": 0.08050772060047497,
            "samples_per_second": 740237.9190429839,
            "samples_per_second_per_gpu": 92529.73988037299,
            "loss_sequences_lower_95": 0.7337156517991742,
            "loss_sequences_upper_95": 0.8360742217128716,
            "loss_tokens_lower_95": 0.6524372362431631,
            "loss_tokens_upper_95": 0.8244170408531977,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.519486969668785,
            "data_time": 0.001203872157262337,
            "batch_time": 0.03674513609940928,
            "samples_per_second": 898336.5941779476,
            "samples_per_second_per_gpu": 112292.07427224345,
            "loss_sequences_lower_95": 4.894191346878276,
            "loss_sequences_upper_95": 4.9414406548905925,
            "loss_tokens_lower_95": 3.9429952973887814,
            "loss_tokens_upper_95": 3.9903541344294005,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.908123627901078,
            "data_time": 0.00535546550675044,
            "batch_time": 0.04095195588611421,
            "samples_per_second": 880840.617955939,
            "samples_per_second_per_gpu": 110105.07724449238,
            "loss_sequences_lower_95": 5.901420751953125,
            "loss_sequences_upper_95": 6.133726989746093,
            "loss_tokens_lower_95": 5.67623875404824,
            "loss_tokens_upper_95": 5.894345208423966,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.2057220531546555,
            "data_time": 0.02086371688519494,
            "batch_time": 0.05663648500280865,
            "samples_per_second": 824296.5455014242,
            "samples_per_second_per_gpu": 103037.06818767803,
            "loss_sequences_lower_95": 5.005575561523437,
            "loss_sequences_upper_95": 5.408106795601223,
            "loss_tokens_lower_95": 5.004571944527004,
            "loss_tokens_upper_95": 5.402925083326257,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.866851957639058,
            "data_time": 0.00430955154350005,
            "batch_time": 0.03974510460014803,
            "samples_per_second": 889894.6172314356,
            "samples_per_second_per_gpu": 111236.82715392945,
            "loss_sequences_lower_95": 7.750552775065104,
            "loss_sequences_upper_95": 7.981155229048295,
            "loss_tokens_lower_95": 7.750081380208333,
            "loss_tokens_upper_95": 7.980828579989346,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.7570239873329798,
            "data_time": 0.0038657337427139282,
            "batch_time": 0.03942249620214422,
            "samples_per_second": 891957.3110484424,
            "samples_per_second_per_gpu": 111494.6638810553,
            "loss_sequences_lower_95": 0.7741488301595052,
            "loss_sequences_upper_95": 0.806640565999349,
            "loss_tokens_lower_95": 0.7116940369897959,
            "loss_tokens_upper_95": 0.758061693427371,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.5366855848403205,
            "data_time": 0.022013953753880093,
            "batch_time": 0.057326710649899075,
            "samples_per_second": 798710.1760327317,
            "samples_per_second_per_gpu": 99838.77200409146,
            "loss_sequences_lower_95": 5.217880132765997,
            "loss_sequences_upper_95": 5.855601733979724,
            "loss_tokens_lower_95": 5.218156592959449,
            "loss_tokens_upper_95": 5.855079926990327,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.35152567923069,
            "data_time": 0.14697273075580597,
            "batch_time": 0.18585152924060822,
            "samples_per_second": 488567.06736354704,
            "samples_per_second_per_gpu": 61070.88342044338,
            "loss_sequences_lower_95": 2.1526519417762757,
            "loss_sequences_upper_95": 3.087966203689575,
            "loss_tokens_lower_95": 1.8175809816970039,
            "loss_tokens_upper_95": 2.3036563393504346,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.682238189697266,
            "data_time": 0.005707460263418773,
            "batch_time": 0.041190612410742136,
            "samples_per_second": 882801.6328660384,
            "samples_per_second_per_gpu": 110350.2041082548,
            "loss_sequences_lower_95": 7.611957836914063,
            "loss_sequences_upper_95": 7.950056005859375,
            "loss_tokens_lower_95": 7.399243040133249,
            "loss_tokens_upper_95": 7.699741029174597,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.143120550632477,
            "data_time": 0.005384501483705308,
            "batch_time": 0.04114184351194473,
            "samples_per_second": 876976.9483490618,
            "samples_per_second_per_gpu": 109622.11854363272,
            "loss_sequences_lower_95": 7.234973425292968,
            "loss_sequences_upper_95": 7.455791625976563,
            "loss_tokens_lower_95": 6.90489914559339,
            "loss_tokens_upper_95": 7.096065863932617,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.087076687069495,
            "data_time": 0.0031780842554609114,
            "batch_time": 0.0386434953746987,
            "samples_per_second": 893083.522186055,
            "samples_per_second_per_gpu": 111635.44027325687,
            "loss_sequences_lower_95": 5.0565851756961315,
            "loss_sequences_upper_95": 5.117215236928533,
            "loss_tokens_lower_95": 5.057100498200742,
            "loss_tokens_upper_95": 5.117206913394653,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.929218982221894,
            "data_time": 0.0081562880662993,
            "batch_time": 0.04370434766809746,
            "samples_per_second": 869246.4263908718,
            "samples_per_second_per_gpu": 108655.80329885897,
            "loss_sequences_lower_95": 4.820700677968389,
            "loss_sequences_upper_95": 5.036326596777194,
            "loss_tokens_lower_95": 4.8189130979382675,
            "loss_tokens_upper_95": 5.034906540268577,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.853845219373703,
            "data_time": 0.005529371991990105,
            "batch_time": 0.040974462316149755,
            "samples_per_second": 884693.3874804721,
            "samples_per_second_per_gpu": 110586.67343505901,
            "loss_sequences_lower_95": 5.770485241699219,
            "loss_sequences_upper_95": 5.940541296386719,
            "loss_tokens_lower_95": 5.770770446777344,
            "loss_tokens_upper_95": 5.941206811523437,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.962226819473274,
            "data_time": 0.0016624943295138095,
            "batch_time": 0.03712806548423262,
            "samples_per_second": 898853.173358122,
            "samples_per_second_per_gpu": 112356.64666976525,
            "loss_sequences_lower_95": 3.474185332530156,
            "loss_sequences_upper_95": 3.5695924953435427,
            "loss_tokens_lower_95": 2.3419067367191166,
            "loss_tokens_upper_95": 2.4049989519126176,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.4729618625854375,
            "data_time": 0.017416884217943462,
            "batch_time": 0.052755090168544225,
            "samples_per_second": 827169.9715568146,
            "samples_per_second_per_gpu": 103396.24644460183,
            "loss_sequences_lower_95": 5.2562010921649085,
            "loss_sequences_upper_95": 5.685049871188491,
            "loss_tokens_lower_95": 5.257397187645756,
            "loss_tokens_upper_95": 5.685986066220412,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.396161344004613,
            "data_time": 0.010029810480773449,
            "batch_time": 0.045697533525526524,
            "samples_per_second": 874201.1456997815,
            "samples_per_second_per_gpu": 109275.14321247269,
            "loss_sequences_lower_95": 5.2407116938572305,
            "loss_sequences_upper_95": 5.547434740253523,
            "loss_tokens_lower_95": 5.244467498180914,
            "loss_tokens_upper_95": 5.544366024241728,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2450143981813175,
            "data_time": 0.001840258574718295,
            "batch_time": 0.03730618175669711,
            "samples_per_second": 897983.7296916014,
            "samples_per_second_per_gpu": 112247.96621145017,
            "loss_sequences_lower_95": 3.649086333605396,
            "loss_sequences_upper_95": 3.7433437289591946,
            "loss_tokens_lower_95": 2.6585534373052386,
            "loss_tokens_upper_95": 2.7279858245385307,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.352584652169041,
            "data_time": 0.025783173739910126,
            "batch_time": 0.061743093033631645,
            "samples_per_second": 817835.7395390146,
            "samples_per_second_per_gpu": 102229.46744237683,
            "loss_sequences_lower_95": 5.17441900344122,
            "loss_sequences_upper_95": 5.522544336066675,
            "loss_tokens_lower_95": 5.175481031306837,
            "loss_tokens_upper_95": 5.52055016573144,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.745196385631503,
            "data_time": 0.003187240698398688,
            "batch_time": 0.038785503169701356,
            "samples_per_second": 890789.1363701732,
            "samples_per_second_per_gpu": 111348.64204627165,
            "loss_sequences_lower_95": 3.7159298622061354,
            "loss_sequences_upper_95": 3.7744828623733753,
            "loss_tokens_lower_95": 3.715914399966552,
            "loss_tokens_upper_95": 3.7734844168100157,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.667592884267418,
            "data_time": 0.021793968027288262,
            "batch_time": 0.056978849931196736,
            "samples_per_second": 798502.7838302929,
            "samples_per_second_per_gpu": 99812.84797878661,
            "loss_sequences_lower_95": 5.4431246414925285,
            "loss_sequences_upper_95": 5.889576824891914,
            "loss_tokens_lower_95": 5.438805604212492,
            "loss_tokens_upper_95": 5.894380765748256,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3867344612876575,
            "data_time": 0.07224363833665848,
            "batch_time": 0.11123339086771011,
            "samples_per_second": 605623.1586066564,
            "samples_per_second_per_gpu": 75702.89482583205,
            "loss_sequences_lower_95": 2.1348647054036456,
            "loss_sequences_upper_95": 2.8001300938924154,
            "loss_tokens_lower_95": 1.9287235260009765,
            "loss_tokens_upper_95": 2.667266167534722,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.44524418314298,
            "data_time": 0.07407093048095703,
            "batch_time": 0.10999457538127899,
            "samples_per_second": 653115.892959859,
            "samples_per_second_per_gpu": 81639.48661998237,
            "loss_sequences_lower_95": 2.255629533131917,
            "loss_sequences_upper_95": 3.001857096354166,
            "loss_tokens_lower_95": 1.840903310025676,
            "loss_tokens_upper_95": 2.7115798564439406,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.863083328940907,
            "data_time": 0.0032675455709707743,
            "batch_time": 0.03885034563402487,
            "samples_per_second": 892352.4239469338,
            "samples_per_second_per_gpu": 111544.05299336673,
            "loss_sequences_lower_95": 3.847429454505707,
            "loss_sequences_upper_95": 3.878756759710972,
            "loss_tokens_lower_95": 3.8470031037141017,
            "loss_tokens_upper_95": 3.87863731417986,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.6963893999965612,
            "data_time": 0.0011355591289845806,
            "batch_time": 0.036658440241732794,
            "samples_per_second": 899212.5016778752,
            "samples_per_second_per_gpu": 112401.5627097344,
            "loss_sequences_lower_95": 0.8159669037272191,
            "loss_sequences_upper_95": 0.8380305162591679,
            "loss_tokens_lower_95": 0.5584243516152704,
            "loss_tokens_upper_95": 0.5694422407676735,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.296223664846946,
            "data_time": 0.03780258446931839,
            "batch_time": 0.07442300021648407,
            "samples_per_second": 790842.0488916363,
            "samples_per_second_per_gpu": 98855.25611145454,
            "loss_sequences_lower_95": 4.324905647818498,
            "loss_sequences_upper_95": 4.685445470134105,
            "loss_tokens_lower_95": 3.9981449690359403,
            "loss_tokens_upper_95": 4.208102506198569,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.801506944604822,
            "data_time": 0.10920843623933338,
            "batch_time": 0.1451583362760998,
            "samples_per_second": 507740.5901244213,
            "samples_per_second_per_gpu": 63467.57376555266,
            "loss_sequences_lower_95": 6.3575931136672565,
            "loss_sequences_upper_95": 7.461606164880701,
            "loss_tokens_lower_95": 6.065498625201943,
            "loss_tokens_upper_95": 7.2540997163749035,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.166218600622037,
            "data_time": 0.028304554167247954,
            "batch_time": 0.0650270524479094,
            "samples_per_second": 789142.7984120639,
            "samples_per_second_per_gpu": 98642.84980150798,
            "loss_sequences_lower_95": 4.115607898991282,
            "loss_sequences_upper_95": 4.447907145430402,
            "loss_tokens_lower_95": 3.8142252631544364,
            "loss_tokens_upper_95": 3.9882648398076115,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.2670400404348605,
            "data_time": 0.02813615117754255,
            "batch_time": 0.06427021537508283,
            "samples_per_second": 802679.7799324633,
            "samples_per_second_per_gpu": 100334.97249155791,
            "loss_sequences_lower_95": 4.221773687223109,
            "loss_sequences_upper_95": 4.520178762296351,
            "loss_tokens_lower_95": 3.9351693517649817,
            "loss_tokens_upper_95": 4.080326680599126,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.369720283077984,
            "data_time": 0.028107847486223494,
            "batch_time": 0.06406079303650629,
            "samples_per_second": 807129.009121467,
            "samples_per_second_per_gpu": 100891.12614018338,
            "loss_sequences_lower_95": 4.327370862263005,
            "loss_sequences_upper_95": 4.732553565792921,
            "loss_tokens_lower_95": 3.9968863717024874,
            "loss_tokens_upper_95": 4.226436435552491,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.400250940788083,
            "data_time": 0.029354345230829148,
            "batch_time": 0.06571432238533384,
            "samples_per_second": 794763.8656148309,
            "samples_per_second_per_gpu": 99345.48320185386,
            "loss_sequences_lower_95": 4.358548066674209,
            "loss_sequences_upper_95": 4.648445594601515,
            "loss_tokens_lower_95": 4.078697684695045,
            "loss_tokens_upper_95": 4.214425516574182,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.787648263184921,
            "data_time": 0.03091761800977919,
            "batch_time": 0.0669620861241847,
            "samples_per_second": 814186.8709592356,
            "samples_per_second_per_gpu": 101773.35886990445,
            "loss_sequences_lower_95": 4.755702550662971,
            "loss_sequences_upper_95": 5.063745031889922,
            "loss_tokens_lower_95": 4.490903624506435,
            "loss_tokens_upper_95": 4.608875893444721,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.4686974534174295,
            "data_time": 0.029076698280516126,
            "batch_time": 0.0656755765279134,
            "samples_per_second": 795688.4122388326,
            "samples_per_second_per_gpu": 99461.05152985407,
            "loss_sequences_lower_95": 4.4931294324921405,
            "loss_sequences_upper_95": 4.808281614722276,
            "loss_tokens_lower_95": 4.122347576194244,
            "loss_tokens_upper_95": 4.250240685277276,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/params.txt",
    "uuid": "f33ff600-22ea-49c3-8b9c-7e8d3ccf4f06",
    "creation_date": "2023_12_13-16_18_39"
}