{
    "name": "c4_original-d=96_l=8_h=4-0.5",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 105693120,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "21138624",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=96_l=8_h=4-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.8724460681279504,
            "data_time": 0.13330470025539398,
            "batch_time": 1.2880945056676865,
            "samples_per_second": 376918.1580734811,
            "samples_per_second_per_gpu": 47114.76975918514,
            "loss_sequences_lower_95": 6.675377248128255,
            "loss_sequences_upper_95": 7.070577824910482,
            "loss_tokens_lower_95": 6.856499214172363,
            "loss_tokens_upper_95": 6.887767219543457,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.552642267517968,
            "data_time": 0.019148251687164795,
            "batch_time": 0.06423423213603735,
            "samples_per_second": 4666957.648094154,
            "samples_per_second_per_gpu": 583369.7060117692,
            "loss_sequences_lower_95": 5.55028583975988,
            "loss_sequences_upper_95": 5.554982632148613,
            "loss_tokens_lower_95": 5.540982958333333,
            "loss_tokens_upper_95": 5.564042197916667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.8864708676630135,
            "data_time": 0.0921299159526825,
            "batch_time": 0.13640163093805313,
            "samples_per_second": 4146956.8361779936,
            "samples_per_second_per_gpu": 518369.6045222492,
            "loss_sequences_lower_95": 6.840415674326371,
            "loss_sequences_upper_95": 6.944137697803731,
            "loss_tokens_lower_95": 6.872628708333333,
            "loss_tokens_upper_95": 6.900465270833333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.771168206008439,
            "data_time": 0.013466177018065201,
            "batch_time": 0.057451368162506504,
            "samples_per_second": 5361738.897617114,
            "samples_per_second_per_gpu": 670217.3622021392,
            "loss_sequences_lower_95": 5.735475978576031,
            "loss_sequences_upper_95": 5.807617862032861,
            "loss_tokens_lower_95": 5.7584011875000005,
            "loss_tokens_upper_95": 5.78417596875,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.579847275603813,
            "data_time": 0.0914904996752739,
            "batch_time": 0.13710293918848038,
            "samples_per_second": 4077300.122671066,
            "samples_per_second_per_gpu": 509662.51533388323,
            "loss_sequences_lower_95": 5.523907507995482,
            "loss_sequences_upper_95": 5.647346723443865,
            "loss_tokens_lower_95": 5.567768729166667,
            "loss_tokens_upper_95": 5.5919679375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.559752050344025,
            "data_time": 0.03280680378278097,
            "batch_time": 0.07624935358762741,
            "samples_per_second": 4988302.282881014,
            "samples_per_second_per_gpu": 623537.7853601268,
            "loss_sequences_lower_95": 6.492495588539897,
            "loss_sequences_upper_95": 6.629980418959254,
            "loss_tokens_lower_95": 6.546144052083334,
            "loss_tokens_upper_95": 6.57323084375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.674767761230468,
            "data_time": 0.012525753676891327,
            "batch_time": 0.05493971630930901,
            "samples_per_second": 5249778.477705921,
            "samples_per_second_per_gpu": 656222.3097132401,
            "loss_sequences_lower_95": 8.64469732541454,
            "loss_sequences_upper_95": 8.704883769132653,
            "loss_tokens_lower_95": 8.660236541666666,
            "loss_tokens_upper_95": 8.689998354166667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.727559515968043,
            "data_time": 0.013103211396618894,
            "batch_time": 0.05641867690964749,
            "samples_per_second": 5365554.004857373,
            "samples_per_second_per_gpu": 670694.2506071717,
            "loss_sequences_lower_95": 5.704313604384817,
            "loss_sequences_upper_95": 5.752341019715314,
            "loss_tokens_lower_95": 5.7148090625000005,
            "loss_tokens_upper_95": 5.740254625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.929205004761859,
            "data_time": 0.0964667871594429,
            "batch_time": 0.14101610332727432,
            "samples_per_second": 4184614.162749439,
            "samples_per_second_per_gpu": 523076.7703436799,
            "loss_sequences_lower_95": 5.847137984609216,
            "loss_sequences_upper_95": 6.024251575004763,
            "loss_tokens_lower_95": 5.916739020833334,
            "loss_tokens_upper_95": 5.941707989583333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.691515387282541,
            "data_time": 0.0955861285328865,
            "batch_time": 0.14198967069387436,
            "samples_per_second": 4106609.71320549,
            "samples_per_second_per_gpu": 513326.2141506862,
            "loss_sequences_lower_95": 6.591939101388803,
            "loss_sequences_upper_95": 6.8115111339704795,
            "loss_tokens_lower_95": 6.677857875,
            "loss_tokens_upper_95": 6.7051458749999995,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.378839562915197,
            "data_time": 0.010589170044866102,
            "batch_time": 0.05416460643554556,
            "samples_per_second": 5334047.159580634,
            "samples_per_second_per_gpu": 666755.8949475792,
            "loss_sequences_lower_95": 6.365383448998733,
            "loss_sequences_upper_95": 6.392575408237119,
            "loss_tokens_lower_95": 6.3659220520833335,
            "loss_tokens_upper_95": 6.392281072916667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.076617325310506,
            "data_time": 0.023074495792388915,
            "batch_time": 0.06669298857450486,
            "samples_per_second": 5110618.889362651,
            "samples_per_second_per_gpu": 638827.3611703314,
            "loss_sequences_lower_95": 6.051668789127578,
            "loss_sequences_upper_95": 6.102108394776916,
            "loss_tokens_lower_95": 6.06375821875,
            "loss_tokens_upper_95": 6.089221885416667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.978078368227574,
            "data_time": 0.09245166927576065,
            "batch_time": 0.14096098393201828,
            "samples_per_second": 4198836.764844743,
            "samples_per_second_per_gpu": 524854.5956055928,
            "loss_sequences_lower_95": 5.886733977838172,
            "loss_sequences_upper_95": 6.084372969621577,
            "loss_tokens_lower_95": 5.964638322916667,
            "loss_tokens_upper_95": 5.991664854166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.397951671887804,
            "data_time": 0.09375167638063431,
            "batch_time": 0.14023375511169434,
            "samples_per_second": 4142418.18614034,
            "samples_per_second_per_gpu": 517802.2732675425,
            "loss_sequences_lower_95": 6.313582323622072,
            "loss_sequences_upper_95": 6.4924682617187495,
            "loss_tokens_lower_95": 6.385564958333334,
            "loss_tokens_upper_95": 6.411014687500001,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.6212391528216274,
            "data_time": 0.1376870721578598,
            "batch_time": 0.16184522211551666,
            "samples_per_second": 1020877.4790572609,
            "samples_per_second_per_gpu": 127609.68488215761,
            "loss_sequences_lower_95": 7.552428349581631,
            "loss_sequences_upper_95": 7.706816083734686,
            "loss_tokens_lower_95": 7.59424372586337,
            "loss_tokens_upper_95": 7.6486778606068,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.263297735775873,
            "data_time": 0.09403964877128601,
            "batch_time": 0.12920928746461868,
            "samples_per_second": 3269429.1202846994,
            "samples_per_second_per_gpu": 408678.6400355874,
            "loss_sequences_lower_95": 7.092789256816007,
            "loss_sequences_upper_95": 7.434529313957725,
            "loss_tokens_lower_95": 7.248148208333333,
            "loss_tokens_upper_95": 7.278520020833334,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.207426315247226,
            "data_time": 0.09287393093109131,
            "batch_time": 0.1292751431465149,
            "samples_per_second": 3767717.8079672023,
            "samples_per_second_per_gpu": 470964.7259959003,
            "loss_sequences_lower_95": 7.122171817668824,
            "loss_sequences_upper_95": 7.313481361570333,
            "loss_tokens_lower_95": 7.1955162291666666,
            "loss_tokens_upper_95": 7.219275083333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.100345345794177,
            "data_time": 0.16083717346191406,
            "batch_time": 0.19072043895721436,
            "samples_per_second": 2264334.8852588483,
            "samples_per_second_per_gpu": 283041.86065735604,
            "loss_sequences_lower_95": 6.9759368896484375,
            "loss_sequences_upper_95": 7.315434152571882,
            "loss_tokens_lower_95": 7.085750379718718,
            "loss_tokens_upper_95": 7.1146745650494685,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.67896254289152,
            "data_time": 0.03347304111177271,
            "batch_time": 0.07786840959028764,
            "samples_per_second": 4364163.15550875,
            "samples_per_second_per_gpu": 545520.3944385938,
            "loss_sequences_lower_95": 5.660802734096817,
            "loss_sequences_upper_95": 5.696482121715212,
            "loss_tokens_lower_95": 5.660963983050848,
            "loss_tokens_upper_95": 5.696749483691781,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.178972867921253,
            "data_time": 0.02817639671266079,
            "batch_time": 0.07221630960702896,
            "samples_per_second": 4462765.882535221,
            "samples_per_second_per_gpu": 557845.7353169026,
            "loss_sequences_lower_95": 5.194193304138244,
            "loss_sequences_upper_95": 5.220184064818388,
            "loss_tokens_lower_95": 5.166164146570586,
            "loss_tokens_upper_95": 5.188617867766012,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.8179745761977655,
            "data_time": 0.047387904591030545,
            "batch_time": 0.08909204767809974,
            "samples_per_second": 4392107.743348669,
            "samples_per_second_per_gpu": 549013.4679185837,
            "loss_sequences_lower_95": 8.264029056078765,
            "loss_sequences_upper_95": 8.492044982766002,
            "loss_tokens_lower_95": 7.6925725639221465,
            "loss_tokens_upper_95": 7.870135220283834,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.509287654399872,
            "data_time": 0.0393102690577507,
            "batch_time": 0.08304662878314654,
            "samples_per_second": 4577050.260603118,
            "samples_per_second_per_gpu": 572131.2825753897,
            "loss_sequences_lower_95": 7.888058040364584,
            "loss_sequences_upper_95": 8.045557698567707,
            "loss_tokens_lower_95": 7.413886387087264,
            "loss_tokens_upper_95": 7.531074783805032,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.4807431392366,
            "data_time": 0.07476217051347096,
            "batch_time": 0.11547898004452388,
            "samples_per_second": 4000295.456992515,
            "samples_per_second_per_gpu": 500036.93212406436,
            "loss_sequences_lower_95": 6.526760033642912,
            "loss_sequences_upper_95": 6.597717368449109,
            "loss_tokens_lower_95": 6.45942365009841,
            "loss_tokens_upper_95": 6.496627098760204,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.629267072677612,
            "data_time": 0.3412935733795166,
            "batch_time": 0.384662389755249,
            "samples_per_second": 2270380.8440340986,
            "samples_per_second_per_gpu": 283797.6055042623,
            "loss_sequences_lower_95": 7.47983173717152,
            "loss_sequences_upper_95": 7.818859419389204,
            "loss_tokens_lower_95": 7.583737812405523,
            "loss_tokens_upper_95": 7.672907701803889,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.190436968511465,
            "data_time": 0.3765443116426468,
            "batch_time": 0.422719269990921,
            "samples_per_second": 2009369.4258682537,
            "samples_per_second_per_gpu": 251171.1782335317,
            "loss_sequences_lower_95": 6.131810688875159,
            "loss_sequences_upper_95": 6.331966291155134,
            "loss_tokens_lower_95": 6.153051437140318,
            "loss_tokens_upper_95": 6.253267629697021,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.63912760257721,
            "data_time": 0.17525745183229446,
            "batch_time": 0.20697954297065735,
            "samples_per_second": 2653008.870874857,
            "samples_per_second_per_gpu": 331626.10885935713,
            "loss_sequences_lower_95": 5.5587693786621095,
            "loss_sequences_upper_95": 5.684975229899089,
            "loss_tokens_lower_95": 5.545142340925333,
            "loss_tokens_upper_95": 5.739125119505176,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.95518693153821,
            "data_time": 0.024378476850688457,
            "batch_time": 0.0684812905266881,
            "samples_per_second": 4530448.852114491,
            "samples_per_second_per_gpu": 566306.1065143114,
            "loss_sequences_lower_95": 10.027997514886078,
            "loss_sequences_upper_95": 10.09782129631908,
            "loss_tokens_lower_95": 9.902255001284104,
            "loss_tokens_upper_95": 9.97546703661785,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.580881769809658,
            "data_time": 0.04978475868701935,
            "batch_time": 0.09189047515392304,
            "samples_per_second": 4345361.304545144,
            "samples_per_second_per_gpu": 543170.163068143,
            "loss_sequences_lower_95": 7.719388526377052,
            "loss_sequences_upper_95": 7.985459843388311,
            "loss_tokens_lower_95": 6.43432803452827,
            "loss_tokens_upper_95": 6.58565222578196,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.265518215329167,
            "data_time": 0.07363949120044708,
            "batch_time": 0.11558646559715272,
            "samples_per_second": 4349745.485979702,
            "samples_per_second_per_gpu": 543718.1857474628,
            "loss_sequences_lower_95": 6.931120982511866,
            "loss_sequences_upper_95": 7.237022259292342,
            "loss_tokens_lower_95": 6.165472087140005,
            "loss_tokens_upper_95": 6.340762692476404,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.92498373223222,
            "data_time": 0.3528608977794647,
            "batch_time": 0.3946155309677124,
            "samples_per_second": 1839631.3726762433,
            "samples_per_second_per_gpu": 229953.9215845304,
            "loss_sequences_lower_95": 5.894549253977597,
            "loss_sequences_upper_95": 5.954513368650114,
            "loss_tokens_lower_95": 5.894459610333726,
            "loss_tokens_upper_95": 5.95393862092876,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.532941541671753,
            "data_time": 0.31998077034950256,
            "batch_time": 0.3451509475708008,
            "samples_per_second": 1269883.4121690465,
            "samples_per_second_per_gpu": 158735.4265211308,
            "loss_sequences_lower_95": 5.439388809204101,
            "loss_sequences_upper_95": 5.887236679077149,
            "loss_tokens_lower_95": 5.269013499360605,
            "loss_tokens_upper_95": 5.793182646012689,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.419194058727705,
            "data_time": 0.053275587037205696,
            "batch_time": 0.09639099054038525,
            "samples_per_second": 4464269.099119368,
            "samples_per_second_per_gpu": 558033.637389921,
            "loss_sequences_lower_95": 5.366606860126663,
            "loss_sequences_upper_95": 5.472690197291613,
            "loss_tokens_lower_95": 5.36613300991156,
            "loss_tokens_upper_95": 5.4724047876551305,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.742744225932379,
            "data_time": 0.07898750901222229,
            "batch_time": 0.1220181405544281,
            "samples_per_second": 4423671.96832175,
            "samples_per_second_per_gpu": 552958.9960402187,
            "loss_sequences_lower_95": 5.689467311549832,
            "loss_sequences_upper_95": 5.79391096085432,
            "loss_tokens_lower_95": 5.689043154704775,
            "loss_tokens_upper_95": 5.794973772378557,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.625833928520194,
            "data_time": 0.060017179697752,
            "batch_time": 0.10154393687844276,
            "samples_per_second": 4114119.0762983235,
            "samples_per_second_per_gpu": 514264.88453729043,
            "loss_sequences_lower_95": 5.749818342372824,
            "loss_sequences_upper_95": 5.857350253332426,
            "loss_tokens_lower_95": 5.607095472508698,
            "loss_tokens_upper_95": 5.666663472489924,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.1601079635620115,
            "data_time": 0.1783265918493271,
            "batch_time": 0.22321096062660217,
            "samples_per_second": 3830920.1991376895,
            "samples_per_second_per_gpu": 478865.0248922112,
            "loss_sequences_lower_95": 7.869148779296875,
            "loss_sequences_upper_95": 8.38839189453125,
            "loss_tokens_lower_95": 6.9200188203176705,
            "loss_tokens_upper_95": 7.260942670416942,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.297780215740204,
            "data_time": 0.1484365612268448,
            "batch_time": 0.1703384965658188,
            "samples_per_second": 926982.1179159153,
            "samples_per_second_per_gpu": 115872.76473948942,
            "loss_sequences_lower_95": 5.019027876853943,
            "loss_sequences_upper_95": 5.711372256278992,
            "loss_tokens_lower_95": 4.728932461793395,
            "loss_tokens_upper_95": 5.705000541950095,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.24211511392703,
            "data_time": 0.30837005376815796,
            "batch_time": 0.34351174533367157,
            "samples_per_second": 2310907.835898556,
            "samples_per_second_per_gpu": 288863.4794873195,
            "loss_sequences_lower_95": 7.115972795157597,
            "loss_sequences_upper_95": 7.655789061798447,
            "loss_tokens_lower_95": 5.963679265277093,
            "loss_tokens_upper_95": 6.3707202313583355,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.443017056958026,
            "data_time": 0.053169507119390697,
            "batch_time": 0.09794274634785122,
            "samples_per_second": 4437513.562126261,
            "samples_per_second_per_gpu": 554689.1952657826,
            "loss_sequences_lower_95": 5.433991158461665,
            "loss_sequences_upper_95": 5.452371513130052,
            "loss_tokens_lower_95": 5.433925158366098,
            "loss_tokens_upper_95": 5.452069851736566,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.66204589319627,
            "data_time": 0.037928747988882516,
            "batch_time": 0.08111093157813662,
            "samples_per_second": 4333261.008775504,
            "samples_per_second_per_gpu": 541657.626096938,
            "loss_sequences_lower_95": 8.69110336969969,
            "loss_sequences_upper_95": 8.83941692794246,
            "loss_tokens_lower_95": 8.577010782417746,
            "loss_tokens_upper_95": 8.72443011899608,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.813009450723837,
            "data_time": 0.1752774864435196,
            "batch_time": 0.2046489119529724,
            "samples_per_second": 1784735.8333532654,
            "samples_per_second_per_gpu": 223091.97916915818,
            "loss_sequences_lower_95": 4.672485496884301,
            "loss_sequences_upper_95": 5.062861180567479,
            "loss_tokens_lower_95": 4.600459756771797,
            "loss_tokens_upper_95": 4.944262661435447,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.202609793821845,
            "data_time": 0.07986585795879364,
            "batch_time": 0.12474762201309204,
            "samples_per_second": 4322012.2333739605,
            "samples_per_second_per_gpu": 540251.5291717451,
            "loss_sequences_lower_95": 5.24115813992206,
            "loss_sequences_upper_95": 5.382186578932765,
            "loss_tokens_lower_95": 5.118601997184401,
            "loss_tokens_upper_95": 5.277805511890128,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.2042431511530065,
            "data_time": 0.32297323644161224,
            "batch_time": 0.3573075234889984,
            "samples_per_second": 2513923.5978151234,
            "samples_per_second_per_gpu": 314240.4497268904,
            "loss_sequences_lower_95": 6.983565967838939,
            "loss_sequences_upper_95": 7.533072904261147,
            "loss_tokens_lower_95": 7.038185206750867,
            "loss_tokens_upper_95": 7.412285670020646,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.96542846772594,
            "data_time": 0.03141488475859352,
            "batch_time": 0.07566930881488286,
            "samples_per_second": 4336504.516314463,
            "samples_per_second_per_gpu": 542063.0645393078,
            "loss_sequences_lower_95": 4.957084649351745,
            "loss_sequences_upper_95": 4.973827236150355,
            "loss_tokens_lower_95": 4.956918268419309,
            "loss_tokens_upper_95": 4.973750882989098,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.448767527793217,
            "data_time": 0.3445223867893219,
            "batch_time": 0.37130920588970184,
            "samples_per_second": 1674104.214641966,
            "samples_per_second_per_gpu": 209263.02683024574,
            "loss_sequences_lower_95": 6.316996705879285,
            "loss_sequences_upper_95": 6.673772511898893,
            "loss_tokens_lower_95": 6.194681207818897,
            "loss_tokens_upper_95": 6.6179890658984775,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.4586354678531865,
            "data_time": 0.024759328265984853,
            "batch_time": 0.06908540219068528,
            "samples_per_second": 4455078.980558525,
            "samples_per_second_per_gpu": 556884.8725698156,
            "loss_sequences_lower_95": 7.9463533477463315,
            "loss_sequences_upper_95": 7.9855723843684485,
            "loss_tokens_lower_95": 7.390529968568665,
            "loss_tokens_upper_95": 7.4293481745647965,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.349258622169495,
            "data_time": 0.09403550252318382,
            "batch_time": 0.13879096508026123,
            "samples_per_second": 4348057.885808474,
            "samples_per_second_per_gpu": 543507.2357260593,
            "loss_sequences_lower_95": 10.031098657226563,
            "loss_sequences_upper_95": 10.641998364257812,
            "loss_tokens_lower_95": 10.025222351121423,
            "loss_tokens_upper_95": 10.644032262815399,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.513032911134803,
            "data_time": 0.35229913890361786,
            "batch_time": 0.3964061439037323,
            "samples_per_second": 2437778.1243639197,
            "samples_per_second_per_gpu": 304722.26554548997,
            "loss_sequences_lower_95": 5.379119474991508,
            "loss_sequences_upper_95": 5.644835324494735,
            "loss_tokens_lower_95": 5.382779740043309,
            "loss_tokens_upper_95": 5.640852687669836,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 12.104023680542454,
            "data_time": 0.06335823982954025,
            "batch_time": 0.10367379585901897,
            "samples_per_second": 3982263.547191384,
            "samples_per_second_per_gpu": 497782.943398923,
            "loss_sequences_lower_95": 11.915682188091855,
            "loss_sequences_upper_95": 12.291186837861032,
            "loss_tokens_lower_95": 11.91496992631392,
            "loss_tokens_upper_95": 12.295054543235084,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.888332382837931,
            "data_time": 0.06621447205543518,
            "batch_time": 0.11065173149108887,
            "samples_per_second": 4451362.645899446,
            "samples_per_second_per_gpu": 556420.3307374307,
            "loss_sequences_lower_95": 5.006295458984375,
            "loss_sequences_upper_95": 5.097774096679688,
            "loss_tokens_lower_95": 4.832344672243898,
            "loss_tokens_upper_95": 4.928667943739996,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.902072529565721,
            "data_time": 0.35297757387161255,
            "batch_time": 0.3942625820636749,
            "samples_per_second": 2273281.0713018095,
            "samples_per_second_per_gpu": 284160.1339127262,
            "loss_sequences_lower_95": 5.544791550409227,
            "loss_sequences_upper_95": 6.266021931966145,
            "loss_tokens_lower_95": 5.5459115600585935,
            "loss_tokens_upper_95": 6.260764567057292,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.208824098110199,
            "data_time": 0.1567952036857605,
            "batch_time": 0.17650707066059113,
            "samples_per_second": 947123.7202752833,
            "samples_per_second_per_gpu": 118390.46503441042,
            "loss_sequences_lower_95": 6.968178391456604,
            "loss_sequences_upper_95": 8.495132279396056,
            "loss_tokens_lower_95": 6.8162796114892075,
            "loss_tokens_upper_95": 7.408969198010631,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.082481589317322,
            "data_time": 0.09501681849360466,
            "batch_time": 0.1394253447651863,
            "samples_per_second": 4348128.620746494,
            "samples_per_second_per_gpu": 543516.0775933118,
            "loss_sequences_lower_95": 8.171475927734374,
            "loss_sequences_upper_95": 8.506909814453126,
            "loss_tokens_lower_95": 7.921145965524535,
            "loss_tokens_upper_95": 8.2147483451314,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.144860269546509,
            "data_time": 0.10037058591842651,
            "batch_time": 0.14523999392986298,
            "samples_per_second": 4265206.775305345,
            "samples_per_second_per_gpu": 533150.8469131681,
            "loss_sequences_lower_95": 7.406471936035156,
            "loss_sequences_upper_95": 7.645822827148438,
            "loss_tokens_lower_95": 7.038209646011986,
            "loss_tokens_upper_95": 7.222767830945638,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.117136321290004,
            "data_time": 0.039634816348552704,
            "batch_time": 0.08337724457184474,
            "samples_per_second": 4554265.131174975,
            "samples_per_second_per_gpu": 569283.1413968719,
            "loss_sequences_lower_95": 5.099715011039118,
            "loss_sequences_upper_95": 5.134740196072013,
            "loss_tokens_lower_95": 5.099298155040018,
            "loss_tokens_upper_95": 5.134872349564197,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.690628078126688,
            "data_time": 0.11748439073562622,
            "batch_time": 0.15791881581147513,
            "samples_per_second": 4005909.357581697,
            "samples_per_second_per_gpu": 500738.6696977121,
            "loss_sequences_lower_95": 5.607808740384385,
            "loss_sequences_upper_95": 5.770886614868351,
            "loss_tokens_lower_95": 5.608133998700917,
            "loss_tokens_upper_95": 5.770798878003192,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.070463610649108,
            "data_time": 0.09534016251564026,
            "batch_time": 0.13959409669041634,
            "samples_per_second": 4296029.193318161,
            "samples_per_second_per_gpu": 537003.6491647701,
            "loss_sequences_lower_95": 9.015810278320313,
            "loss_sequences_upper_95": 9.126535473632812,
            "loss_tokens_lower_95": 9.01368037109375,
            "loss_tokens_upper_95": 9.125219311523438,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.385562770684711,
            "data_time": 0.029099097209317342,
            "batch_time": 0.07329609599851426,
            "samples_per_second": 4433874.847261948,
            "samples_per_second_per_gpu": 554234.3559077434,
            "loss_sequences_lower_95": 7.975836610099337,
            "loss_sequences_upper_95": 8.047386785270813,
            "loss_tokens_lower_95": 7.297022182127927,
            "loss_tokens_upper_95": 7.353906849859233,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.54933892257178,
            "data_time": 0.20676695449011667,
            "batch_time": 0.23893217529569352,
            "samples_per_second": 1909214.9462260273,
            "samples_per_second_per_gpu": 238651.86827825342,
            "loss_sequences_lower_95": 5.404491413173391,
            "loss_sequences_upper_95": 5.6898605802165925,
            "loss_tokens_lower_95": 5.403545288541423,
            "loss_tokens_upper_95": 5.689725346351738,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.60100853396397,
            "data_time": 0.19125168770551682,
            "batch_time": 0.23672351986169815,
            "samples_per_second": 3662467.1459993063,
            "samples_per_second_per_gpu": 457808.3932499133,
            "loss_sequences_lower_95": 5.494946336933211,
            "loss_sequences_upper_95": 5.705071470971201,
            "loss_tokens_lower_95": 5.495330595128676,
            "loss_tokens_upper_95": 5.707265218098958,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.106730494092434,
            "data_time": 0.03484365018084645,
            "batch_time": 0.07884867256507277,
            "samples_per_second": 4314468.567806119,
            "samples_per_second_per_gpu": 539308.5709757649,
            "loss_sequences_lower_95": 7.49876666627521,
            "loss_sequences_upper_95": 7.579067469055337,
            "loss_tokens_lower_95": 7.0262620236371305,
            "loss_tokens_upper_95": 7.096362950012831,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.219203484752191,
            "data_time": 0.31954333186149597,
            "batch_time": 0.35733577609062195,
            "samples_per_second": 2433870.4321247065,
            "samples_per_second_per_gpu": 304233.8040155883,
            "loss_sequences_lower_95": 5.147191664276931,
            "loss_sequences_upper_95": 5.29358979886171,
            "loss_tokens_lower_95": 5.147385450898025,
            "loss_tokens_upper_95": 5.291276509925802,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.833494000916087,
            "data_time": 0.05401896513425387,
            "batch_time": 0.09816920069547799,
            "samples_per_second": 4328916.154978354,
            "samples_per_second_per_gpu": 541114.5193722943,
            "loss_sequences_lower_95": 8.814491948585626,
            "loss_sequences_upper_95": 8.853294927847859,
            "loss_tokens_lower_95": 8.81376980002867,
            "loss_tokens_upper_95": 8.852779932984518,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.393864154815674,
            "data_time": 0.3370158523321152,
            "batch_time": 0.37722837924957275,
            "samples_per_second": 2343514.0296245967,
            "samples_per_second_per_gpu": 292939.2537030746,
            "loss_sequences_lower_95": 5.229903582230355,
            "loss_sequences_upper_95": 5.555590657354558,
            "loss_tokens_lower_95": 5.230670743775599,
            "loss_tokens_upper_95": 5.555768903713782,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.083488464355469,
            "data_time": 0.29996567964553833,
            "batch_time": 0.3203079551458359,
            "samples_per_second": 1001734.7015930641,
            "samples_per_second_per_gpu": 125216.83769913302,
            "loss_sequences_lower_95": 7.916064631144206,
            "loss_sequences_upper_95": 8.561225077311198,
            "loss_tokens_lower_95": 7.467768245273167,
            "loss_tokens_upper_95": 8.617645348442924,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.83676639397939,
            "data_time": 0.29859134554862976,
            "batch_time": 0.31871363520622253,
            "samples_per_second": 1247077.6315930183,
            "samples_per_second_per_gpu": 155884.7039491273,
            "loss_sequences_lower_95": 7.671597061157227,
            "loss_sequences_upper_95": 8.507924105326335,
            "loss_tokens_lower_95": 7.083422542957778,
            "loss_tokens_upper_95": 8.306443563739906,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.12365884794929,
            "data_time": 0.04369554136480604,
            "batch_time": 0.0873015405876296,
            "samples_per_second": 4261106.951324822,
            "samples_per_second_per_gpu": 532638.3689156028,
            "loss_sequences_lower_95": 9.10482211892489,
            "loss_sequences_upper_95": 9.14193306160254,
            "loss_tokens_lower_95": 9.104875017258836,
            "loss_tokens_upper_95": 9.142554796805964,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.993524299717582,
            "data_time": 0.023202336055743122,
            "batch_time": 0.0676709510483596,
            "samples_per_second": 4466341.546917477,
            "samples_per_second_per_gpu": 558292.6933646846,
            "loss_sequences_lower_95": 7.564403799344355,
            "loss_sequences_upper_95": 7.595418871384121,
            "loss_tokens_lower_95": 6.929418608763306,
            "loss_tokens_upper_95": 6.95905596863175,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.4908366991779,
            "data_time": 0.32006846368312836,
            "batch_time": 0.35070985555648804,
            "samples_per_second": 1922992.4029218333,
            "samples_per_second_per_gpu": 240374.05036522917,
            "loss_sequences_lower_95": 8.453744795191007,
            "loss_sequences_upper_95": 8.825086194136011,
            "loss_tokens_lower_95": 8.37856528072034,
            "loss_tokens_upper_95": 8.611888994757123,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.684158015895534,
            "data_time": 0.21602189540863037,
            "batch_time": 0.23366624116897583,
            "samples_per_second": 1036102.7279851235,
            "samples_per_second_per_gpu": 129512.84099814044,
            "loss_sequences_lower_95": 10.299531740755649,
            "loss_sequences_upper_95": 11.288272919525971,
            "loss_tokens_lower_95": 10.128988911193094,
            "loss_tokens_upper_95": 11.007442898220486,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.647934146043731,
            "data_time": 0.332304984331131,
            "batch_time": 0.36614474654197693,
            "samples_per_second": 2244975.4143872084,
            "samples_per_second_per_gpu": 280621.92679840105,
            "loss_sequences_lower_95": 8.575186194443122,
            "loss_sequences_upper_95": 8.847266667063643,
            "loss_tokens_lower_95": 8.513806852528386,
            "loss_tokens_upper_95": 8.713635440759042,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.590153990722284,
            "data_time": 0.316719651222229,
            "batch_time": 0.3511328101158142,
            "samples_per_second": 2373284.2614214644,
            "samples_per_second_per_gpu": 296660.53267768305,
            "loss_sequences_lower_95": 8.499435387588129,
            "loss_sequences_upper_95": 8.753844340254622,
            "loss_tokens_lower_95": 8.489245581131993,
            "loss_tokens_upper_95": 8.655445478333528,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.732345400786981,
            "data_time": 0.33995427191257477,
            "batch_time": 0.37471945583820343,
            "samples_per_second": 2353294.7654308164,
            "samples_per_second_per_gpu": 294161.84567885206,
            "loss_sequences_lower_95": 8.786474125559739,
            "loss_sequences_upper_95": 9.145254702684356,
            "loss_tokens_lower_95": 8.56859503598512,
            "loss_tokens_upper_95": 8.823511135283388,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.56051847411365,
            "data_time": 0.32028159499168396,
            "batch_time": 0.3551910072565079,
            "samples_per_second": 2048995.700282971,
            "samples_per_second_per_gpu": 256124.46253537136,
            "loss_sequences_lower_95": 8.459435551340986,
            "loss_sequences_upper_95": 8.701798304115853,
            "loss_tokens_lower_95": 8.465020390685845,
            "loss_tokens_upper_95": 8.61583561882423,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.771963492683742,
            "data_time": 0.3303847461938858,
            "batch_time": 0.36413203179836273,
            "samples_per_second": 2304254.6863230485,
            "samples_per_second_per_gpu": 288031.83579038107,
            "loss_sequences_lower_95": 8.641945633977096,
            "loss_sequences_upper_95": 8.816032343325407,
            "loss_tokens_lower_95": 8.720648444355074,
            "loss_tokens_upper_95": 8.834875146241592,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.776138166101967,
            "data_time": 0.3534531891345978,
            "batch_time": 0.3884619325399399,
            "samples_per_second": 2052379.9659870612,
            "samples_per_second_per_gpu": 256547.49574838264,
            "loss_sequences_lower_95": 8.763208454411204,
            "loss_sequences_upper_95": 8.98309694615806,
            "loss_tokens_lower_95": 8.682113354242013,
            "loss_tokens_upper_95": 8.809131983687461,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-0.5/params.txt",
    "uuid": "230e21fa-05a6-4109-9aa3-da65a49b327e",
    "creation_date": "2023_12_14-04_59_08"
}