{
    "name": "c4_original-d=96_l=8_h=4-2.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 422772480,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "84554496",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=96_l=8_h=4-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.316089781125386,
            "data_time": 0.12096802890300751,
            "batch_time": 1.2785929143428802,
            "samples_per_second": 379072.3169344869,
            "samples_per_second_per_gpu": 47384.039616810864,
            "loss_sequences_lower_95": 6.124076957702637,
            "loss_sequences_upper_95": 6.5098813501993815,
            "loss_tokens_lower_95": 6.300252265930175,
            "loss_tokens_upper_95": 6.331711006164551,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.003809142081036,
            "data_time": 0.018821441986032347,
            "batch_time": 0.06379744643039062,
            "samples_per_second": 4708759.904511585,
            "samples_per_second_per_gpu": 588594.9880639481,
            "loss_sequences_lower_95": 5.001546975920305,
            "loss_sequences_upper_95": 5.006083187375734,
            "loss_tokens_lower_95": 4.992348010416666,
            "loss_tokens_upper_95": 5.015141552083334,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.1786014897482735,
            "data_time": 0.08804140985012054,
            "batch_time": 0.13299183547496796,
            "samples_per_second": 4003263.766242877,
            "samples_per_second_per_gpu": 500407.97078035964,
            "loss_sequences_lower_95": 6.135092275191327,
            "loss_sequences_upper_95": 6.234403736348055,
            "loss_tokens_lower_95": 6.16416659375,
            "loss_tokens_upper_95": 6.193018885416667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2082412431657925,
            "data_time": 0.012641276183881257,
            "batch_time": 0.05654376980505491,
            "samples_per_second": 5375644.8625025125,
            "samples_per_second_per_gpu": 671955.6078128141,
            "loss_sequences_lower_95": 5.173142698936855,
            "loss_sequences_upper_95": 5.244528108891752,
            "loss_tokens_lower_95": 5.195757145833333,
            "loss_tokens_upper_95": 5.220864145833333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.032739325599127,
            "data_time": 0.08836415410041809,
            "batch_time": 0.1326054334640503,
            "samples_per_second": 4169204.331060968,
            "samples_per_second_per_gpu": 521150.541382621,
            "loss_sequences_lower_95": 4.9786135547272785,
            "loss_sequences_upper_95": 5.098103807997072,
            "loss_tokens_lower_95": 5.020891958333333,
            "loss_tokens_upper_95": 5.044638895833333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.959113204989767,
            "data_time": 0.03276104231675466,
            "batch_time": 0.07603434224923451,
            "samples_per_second": 4977302.654374457,
            "samples_per_second_per_gpu": 622162.8317968071,
            "loss_sequences_lower_95": 5.89657723474146,
            "loss_sequences_upper_95": 6.024546655256841,
            "loss_tokens_lower_95": 5.94579821875,
            "loss_tokens_upper_95": 5.97252990625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.809325046052738,
            "data_time": 0.012085069715976716,
            "batch_time": 0.0547953300178051,
            "samples_per_second": 5213430.714028641,
            "samples_per_second_per_gpu": 651678.8392535801,
            "loss_sequences_lower_95": 7.779230548469388,
            "loss_sequences_upper_95": 7.8388473573022965,
            "loss_tokens_lower_95": 7.794381458333333,
            "loss_tokens_upper_95": 7.824723479166666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.323171633775321,
            "data_time": 0.01229319995955417,
            "batch_time": 0.05574879442390643,
            "samples_per_second": 5363765.512858694,
            "samples_per_second_per_gpu": 670470.6891073368,
            "loss_sequences_lower_95": 5.300147455824607,
            "loss_sequences_upper_95": 5.347869038367146,
            "loss_tokens_lower_95": 5.310638895833333,
            "loss_tokens_upper_95": 5.335600958333334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.412842222345554,
            "data_time": 0.08562307804822922,
            "batch_time": 0.13043830543756485,
            "samples_per_second": 4191970.5484326957,
            "samples_per_second_per_gpu": 523996.31855408696,
            "loss_sequences_lower_95": 5.330935805406027,
            "loss_sequences_upper_95": 5.509416285569106,
            "loss_tokens_lower_95": 5.400530385416667,
            "loss_tokens_upper_95": 5.425508364583333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.279074601033931,
            "data_time": 0.08977439254522324,
            "batch_time": 0.13500794768333435,
            "samples_per_second": 4300126.439582938,
            "samples_per_second_per_gpu": 537515.8049478673,
            "loss_sequences_lower_95": 6.178465940452847,
            "loss_sequences_upper_95": 6.400172888714334,
            "loss_tokens_lower_95": 6.265082875,
            "loss_tokens_upper_95": 6.292740458333333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.8328717776613885,
            "data_time": 0.009275963594173563,
            "batch_time": 0.05266118306538154,
            "samples_per_second": 5405129.008491437,
            "samples_per_second_per_gpu": 675641.1260614296,
            "loss_sequences_lower_95": 5.820529091578731,
            "loss_sequences_upper_95": 5.845425051605112,
            "loss_tokens_lower_95": 5.820389135416667,
            "loss_tokens_upper_95": 5.8457,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.501814797539256,
            "data_time": 0.021341289579868316,
            "batch_time": 0.06371652781963348,
            "samples_per_second": 5054075.978112637,
            "samples_per_second_per_gpu": 631759.4972640796,
            "loss_sequences_lower_95": 5.478127379379686,
            "loss_sequences_upper_95": 5.526040744741775,
            "loss_tokens_lower_95": 5.4891248854166665,
            "loss_tokens_upper_95": 5.514246885416667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.615875941502878,
            "data_time": 0.08754809200763702,
            "batch_time": 0.13205471634864807,
            "samples_per_second": 4143832.764667295,
            "samples_per_second_per_gpu": 517979.09558341186,
            "loss_sequences_lower_95": 5.524935919276116,
            "loss_sequences_upper_95": 5.721474037402542,
            "loss_tokens_lower_95": 5.6023437395833335,
            "loss_tokens_upper_95": 5.629301302083333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.770193164071823,
            "data_time": 0.08517958968877792,
            "batch_time": 0.12912040948867798,
            "samples_per_second": 4215190.414809024,
            "samples_per_second_per_gpu": 526898.801851128,
            "loss_sequences_lower_95": 5.689141249025426,
            "loss_sequences_upper_95": 5.863254648119272,
            "loss_tokens_lower_95": 5.75799646875,
            "loss_tokens_upper_95": 5.783230114583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.2000368508425625,
            "data_time": 0.13641001284122467,
            "batch_time": 0.15681466460227966,
            "samples_per_second": 960694.2926677389,
            "samples_per_second_per_gpu": 120086.78658346736,
            "loss_sequences_lower_95": 7.133785178444602,
            "loss_sequences_upper_95": 7.27823108326305,
            "loss_tokens_lower_95": 7.170928851040927,
            "loss_tokens_upper_95": 7.229537998546253,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.637542342305531,
            "data_time": 0.08519858866930008,
            "batch_time": 0.11968652158975601,
            "samples_per_second": 3365109.0434941687,
            "samples_per_second_per_gpu": 420638.6304367711,
            "loss_sequences_lower_95": 6.473580799158391,
            "loss_sequences_upper_95": 6.802732524371356,
            "loss_tokens_lower_95": 6.622822177083333,
            "loss_tokens_upper_95": 6.652510895833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.9755714876985175,
            "data_time": 0.08238472789525986,
            "batch_time": 0.11942890286445618,
            "samples_per_second": 3729109.8450171137,
            "samples_per_second_per_gpu": 466138.7306271392,
            "loss_sequences_lower_95": 6.887503172539784,
            "loss_sequences_upper_95": 7.0850045156353065,
            "loss_tokens_lower_95": 6.9630257291666675,
            "loss_tokens_upper_95": 6.988006385416666,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.446598232769575,
            "data_time": 0.14449289441108704,
            "batch_time": 0.17474688589572906,
            "samples_per_second": 2310723.6120856586,
            "samples_per_second_per_gpu": 288840.4515107073,
            "loss_sequences_lower_95": 6.3092830845567045,
            "loss_sequences_upper_95": 6.684489465932377,
            "loss_tokens_lower_95": 6.432046821469166,
            "loss_tokens_upper_95": 6.461259923215772,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.105661675353078,
            "data_time": 0.026429815183986318,
            "batch_time": 0.07078476683659987,
            "samples_per_second": 4533123.035761554,
            "samples_per_second_per_gpu": 566640.3794701942,
            "loss_sequences_lower_95": 5.086890472555548,
            "loss_sequences_upper_95": 5.123831908292978,
            "loss_tokens_lower_95": 5.086409243252386,
            "loss_tokens_upper_95": 5.124292232120602,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.627556432601679,
            "data_time": 0.027296946570277213,
            "batch_time": 0.07107300646603107,
            "samples_per_second": 4516024.035423087,
            "samples_per_second_per_gpu": 564503.0044278859,
            "loss_sequences_lower_95": 4.644016245052032,
            "loss_sequences_upper_95": 4.670052430183852,
            "loss_tokens_lower_95": 4.615766017472513,
            "loss_tokens_upper_95": 4.637065934952927,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.045205490398812,
            "data_time": 0.04665196935335795,
            "batch_time": 0.08871933652295007,
            "samples_per_second": 4388727.681162766,
            "samples_per_second_per_gpu": 548590.9601453458,
            "loss_sequences_lower_95": 7.473476248819083,
            "loss_sequences_upper_95": 7.739326499394781,
            "loss_tokens_lower_95": 6.921751446345604,
            "loss_tokens_upper_95": 7.1201274603697575,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.87041442044576,
            "data_time": 0.03764656682809194,
            "batch_time": 0.08164967224001884,
            "samples_per_second": 4577653.246198928,
            "samples_per_second_per_gpu": 572206.655774866,
            "loss_sequences_lower_95": 7.286519108072917,
            "loss_sequences_upper_95": 7.470424300130208,
            "loss_tokens_lower_95": 6.772773597189466,
            "loss_tokens_upper_95": 6.898798938679245,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.751674565697728,
            "data_time": 0.07036272933085759,
            "batch_time": 0.11081949373086293,
            "samples_per_second": 3829572.687326174,
            "samples_per_second_per_gpu": 478696.5859157718,
            "loss_sequences_lower_95": 5.815361674253103,
            "loss_sequences_upper_95": 5.88648733467293,
            "loss_tokens_lower_95": 5.728546428326558,
            "loss_tokens_upper_95": 5.765774857427483,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.045315525748513,
            "data_time": 0.35189180076122284,
            "batch_time": 0.395511731505394,
            "samples_per_second": 2524803.8334154086,
            "samples_per_second_per_gpu": 315600.4791769261,
            "loss_sequences_lower_95": 6.904273099032316,
            "loss_sequences_upper_95": 7.248385162353515,
            "loss_tokens_lower_95": 6.996480273102892,
            "loss_tokens_upper_95": 7.08486423586617,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.669759281314149,
            "data_time": 0.3447471708059311,
            "batch_time": 0.3903689831495285,
            "samples_per_second": 2763985.267814101,
            "samples_per_second_per_gpu": 345498.15847676265,
            "loss_sequences_lower_95": 5.634308758171237,
            "loss_sequences_upper_95": 5.844729626713967,
            "loss_tokens_lower_95": 5.631091449094448,
            "loss_tokens_upper_95": 5.731902650537407,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.760157167116801,
            "data_time": 0.18266691267490387,
            "batch_time": 0.2138439640402794,
            "samples_per_second": 2407548.7728282344,
            "samples_per_second_per_gpu": 300943.5966035293,
            "loss_sequences_lower_95": 5.683680674235026,
            "loss_sequences_upper_95": 5.797153086344401,
            "loss_tokens_lower_95": 5.670807224268967,
            "loss_tokens_upper_95": 5.859653151315281,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.938483940229824,
            "data_time": 0.024140872806310654,
            "batch_time": 0.06846630349755287,
            "samples_per_second": 4492813.359645219,
            "samples_per_second_per_gpu": 561601.6699556523,
            "loss_sequences_lower_95": 9.019075902084051,
            "loss_sequences_upper_95": 9.095256850210372,
            "loss_tokens_lower_95": 8.881253583544364,
            "loss_tokens_upper_95": 8.961132597860624,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.9485640162570705,
            "data_time": 0.047139979898929596,
            "batch_time": 0.08915453553199768,
            "samples_per_second": 4331896.900542267,
            "samples_per_second_per_gpu": 541487.1125677833,
            "loss_sequences_lower_95": 7.156548846209491,
            "loss_sequences_upper_95": 7.453671048867582,
            "loss_tokens_lower_95": 5.8026049760338605,
            "loss_tokens_upper_95": 5.951697963676487,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.687769657515829,
            "data_time": 0.06995189487934113,
            "batch_time": 0.11173134446144103,
            "samples_per_second": 4290870.774042199,
            "samples_per_second_per_gpu": 536358.8467552749,
            "loss_sequences_lower_95": 6.372639824668703,
            "loss_sequences_upper_95": 6.708724694365935,
            "loss_tokens_lower_95": 5.588558462659326,
            "loss_tokens_upper_95": 5.761984917639762,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.727991705071436,
            "data_time": 0.33085446059703827,
            "batch_time": 0.37287600338459015,
            "samples_per_second": 2525380.589534651,
            "samples_per_second_per_gpu": 315672.5736918314,
            "loss_sequences_lower_95": 5.683321432871361,
            "loss_sequences_upper_95": 5.7731585498269835,
            "loss_tokens_lower_95": 5.682959667188392,
            "loss_tokens_upper_95": 5.773100441013841,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.997462115287781,
            "data_time": 0.3043103963136673,
            "batch_time": 0.3290846347808838,
            "samples_per_second": 1543701.1409589767,
            "samples_per_second_per_gpu": 192962.6426198721,
            "loss_sequences_lower_95": 4.927349731445313,
            "loss_sequences_upper_95": 5.365154251098632,
            "loss_tokens_lower_95": 4.739712901106888,
            "loss_tokens_upper_95": 5.246308716038685,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1283682044233245,
            "data_time": 0.04889507591724396,
            "batch_time": 0.09231094270944595,
            "samples_per_second": 4548420.557970447,
            "samples_per_second_per_gpu": 568552.5697463058,
            "loss_sequences_lower_95": 5.069248601626471,
            "loss_sequences_upper_95": 5.188582165189995,
            "loss_tokens_lower_95": 5.068966465523446,
            "loss_tokens_upper_95": 5.188685144055463,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.39263865168616,
            "data_time": 0.07484453618526458,
            "batch_time": 0.11793633699417114,
            "samples_per_second": 4351196.009698303,
            "samples_per_second_per_gpu": 543899.5012122879,
            "loss_sequences_lower_95": 5.330850627127483,
            "loss_sequences_upper_95": 5.452870751833154,
            "loss_tokens_lower_95": 5.329499365754249,
            "loss_tokens_upper_95": 5.454283978136518,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.044726282260882,
            "data_time": 0.053548483178019524,
            "batch_time": 0.09475547634065151,
            "samples_per_second": 4211808.825133529,
            "samples_per_second_per_gpu": 526476.1031416911,
            "loss_sequences_lower_95": 5.26514034499541,
            "loss_sequences_upper_95": 5.378580897417369,
            "loss_tokens_lower_95": 5.010879662585421,
            "loss_tokens_upper_95": 5.0708334963015345,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.610323367118835,
            "data_time": 0.16992278397083282,
            "batch_time": 0.21508505195379257,
            "samples_per_second": 3929840.362244672,
            "samples_per_second_per_gpu": 491230.045280584,
            "loss_sequences_lower_95": 7.246314367675781,
            "loss_sequences_upper_95": 7.776241381835938,
            "loss_tokens_lower_95": 6.371314042076853,
            "loss_tokens_upper_95": 6.723076346376572,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.123042464256287,
            "data_time": 0.13165327906608582,
            "batch_time": 0.1491704136133194,
            "samples_per_second": 774294.0692233038,
            "samples_per_second_per_gpu": 96786.75865291298,
            "loss_sequences_lower_95": 4.802392303943634,
            "loss_sequences_upper_95": 5.55391104221344,
            "loss_tokens_lower_95": 4.5618034450487155,
            "loss_tokens_upper_95": 5.502007284931753,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6087248544583375,
            "data_time": 0.2917083203792572,
            "batch_time": 0.32743681967258453,
            "samples_per_second": 2213885.967865533,
            "samples_per_second_per_gpu": 276735.74598319165,
            "loss_sequences_lower_95": 6.442618499405083,
            "loss_sequences_upper_95": 7.033290398257902,
            "loss_tokens_lower_95": 5.331730904735062,
            "loss_tokens_upper_95": 5.746267717398006,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.985041074848633,
            "data_time": 0.04938373631901211,
            "batch_time": 0.09359348482555813,
            "samples_per_second": 4517200.045466485,
            "samples_per_second_per_gpu": 564650.0056833107,
            "loss_sequences_lower_95": 4.9373474147758305,
            "loss_sequences_upper_95": 5.032052602972095,
            "loss_tokens_lower_95": 4.937787454520943,
            "loss_tokens_upper_95": 5.032361922424503,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.814989891233616,
            "data_time": 0.031322567945434934,
            "batch_time": 0.074174498518308,
            "samples_per_second": 4427976.8724226765,
            "samples_per_second_per_gpu": 553497.1090528346,
            "loss_sequences_lower_95": 7.834616819085969,
            "loss_sequences_upper_95": 8.007650105975888,
            "loss_tokens_lower_95": 7.718648925959259,
            "loss_tokens_upper_95": 7.888576052252324,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.419202176642505,
            "data_time": 0.15810221433639526,
            "batch_time": 0.1878942921757698,
            "samples_per_second": 1932875.3683683344,
            "samples_per_second_per_gpu": 241609.4210460418,
            "loss_sequences_lower_95": 4.2887717893272095,
            "loss_sequences_upper_95": 4.6415796859797105,
            "loss_tokens_lower_95": 4.220183891416368,
            "loss_tokens_upper_95": 4.554388410795849,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.704015281318959,
            "data_time": 0.07925377786159515,
            "batch_time": 0.1237289160490036,
            "samples_per_second": 4425207.211695747,
            "samples_per_second_per_gpu": 553150.9014619684,
            "loss_sequences_lower_95": 4.756327934235029,
            "loss_sequences_upper_95": 4.890522208511124,
            "loss_tokens_lower_95": 4.62284287866133,
            "loss_tokens_upper_95": 4.776291519911434,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.536943828187337,
            "data_time": 0.30321137607097626,
            "batch_time": 0.3380602300167084,
            "samples_per_second": 2272506.8398969932,
            "samples_per_second_per_gpu": 284063.35498712416,
            "loss_sequences_lower_95": 6.290420513618283,
            "loss_sequences_upper_95": 6.866752196521294,
            "loss_tokens_lower_95": 6.360705407097523,
            "loss_tokens_upper_95": 6.750982795453965,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.340935663357571,
            "data_time": 0.027439910042622247,
            "batch_time": 0.07135967326388463,
            "samples_per_second": 4443517.543772151,
            "samples_per_second_per_gpu": 555439.6929715188,
            "loss_sequences_lower_95": 4.328225928388802,
            "loss_sequences_upper_95": 4.353557918614973,
            "loss_tokens_lower_95": 4.328450690138028,
            "loss_tokens_upper_95": 4.353535179692188,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.6619198854687145,
            "data_time": 0.3220175802707672,
            "batch_time": 0.3474452793598175,
            "samples_per_second": 1081766.3341931733,
            "samples_per_second_per_gpu": 135220.79177414667,
            "loss_sequences_lower_95": 5.508903088615936,
            "loss_sequences_upper_95": 5.868085664915807,
            "loss_tokens_lower_95": 5.41204116805671,
            "loss_tokens_upper_95": 5.8279881113261265,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.63783496117192,
            "data_time": 0.023058237433433534,
            "batch_time": 0.06740776588519415,
            "samples_per_second": 4473875.251331266,
            "samples_per_second_per_gpu": 559234.4064164083,
            "loss_sequences_lower_95": 6.066466962706368,
            "loss_sequences_upper_95": 6.107258746069182,
            "loss_tokens_lower_95": 5.581627550773694,
            "loss_tokens_upper_95": 5.621018882978723,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.966018466949462,
            "data_time": 0.08875469490885735,
            "batch_time": 0.13376165553927422,
            "samples_per_second": 4337547.3450577045,
            "samples_per_second_per_gpu": 542193.4181322131,
            "loss_sequences_lower_95": 8.692363916015625,
            "loss_sequences_upper_95": 9.15452822265625,
            "loss_tokens_lower_95": 8.727319770217033,
            "loss_tokens_upper_95": 9.187620446484676,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.690021479648093,
            "data_time": 0.31993208825588226,
            "batch_time": 0.36231690645217896,
            "samples_per_second": 2516413.514425279,
            "samples_per_second_per_gpu": 314551.6893031599,
            "loss_sequences_lower_95": 4.545134104853091,
            "loss_sequences_upper_95": 4.833394377335258,
            "loss_tokens_lower_95": 4.544605474057406,
            "loss_tokens_upper_95": 4.835556561013926,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.457239540417989,
            "data_time": 0.0692003642519315,
            "batch_time": 0.10937462498744328,
            "samples_per_second": 4025494.80278234,
            "samples_per_second_per_gpu": 503186.8503477925,
            "loss_sequences_lower_95": 10.335148999763257,
            "loss_sequences_upper_95": 10.57796610514323,
            "loss_tokens_lower_95": 10.333697639234138,
            "loss_tokens_upper_95": 10.580952370383523,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.117180546124776,
            "data_time": 0.061889740327994026,
            "batch_time": 0.1066553145647049,
            "samples_per_second": 4496039.565761332,
            "samples_per_second_per_gpu": 562004.9457201666,
            "loss_sequences_lower_95": 4.225357877604166,
            "loss_sequences_upper_95": 4.320791064453124,
            "loss_tokens_lower_95": 4.062964420143057,
            "loss_tokens_upper_95": 4.155491602891156,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.462431106113252,
            "data_time": 0.32086339592933655,
            "batch_time": 0.3618709444999695,
            "samples_per_second": 2174624.5722913016,
            "samples_per_second_per_gpu": 271828.0715364127,
            "loss_sequences_lower_95": 6.131203308105469,
            "loss_sequences_upper_95": 6.793944702148438,
            "loss_tokens_lower_95": 6.135395827520461,
            "loss_tokens_upper_95": 6.787484363374256,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.997336849570274,
            "data_time": 0.1397172212600708,
            "batch_time": 0.15677523612976074,
            "samples_per_second": 876707.45229898,
            "samples_per_second_per_gpu": 109588.4315373725,
            "loss_sequences_lower_95": 5.705481934547424,
            "loss_sequences_upper_95": 7.209061932563782,
            "loss_tokens_lower_95": 5.549380574963756,
            "loss_tokens_upper_95": 6.160009501349065,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.705568502426147,
            "data_time": 0.08732181787490845,
            "batch_time": 0.13181958720088005,
            "samples_per_second": 4451643.104615815,
            "samples_per_second_per_gpu": 556455.3880769769,
            "loss_sequences_lower_95": 7.765931982421875,
            "loss_sequences_upper_95": 8.104658276367187,
            "loss_tokens_lower_95": 7.550215657552084,
            "loss_tokens_upper_95": 7.844467219886844,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.200847085952759,
            "data_time": 0.08761290088295937,
            "batch_time": 0.13139360398054123,
            "samples_per_second": 4449971.122058797,
            "samples_per_second_per_gpu": 556246.3902573496,
            "loss_sequences_lower_95": 7.44009462890625,
            "loss_sequences_upper_95": 7.6593626953125,
            "loss_tokens_lower_95": 7.105619733310904,
            "loss_tokens_upper_95": 7.2716424490251,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.33205827881816,
            "data_time": 0.03653271868824959,
            "batch_time": 0.08008861045042674,
            "samples_per_second": 4634324.452955821,
            "samples_per_second_per_gpu": 579290.5566194776,
            "loss_sequences_lower_95": 5.299360192785157,
            "loss_sequences_upper_95": 5.365104365820377,
            "loss_tokens_lower_95": 5.299030050494468,
            "loss_tokens_upper_95": 5.365519044769318,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.925402267737323,
            "data_time": 0.11759952207406361,
            "batch_time": 0.15822241206963858,
            "samples_per_second": 4053836.9996100985,
            "samples_per_second_per_gpu": 506729.6249512623,
            "loss_sequences_lower_95": 4.83327394828269,
            "loss_sequences_upper_95": 5.0162643127910185,
            "loss_tokens_lower_95": 4.832786098310292,
            "loss_tokens_upper_95": 5.015392672631049,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.19819398355484,
            "data_time": 0.09283298626542091,
            "batch_time": 0.13766663521528244,
            "samples_per_second": 4262401.912008224,
            "samples_per_second_per_gpu": 532800.239001028,
            "loss_sequences_lower_95": 7.1606011352539065,
            "loss_sequences_upper_95": 7.237277819824219,
            "loss_tokens_lower_95": 7.160188867187499,
            "loss_tokens_upper_95": 7.236203125,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.681297820496356,
            "data_time": 0.026925742271400634,
            "batch_time": 0.07077214760439736,
            "samples_per_second": 4499081.906566988,
            "samples_per_second_per_gpu": 562385.2383208735,
            "loss_sequences_lower_95": 7.3500440146050146,
            "loss_sequences_upper_95": 7.428859981078524,
            "loss_tokens_lower_95": 6.588777383573988,
            "loss_tokens_upper_95": 6.648582149388944,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.042104540476158,
            "data_time": 0.19531133345195226,
            "batch_time": 0.22689407212393625,
            "samples_per_second": 1931077.961035175,
            "samples_per_second_per_gpu": 241384.74512939688,
            "loss_sequences_lower_95": 4.879710160440474,
            "loss_sequences_upper_95": 5.203487157109958,
            "loss_tokens_lower_95": 4.874940149107976,
            "loss_tokens_upper_95": 5.204313363601912,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.117658529094621,
            "data_time": 0.17228373885154724,
            "batch_time": 0.21740945428609848,
            "samples_per_second": 3985623.562257875,
            "samples_per_second_per_gpu": 498202.9452822344,
            "loss_sequences_lower_95": 5.010299371457567,
            "loss_sequences_upper_95": 5.223051901424632,
            "loss_tokens_lower_95": 5.011620351753983,
            "loss_tokens_upper_95": 5.223837016984528,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.562003076487162,
            "data_time": 0.028251757379621267,
            "batch_time": 0.07175802439451218,
            "samples_per_second": 4507227.742050617,
            "samples_per_second_per_gpu": 563403.4677563271,
            "loss_sequences_lower_95": 7.01482379793389,
            "loss_sequences_upper_95": 7.102041973470969,
            "loss_tokens_lower_95": 6.4761341543470685,
            "loss_tokens_upper_95": 6.551081354025369,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.155552513384945,
            "data_time": 0.33001483976840973,
            "batch_time": 0.36807018518447876,
            "samples_per_second": 1890872.0760475146,
            "samples_per_second_per_gpu": 236359.00950593932,
            "loss_sequences_lower_95": 5.045044510452835,
            "loss_sequences_upper_95": 5.268764411078559,
            "loss_tokens_lower_95": 5.044796777149988,
            "loss_tokens_upper_95": 5.267375199252336,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.324018904193096,
            "data_time": 0.04008505092217372,
            "batch_time": 0.08401764470797318,
            "samples_per_second": 4516349.789757569,
            "samples_per_second_per_gpu": 564543.7237196962,
            "loss_sequences_lower_95": 8.298289268563648,
            "loss_sequences_upper_95": 8.349932342435972,
            "loss_tokens_lower_95": 8.29807856101873,
            "loss_tokens_upper_95": 8.349602879515482,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.912312269210815,
            "data_time": 0.3157426118850708,
            "batch_time": 0.3553396612405777,
            "samples_per_second": 2715519.417076517,
            "samples_per_second_per_gpu": 339439.9271345646,
            "loss_sequences_lower_95": 4.74193852989419,
            "loss_sequences_upper_95": 5.079729483891459,
            "loss_tokens_lower_95": 4.738970325062576,
            "loss_tokens_upper_95": 5.081656106004437,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.51345903078715,
            "data_time": 0.27304790914058685,
            "batch_time": 0.29384586215019226,
            "samples_per_second": 1142857.9607874383,
            "samples_per_second_per_gpu": 142857.2450984298,
            "loss_sequences_lower_95": 6.159935302734375,
            "loss_sequences_upper_95": 7.008249816894531,
            "loss_tokens_lower_95": 5.82396191491021,
            "loss_tokens_upper_95": 7.077386972639295,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.147006090482076,
            "data_time": 0.3047667145729065,
            "batch_time": 0.32405032217502594,
            "samples_per_second": 1571949.364407323,
            "samples_per_second_per_gpu": 196493.67055091538,
            "loss_sequences_lower_95": 5.89139269510905,
            "loss_sequences_upper_95": 6.907939771016438,
            "loss_tokens_lower_95": 5.37568319084939,
            "loss_tokens_upper_95": 6.719081990102704,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.930800270507543,
            "data_time": 0.042624903576714654,
            "batch_time": 0.08636728141989027,
            "samples_per_second": 4325234.371789484,
            "samples_per_second_per_gpu": 540654.2964736855,
            "loss_sequences_lower_95": 8.902908962513807,
            "loss_sequences_upper_95": 8.958506722316825,
            "loss_tokens_lower_95": 8.903966281986376,
            "loss_tokens_upper_95": 8.957455414672312,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.973063657737534,
            "data_time": 0.022622932708419568,
            "batch_time": 0.06708334171615833,
            "samples_per_second": 4483054.758638556,
            "samples_per_second_per_gpu": 560381.8448298195,
            "loss_sequences_lower_95": 6.5774740829942555,
            "loss_sequences_upper_95": 6.609799831066642,
            "loss_tokens_lower_95": 5.908606807210807,
            "loss_tokens_upper_95": 5.9395738254588535,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.4966854974040835,
            "data_time": 0.29122525453567505,
            "batch_time": 0.32444244623184204,
            "samples_per_second": 2073193.9613602688,
            "samples_per_second_per_gpu": 259149.2451700336,
            "loss_sequences_lower_95": 7.505598714032511,
            "loss_sequences_upper_95": 7.887679525059977,
            "loss_tokens_lower_95": 7.370264103663873,
            "loss_tokens_upper_95": 7.589034445717422,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.446668006278372,
            "data_time": 0.19004404544830322,
            "batch_time": 0.20803315937519073,
            "samples_per_second": 1121347.0121600707,
            "samples_per_second_per_gpu": 140168.37652000884,
            "loss_sequences_lower_95": 9.065466762233426,
            "loss_sequences_upper_95": 10.027408867913323,
            "loss_tokens_lower_95": 8.912822431399498,
            "loss_tokens_upper_95": 9.817402856143904,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.605610562533867,
            "data_time": 0.30892637372016907,
            "batch_time": 0.3429350256919861,
            "samples_per_second": 2164034.0072164573,
            "samples_per_second_per_gpu": 270504.25090205716,
            "loss_sequences_lower_95": 7.541100683444884,
            "loss_sequences_upper_95": 7.833284071015148,
            "loss_tokens_lower_95": 7.478778582317073,
            "loss_tokens_upper_95": 7.666559576061816,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.567511352097116,
            "data_time": 0.3027243912220001,
            "batch_time": 0.3374345004558563,
            "samples_per_second": 2418999.253594976,
            "samples_per_second_per_gpu": 302374.906699372,
            "loss_sequences_lower_95": 7.502527599799923,
            "loss_sequences_upper_95": 7.78284713000786,
            "loss_tokens_lower_95": 7.471623924830685,
            "loss_tokens_upper_95": 7.6308719384931685,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.744235399292736,
            "data_time": 0.3164532333612442,
            "batch_time": 0.3514566421508789,
            "samples_per_second": 2076384.187936285,
            "samples_per_second_per_gpu": 259548.02349203563,
            "loss_sequences_lower_95": 7.758276069455031,
            "loss_sequences_upper_95": 8.113775113733803,
            "loss_tokens_lower_95": 7.587105584432504,
            "loss_tokens_upper_95": 7.831997050657705,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.549975738292787,
            "data_time": 0.2840394079685211,
            "batch_time": 0.3201790452003479,
            "samples_per_second": 2462983.4269234873,
            "samples_per_second_per_gpu": 307872.9283654359,
            "loss_sequences_lower_95": 7.486917709722752,
            "loss_sequences_upper_95": 7.753446811582984,
            "loss_tokens_lower_95": 7.4581292363342095,
            "loss_tokens_upper_95": 7.605077079226295,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.747426521704064,
            "data_time": 0.2915261685848236,
            "batch_time": 0.32497115433216095,
            "samples_per_second": 2277998.1454553166,
            "samples_per_second_per_gpu": 284749.7681819146,
            "loss_sequences_lower_95": 7.6230216079617135,
            "loss_sequences_upper_95": 7.803769072538577,
            "loss_tokens_lower_95": 7.698358506690552,
            "loss_tokens_upper_95": 7.805867920992981,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.581464276081178,
            "data_time": 0.31858550012111664,
            "batch_time": 0.35399167239665985,
            "samples_per_second": 2321441.499809332,
            "samples_per_second_per_gpu": 290180.1874761665,
            "loss_sequences_lower_95": 7.591493429788729,
            "loss_sequences_upper_95": 7.81094914878287,
            "loss_tokens_lower_95": 7.484038400862718,
            "loss_tokens_upper_95": 7.610726611946203,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/checkpoints/epoch_2.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-2.0/params.txt",
    "uuid": "eda945b4-cd91-470c-a90d-168c8033827d",
    "creation_date": "2023_12_14-04_59_10"
}