{
    "name": "c4_original-d=1024_l=24_h=8-2.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 16464650240,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "3292930048",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=1024_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.610610395669937,
            "data_time": 0.041619714349508286,
            "batch_time": 0.4455937296152115,
            "samples_per_second": 690697.6833040905,
            "samples_per_second_per_gpu": 86337.21041301131,
            "loss_sequences_lower_95": 3.5003023783365887,
            "loss_sequences_upper_95": 3.7203263092041015,
            "loss_tokens_lower_95": 3.5956730206807457,
            "loss_tokens_upper_95": 3.6254426129659016,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.943318389885835,
            "data_time": 0.0010078749880943175,
            "batch_time": 0.03660115842731931,
            "samples_per_second": 900813.6706468903,
            "samples_per_second_per_gpu": 112601.70883086129,
            "loss_sequences_lower_95": 2.9404648372701443,
            "loss_sequences_upper_95": 2.9461385391298722,
            "loss_tokens_lower_95": 2.933057479166667,
            "loss_tokens_upper_95": 2.9534208854166666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3163454041189078,
            "data_time": 0.008394585609436036,
            "batch_time": 0.043646520614624026,
            "samples_per_second": 873135.1743823987,
            "samples_per_second_per_gpu": 109141.89679779984,
            "loss_sequences_lower_95": 3.295179667570153,
            "loss_sequences_upper_95": 3.3389459539919484,
            "loss_tokens_lower_95": 3.30197396875,
            "loss_tokens_upper_95": 3.331128442708333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9321493572058137,
            "data_time": 0.0015029428820861014,
            "batch_time": 0.03678710190089125,
            "samples_per_second": 906300.074595399,
            "samples_per_second_per_gpu": 113287.50932442487,
            "loss_sequences_lower_95": 2.921769657095683,
            "loss_sequences_upper_95": 2.9427785317332473,
            "loss_tokens_lower_95": 2.92174078125,
            "loss_tokens_upper_95": 2.9424208750000003,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9611260453940895,
            "data_time": 0.008421784853080354,
            "batch_time": 0.043656391926495675,
            "samples_per_second": 870089.0576920771,
            "samples_per_second_per_gpu": 108761.13221150964,
            "loss_sequences_lower_95": 2.9262049346737364,
            "loss_sequences_upper_95": 2.995059794563868,
            "loss_tokens_lower_95": 2.950890182291667,
            "loss_tokens_upper_95": 2.9711919375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4113796196528474,
            "data_time": 0.0034176937263944874,
            "batch_time": 0.03878405495830204,
            "samples_per_second": 903015.7226398801,
            "samples_per_second_per_gpu": 112876.96532998502,
            "loss_sequences_lower_95": 3.3727676899882097,
            "loss_sequences_upper_95": 3.450798452702774,
            "loss_tokens_lower_95": 3.3995708645833336,
            "loss_tokens_upper_95": 3.4231433645833333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0807016411606147,
            "data_time": 0.0015188549898965907,
            "batch_time": 0.03680422129094309,
            "samples_per_second": 909385.6138427518,
            "samples_per_second_per_gpu": 113673.20173034398,
            "loss_sequences_lower_95": 3.04857823959662,
            "loss_sequences_upper_95": 3.1118213339046554,
            "loss_tokens_lower_95": 3.066784234375,
            "loss_tokens_upper_95": 3.094734458333333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.66573893791718,
            "data_time": 0.001540011015460861,
            "batch_time": 0.03743910709717741,
            "samples_per_second": 908457.1229398828,
            "samples_per_second_per_gpu": 113557.14036748535,
            "loss_sequences_lower_95": 3.655787927437827,
            "loss_sequences_upper_95": 3.6762895226603405,
            "loss_tokens_lower_95": 3.6540220833333334,
            "loss_tokens_upper_95": 3.677588604166667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3742835671921085,
            "data_time": 0.009759540595705546,
            "batch_time": 0.04731935451901148,
            "samples_per_second": 862925.081786855,
            "samples_per_second_per_gpu": 107865.63522335688,
            "loss_sequences_lower_95": 3.3286645904789127,
            "loss_sequences_upper_95": 3.424682790864774,
            "loss_tokens_lower_95": 3.3632794791666667,
            "loss_tokens_upper_95": 3.385395963541667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.423457760113501,
            "data_time": 0.009075751528143883,
            "batch_time": 0.044749013148248196,
            "samples_per_second": 874286.8525447067,
            "samples_per_second_per_gpu": 109285.85656808833,
            "loss_sequences_lower_95": 4.392231635896585,
            "loss_sequences_upper_95": 4.461580190545485,
            "loss_tokens_lower_95": 4.410100333333333,
            "loss_tokens_upper_95": 4.4375622083333335,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3192384281892946,
            "data_time": 0.0012005318285962202,
            "batch_time": 0.0364724896515386,
            "samples_per_second": 910703.3060871544,
            "samples_per_second_per_gpu": 113837.9132608943,
            "loss_sequences_lower_95": 3.3118238070868165,
            "loss_sequences_upper_95": 3.3269294570435624,
            "loss_tokens_lower_95": 3.308080229166667,
            "loss_tokens_upper_95": 3.330126,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.095464792910937,
            "data_time": 0.002281802381504386,
            "batch_time": 0.03762517027017179,
            "samples_per_second": 906586.9080170426,
            "samples_per_second_per_gpu": 113323.36350213032,
            "loss_sequences_lower_95": 3.0869306825687866,
            "loss_sequences_upper_95": 3.103799716833481,
            "loss_tokens_lower_95": 3.0846127083333332,
            "loss_tokens_upper_95": 3.106360838541667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.806960063822129,
            "data_time": 0.00848117458961698,
            "batch_time": 0.04376888557856262,
            "samples_per_second": 866221.108239544,
            "samples_per_second_per_gpu": 108277.638529943,
            "loss_sequences_lower_95": 3.7680579380863337,
            "loss_sequences_upper_95": 3.849598158397249,
            "loss_tokens_lower_95": 3.7938845520833335,
            "loss_tokens_upper_95": 3.819998791666667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.949543412494077,
            "data_time": 0.008879635913438532,
            "batch_time": 0.04428597085504418,
            "samples_per_second": 866829.1615729955,
            "samples_per_second_per_gpu": 108353.64519662443,
            "loss_sequences_lower_95": 2.8923928627899853,
            "loss_sequences_upper_95": 3.0048789527411373,
            "loss_tokens_lower_95": 2.9380213906250003,
            "loss_tokens_upper_95": 2.9607896145833332,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.362397920001637,
            "data_time": 0.07164487668446132,
            "batch_time": 0.1059618251664298,
            "samples_per_second": 515797.8462189449,
            "samples_per_second_per_gpu": 64474.73077736811,
            "loss_sequences_lower_95": 4.292886092446067,
            "loss_sequences_upper_95": 4.4328669461337,
            "loss_tokens_lower_95": 4.334216074510055,
            "loss_tokens_upper_95": 4.39119176864624,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.424372277523964,
            "data_time": 0.012310246174985712,
            "batch_time": 0.0478761995380575,
            "samples_per_second": 854640.7119083062,
            "samples_per_second_per_gpu": 106830.08898853828,
            "loss_sequences_lower_95": 3.3483688487950984,
            "loss_sequences_upper_95": 3.4978590806788676,
            "loss_tokens_lower_95": 3.4116158697916665,
            "loss_tokens_upper_95": 3.436784765625,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.729969953169609,
            "data_time": 0.011200300107399622,
            "batch_time": 0.04680316150188446,
            "samples_per_second": 869882.3135038023,
            "samples_per_second_per_gpu": 108735.28918797529,
            "loss_sequences_lower_95": 5.666514084144129,
            "loss_sequences_upper_95": 5.793441184655343,
            "loss_tokens_lower_95": 5.7171784895833335,
            "loss_tokens_upper_95": 5.743100510416666,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4636628217384464,
            "data_time": 0.03320198506116867,
            "batch_time": 0.07155609875917435,
            "samples_per_second": 775945.2330067643,
            "samples_per_second_per_gpu": 96993.15412584554,
            "loss_sequences_lower_95": 3.380640267544105,
            "loss_sequences_upper_95": 3.594241139146148,
            "loss_tokens_lower_95": 3.4500285727078794,
            "loss_tokens_upper_95": 3.477245962424356,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.532401166093297,
            "data_time": 0.0015213388041646172,
            "batch_time": 0.03691125503297819,
            "samples_per_second": 903107.1989724786,
            "samples_per_second_per_gpu": 112888.39987155983,
            "loss_sequences_lower_95": 4.5139829534833,
            "loss_sequences_upper_95": 4.551301299783685,
            "loss_tokens_lower_95": 4.513694512166626,
            "loss_tokens_upper_95": 4.5512020300705025,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6769978690137903,
            "data_time": 0.0020331696245321045,
            "batch_time": 0.03746678533068128,
            "samples_per_second": 900060.9400894151,
            "samples_per_second_per_gpu": 112507.61751117688,
            "loss_sequences_lower_95": 2.6779233712547925,
            "loss_sequences_upper_95": 2.7027670454015014,
            "loss_tokens_lower_95": 2.6543667705818104,
            "loss_tokens_upper_95": 2.6723304892269435,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8200028537014603,
            "data_time": 0.00288533580179601,
            "batch_time": 0.03846483315981075,
            "samples_per_second": 894558.0177180828,
            "samples_per_second_per_gpu": 111819.75221476034,
            "loss_sequences_lower_95": 4.088569536601027,
            "loss_sequences_upper_95": 4.3902462879553905,
            "loss_tokens_lower_95": 3.2564773585019804,
            "loss_tokens_upper_95": 3.4720402093850518,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.741534526762863,
            "data_time": 0.003178671160910992,
            "batch_time": 0.03851879974629017,
            "samples_per_second": 896714.0895248993,
            "samples_per_second_per_gpu": 112089.26119061241,
            "loss_sequences_lower_95": 3.8219498697916667,
            "loss_sequences_upper_95": 4.022304060872395,
            "loss_tokens_lower_95": 3.502881823653695,
            "loss_tokens_upper_95": 3.6448653940644653,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.842645740762093,
            "data_time": 0.0041414544233549415,
            "batch_time": 0.03960879032428448,
            "samples_per_second": 891702.9314738386,
            "samples_per_second_per_gpu": 111462.86643422983,
            "loss_sequences_lower_95": 2.8817754946485263,
            "loss_sequences_upper_95": 2.94422581508541,
            "loss_tokens_lower_95": 2.751617741808634,
            "loss_tokens_upper_95": 2.783802794179331,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1149165055968546,
            "data_time": 0.019343222890581404,
            "batch_time": 0.05496032110282353,
            "samples_per_second": 836109.1352337545,
            "samples_per_second_per_gpu": 104513.64190421932,
            "loss_sequences_lower_95": 3.0501624020663174,
            "loss_sequences_upper_95": 3.245617800625888,
            "loss_tokens_lower_95": 3.017496055565353,
            "loss_tokens_upper_95": 3.08449177416104,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3861472382837414,
            "data_time": 0.017800375819206238,
            "batch_time": 0.05350575968623161,
            "samples_per_second": 819815.039734653,
            "samples_per_second_per_gpu": 102476.87996683162,
            "loss_sequences_lower_95": 3.3637788733657525,
            "loss_sequences_upper_95": 3.569145644830198,
            "loss_tokens_lower_95": 3.2759451630733754,
            "loss_tokens_upper_95": 3.3736037139789268,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.591398727099101,
            "data_time": 0.016038339871626634,
            "batch_time": 0.05139398269164257,
            "samples_per_second": 835353.9597155679,
            "samples_per_second_per_gpu": 104419.24496444598,
            "loss_sequences_lower_95": 3.5495147399902343,
            "loss_sequences_upper_95": 3.6412299194335938,
            "loss_tokens_lower_95": 3.48001286491328,
            "loss_tokens_upper_95": 3.6824238516638985,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.801908399055321,
            "data_time": 0.0012945747788456197,
            "batch_time": 0.03674804745503221,
            "samples_per_second": 901754.1827162433,
            "samples_per_second_per_gpu": 112719.27283953041,
            "loss_sequences_lower_95": 4.807031692122681,
            "loss_sequences_upper_95": 4.88432836566852,
            "loss_tokens_lower_95": 4.673122138763797,
            "loss_tokens_upper_95": 4.753165865470758,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.995945577042472,
            "data_time": 0.00277365094063266,
            "batch_time": 0.03811212434064622,
            "samples_per_second": 899200.822268165,
            "samples_per_second_per_gpu": 112400.10278352062,
            "loss_sequences_lower_95": 4.541995074853351,
            "loss_sequences_upper_95": 4.863060691139915,
            "loss_tokens_lower_95": 3.2637930860528073,
            "loss_tokens_upper_95": 3.399003779855671,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6668533411754276,
            "data_time": 0.004877302694965053,
            "batch_time": 0.040164913277368285,
            "samples_per_second": 889088.2840164376,
            "samples_per_second_per_gpu": 111136.0355020547,
            "loss_sequences_lower_95": 4.064250448702139,
            "loss_sequences_upper_95": 4.425046745013866,
            "loss_tokens_lower_95": 3.2749740339625686,
            "loss_tokens_upper_95": 3.433530954100617,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.798374483030137,
            "data_time": 0.020232290029525757,
            "batch_time": 0.05626844082559858,
            "samples_per_second": 823917.5199301223,
            "samples_per_second_per_gpu": 102989.68999126529,
            "loss_sequences_lower_95": 5.721137923845961,
            "loss_sequences_upper_95": 5.871516355749679,
            "loss_tokens_lower_95": 5.723016468901612,
            "loss_tokens_upper_95": 5.871779754499322,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9922677540779112,
            "data_time": 0.04344991078743568,
            "batch_time": 0.07931699661108163,
            "samples_per_second": 757206.1065729451,
            "samples_per_second_per_gpu": 94650.76332161814,
            "loss_sequences_lower_95": 2.8663021697998046,
            "loss_sequences_upper_95": 3.2013422698974607,
            "loss_tokens_lower_95": 2.7104674474071304,
            "loss_tokens_upper_95": 3.1276599153849647,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.714072363891524,
            "data_time": 0.0030244406015595042,
            "batch_time": 0.03852358210550008,
            "samples_per_second": 897486.692158984,
            "samples_per_second_per_gpu": 112185.836519873,
            "loss_sequences_lower_95": 4.662962792668885,
            "loss_sequences_upper_95": 4.765781967178704,
            "loss_tokens_lower_95": 4.661831037195336,
            "loss_tokens_upper_95": 4.766082107715744,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.878146002079794,
            "data_time": 0.004599726802956611,
            "batch_time": 0.04010115206533311,
            "samples_per_second": 890980.3472352701,
            "samples_per_second_per_gpu": 111372.54340440876,
            "loss_sequences_lower_95": 4.818202123243627,
            "loss_sequences_upper_95": 4.937059997008728,
            "loss_tokens_lower_95": 4.81641135875755,
            "loss_tokens_upper_95": 4.938708786023239,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0003726981799153,
            "data_time": 0.003321388885217918,
            "batch_time": 0.038732480281887506,
            "samples_per_second": 892695.7866667358,
            "samples_per_second_per_gpu": 111586.97333334197,
            "loss_sequences_lower_95": 3.151314500677962,
            "loss_sequences_upper_95": 3.277670484009453,
            "loss_tokens_lower_95": 2.827751398658289,
            "loss_tokens_upper_95": 2.87987519536628,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.006567001581192,
            "data_time": 0.009903443977236748,
            "batch_time": 0.0450810045003891,
            "samples_per_second": 866479.0645101217,
            "samples_per_second_per_gpu": 108309.88306376521,
            "loss_sequences_lower_95": 5.17099296875,
            "loss_sequences_upper_95": 5.7399626708984375,
            "loss_tokens_lower_95": 4.43157418514229,
            "loss_tokens_upper_95": 4.789989087188741,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5135800391435623,
            "data_time": 0.1347445398569107,
            "batch_time": 0.1757924109697342,
            "samples_per_second": 477894.3231506931,
            "samples_per_second_per_gpu": 59736.790393836636,
            "loss_sequences_lower_95": 3.272219240665436,
            "loss_sequences_upper_95": 3.7954709053039553,
            "loss_tokens_lower_95": 3.067743077771417,
            "loss_tokens_upper_95": 3.8550900777180988,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.439154410499266,
            "data_time": 0.02484378662515194,
            "batch_time": 0.059782794181336744,
            "samples_per_second": 785688.236665855,
            "samples_per_second_per_gpu": 98211.02958323188,
            "loss_sequences_lower_95": 4.885781728810277,
            "loss_sequences_upper_95": 5.667693898869657,
            "loss_tokens_lower_95": 3.088914873366157,
            "loss_tokens_upper_95": 3.5055120457387967,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.206992117793032,
            "data_time": 0.00262150623732143,
            "batch_time": 0.03800893409384622,
            "samples_per_second": 895434.3038554857,
            "samples_per_second_per_gpu": 111929.28798193572,
            "loss_sequences_lower_95": 2.183043159028233,
            "loss_sequences_upper_95": 2.2308090803247187,
            "loss_tokens_lower_95": 2.1828822971275668,
            "loss_tokens_upper_95": 2.231487666472122,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4046689260250558,
            "data_time": 0.0023101000624846523,
            "batch_time": 0.03775692790989548,
            "samples_per_second": 899693.4089827195,
            "samples_per_second_per_gpu": 112461.67612283994,
            "loss_sequences_lower_95": 2.3771322009402898,
            "loss_sequences_upper_95": 2.511480370582064,
            "loss_tokens_lower_95": 2.273418573093727,
            "loss_tokens_upper_95": 2.404702506237468,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9662495519652037,
            "data_time": 0.01714596814579434,
            "batch_time": 0.05207868913809458,
            "samples_per_second": 827741.5518827611,
            "samples_per_second_per_gpu": 103467.69398534513,
            "loss_sequences_lower_95": 2.8399081219683637,
            "loss_sequences_upper_95": 3.232840038481213,
            "loss_tokens_lower_95": 2.725299704637272,
            "loss_tokens_upper_95": 3.0108997101479384,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4027196102097172,
            "data_time": 0.004386353865265846,
            "batch_time": 0.03984516300261021,
            "samples_per_second": 885256.1246160716,
            "samples_per_second_per_gpu": 110657.01557700895,
            "loss_sequences_lower_95": 3.4480802588699935,
            "loss_sequences_upper_95": 3.6014759813412587,
            "loss_tokens_lower_95": 3.250803722920869,
            "loss_tokens_upper_95": 3.39115174293999,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.569345662506615,
            "data_time": 0.028097530206044514,
            "batch_time": 0.0637178137188866,
            "samples_per_second": 814822.4536711458,
            "samples_per_second_per_gpu": 101852.80670889323,
            "loss_sequences_lower_95": 2.4394692955947503,
            "loss_sequences_upper_95": 2.857095629994462,
            "loss_tokens_lower_95": 2.300734144400617,
            "loss_tokens_upper_95": 2.6352308214196953,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.606754483163631,
            "data_time": 0.0016892733594869262,
            "batch_time": 0.03710792510451386,
            "samples_per_second": 899244.9586706787,
            "samples_per_second_per_gpu": 112405.61983383483,
            "loss_sequences_lower_95": 4.589705128525705,
            "loss_sequences_upper_95": 4.62378824397692,
            "loss_tokens_lower_95": 4.589807580656756,
            "loss_tokens_upper_95": 4.623474773079616,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.703932744496077,
            "data_time": 0.0434218016537753,
            "batch_time": 0.07972126440568404,
            "samples_per_second": 728595.5847504918,
            "samples_per_second_per_gpu": 91074.44809381147,
            "loss_sequences_lower_95": 0.665811372034758,
            "loss_sequences_upper_95": 0.771854546926554,
            "loss_tokens_lower_95": 0.5936260742687276,
            "loss_tokens_upper_95": 0.7571111204680303,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.878541333058095,
            "data_time": 0.0011717393956859872,
            "batch_time": 0.036561950654016254,
            "samples_per_second": 902439.9884734665,
            "samples_per_second_per_gpu": 112804.99855918331,
            "loss_sequences_lower_95": 4.1666844372379455,
            "loss_sequences_upper_95": 4.204931098090277,
            "loss_tokens_lower_95": 3.4304297388781433,
            "loss_tokens_upper_95": 3.4704038261605414,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.468328731060028,
            "data_time": 0.005246371976912968,
            "batch_time": 0.040729857626415435,
            "samples_per_second": 886307.1270725799,
            "samples_per_second_per_gpu": 110788.39088407249,
            "loss_sequences_lower_95": 6.4738001586914065,
            "loss_sequences_upper_95": 6.7013804077148444,
            "loss_tokens_lower_95": 6.215264566868474,
            "loss_tokens_upper_95": 6.431499616323473,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.342631238439809,
            "data_time": 0.019550733647104038,
            "batch_time": 0.05542617733195677,
            "samples_per_second": 825395.4934656706,
            "samples_per_second_per_gpu": 103174.43668320883,
            "loss_sequences_lower_95": 4.211522289773693,
            "loss_sequences_upper_95": 4.474664107612941,
            "loss_tokens_lower_95": 4.212495574951172,
            "loss_tokens_upper_95": 4.473953738005265,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.212227882760944,
            "data_time": 0.004181454698723483,
            "batch_time": 0.03962919259645853,
            "samples_per_second": 890480.475495858,
            "samples_per_second_per_gpu": 111310.05943698225,
            "loss_sequences_lower_95": 5.175284312855114,
            "loss_sequences_upper_95": 5.250012290261009,
            "loss_tokens_lower_95": 5.175412653142756,
            "loss_tokens_upper_95": 5.248996128891454,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9148470306793849,
            "data_time": 0.0036882317446647805,
            "batch_time": 0.03927094758825099,
            "samples_per_second": 891845.0197219397,
            "samples_per_second_per_gpu": 111480.62746524246,
            "loss_sequences_lower_95": 0.9325077311197917,
            "loss_sequences_upper_95": 0.9628716389973958,
            "loss_tokens_lower_95": 0.8669810580482193,
            "loss_tokens_upper_95": 0.9197106068990096,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.0316736096427555,
            "data_time": 0.020182002867971147,
            "batch_time": 0.05512067462716784,
            "samples_per_second": 806109.3970175425,
            "samples_per_second_per_gpu": 100763.67462719281,
            "loss_sequences_lower_95": 5.70157464890253,
            "loss_sequences_upper_95": 6.362707664853051,
            "loss_tokens_lower_95": 5.699551159086681,
            "loss_tokens_upper_95": 6.367249465215774,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.197441153228283,
            "data_time": 0.1418924331665039,
            "batch_time": 0.1805751770734787,
            "samples_per_second": 482928.9437671977,
            "samples_per_second_per_gpu": 60366.117970899715,
            "loss_sequences_lower_95": 2.0249631643295287,
            "loss_sequences_upper_95": 2.948587220907211,
            "loss_tokens_lower_95": 1.727171756705058,
            "loss_tokens_upper_95": 2.188155945453447,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.589835183620453,
            "data_time": 0.005596420121571374,
            "batch_time": 0.04094995913051423,
            "samples_per_second": 886322.6323312516,
            "samples_per_second_per_gpu": 110790.32904140645,
            "loss_sequences_lower_95": 7.48382109375,
            "loss_sequences_upper_95": 7.854268481445312,
            "loss_tokens_lower_95": 7.319098509709444,
            "loss_tokens_upper_95": 7.6425396656223565,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.999400876045227,
            "data_time": 0.005402023357058328,
            "batch_time": 0.04082810642227294,
            "samples_per_second": 887217.909810013,
            "samples_per_second_per_gpu": 110902.23872625163,
            "loss_sequences_lower_95": 7.084140893554688,
            "loss_sequences_upper_95": 7.290617724609374,
            "loss_tokens_lower_95": 6.771723588801171,
            "loss_tokens_upper_95": 6.972698623130007,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.434268998811999,
            "data_time": 0.003349774019375294,
            "batch_time": 0.03889481136232714,
            "samples_per_second": 892751.4673271454,
            "samples_per_second_per_gpu": 111593.93341589317,
            "loss_sequences_lower_95": 4.391436982418601,
            "loss_sequences_upper_95": 4.476336365053009,
            "loss_tokens_lower_95": 4.392813131835609,
            "loss_tokens_upper_95": 4.476116041029375,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.762245138303109,
            "data_time": 0.008007926162996322,
            "batch_time": 0.043585059866084075,
            "samples_per_second": 869780.3647014621,
            "samples_per_second_per_gpu": 108722.54558768276,
            "loss_sequences_lower_95": 3.6759390694754464,
            "loss_sequences_upper_95": 3.8491455734417004,
            "loss_tokens_lower_95": 3.6727994154125865,
            "loss_tokens_upper_95": 3.8478300830003116,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.59695663022995,
            "data_time": 0.005540491096557133,
            "batch_time": 0.04102630369246952,
            "samples_per_second": 885164.2089951953,
            "samples_per_second_per_gpu": 110645.52612439942,
            "loss_sequences_lower_95": 7.525557299804688,
            "loss_sequences_upper_95": 7.671803710937501,
            "loss_tokens_lower_95": 7.5246447143554684,
            "loss_tokens_upper_95": 7.669637268066406,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.843734716094657,
            "data_time": 0.0016854113126296853,
            "batch_time": 0.03711630410192594,
            "samples_per_second": 899788.3037118047,
            "samples_per_second_per_gpu": 112473.53796397558,
            "loss_sequences_lower_95": 3.342733515772824,
            "loss_sequences_upper_95": 3.4355765968690872,
            "loss_tokens_lower_95": 2.240122172163599,
            "loss_tokens_upper_95": 2.300955454953238,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.386888726433711,
            "data_time": 0.017906561919621058,
            "batch_time": 0.05332263197217669,
            "samples_per_second": 826227.7498754498,
            "samples_per_second_per_gpu": 103278.46873443123,
            "loss_sequences_lower_95": 4.240004502481488,
            "loss_sequences_upper_95": 4.533117049487669,
            "loss_tokens_lower_95": 4.239701000612174,
            "loss_tokens_upper_95": 4.533728778896047,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.393119941973219,
            "data_time": 0.009342193603515625,
            "batch_time": 0.04482546169310808,
            "samples_per_second": 878845.5933206499,
            "samples_per_second_per_gpu": 109855.69916508124,
            "loss_sequences_lower_95": 4.288448558134191,
            "loss_sequences_upper_95": 4.496084881950827,
            "loss_tokens_lower_95": 4.2901687921262255,
            "loss_tokens_upper_95": 4.494092167873009,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1063809501979938,
            "data_time": 0.0018641735858158876,
            "batch_time": 0.03725713185889627,
            "samples_per_second": 899511.4496972872,
            "samples_per_second_per_gpu": 112438.9312121609,
            "loss_sequences_lower_95": 3.490737030056057,
            "loss_sequences_upper_95": 3.583995555986158,
            "loss_tokens_lower_95": 2.5368422371640946,
            "loss_tokens_upper_95": 2.6061696681842763,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.635053405055293,
            "data_time": 0.024467999736468,
            "batch_time": 0.05990208685398102,
            "samples_per_second": 830891.376612885,
            "samples_per_second_per_gpu": 103861.42207661063,
            "loss_sequences_lower_95": 4.452902512323289,
            "loss_sequences_upper_95": 4.808901751482928,
            "loss_tokens_lower_95": 4.453167700389075,
            "loss_tokens_upper_95": 4.809582172373615,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.214220272947889,
            "data_time": 0.0031060598999880463,
            "batch_time": 0.038646932486649396,
            "samples_per_second": 893570.2297337584,
            "samples_per_second_per_gpu": 111696.2787167198,
            "loss_sequences_lower_95": 4.172020222608467,
            "loss_sequences_upper_95": 4.255932930762614,
            "loss_tokens_lower_95": 4.172534791905581,
            "loss_tokens_upper_95": 4.255863638128345,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.592081065895488,
            "data_time": 0.021571709892966532,
            "batch_time": 0.056737901947715066,
            "samples_per_second": 794193.0992623686,
            "samples_per_second_per_gpu": 99274.13740779608,
            "loss_sequences_lower_95": 4.426090670094907,
            "loss_sequences_upper_95": 4.755198587954623,
            "loss_tokens_lower_95": 4.4272483529396425,
            "loss_tokens_upper_95": 4.759708582313315,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6506785333156584,
            "data_time": 0.07211083173751831,
            "batch_time": 0.10830745846033096,
            "samples_per_second": 655371.103986696,
            "samples_per_second_per_gpu": 81921.387998337,
            "loss_sequences_lower_95": 2.470063184102376,
            "loss_sequences_upper_95": 2.936624685923258,
            "loss_tokens_lower_95": 2.194445180892944,
            "loss_tokens_upper_95": 2.828548542658488,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.571379800637563,
            "data_time": 0.07117055356502533,
            "batch_time": 0.10767599940299988,
            "samples_per_second": 649967.7731526583,
            "samples_per_second_per_gpu": 81245.97164408228,
            "loss_sequences_lower_95": 2.4376290257771807,
            "loss_sequences_upper_95": 3.055073375701904,
            "loss_tokens_lower_95": 1.9968493515186096,
            "loss_tokens_upper_95": 2.8486282391494577,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.4134764702225295,
            "data_time": 0.0032141194615804706,
            "batch_time": 0.0386821290477014,
            "samples_per_second": 895368.4437795945,
            "samples_per_second_per_gpu": 111921.05547244931,
            "loss_sequences_lower_95": 5.3953485997330635,
            "loss_sequences_upper_95": 5.431994963296208,
            "loss_tokens_lower_95": 5.395442013185751,
            "loss_tokens_upper_95": 5.431674236584131,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.6377706057597458,
            "data_time": 0.0011224638206040571,
            "batch_time": 0.0365576908742312,
            "samples_per_second": 901571.0266683336,
            "samples_per_second_per_gpu": 112696.3783335417,
            "loss_sequences_lower_95": 0.7388825959239084,
            "loss_sequences_upper_95": 0.7583222902677931,
            "loss_tokens_lower_95": 0.5201117667958942,
            "loss_tokens_upper_95": 0.5299870325816918,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.353041235856184,
            "data_time": 0.03631819784641266,
            "batch_time": 0.08467214554548264,
            "samples_per_second": 797509.3270277258,
            "samples_per_second_per_gpu": 99688.66587846572,
            "loss_sequences_lower_95": 4.416414852592888,
            "loss_sequences_upper_95": 4.783716637318529,
            "loss_tokens_lower_95": 4.015968182197633,
            "loss_tokens_upper_95": 4.2234946280666605,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.4632277359833585,
            "data_time": 0.09874296188354492,
            "batch_time": 0.13573431968688965,
            "samples_per_second": 513683.2038604212,
            "samples_per_second_per_gpu": 64210.40048255265,
            "loss_sequences_lower_95": 6.037131283734296,
            "loss_sequences_upper_95": 7.083214012352196,
            "loss_tokens_lower_95": 5.72964294810354,
            "loss_tokens_upper_95": 6.908609253683208,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.149265947865277,
            "data_time": 0.026765959603445872,
            "batch_time": 0.062445118313743955,
            "samples_per_second": 809088.0007329248,
            "samples_per_second_per_gpu": 101136.0000916156,
            "loss_sequences_lower_95": 4.121026127512862,
            "loss_sequences_upper_95": 4.458238490034894,
            "loss_tokens_lower_95": 3.7776895713966567,
            "loss_tokens_upper_95": 3.951057726424516,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.247615644117681,
            "data_time": 0.027960218134380523,
            "batch_time": 0.06382981368473598,
            "samples_per_second": 807824.5149095742,
            "samples_per_second_per_gpu": 100978.06436369677,
            "loss_sequences_lower_95": 4.211325408191216,
            "loss_sequences_upper_95": 4.520732758684856,
            "loss_tokens_lower_95": 3.90065268320482,
            "loss_tokens_upper_95": 4.0471419599140965,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.408817640891889,
            "data_time": 0.02967813469114758,
            "batch_time": 0.06624764771688552,
            "samples_per_second": 797659.866786019,
            "samples_per_second_per_gpu": 99707.48334825237,
            "loss_sequences_lower_95": 4.41068937720322,
            "loss_sequences_upper_95": 4.82721374325636,
            "loss_tokens_lower_95": 3.9999802750098836,
            "loss_tokens_upper_95": 4.231928258877228,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.361578606977695,
            "data_time": 0.029174702508108958,
            "batch_time": 0.0652909789766584,
            "samples_per_second": 800390.4725981043,
            "samples_per_second_per_gpu": 100048.80907476303,
            "loss_sequences_lower_95": 4.318823716698623,
            "loss_sequences_upper_95": 4.608962733571122,
            "loss_tokens_lower_95": 4.034036188333577,
            "loss_tokens_upper_95": 4.168155943493233,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.6584692860242,
            "data_time": 0.030893384674449027,
            "batch_time": 0.06718292354065696,
            "samples_per_second": 821600.3684317834,
            "samples_per_second_per_gpu": 102700.04605397292,
            "loss_sequences_lower_95": 4.631239053477412,
            "loss_sequences_upper_95": 4.940882352272175,
            "loss_tokens_lower_95": 4.362063674502779,
            "loss_tokens_upper_95": 4.4739969197864875,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.37965032530994,
            "data_time": 0.02810038271404448,
            "batch_time": 0.06510511466435023,
            "samples_per_second": 800724.9645022764,
            "samples_per_second_per_gpu": 100090.62056278455,
            "loss_sequences_lower_95": 4.420642145668588,
            "loss_sequences_upper_95": 4.73713521259587,
            "loss_tokens_lower_95": 4.01394645447182,
            "loss_tokens_upper_95": 4.13370466254144,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/params.txt",
    "uuid": "036aa198-4da2-4e15-8a8f-1fe2b8b00c6a",
    "creation_date": "2023_12_14-04_59_50"
}