{
    "name": "c4_original-d=576_l=24_h=8-2.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 6147095040,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1229419008",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=576_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.9767327109972634,
            "data_time": 0.035001080483198166,
            "batch_time": 0.36994775384664536,
            "samples_per_second": 838494.3693112809,
            "samples_per_second_per_gpu": 104811.79616391011,
            "loss_sequences_lower_95": 3.8576171239217123,
            "loss_sequences_upper_95": 4.094598210652669,
            "loss_tokens_lower_95": 3.961446107228597,
            "loss_tokens_upper_95": 3.9917141723632814,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2951036895940184,
            "data_time": 0.0010846584086625366,
            "batch_time": 0.030372161650728893,
            "samples_per_second": 1089487.2643198275,
            "samples_per_second_per_gpu": 136185.90803997844,
            "loss_sequences_lower_95": 3.292319532803327,
            "loss_sequences_upper_95": 3.2978378146360545,
            "loss_tokens_lower_95": 3.2843876197916666,
            "loss_tokens_upper_95": 3.3057097708333334,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.580099577806434,
            "data_time": 0.009457884788513183,
            "batch_time": 0.038565670013427734,
            "samples_per_second": 1065991.0637048401,
            "samples_per_second_per_gpu": 133248.88296310502,
            "loss_sequences_lower_95": 3.5560746547154016,
            "loss_sequences_upper_95": 3.6053497252172355,
            "loss_tokens_lower_95": 3.5657635208333334,
            "loss_tokens_upper_95": 3.594887604166667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.299614296844325,
            "data_time": 0.001516260990971013,
            "batch_time": 0.03049396988200514,
            "samples_per_second": 1102527.2697290727,
            "samples_per_second_per_gpu": 137815.9087161341,
            "loss_sequences_lower_95": 3.287870227931701,
            "loss_sequences_upper_95": 3.3117189815560564,
            "loss_tokens_lower_95": 3.288590630208333,
            "loss_tokens_upper_95": 3.31045340625,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3143285282274855,
            "data_time": 0.009201450651860331,
            "batch_time": 0.03835619018372312,
            "samples_per_second": 1066912.0407895085,
            "samples_per_second_per_gpu": 133364.00509868856,
            "loss_sequences_lower_95": 3.280048110868683,
            "loss_sequences_upper_95": 3.348818149294727,
            "loss_tokens_lower_95": 3.3035584375,
            "loss_tokens_upper_95": 3.324786140625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.762416362114134,
            "data_time": 0.003671420981054721,
            "batch_time": 0.03268324065467586,
            "samples_per_second": 1101846.169421166,
            "samples_per_second_per_gpu": 137730.77117764574,
            "loss_sequences_lower_95": 3.7256378360543425,
            "loss_sequences_upper_95": 3.799740366154508,
            "loss_tokens_lower_95": 3.7501337083333333,
            "loss_tokens_upper_95": 3.7745076458333333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5014701150388134,
            "data_time": 0.001539719143660578,
            "batch_time": 0.029860917837351415,
            "samples_per_second": 1128251.814917381,
            "samples_per_second_per_gpu": 141031.47686467262,
            "loss_sequences_lower_95": 3.4698491310586737,
            "loss_sequences_upper_95": 3.532040377869898,
            "loss_tokens_lower_95": 3.4870837083333335,
            "loss_tokens_upper_95": 3.5160125208333333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9581866616853243,
            "data_time": 0.0016295483101366555,
            "batch_time": 0.030139600318229032,
            "samples_per_second": 1123372.5142327277,
            "samples_per_second_per_gpu": 140421.56427909096,
            "loss_sequences_lower_95": 3.947437285258508,
            "loss_sequences_upper_95": 3.9696623445680626,
            "loss_tokens_lower_95": 3.946541072916667,
            "loss_tokens_upper_95": 3.9700076145833334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.693014583936552,
            "data_time": 0.016669701962243943,
            "batch_time": 0.07005601553689866,
            "samples_per_second": 1059707.447588139,
            "samples_per_second_per_gpu": 132463.43094851737,
            "loss_sequences_lower_95": 3.65007580392729,
            "loss_sequences_upper_95": 3.7402852252246888,
            "loss_tokens_lower_95": 3.68158434375,
            "loss_tokens_upper_95": 3.70448603125,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.739289011408689,
            "data_time": 0.009003972634673119,
            "batch_time": 0.03811411093920469,
            "samples_per_second": 1069890.085794916,
            "samples_per_second_per_gpu": 133736.2607243645,
            "loss_sequences_lower_95": 4.710847551549376,
            "loss_sequences_upper_95": 4.773703618106163,
            "loss_tokens_lower_95": 4.72624025,
            "loss_tokens_upper_95": 4.752801864583334,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6739384921197944,
            "data_time": 0.0012527610478776225,
            "batch_time": 0.029786627728437914,
            "samples_per_second": 1121561.095136929,
            "samples_per_second_per_gpu": 140195.13689211613,
            "loss_sequences_lower_95": 3.6665843987147975,
            "loss_sequences_upper_95": 3.6813947646507126,
            "loss_tokens_lower_95": 3.6624652916666665,
            "loss_tokens_upper_95": 3.685296697916667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.470187543780046,
            "data_time": 0.002427798524486532,
            "batch_time": 0.03086542924377543,
            "samples_per_second": 1124911.6547814708,
            "samples_per_second_per_gpu": 140613.95684768385,
            "loss_sequences_lower_95": 3.461454445657408,
            "loss_sequences_upper_95": 3.478765917439999,
            "loss_tokens_lower_95": 3.4589309635416665,
            "loss_tokens_upper_95": 3.4814459375,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.110410873111315,
            "data_time": 0.009163818811710644,
            "batch_time": 0.03790247487456431,
            "samples_per_second": 1069480.9239119496,
            "samples_per_second_per_gpu": 133685.1154889937,
            "loss_sequences_lower_95": 4.069274085240239,
            "loss_sequences_upper_95": 4.155706019527289,
            "loss_tokens_lower_95": 4.09707475,
            "loss_tokens_upper_95": 4.123327197916667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.29925649083559,
            "data_time": 0.009202308388820208,
            "batch_time": 0.03806115526602088,
            "samples_per_second": 1068023.4906086733,
            "samples_per_second_per_gpu": 133502.93632608416,
            "loss_sequences_lower_95": 3.2424593364148424,
            "loss_sequences_upper_95": 3.3561211859864275,
            "loss_tokens_lower_95": 3.2874468749999997,
            "loss_tokens_upper_95": 3.31083146875,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.765194567767057,
            "data_time": 0.07438702242715019,
            "batch_time": 0.10955227272851127,
            "samples_per_second": 532922.719558131,
            "samples_per_second_per_gpu": 66615.33994476637,
            "loss_sequences_lower_95": 4.698004107041792,
            "loss_sequences_upper_95": 4.834245456348766,
            "loss_tokens_lower_95": 4.736485732685436,
            "loss_tokens_upper_95": 4.793978162245317,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8509773610284657,
            "data_time": 0.012617518956010992,
            "batch_time": 0.041660266843709076,
            "samples_per_second": 1043254.9282329499,
            "samples_per_second_per_gpu": 130406.86602911874,
            "loss_sequences_lower_95": 3.770544887353658,
            "loss_sequences_upper_95": 3.9294969030441416,
            "loss_tokens_lower_95": 3.8378516770833335,
            "loss_tokens_upper_95": 3.863944291666667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.8854913296359825,
            "data_time": 0.011482556660970053,
            "batch_time": 0.04050266742706299,
            "samples_per_second": 1064279.6010981717,
            "samples_per_second_per_gpu": 133034.95013727146,
            "loss_sequences_lower_95": 5.826822213447189,
            "loss_sequences_upper_95": 5.942889307671298,
            "loss_tokens_lower_95": 5.873190624999999,
            "loss_tokens_upper_95": 5.898011364583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.953700634299732,
            "data_time": 0.03456134349107742,
            "batch_time": 0.06513412669301033,
            "samples_per_second": 932041.6010393533,
            "samples_per_second_per_gpu": 116505.20012991916,
            "loss_sequences_lower_95": 3.867909697235608,
            "loss_sequences_upper_95": 4.088174576055808,
            "loss_tokens_lower_95": 3.9395328521728517,
            "loss_tokens_upper_95": 3.9680652055584016,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.28880928254097,
            "data_time": 0.0017370421753385888,
            "batch_time": 0.030809085264564374,
            "samples_per_second": 1094010.734124366,
            "samples_per_second_per_gpu": 136751.34176554575,
            "loss_sequences_lower_95": 5.2666524839543865,
            "loss_sequences_upper_95": 5.311291571713431,
            "loss_tokens_lower_95": 5.26642282972511,
            "loss_tokens_upper_95": 5.311205307069862,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0041337922093785,
            "data_time": 0.0018992279745211266,
            "batch_time": 0.030660651814026437,
            "samples_per_second": 1103570.3543515166,
            "samples_per_second_per_gpu": 137946.29429393957,
            "loss_sequences_lower_95": 3.011441540257481,
            "loss_sequences_upper_95": 3.037463780053463,
            "loss_tokens_lower_95": 2.978118329094294,
            "loss_tokens_upper_95": 2.9969071953043693,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.601861165867248,
            "data_time": 0.002896695177471941,
            "batch_time": 0.03170295664996723,
            "samples_per_second": 1102542.7085201642,
            "samples_per_second_per_gpu": 137817.83856502053,
            "loss_sequences_lower_95": 4.849332643850378,
            "loss_sequences_upper_95": 5.148472673779227,
            "loss_tokens_lower_95": 4.078208323095366,
            "loss_tokens_upper_95": 4.295773452856952,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.744596679190795,
            "data_time": 0.0041776356544900445,
            "batch_time": 0.03304016574266109,
            "samples_per_second": 1092751.1686646664,
            "samples_per_second_per_gpu": 136593.8960830833,
            "loss_sequences_lower_95": 4.858293391927083,
            "loss_sequences_upper_95": 5.062993693033854,
            "loss_tokens_lower_95": 4.4370008475825475,
            "loss_tokens_upper_95": 4.58205886399371,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3614270952816168,
            "data_time": 0.004113170356232656,
            "batch_time": 0.03376419547816028,
            "samples_per_second": 1066716.3932710777,
            "samples_per_second_per_gpu": 133339.5491588847,
            "loss_sequences_lower_95": 3.4050109446816954,
            "loss_sequences_upper_95": 3.472368093726308,
            "loss_tokens_lower_95": 3.262223967407963,
            "loss_tokens_upper_95": 3.2952374641552127,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5751346859064967,
            "data_time": 0.021168068051338196,
            "batch_time": 0.05158330712999616,
            "samples_per_second": 998287.313670784,
            "samples_per_second_per_gpu": 124785.914208848,
            "loss_sequences_lower_95": 3.499786370017312,
            "loss_sequences_upper_95": 3.714809209650213,
            "loss_tokens_lower_95": 3.4675264773128087,
            "loss_tokens_upper_95": 3.5403089413483824,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.78222165788923,
            "data_time": 0.01888064481317997,
            "batch_time": 0.047907598316669464,
            "samples_per_second": 1007876.3662958617,
            "samples_per_second_per_gpu": 125984.54578698271,
            "loss_sequences_lower_95": 3.769687450175383,
            "loss_sequences_upper_95": 3.9838807896205357,
            "loss_tokens_lower_95": 3.6547382786052807,
            "loss_tokens_upper_95": 3.75549251280044,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.226992588837941,
            "data_time": 0.015337746876936693,
            "batch_time": 0.04524291325838138,
            "samples_per_second": 998354.7712460426,
            "samples_per_second_per_gpu": 124794.34640575532,
            "loss_sequences_lower_95": 4.177641042073567,
            "loss_sequences_upper_95": 4.300573048909505,
            "loss_tokens_lower_95": 4.086311247911677,
            "loss_tokens_upper_95": 4.325198503831409,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.688132227864231,
            "data_time": 0.0015087385321212513,
            "batch_time": 0.030303491803208578,
            "samples_per_second": 1104180.8865691002,
            "samples_per_second_per_gpu": 138022.61082113752,
            "loss_sequences_lower_95": 5.696032631345037,
            "loss_sequences_upper_95": 5.775881102446361,
            "loss_tokens_lower_95": 5.550192568848846,
            "loss_tokens_upper_95": 5.632484129751183,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5991772262737,
            "data_time": 0.002770763915657197,
            "batch_time": 0.03178212706674666,
            "samples_per_second": 1094050.612542186,
            "samples_per_second_per_gpu": 136756.32656777324,
            "loss_sequences_lower_95": 5.178859579683555,
            "loss_sequences_upper_95": 5.5000912753018465,
            "loss_tokens_lower_95": 3.8132681670782245,
            "loss_tokens_upper_95": 3.955883223614186,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.173930432711038,
            "data_time": 0.0048020917016106685,
            "batch_time": 0.033842545506116505,
            "samples_per_second": 1081493.2067779296,
            "samples_per_second_per_gpu": 135186.6508472412,
            "loss_sequences_lower_95": 4.635090163299249,
            "loss_sequences_upper_95": 5.002875651597163,
            "loss_tokens_lower_95": 3.7555762459286264,
            "loss_tokens_upper_95": 3.9196142452076272,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.8265899588528285,
            "data_time": 0.020228415727615356,
            "batch_time": 0.04999705297606332,
            "samples_per_second": 1008783.999200906,
            "samples_per_second_per_gpu": 126097.99990011325,
            "loss_sequences_lower_95": 5.754024850505672,
            "loss_sequences_upper_95": 5.8989222975082045,
            "loss_tokens_lower_95": 5.7561006171518265,
            "loss_tokens_upper_95": 5.897015882518193,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.412634263038635,
            "data_time": 0.041003598616673395,
            "batch_time": 0.07294438893978412,
            "samples_per_second": 879024.1854083134,
            "samples_per_second_per_gpu": 109878.02317603918,
            "loss_sequences_lower_95": 3.2784308395385744,
            "loss_sequences_upper_95": 3.6463972854614255,
            "loss_tokens_lower_95": 3.1083861267417405,
            "loss_tokens_upper_95": 3.553871637934649,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.10626009176795,
            "data_time": 0.0030927453304360982,
            "batch_time": 0.032785490490419976,
            "samples_per_second": 1072887.1728958224,
            "samples_per_second_per_gpu": 134110.8966119778,
            "loss_sequences_lower_95": 5.04868986194185,
            "loss_sequences_upper_95": 5.1637600604928995,
            "loss_tokens_lower_95": 5.047709850986759,
            "loss_tokens_upper_95": 5.164658033201126,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.150574147945345,
            "data_time": 0.0044329154744233825,
            "batch_time": 0.03381389544799705,
            "samples_per_second": 1076892.079156204,
            "samples_per_second_per_gpu": 134611.5098945255,
            "loss_sequences_lower_95": 5.098989685708435,
            "loss_sequences_upper_95": 5.200470045684889,
            "loss_tokens_lower_95": 5.098155968468468,
            "loss_tokens_upper_95": 5.20236965370022,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3527626553479424,
            "data_time": 0.0032508759467699613,
            "batch_time": 0.0318953259717311,
            "samples_per_second": 1099655.388790068,
            "samples_per_second_per_gpu": 137456.9235987585,
            "loss_sequences_lower_95": 3.4934506222265203,
            "loss_sequences_upper_95": 3.6161186342789375,
            "loss_tokens_lower_95": 3.1757404264656186,
            "loss_tokens_upper_95": 3.2305620818887832,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.4866411204338075,
            "data_time": 0.01023394986987114,
            "batch_time": 0.039731922559440136,
            "samples_per_second": 1040318.8887752133,
            "samples_per_second_per_gpu": 130039.86109690166,
            "loss_sequences_lower_95": 5.661586376953125,
            "loss_sequences_upper_95": 6.2243455078125,
            "loss_tokens_lower_95": 4.85848418563348,
            "loss_tokens_upper_95": 5.221000617864824,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9225582629442215,
            "data_time": 0.139612078666687,
            "batch_time": 0.17390918731689453,
            "samples_per_second": 549695.9520464571,
            "samples_per_second_per_gpu": 68711.99400580714,
            "loss_sequences_lower_95": 3.672338056564331,
            "loss_sequences_upper_95": 4.253269529342651,
            "loss_tokens_lower_95": 3.470268784446278,
            "loss_tokens_upper_95": 4.235255344434716,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.789090432654852,
            "data_time": 0.024909042297525608,
            "batch_time": 0.05396260606481674,
            "samples_per_second": 951282.6400788695,
            "samples_per_second_per_gpu": 118910.33000985869,
            "loss_sequences_lower_95": 5.262301653281026,
            "loss_sequences_upper_95": 6.0669455166520745,
            "loss_tokens_lower_95": 3.4577110729196026,
            "loss_tokens_upper_95": 3.8836399357912907,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.326319319080088,
            "data_time": 0.0026714327848619884,
            "batch_time": 0.03132559214201239,
            "samples_per_second": 1101172.541265221,
            "samples_per_second_per_gpu": 137646.56765815263,
            "loss_sequences_lower_95": 2.296100117494676,
            "loss_sequences_upper_95": 2.355627153429104,
            "loss_tokens_lower_95": 2.296386772079101,
            "loss_tokens_upper_95": 2.355859375,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1191086111476496,
            "data_time": 0.002442345676562746,
            "batch_time": 0.031509396916079714,
            "samples_per_second": 1093679.5932528647,
            "samples_per_second_per_gpu": 136709.94915660808,
            "loss_sequences_lower_95": 3.088141493350354,
            "loss_sequences_upper_95": 3.2493687877510675,
            "loss_tokens_lower_95": 2.9471892186859163,
            "loss_tokens_upper_95": 3.1053041620146735,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2345737373872554,
            "data_time": 0.016806984941164654,
            "batch_time": 0.04704335331916809,
            "samples_per_second": 972426.9977047425,
            "samples_per_second_per_gpu": 121553.37471309281,
            "loss_sequences_lower_95": 3.0629638392409997,
            "loss_sequences_upper_95": 3.4576365097101793,
            "loss_tokens_lower_95": 2.981164934721178,
            "loss_tokens_upper_95": 3.278933063687052,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6143171895462975,
            "data_time": 0.004447972774505616,
            "batch_time": 0.03336468301713467,
            "samples_per_second": 1084442.5337147065,
            "samples_per_second_per_gpu": 135555.31671433832,
            "loss_sequences_lower_95": 3.6399170692642557,
            "loss_sequences_upper_95": 3.788862851932468,
            "loss_tokens_lower_95": 3.4725091172098255,
            "loss_tokens_upper_95": 3.617016178861468,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0046081005073173,
            "data_time": 0.029838607424781435,
            "batch_time": 0.0592813804036095,
            "samples_per_second": 991390.5434397436,
            "samples_per_second_per_gpu": 123923.81792996795,
            "loss_sequences_lower_95": 2.8577811776137936,
            "loss_sequences_upper_95": 3.318090485363472,
            "loss_tokens_lower_95": 2.7073021943184137,
            "loss_tokens_upper_95": 3.0704293230812856,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.855508756604188,
            "data_time": 0.001919162354372141,
            "batch_time": 0.030816481154392682,
            "samples_per_second": 1098488.363615026,
            "samples_per_second_per_gpu": 137311.04545187825,
            "loss_sequences_lower_95": 4.84209787269954,
            "loss_sequences_upper_95": 4.868748007413982,
            "loss_tokens_lower_95": 4.84208517484747,
            "loss_tokens_upper_95": 4.868580034366248,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8364175416312172,
            "data_time": 0.04893606792796742,
            "batch_time": 0.0864510796286843,
            "samples_per_second": 904283.1715742473,
            "samples_per_second_per_gpu": 113035.39644678091,
            "loss_sequences_lower_95": 0.7978558882926274,
            "loss_sequences_upper_95": 0.9092419837285014,
            "loss_tokens_lower_95": 0.7089352407621385,
            "loss_tokens_upper_95": 0.8940684417131125,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.374328090869031,
            "data_time": 0.0013865468912732171,
            "batch_time": 0.03033365935012261,
            "samples_per_second": 1097675.4006230286,
            "samples_per_second_per_gpu": 137209.42507787858,
            "loss_sequences_lower_95": 5.823608940972223,
            "loss_sequences_upper_95": 5.877330155922432,
            "loss_tokens_lower_95": 4.688961702127659,
            "loss_tokens_upper_95": 4.744170176499034,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.600205251693725,
            "data_time": 0.005579797048417348,
            "batch_time": 0.03465090289948478,
            "samples_per_second": 1078271.1820367742,
            "samples_per_second_per_gpu": 134783.89775459678,
            "loss_sequences_lower_95": 6.597463720703125,
            "loss_sequences_upper_95": 6.818079870605469,
            "loss_tokens_lower_95": 6.367575236097013,
            "loss_tokens_upper_95": 6.570689775624759,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.33409615703251,
            "data_time": 0.021092952308008225,
            "batch_time": 0.051757044711355435,
            "samples_per_second": 983796.0229604865,
            "samples_per_second_per_gpu": 122974.50287006081,
            "loss_sequences_lower_95": 5.175022317637568,
            "loss_sequences_upper_95": 5.49404314124066,
            "loss_tokens_lower_95": 5.177004739512568,
            "loss_tokens_upper_95": 5.4915628847868545,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.02621278220957,
            "data_time": 0.004323927752942924,
            "batch_time": 0.033076262258621585,
            "samples_per_second": 1093323.5460719485,
            "samples_per_second_per_gpu": 136665.44325899356,
            "loss_sequences_lower_95": 5.969389417243726,
            "loss_sequences_upper_95": 6.083103064334754,
            "loss_tokens_lower_95": 5.968738153631037,
            "loss_tokens_upper_95": 6.0840451512192235,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.9044128475387891,
            "data_time": 0.0038708185261868418,
            "batch_time": 0.0326369417474625,
            "samples_per_second": 1099237.7309596515,
            "samples_per_second_per_gpu": 137404.71636995644,
            "loss_sequences_lower_95": 0.9370619649251303,
            "loss_sequences_upper_95": 0.9878746256510417,
            "loss_tokens_lower_95": 0.8418726103722739,
            "loss_tokens_upper_95": 0.8917207019526561,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.564069486799694,
            "data_time": 0.02188956524644579,
            "batch_time": 0.05174035685403006,
            "samples_per_second": 947998.3033655264,
            "samples_per_second_per_gpu": 118499.7879206908,
            "loss_sequences_lower_95": 6.235321451822917,
            "loss_sequences_upper_95": 6.888989780970982,
            "loss_tokens_lower_95": 6.232984720865885,
            "loss_tokens_upper_95": 6.8955177234468,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3970713429152966,
            "data_time": 0.14249959588050842,
            "batch_time": 0.1779787838459015,
            "samples_per_second": 533600.5796170292,
            "samples_per_second_per_gpu": 66700.07245212865,
            "loss_sequences_lower_95": 2.191310876607895,
            "loss_sequences_upper_95": 3.2433484911918637,
            "loss_tokens_lower_95": 1.8418356480549292,
            "loss_tokens_upper_95": 2.360651490516269,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.467900729417801,
            "data_time": 0.005511749831456986,
            "batch_time": 0.0342741674847073,
            "samples_per_second": 1087636.7088304358,
            "samples_per_second_per_gpu": 135954.58860380447,
            "loss_sequences_lower_95": 7.374032006835938,
            "loss_sequences_upper_95": 7.751754443359375,
            "loss_tokens_lower_95": 7.181031987792143,
            "loss_tokens_upper_95": 7.517900774805679,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.675722054481506,
            "data_time": 0.005554805199305217,
            "batch_time": 0.03442231672150748,
            "samples_per_second": 1085448.6074467194,
            "samples_per_second_per_gpu": 135681.07593083993,
            "loss_sequences_lower_95": 6.762188061523437,
            "loss_sequences_upper_95": 6.972706579589843,
            "loss_tokens_lower_95": 6.45167455773504,
            "loss_tokens_upper_95": 6.6452687331778915,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.057135587405168,
            "data_time": 0.003470978210602317,
            "batch_time": 0.032341370534737374,
            "samples_per_second": 1094583.350525547,
            "samples_per_second_per_gpu": 136822.91881569338,
            "loss_sequences_lower_95": 5.021493345101407,
            "loss_sequences_upper_95": 5.0931695542317925,
            "loss_tokens_lower_95": 5.021744483387634,
            "loss_tokens_upper_95": 5.092541982693597,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.110138780693488,
            "data_time": 0.007742609502325606,
            "batch_time": 0.03661238390873566,
            "samples_per_second": 1069221.0547119551,
            "samples_per_second_per_gpu": 133652.6318389944,
            "loss_sequences_lower_95": 4.9993546211957565,
            "loss_sequences_upper_95": 5.222894887147778,
            "loss_tokens_lower_95": 4.994141262540802,
            "loss_tokens_upper_95": 5.220472400858655,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.36733513212204,
            "data_time": 0.005161664788685147,
            "batch_time": 0.03425184554523892,
            "samples_per_second": 1078263.791917709,
            "samples_per_second_per_gpu": 134782.97398971362,
            "loss_sequences_lower_95": 7.294052221679687,
            "loss_sequences_upper_95": 7.443634375,
            "loss_tokens_lower_95": 7.292571887207031,
            "loss_tokens_upper_95": 7.442058959960938,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5915743487503806,
            "data_time": 0.0019750825398811094,
            "batch_time": 0.030589756397759307,
            "samples_per_second": 1108138.6647531185,
            "samples_per_second_per_gpu": 138517.33309413982,
            "loss_sequences_lower_95": 4.2276581717123936,
            "loss_sequences_upper_95": 4.338850067629494,
            "loss_tokens_lower_95": 2.836024096345391,
            "loss_tokens_upper_95": 2.907077772416073,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.669436275069393,
            "data_time": 0.017776691913604738,
            "batch_time": 0.04697236674172538,
            "samples_per_second": 1008003.1886955295,
            "samples_per_second_per_gpu": 126000.39858694118,
            "loss_sequences_lower_95": 5.472039533017287,
            "loss_sequences_upper_95": 5.866901340769298,
            "loss_tokens_lower_95": 5.474453211542386,
            "loss_tokens_upper_95": 5.863863315866954,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.647087498272167,
            "data_time": 0.009668620303273201,
            "batch_time": 0.03868706617504358,
            "samples_per_second": 1073917.559565889,
            "samples_per_second_per_gpu": 134239.69494573612,
            "loss_sequences_lower_95": 5.508025548598345,
            "loss_sequences_upper_95": 5.781181844075521,
            "loss_tokens_lower_95": 5.511170893650429,
            "loss_tokens_upper_95": 5.778611222809436,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.014894200792949,
            "data_time": 0.0020530741940322115,
            "batch_time": 0.030919974373113696,
            "samples_per_second": 1098405.59650519,
            "samples_per_second_per_gpu": 137300.69956314875,
            "loss_sequences_lower_95": 4.501318108597959,
            "loss_sequences_upper_95": 4.608369971247495,
            "loss_tokens_lower_95": 3.2986367506851013,
            "loss_tokens_upper_95": 3.37953758403371,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.270626238414219,
            "data_time": 0.02442605545123418,
            "batch_time": 0.05399187157551447,
            "samples_per_second": 1001176.2805554173,
            "samples_per_second_per_gpu": 125147.03506942716,
            "loss_sequences_lower_95": 5.13788415010644,
            "loss_sequences_upper_95": 5.4006115383572055,
            "loss_tokens_lower_95": 5.137568034822979,
            "loss_tokens_upper_95": 5.39979250468905,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5820170148067882,
            "data_time": 0.0032768652698204775,
            "batch_time": 0.03199286877162873,
            "samples_per_second": 1100906.3027344432,
            "samples_per_second_per_gpu": 137613.2878418054,
            "loss_sequences_lower_95": 3.5501311117283065,
            "loss_sequences_upper_95": 3.6136428618358183,
            "loss_tokens_lower_95": 3.5510018874235474,
            "loss_tokens_upper_95": 3.6131037291547208,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.7119230367604965,
            "data_time": 0.022282786802812055,
            "batch_time": 0.053348083929582076,
            "samples_per_second": 935551.898914328,
            "samples_per_second_per_gpu": 116943.987364291,
            "loss_sequences_lower_95": 5.518720519426957,
            "loss_sequences_upper_95": 5.904246239523286,
            "loss_tokens_lower_95": 5.518995296144948,
            "loss_tokens_upper_95": 5.908496419665882,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.4057252888878184,
            "data_time": 0.07040317356586456,
            "batch_time": 0.10168665647506714,
            "samples_per_second": 774495.3330177685,
            "samples_per_second_per_gpu": 96811.91662722106,
            "loss_sequences_lower_95": 2.1648272895812988,
            "loss_sequences_upper_95": 2.779712670644124,
            "loss_tokens_lower_95": 1.9606316142612032,
            "loss_tokens_upper_95": 2.714665200975206,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3761230130990345,
            "data_time": 0.06828342378139496,
            "batch_time": 0.10072264075279236,
            "samples_per_second": 733781.4961392202,
            "samples_per_second_per_gpu": 91722.68701740252,
            "loss_sequences_lower_95": 2.1463203684488934,
            "loss_sequences_upper_95": 2.786746196746826,
            "loss_tokens_lower_95": 1.7855554644981129,
            "loss_tokens_upper_95": 2.636521658736668,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0327700010982284,
            "data_time": 0.0033473078821461785,
            "batch_time": 0.03210123599873633,
            "samples_per_second": 1098970.000022874,
            "samples_per_second_per_gpu": 137371.25000285925,
            "loss_sequences_lower_95": 3.0039521656963366,
            "loss_sequences_upper_95": 3.0630953076099963,
            "loss_tokens_lower_95": 3.003874838917526,
            "loss_tokens_upper_95": 3.06167991761782,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.6124646690269415,
            "data_time": 0.001248180328234541,
            "batch_time": 0.03008505593602193,
            "samples_per_second": 1101772.8290396968,
            "samples_per_second_per_gpu": 137721.6036299621,
            "loss_sequences_lower_95": 0.7168930653711192,
            "loss_sequences_upper_95": 0.7362054185828831,
            "loss_tokens_lower_95": 0.49686972710816213,
            "loss_tokens_upper_95": 0.5068759711404145,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.53378649959414,
            "data_time": 0.0351279191672802,
            "batch_time": 0.0661587342619896,
            "samples_per_second": 953739.4298020669,
            "samples_per_second_per_gpu": 119217.42872525836,
            "loss_sequences_lower_95": 4.547196119413601,
            "loss_sequences_upper_95": 4.902570048655112,
            "loss_tokens_lower_95": 4.186608739790592,
            "loss_tokens_upper_95": 4.388384063728199,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.9658436259707885,
            "data_time": 0.11140236400422596,
            "batch_time": 0.14393048059372676,
            "samples_per_second": 565273.0044930845,
            "samples_per_second_per_gpu": 70659.12556163556,
            "loss_sequences_lower_95": 6.538157457918735,
            "loss_sequences_upper_95": 7.619805681383287,
            "loss_tokens_lower_95": 6.2398605064109525,
            "loss_tokens_upper_95": 7.35681644015842,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.366514810701696,
            "data_time": 0.027562672183627172,
            "batch_time": 0.057304586683000834,
            "samples_per_second": 980992.2708869668,
            "samples_per_second_per_gpu": 122624.03386087086,
            "loss_sequences_lower_95": 4.314603917191668,
            "loss_sequences_upper_95": 4.648078806807355,
            "loss_tokens_lower_95": 4.019764039831266,
            "loss_tokens_upper_95": 4.191001112082107,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.4538297609585085,
            "data_time": 0.029050739038558232,
            "batch_time": 0.060885330041249595,
            "samples_per_second": 926123.9639109304,
            "samples_per_second_per_gpu": 115765.4954888663,
            "loss_sequences_lower_95": 4.420086493143221,
            "loss_sequences_upper_95": 4.7234562385373,
            "loss_tokens_lower_95": 4.120336413011611,
            "loss_tokens_upper_95": 4.263403815637094,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5827029769013565,
            "data_time": 0.028168947923751103,
            "batch_time": 0.05890194858823504,
            "samples_per_second": 962335.630879504,
            "samples_per_second_per_gpu": 120291.953859938,
            "loss_sequences_lower_95": 4.514392210797566,
            "loss_sequences_upper_95": 4.8890821689512665,
            "loss_tokens_lower_95": 4.225359837168451,
            "loss_tokens_upper_95": 4.454210386884974,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.523443368876853,
            "data_time": 0.02772256306239537,
            "batch_time": 0.058211627460661386,
            "samples_per_second": 963798.9806085575,
            "samples_per_second_per_gpu": 120474.87257606968,
            "loss_sequences_lower_95": 4.4754540234077265,
            "loss_sequences_upper_95": 4.756603892256574,
            "loss_tokens_lower_95": 4.216064785871179,
            "loss_tokens_upper_95": 4.350457516489,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.760715555700457,
            "data_time": 0.028256330961062583,
            "batch_time": 0.05809809249124409,
            "samples_per_second": 997880.5436401278,
            "samples_per_second_per_gpu": 124735.06795501597,
            "loss_sequences_lower_95": 4.733233244522758,
            "loss_sequences_upper_95": 5.015715680803572,
            "loss_tokens_lower_95": 4.503761493675051,
            "loss_tokens_upper_95": 4.614048184319246,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.7629379571937935,
            "data_time": 0.027496891362326487,
            "batch_time": 0.05814672935576666,
            "samples_per_second": 971636.1119386561,
            "samples_per_second_per_gpu": 121454.51399233201,
            "loss_sequences_lower_95": 4.788691218306379,
            "loss_sequences_upper_95": 5.094093099454554,
            "loss_tokens_lower_95": 4.430399230993822,
            "loss_tokens_upper_95": 4.5530573057941535,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/params.txt",
    "uuid": "1d14144e-e426-4241-b6b4-c1b7102fd847",
    "creation_date": "2023_12_14-04_59_41"
}