{
    "name": "c4_original-d=1024_l=24_h=8-4.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 32929300480,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 4.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/25614",
        "--train-num-samples",
        "6585860096",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/original_c4/manifest.jsonl",
        "--data-key",
        "txt",
        "--name",
        "c4_original-d=1024_l=24_h=8-4.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/paloma_val/00000001.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/411m_4x_c4_original/"
    ],
    "results": [
        {
            "loss": 3.5588855842749276,
            "data_time": 0.140150785446167,
            "batch_time": 1.661707118153572,
            "samples_per_second": 254785.22431652577,
            "samples_per_second_per_gpu": 31848.15303956572,
            "loss_sequences_lower_95": 3.4321963500976564,
            "loss_sequences_upper_95": 3.6898443349202474,
            "loss_tokens_lower_95": 3.543254820505778,
            "loss_tokens_upper_95": 3.57495449701945,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7869397413267945,
            "data_time": 0.002531208578006937,
            "batch_time": 0.11770198006764952,
            "samples_per_second": 1120121.2233163028,
            "samples_per_second_per_gpu": 140015.15291453784,
            "loss_sequences_lower_95": 2.7841103783206274,
            "loss_sequences_upper_95": 2.7898332719355525,
            "loss_tokens_lower_95": 2.776763932291667,
            "loss_tokens_upper_95": 2.7971373749999997,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5130356141499113,
            "data_time": 0.03716522082686424,
            "batch_time": 0.17005276679992676,
            "samples_per_second": 943228.2828174387,
            "samples_per_second_per_gpu": 117903.53535217984,
            "loss_sequences_lower_95": 3.4917940848214286,
            "loss_sequences_upper_95": 3.5336144023038902,
            "loss_tokens_lower_95": 3.4965938333333333,
            "loss_tokens_upper_95": 3.529867020833333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7243482828631844,
            "data_time": 0.005545249895045632,
            "batch_time": 0.11952587099451768,
            "samples_per_second": 1117418.6545060244,
            "samples_per_second_per_gpu": 139677.33181325305,
            "loss_sequences_lower_95": 2.713246013208763,
            "loss_sequences_upper_95": 2.735272601884665,
            "loss_tokens_lower_95": 2.7145280104166667,
            "loss_tokens_upper_95": 2.7342283541666665,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7928951374139417,
            "data_time": 0.03445090353488922,
            "batch_time": 0.14571724832057953,
            "samples_per_second": 1014481.6831249293,
            "samples_per_second_per_gpu": 126810.21039061616,
            "loss_sequences_lower_95": 2.7572331818932185,
            "loss_sequences_upper_95": 2.827860684210317,
            "loss_tokens_lower_95": 2.7829187708333336,
            "loss_tokens_upper_95": 2.8029722916666664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.317420338709123,
            "data_time": 0.013448846836884817,
            "batch_time": 0.1257749435802301,
            "samples_per_second": 1073303.348130435,
            "samples_per_second_per_gpu": 134162.91851630437,
            "loss_sequences_lower_95": 3.2821882236255098,
            "loss_sequences_upper_95": 3.3533284615854435,
            "loss_tokens_lower_95": 3.3048416041666666,
            "loss_tokens_upper_95": 3.329764916666667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3349116431936925,
            "data_time": 0.0052638405408614725,
            "batch_time": 0.11760014792283376,
            "samples_per_second": 1112654.8110796565,
            "samples_per_second_per_gpu": 139081.85138495706,
            "loss_sequences_lower_95": 3.3021528569435588,
            "loss_sequences_upper_95": 3.367334193638393,
            "loss_tokens_lower_95": 3.318268317708333,
            "loss_tokens_upper_95": 3.3520411458333332,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5600317289941596,
            "data_time": 0.005989398611219306,
            "batch_time": 0.11838598157230176,
            "samples_per_second": 1111529.9209134143,
            "samples_per_second_per_gpu": 138941.2401141768,
            "loss_sequences_lower_95": 3.5513117023887437,
            "loss_sequences_upper_95": 3.568691160831152,
            "loss_tokens_lower_95": 3.5479107708333335,
            "loss_tokens_upper_95": 3.5723771145833334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.206782040557241,
            "data_time": 0.03586173057556152,
            "batch_time": 0.14818881079554558,
            "samples_per_second": 1013713.9584553766,
            "samples_per_second_per_gpu": 126714.24480692207,
            "loss_sequences_lower_95": 3.162079986324155,
            "loss_sequences_upper_95": 3.253066694058054,
            "loss_tokens_lower_95": 3.1954957916666666,
            "loss_tokens_upper_95": 3.217925625,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.266483707390284,
            "data_time": 0.034583840519189835,
            "batch_time": 0.14830487966537476,
            "samples_per_second": 1026118.8761838856,
            "samples_per_second_per_gpu": 128264.8595229857,
            "loss_sequences_lower_95": 4.230054434674531,
            "loss_sequences_upper_95": 4.298188582710598,
            "loss_tokens_lower_95": 4.252530760416667,
            "loss_tokens_upper_95": 4.280262489583333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1578844212760364,
            "data_time": 0.004100262400615646,
            "batch_time": 0.11832566433642284,
            "samples_per_second": 1120523.2628067578,
            "samples_per_second_per_gpu": 140065.40785084473,
            "loss_sequences_lower_95": 3.150638519575682,
            "loss_sequences_upper_95": 3.1650545600931888,
            "loss_tokens_lower_95": 3.1468689114583333,
            "loss_tokens_upper_95": 3.168589770833333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9490784523339135,
            "data_time": 0.008959712950806869,
            "batch_time": 0.12232907037985952,
            "samples_per_second": 1100679.9093522802,
            "samples_per_second_per_gpu": 137584.98866903502,
            "loss_sequences_lower_95": 2.939575688474529,
            "loss_sequences_upper_95": 2.9583866931259113,
            "loss_tokens_lower_95": 2.937820359375,
            "loss_tokens_upper_95": 2.960345901041667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7500631349807336,
            "data_time": 0.034092068672180176,
            "batch_time": 0.1454659067094326,
            "samples_per_second": 1018979.486342822,
            "samples_per_second_per_gpu": 127372.43579285275,
            "loss_sequences_lower_95": 3.7119854476340644,
            "loss_sequences_upper_95": 3.787427432541915,
            "loss_tokens_lower_95": 3.7362794479166666,
            "loss_tokens_upper_95": 3.7640305520833333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.801087077426328,
            "data_time": 0.03498731553554535,
            "batch_time": 0.1460067331790924,
            "samples_per_second": 1016953.328514392,
            "samples_per_second_per_gpu": 127119.166064299,
            "loss_sequences_lower_95": 2.745461054100767,
            "loss_sequences_upper_95": 2.8558947530152112,
            "loss_tokens_lower_95": 2.7897804114583336,
            "loss_tokens_upper_95": 2.8125932239583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.292686755006963,
            "data_time": 0.11439669132232666,
            "batch_time": 0.16583985090255737,
            "samples_per_second": 510379.7502252697,
            "samples_per_second_per_gpu": 63797.46877815871,
            "loss_sequences_lower_95": 4.219176656549627,
            "loss_sequences_upper_95": 4.363629878651012,
            "loss_tokens_lower_95": 4.2593035004355695,
            "loss_tokens_upper_95": 4.326009316877885,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.451141159318974,
            "data_time": 0.04631916185220083,
            "batch_time": 0.15173344810803732,
            "samples_per_second": 958777.8986244854,
            "samples_per_second_per_gpu": 119847.23732806067,
            "loss_sequences_lower_95": 3.3648433229318515,
            "loss_sequences_upper_95": 3.5395677750034165,
            "loss_tokens_lower_95": 3.437261036458333,
            "loss_tokens_upper_95": 3.4653115104166665,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.625951909138219,
            "data_time": 0.04597380260626475,
            "batch_time": 0.1596567134062449,
            "samples_per_second": 992690.2756941154,
            "samples_per_second_per_gpu": 124086.28446176443,
            "loss_sequences_lower_95": 5.560271959128669,
            "loss_sequences_upper_95": 5.686942067813119,
            "loss_tokens_lower_95": 5.612864291666667,
            "loss_tokens_upper_95": 5.6389526354166675,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2679155048776845,
            "data_time": 0.13225430250167847,
            "batch_time": 0.24453584849834442,
            "samples_per_second": 713397.655361298,
            "samples_per_second_per_gpu": 89174.70692016225,
            "loss_sequences_lower_95": 3.211289559036005,
            "loss_sequences_upper_95": 3.3184124368136048,
            "loss_tokens_lower_95": 3.25314573694448,
            "loss_tokens_upper_95": 3.2828933778356335,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.273690546294119,
            "data_time": 0.003868473795327273,
            "batch_time": 0.117370513623411,
            "samples_per_second": 1125486.3102557804,
            "samples_per_second_per_gpu": 140685.78878197254,
            "loss_sequences_lower_95": 4.252337741240564,
            "loss_sequences_upper_95": 4.295031590496368,
            "loss_tokens_lower_95": 4.2522318020825915,
            "loss_tokens_upper_95": 4.2948772265791915,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5222282096034285,
            "data_time": 0.00422395784643632,
            "batch_time": 0.1174613337350797,
            "samples_per_second": 1121473.080574448,
            "samples_per_second_per_gpu": 140184.135071806,
            "loss_sequences_lower_95": 2.537249601478478,
            "loss_sequences_upper_95": 2.56211909491853,
            "loss_tokens_lower_95": 2.5113340780990785,
            "loss_tokens_upper_95": 2.529186790297208,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.896147731795421,
            "data_time": 0.010979297406533185,
            "batch_time": 0.12191406418295468,
            "samples_per_second": 1102444.9595503982,
            "samples_per_second_per_gpu": 137805.61994379977,
            "loss_sequences_lower_95": 3.482245648599876,
            "loss_sequences_upper_95": 3.7787703581221215,
            "loss_tokens_lower_95": 2.7012972668349415,
            "loss_tokens_upper_95": 2.905450781203464,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2522781458298367,
            "data_time": 0.009484201048811277,
            "batch_time": 0.12079787688950698,
            "samples_per_second": 1103485.1029022976,
            "samples_per_second_per_gpu": 137935.6378627872,
            "loss_sequences_lower_95": 3.4718664225260416,
            "loss_sequences_upper_95": 3.6753655761718753,
            "loss_tokens_lower_95": 3.1548217374213836,
            "loss_tokens_upper_95": 3.2968671445066824,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4648346812368978,
            "data_time": 0.014660107818516817,
            "batch_time": 0.1220248206095262,
            "samples_per_second": 1075463.0581107994,
            "samples_per_second_per_gpu": 134432.88226384993,
            "loss_sequences_lower_95": 2.546636740776334,
            "loss_sequences_upper_95": 2.6070454236682026,
            "loss_tokens_lower_95": 2.4343035120833734,
            "loss_tokens_upper_95": 2.4657655244228533,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1191523508592085,
            "data_time": 0.07425317913293839,
            "batch_time": 0.17673426121473312,
            "samples_per_second": 876937.2686370838,
            "samples_per_second_per_gpu": 109617.15857963548,
            "loss_sequences_lower_95": 3.0484406766024503,
            "loss_sequences_upper_95": 3.3052658427845345,
            "loss_tokens_lower_95": 3.053698100372871,
            "loss_tokens_upper_95": 3.1306985192230172,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1744761301546682,
            "data_time": 0.07897976785898209,
            "batch_time": 0.18925190716981888,
            "samples_per_second": 917407.4557445296,
            "samples_per_second_per_gpu": 114675.9319680662,
            "loss_sequences_lower_95": 3.1815718296595983,
            "loss_sequences_upper_95": 3.3818954405492665,
            "loss_tokens_lower_95": 3.116228657117468,
            "loss_tokens_upper_95": 3.2170528441308393,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3209503134091696,
            "data_time": 0.052743529280026756,
            "batch_time": 0.1443816920121511,
            "samples_per_second": 928679.5847888506,
            "samples_per_second_per_gpu": 116084.94809860633,
            "loss_sequences_lower_95": 3.328282608032227,
            "loss_sequences_upper_95": 3.437743469238281,
            "loss_tokens_lower_95": 3.206576956806725,
            "loss_tokens_upper_95": 3.4173407367192326,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.913026928649421,
            "data_time": 0.003894307797060073,
            "batch_time": 0.11751408650065368,
            "samples_per_second": 1127445.1623765687,
            "samples_per_second_per_gpu": 140930.64529707108,
            "loss_sequences_lower_95": 3.9584834980592736,
            "loss_sequences_upper_95": 4.038562391968284,
            "loss_tokens_lower_95": 3.8464413164747477,
            "loss_tokens_upper_95": 3.9257639761963063,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1723764056107813,
            "data_time": 0.010194368268314161,
            "batch_time": 0.12196688824578335,
            "samples_per_second": 1101299.259259003,
            "samples_per_second_per_gpu": 137662.40740737537,
            "loss_sequences_lower_95": 4.215070309622922,
            "loss_sequences_upper_95": 4.527564812348748,
            "loss_tokens_lower_95": 2.9908104892214467,
            "loss_tokens_upper_95": 3.1240304446242053,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1838266073436867,
            "data_time": 0.017237943410873414,
            "batch_time": 0.12303010821342468,
            "samples_per_second": 1038792.6603865111,
            "samples_per_second_per_gpu": 129849.08254831389,
            "loss_sequences_lower_95": 3.806509128609615,
            "loss_sequences_upper_95": 4.157870847942886,
            "loss_tokens_lower_95": 3.0611279990142988,
            "loss_tokens_upper_95": 3.2242859576047715,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.788974333027182,
            "data_time": 0.07553057372570038,
            "batch_time": 0.17722752690315247,
            "samples_per_second": 876714.1723277475,
            "samples_per_second_per_gpu": 109589.27154096843,
            "loss_sequences_lower_95": 5.681516157873145,
            "loss_sequences_upper_95": 5.893422179897082,
            "loss_tokens_lower_95": 5.680352671723388,
            "loss_tokens_upper_95": 5.898309312236908,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8001028180122374,
            "data_time": 0.14279483258724213,
            "batch_time": 0.23937539756298065,
            "samples_per_second": 651150.245135726,
            "samples_per_second_per_gpu": 81393.78064196576,
            "loss_sequences_lower_95": 2.7286264724731444,
            "loss_sequences_upper_95": 3.0673031387329104,
            "loss_tokens_lower_95": 2.5781720483878856,
            "loss_tokens_upper_95": 2.9830792640317534,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.7160065114315115,
            "data_time": 0.011235814541578293,
            "batch_time": 0.1204126700758934,
            "samples_per_second": 1089522.1762653333,
            "samples_per_second_per_gpu": 136190.27203316666,
            "loss_sequences_lower_95": 4.66804264189643,
            "loss_sequences_upper_95": 4.764463707758924,
            "loss_tokens_lower_95": 4.666772948219197,
            "loss_tokens_upper_95": 4.764959598100051,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.827159987034903,
            "data_time": 0.017654496431350707,
            "batch_time": 0.1270654618740082,
            "samples_per_second": 1077682.6914789702,
            "samples_per_second_per_gpu": 134710.33643487128,
            "loss_sequences_lower_95": 4.7656605513539105,
            "loss_sequences_upper_95": 4.887777232577037,
            "loss_tokens_lower_95": 4.765426768209972,
            "loss_tokens_upper_95": 4.886909133698684,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.722225361083096,
            "data_time": 0.01245814561843872,
            "batch_time": 0.12193805873394012,
            "samples_per_second": 1092006.391825808,
            "samples_per_second_per_gpu": 136500.798978226,
            "loss_sequences_lower_95": 2.988849082838853,
            "loss_sequences_upper_95": 3.1157403757055904,
            "loss_tokens_lower_95": 2.661136841073932,
            "loss_tokens_upper_95": 2.7127719091842097,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.514660259723663,
            "data_time": 0.04368235170841217,
            "batch_time": 0.15540099889039993,
            "samples_per_second": 1024341.2363245063,
            "samples_per_second_per_gpu": 128042.6545405633,
            "loss_sequences_lower_95": 4.949001708984375,
            "loss_sequences_upper_95": 5.542286315917968,
            "loss_tokens_lower_95": 4.22003782967871,
            "loss_tokens_upper_95": 4.586535862658215,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.297869563102722,
            "data_time": 0.1278541535139084,
            "batch_time": 0.16739939153194427,
            "samples_per_second": 463384.0927498996,
            "samples_per_second_per_gpu": 57923.01159373745,
            "loss_sequences_lower_95": 3.031186008453369,
            "loss_sequences_upper_95": 3.5423144578933714,
            "loss_tokens_lower_95": 2.837196205402243,
            "loss_tokens_upper_95": 3.659617351663524,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.576142332334628,
            "data_time": 0.0793125256896019,
            "batch_time": 0.1611449420452118,
            "samples_per_second": 829865.6054848083,
            "samples_per_second_per_gpu": 103733.20068560104,
            "loss_sequences_lower_95": 4.805231747682067,
            "loss_sequences_upper_95": 5.645334072770743,
            "loss_tokens_lower_95": 2.9023643519456384,
            "loss_tokens_upper_95": 3.3231492516690744,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0915145661019507,
            "data_time": 0.010620127121607462,
            "batch_time": 0.12368784927659565,
            "samples_per_second": 1107956.402502814,
            "samples_per_second_per_gpu": 138494.55031285176,
            "loss_sequences_lower_95": 2.068561950923575,
            "loss_sequences_upper_95": 2.114048613102064,
            "loss_tokens_lower_95": 2.069329783321729,
            "loss_tokens_upper_95": 2.114212642751338,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0640202564368626,
            "data_time": 0.006117817832202446,
            "batch_time": 0.11815967647040762,
            "samples_per_second": 1111567.8211850296,
            "samples_per_second_per_gpu": 138945.9776481287,
            "loss_sequences_lower_95": 2.076769211730424,
            "loss_sequences_upper_95": 2.199008276428779,
            "loss_tokens_lower_95": 1.9866155665415377,
            "loss_tokens_upper_95": 2.1089348259347656,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7409736939838956,
            "data_time": 0.053935274481773376,
            "batch_time": 0.13990814487139383,
            "samples_per_second": 808161.1551519561,
            "samples_per_second_per_gpu": 101020.14439399452,
            "loss_sequences_lower_95": 2.689100797883757,
            "loss_sequences_upper_95": 3.103825451078869,
            "loss_tokens_lower_95": 2.565411743954527,
            "loss_tokens_upper_95": 2.8431385417871184,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2376457978725055,
            "data_time": 0.017514774203300477,
            "batch_time": 0.13037543445825578,
            "samples_per_second": 1088060.3244286615,
            "samples_per_second_per_gpu": 136007.5405535827,
            "loss_sequences_lower_95": 3.3442167768350433,
            "loss_sequences_upper_95": 3.496876705323229,
            "loss_tokens_lower_95": 3.150857007874223,
            "loss_tokens_upper_95": 3.290937183179421,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3062830669123953,
            "data_time": 0.07566459476947784,
            "batch_time": 0.15352077782154083,
            "samples_per_second": 798323.5827237691,
            "samples_per_second_per_gpu": 99790.44784047114,
            "loss_sequences_lower_95": 2.2512812079452886,
            "loss_sequences_upper_95": 2.6593467293716055,
            "loss_tokens_lower_95": 2.1159584132532117,
            "loss_tokens_upper_95": 2.4269540260899904,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.003150007800595,
            "data_time": 0.004467061884618599,
            "batch_time": 0.11748235709315813,
            "samples_per_second": 1115269.6971994364,
            "samples_per_second_per_gpu": 139408.71214992955,
            "loss_sequences_lower_95": 3.9877436229433387,
            "loss_sequences_upper_95": 4.0182756570845415,
            "loss_tokens_lower_95": 3.987672173106496,
            "loss_tokens_upper_95": 4.01835950197852,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.6064578789530448,
            "data_time": 0.14024440944194794,
            "batch_time": 0.2390199601650238,
            "samples_per_second": 659056.90948104,
            "samples_per_second_per_gpu": 82382.11368513,
            "loss_sequences_lower_95": 0.5825339048811533,
            "loss_sequences_upper_95": 0.6857070811743875,
            "loss_tokens_lower_95": 0.5220654607673263,
            "loss_tokens_upper_95": 0.6659607121009787,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.839291308860859,
            "data_time": 0.002731346067377556,
            "batch_time": 0.11666073637662525,
            "samples_per_second": 1123092.163394652,
            "samples_per_second_per_gpu": 140386.5204243315,
            "loss_sequences_lower_95": 4.551056939530922,
            "loss_sequences_upper_95": 4.5944827596796385,
            "loss_tokens_lower_95": 3.684510692698259,
            "loss_tokens_upper_95": 3.728199643375242,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.760188080310821,
            "data_time": 0.020953131839632988,
            "batch_time": 0.1331107895821333,
            "samples_per_second": 1069546.954231253,
            "samples_per_second_per_gpu": 133693.3692789066,
            "loss_sequences_lower_95": 5.841750134277344,
            "loss_sequences_upper_95": 6.105556445312501,
            "loss_tokens_lower_95": 5.597455121930588,
            "loss_tokens_upper_95": 5.845459947342662,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3613080055817313,
            "data_time": 0.07650124281644821,
            "batch_time": 0.18271875381469727,
            "samples_per_second": 886268.6608895459,
            "samples_per_second_per_gpu": 110783.58261119324,
            "loss_sequences_lower_95": 3.237874981424083,
            "loss_sequences_upper_95": 3.4892075381071668,
            "loss_tokens_lower_95": 3.2397786281419836,
            "loss_tokens_upper_95": 3.484323896325153,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.516394794709755,
            "data_time": 0.016554558818990536,
            "batch_time": 0.12437563592737372,
            "samples_per_second": 1069595.065220553,
            "samples_per_second_per_gpu": 133699.38315256912,
            "loss_sequences_lower_95": 5.467896941213897,
            "loss_sequences_upper_95": 5.565232432972301,
            "loss_tokens_lower_95": 5.468818664550781,
            "loss_tokens_upper_95": 5.5655830614494555,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.5531173895200093,
            "data_time": 0.015500713139772415,
            "batch_time": 0.1278805434703827,
            "samples_per_second": 1082729.1072112839,
            "samples_per_second_per_gpu": 135341.13840141048,
            "loss_sequences_lower_95": 0.5741845021565755,
            "loss_sequences_upper_95": 0.5963570373535156,
            "loss_tokens_lower_95": 0.5336676467461985,
            "loss_tokens_upper_95": 0.5644453924147784,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.211733976999919,
            "data_time": 0.07966488599777222,
            "batch_time": 0.17732730507850647,
            "samples_per_second": 871213.0954034815,
            "samples_per_second_per_gpu": 108901.63692543519,
            "loss_sequences_lower_95": 4.891882963634673,
            "loss_sequences_upper_95": 5.539610697428386,
            "loss_tokens_lower_95": 4.887345174153645,
            "loss_tokens_upper_95": 5.534308137439546,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7627174146473408,
            "data_time": 0.12855146825313568,
            "batch_time": 0.16937881708145142,
            "samples_per_second": 464293.54853506363,
            "samples_per_second_per_gpu": 58036.693566882954,
            "loss_sequences_lower_95": 1.6220091164112092,
            "loss_sequences_upper_95": 2.3512360751628876,
            "loss_tokens_lower_95": 1.36373139371577,
            "loss_tokens_upper_95": 1.7844093778944505,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.150889307022095,
            "data_time": 0.023053260520100594,
            "batch_time": 0.13511516898870468,
            "samples_per_second": 1067206.7963902894,
            "samples_per_second_per_gpu": 133400.84954878618,
            "loss_sequences_lower_95": 7.169514379882812,
            "loss_sequences_upper_95": 7.4650474609375,
            "loss_tokens_lower_95": 6.992710811918359,
            "loss_tokens_upper_95": 7.254734262637479,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.5312525186538695,
            "data_time": 0.02062336355447769,
            "batch_time": 0.1326276920735836,
            "samples_per_second": 1069455.551895961,
            "samples_per_second_per_gpu": 133681.94398699512,
            "loss_sequences_lower_95": 6.741850561523438,
            "loss_sequences_upper_95": 6.9497092285156254,
            "loss_tokens_lower_95": 6.410101932684182,
            "loss_tokens_upper_95": 6.586425328911336,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.940476203660137,
            "data_time": 0.009884068121512731,
            "batch_time": 0.12103473084668319,
            "samples_per_second": 1094803.6572568275,
            "samples_per_second_per_gpu": 136850.45715710343,
            "loss_sequences_lower_95": 3.905940155019066,
            "loss_sequences_upper_95": 3.975123952724292,
            "loss_tokens_lower_95": 3.9056773457797314,
            "loss_tokens_upper_95": 3.975567113382082,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4455535529029717,
            "data_time": 0.02913002129439469,
            "batch_time": 0.13315186395749942,
            "samples_per_second": 986238.6865898488,
            "samples_per_second_per_gpu": 123279.8358237311,
            "loss_sequences_lower_95": 2.3750777659145186,
            "loss_sequences_upper_95": 2.5156402072232624,
            "loss_tokens_lower_95": 2.37451660812542,
            "loss_tokens_upper_95": 2.5148549913444462,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.7665198068618775,
            "data_time": 0.02052817866206169,
            "batch_time": 0.13284321315586567,
            "samples_per_second": 1067254.8413113134,
            "samples_per_second_per_gpu": 133406.85516391418,
            "loss_sequences_lower_95": 5.691144763183594,
            "loss_sequences_upper_95": 5.846373474121094,
            "loss_tokens_lower_95": 5.688870935058594,
            "loss_tokens_upper_95": 5.845497680664062,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.135427466054626,
            "data_time": 0.004350546792329076,
            "batch_time": 0.11809431231883635,
            "samples_per_second": 1118191.8262904566,
            "samples_per_second_per_gpu": 139773.97828630707,
            "loss_sequences_lower_95": 2.9984779853062915,
            "loss_sequences_upper_95": 3.087306133403796,
            "loss_tokens_lower_95": 1.9643854242204495,
            "loss_tokens_upper_95": 2.0222820711406393,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9706645274340215,
            "data_time": 0.053020070899616585,
            "batch_time": 0.144538169557398,
            "samples_per_second": 822876.6607265874,
            "samples_per_second_per_gpu": 102859.58259082343,
            "loss_sequences_lower_95": 2.8381031719606318,
            "loss_sequences_upper_95": 3.1026832808309526,
            "loss_tokens_lower_95": 2.8369224377532505,
            "loss_tokens_upper_95": 3.100741423421831,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.814585935835745,
            "data_time": 0.04113620147109032,
            "batch_time": 0.1548617258667946,
            "samples_per_second": 1025579.4802711612,
            "samples_per_second_per_gpu": 128197.43503389515,
            "loss_sequences_lower_95": 2.7278565470377605,
            "loss_sequences_upper_95": 2.901886082069547,
            "loss_tokens_lower_95": 2.7291617120481004,
            "loss_tokens_upper_95": 2.9017237165862437,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2771648370083244,
            "data_time": 0.0052594660766541014,
            "batch_time": 0.11865569390947857,
            "samples_per_second": 1112367.1295469962,
            "samples_per_second_per_gpu": 139045.89119337453,
            "loss_sequences_lower_95": 2.9337898943457974,
            "loss_sequences_upper_95": 3.0234281233852407,
            "loss_tokens_lower_95": 2.1238391913001338,
            "loss_tokens_upper_95": 2.1887228888177312,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.370050379838893,
            "data_time": 0.07792680710554123,
            "batch_time": 0.16662918031215668,
            "samples_per_second": 847532.8370552249,
            "samples_per_second_per_gpu": 105941.60463190312,
            "loss_sequences_lower_95": 4.225065895363137,
            "loss_sequences_upper_95": 4.51423090374659,
            "loss_tokens_lower_95": 4.221921438388724,
            "loss_tokens_upper_95": 4.514567735460069,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.981486327422139,
            "data_time": 0.009697644183268914,
            "batch_time": 0.1220735231271157,
            "samples_per_second": 1099635.2997792684,
            "samples_per_second_per_gpu": 137454.41247240856,
            "loss_sequences_lower_95": 3.9479520857224775,
            "loss_sequences_upper_95": 4.014843518551701,
            "loss_tokens_lower_95": 3.9487247198728976,
            "loss_tokens_upper_95": 4.014864117450307,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.194462992612598,
            "data_time": 0.07755009084939957,
            "batch_time": 0.1735353320837021,
            "samples_per_second": 868082.805303905,
            "samples_per_second_per_gpu": 108510.35066298813,
            "loss_sequences_lower_95": 3.061907062715697,
            "loss_sequences_upper_95": 3.327578187220305,
            "loss_tokens_lower_95": 3.0601039479079755,
            "loss_tokens_upper_95": 3.3260654708714164,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2957873125871022,
            "data_time": 0.1270892322063446,
            "batch_time": 0.19038017094135284,
            "samples_per_second": 580362.2051197098,
            "samples_per_second_per_gpu": 72545.27563996373,
            "loss_sequences_lower_95": 1.1891760603586832,
            "loss_sequences_upper_95": 1.5845765495300292,
            "loss_tokens_lower_95": 1.0486510435740153,
            "loss_tokens_upper_95": 1.4527868323855928,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.3700878262519836,
            "data_time": 0.12597590684890747,
            "batch_time": 0.18919958174228668,
            "samples_per_second": 582046.4631432481,
            "samples_per_second_per_gpu": 72755.80789290601,
            "loss_sequences_lower_95": 1.3351788743336994,
            "loss_sequences_upper_95": 1.7759701855977377,
            "loss_tokens_lower_95": 1.0730305232358783,
            "loss_tokens_upper_95": 1.5674468094043514,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.081463242981676,
            "data_time": 0.009608048531744216,
            "batch_time": 0.12192116015487248,
            "samples_per_second": 1101045.9895167511,
            "samples_per_second_per_gpu": 137630.7486895939,
            "loss_sequences_lower_95": 5.059456576767304,
            "loss_sequences_upper_95": 5.103337614483616,
            "loss_tokens_lower_95": 5.0596832428203244,
            "loss_tokens_upper_95": 5.103717625299153,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.4835288514957165,
            "data_time": 0.002578486466303435,
            "batch_time": 0.11699574977094288,
            "samples_per_second": 1123204.818185242,
            "samples_per_second_per_gpu": 140400.60227315524,
            "loss_sequences_lower_95": 0.6447126850680435,
            "loss_sequences_upper_95": 0.6617067224470867,
            "loss_tokens_lower_95": 0.45712749374491635,
            "loss_tokens_upper_95": 0.4661022181233865,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.364145346513883,
            "data_time": 0.14729425311088562,
            "batch_time": 0.26186060905456543,
            "samples_per_second": 721107.6375356298,
            "samples_per_second_per_gpu": 90138.45469195372,
            "loss_sequences_lower_95": 4.662190967469703,
            "loss_sequences_upper_95": 5.093556934266578,
            "loss_tokens_lower_95": 4.168607185830417,
            "loss_tokens_upper_95": 4.422115626343342,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.056854815096469,
            "data_time": 0.12877194583415985,
            "batch_time": 0.17472979426383972,
            "samples_per_second": 470998.1132500401,
            "samples_per_second_per_gpu": 58874.76415625501,
            "loss_sequences_lower_95": 6.591368742246885,
            "loss_sequences_upper_95": 7.825025115141997,
            "loss_tokens_lower_95": 6.19403734560366,
            "loss_tokens_upper_95": 7.627976405767747,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.1929913526628075,
            "data_time": 0.07680956274271011,
            "batch_time": 0.15480565279722214,
            "samples_per_second": 794755.0666193658,
            "samples_per_second_per_gpu": 99344.38332742073,
            "loss_sequences_lower_95": 4.33927372257884,
            "loss_sequences_upper_95": 4.6948837466356235,
            "loss_tokens_lower_95": 3.944945176159062,
            "loss_tokens_upper_95": 4.148976087770711,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.264497162365332,
            "data_time": 0.0764467716217041,
            "batch_time": 0.15438178181648254,
            "samples_per_second": 798340.8356090739,
            "samples_per_second_per_gpu": 99792.60445113423,
            "loss_sequences_lower_95": 4.39280408533608,
            "loss_sequences_upper_95": 4.710326236631812,
            "loss_tokens_lower_95": 4.052346636053126,
            "loss_tokens_upper_95": 4.2246211139065615,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.500392880381607,
            "data_time": 0.07644112408161163,
            "batch_time": 0.15441866219043732,
            "samples_per_second": 797519.9726796885,
            "samples_per_second_per_gpu": 99689.99658496106,
            "loss_sequences_lower_95": 4.777394680860566,
            "loss_sequences_upper_95": 5.261777003218488,
            "loss_tokens_lower_95": 4.234699163716576,
            "loss_tokens_upper_95": 4.513786504568897,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.340662940246303,
            "data_time": 0.07553664594888687,
            "batch_time": 0.1533612534403801,
            "samples_per_second": 798083.9619479321,
            "samples_per_second_per_gpu": 99760.49524349152,
            "loss_sequences_lower_95": 4.429072626625619,
            "loss_sequences_upper_95": 4.724248244122761,
            "loss_tokens_lower_95": 4.1493333572910585,
            "loss_tokens_upper_95": 4.305752382842922,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.5830833600915,
            "data_time": 0.08010201901197433,
            "batch_time": 0.15680388361215591,
            "samples_per_second": 787151.4516717943,
            "samples_per_second_per_gpu": 98393.93145897429,
            "loss_sequences_lower_95": 4.647094849770114,
            "loss_sequences_upper_95": 4.9693158641365,
            "loss_tokens_lower_95": 4.402204763362826,
            "loss_tokens_upper_95": 4.534024637138052,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.452523753410432,
            "data_time": 0.08176952600479126,
            "batch_time": 0.15989714860916138,
            "samples_per_second": 792308.9712641067,
            "samples_per_second_per_gpu": 99038.62140801334,
            "loss_sequences_lower_95": 4.627296261671113,
            "loss_sequences_upper_95": 4.9485187344434785,
            "loss_tokens_lower_95": 4.254754946974834,
            "loss_tokens_upper_95": 4.40335277040009,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/params.txt",
    "uuid": "c8a9f069-7d20-452e-8d34-6cce4a43e094",
    "creation_date": "2024_01_25-08_05_47"
}