{
    "name": "c4_original-d=1024_l=24_h=8-8.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 65858600960,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "13171720192",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=1024_l=24_h=8-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.523845394452413,
            "data_time": 0.05273163691163063,
            "batch_time": 0.49693165719509125,
            "samples_per_second": 689376.0421384198,
            "samples_per_second_per_gpu": 86172.00526730248,
            "loss_sequences_lower_95": 3.410353520711263,
            "loss_sequences_upper_95": 3.6359366162618,
            "loss_tokens_lower_95": 3.508958549499512,
            "loss_tokens_upper_95": 3.5387035433451337,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8026299347967147,
            "data_time": 0.0011085640741000974,
            "batch_time": 0.03682241656792976,
            "samples_per_second": 896711.211610272,
            "samples_per_second_per_gpu": 112088.901451284,
            "loss_sequences_lower_95": 2.799732345196708,
            "loss_sequences_upper_95": 2.80546683361716,
            "loss_tokens_lower_95": 2.7925239635416665,
            "loss_tokens_upper_95": 2.8124952291666667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4030349658460035,
            "data_time": 0.010539603233337403,
            "batch_time": 0.04606712818145752,
            "samples_per_second": 862391.4111577405,
            "samples_per_second_per_gpu": 107798.92639471756,
            "loss_sequences_lower_95": 3.384532906668527,
            "loss_sequences_upper_95": 3.4219493632413904,
            "loss_tokens_lower_95": 3.3882107135416666,
            "loss_tokens_upper_95": 3.4180061927083334,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7640864528823146,
            "data_time": 0.0016803205209343058,
            "batch_time": 0.03702873315073942,
            "samples_per_second": 904485.2682662248,
            "samples_per_second_per_gpu": 113060.6585332781,
            "loss_sequences_lower_95": 2.754260505597616,
            "loss_sequences_upper_95": 2.7737321752174613,
            "loss_tokens_lower_95": 2.7539289270833334,
            "loss_tokens_upper_95": 2.7740245,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.810066745140402,
            "data_time": 0.010959164554854314,
            "batch_time": 0.046659921744905145,
            "samples_per_second": 856637.9552859159,
            "samples_per_second_per_gpu": 107079.74441073948,
            "loss_sequences_lower_95": 2.7753329461558045,
            "loss_sequences_upper_95": 2.8441911064194563,
            "loss_tokens_lower_95": 2.8000347604166667,
            "loss_tokens_upper_95": 2.820225182291667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.310983024593278,
            "data_time": 0.004138959166796311,
            "batch_time": 0.03957502349563267,
            "samples_per_second": 899440.8384504704,
            "samples_per_second_per_gpu": 112430.1048063088,
            "loss_sequences_lower_95": 3.274830719761748,
            "loss_sequences_upper_95": 3.348403019908332,
            "loss_tokens_lower_95": 3.2991044166666668,
            "loss_tokens_upper_95": 3.322735838541667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.099494258651928,
            "data_time": 0.0019112245877065239,
            "batch_time": 0.03737073864477883,
            "samples_per_second": 904816.1190950688,
            "samples_per_second_per_gpu": 113102.0148868836,
            "loss_sequences_lower_95": 3.0677902732382014,
            "loss_sequences_upper_95": 3.1302463827327807,
            "loss_tokens_lower_95": 3.0855352239583334,
            "loss_tokens_upper_95": 3.1137624010416665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5616273902473647,
            "data_time": 0.0017655169998736923,
            "batch_time": 0.037390557057385865,
            "samples_per_second": 904860.9856489055,
            "samples_per_second_per_gpu": 113107.62320611319,
            "loss_sequences_lower_95": 3.5537833974149216,
            "loss_sequences_upper_95": 3.5693873322971204,
            "loss_tokens_lower_95": 3.5498001354166666,
            "loss_tokens_upper_95": 3.5735519166666663,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2436966295164775,
            "data_time": 0.011088618210383825,
            "batch_time": 0.04640561436849927,
            "samples_per_second": 866379.2243563535,
            "samples_per_second_per_gpu": 108297.40304454419,
            "loss_sequences_lower_95": 3.202555996034204,
            "loss_sequences_upper_95": 3.288428137554386,
            "loss_tokens_lower_95": 3.2326351927083334,
            "loss_tokens_upper_95": 3.254838671875,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.289495117579524,
            "data_time": 0.010670747607946396,
            "batch_time": 0.0460827462375164,
            "samples_per_second": 873622.0719511005,
            "samples_per_second_per_gpu": 109202.75899388756,
            "loss_sequences_lower_95": 4.262203026571764,
            "loss_sequences_upper_95": 4.314053477411685,
            "loss_tokens_lower_95": 4.275843510416666,
            "loss_tokens_upper_95": 4.30350765625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.181407305056287,
            "data_time": 0.0013697837339293046,
            "batch_time": 0.03671435402922187,
            "samples_per_second": 908130.717898242,
            "samples_per_second_per_gpu": 113516.33973728024,
            "loss_sequences_lower_95": 3.1740750426116895,
            "loss_sequences_upper_95": 3.188897205722386,
            "loss_tokens_lower_95": 3.1704438437499998,
            "loss_tokens_upper_95": 3.1923072916666664,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9656206474161206,
            "data_time": 0.002783287574012115,
            "batch_time": 0.03826913825677496,
            "samples_per_second": 901096.7416734447,
            "samples_per_second_per_gpu": 112637.0927091806,
            "loss_sequences_lower_95": 2.956896581177114,
            "loss_sequences_upper_95": 2.9741485786358552,
            "loss_tokens_lower_95": 2.9548227604166666,
            "loss_tokens_upper_95": 2.976530791666667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7131974958261176,
            "data_time": 0.010394454473563334,
            "batch_time": 0.04600066724030868,
            "samples_per_second": 857615.9973742377,
            "samples_per_second_per_gpu": 107201.99967177972,
            "loss_sequences_lower_95": 3.680920998223171,
            "loss_sequences_upper_95": 3.7452606943993247,
            "loss_tokens_lower_95": 3.699958322916667,
            "loss_tokens_upper_95": 3.7262837083333333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8269845868807946,
            "data_time": 0.010651448808343287,
            "batch_time": 0.04620404927379106,
            "samples_per_second": 859798.2589031317,
            "samples_per_second_per_gpu": 107474.78236289146,
            "loss_sequences_lower_95": 2.772862290267789,
            "loss_sequences_upper_95": 2.880855142317576,
            "loss_tokens_lower_95": 2.815726161458333,
            "loss_tokens_upper_95": 2.838137145833333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.178933468731967,
            "data_time": 0.08738097974232265,
            "batch_time": 0.12291995968137469,
            "samples_per_second": 506884.6923671469,
            "samples_per_second_per_gpu": 63360.58654589336,
            "loss_sequences_lower_95": 4.112731976942582,
            "loss_sequences_upper_95": 4.245532035827637,
            "loss_tokens_lower_95": 4.151303308660333,
            "loss_tokens_upper_95": 4.206337226520885,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3587168229217084,
            "data_time": 0.015021627599542791,
            "batch_time": 0.050841149958697235,
            "samples_per_second": 843892.5278345197,
            "samples_per_second_per_gpu": 105486.56597931497,
            "loss_sequences_lower_95": 3.2823538999863335,
            "loss_sequences_upper_95": 3.4326466374077533,
            "loss_tokens_lower_95": 3.3455300260416667,
            "loss_tokens_upper_95": 3.3713191874999997,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.6733039702463275,
            "data_time": 0.013603433966636658,
            "batch_time": 0.04921814426779747,
            "samples_per_second": 864357.3569161446,
            "samples_per_second_per_gpu": 108044.66961451808,
            "loss_sequences_lower_95": 5.60920786996001,
            "loss_sequences_upper_95": 5.734356528410497,
            "loss_tokens_lower_95": 5.6601311875,
            "loss_tokens_upper_95": 5.686665135416666,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2405075518811337,
            "data_time": 0.04046781361103058,
            "batch_time": 0.07691571861505508,
            "samples_per_second": 756422.5686784309,
            "samples_per_second_per_gpu": 94552.82108480386,
            "loss_sequences_lower_95": 3.1987623558669793,
            "loss_sequences_upper_95": 3.2799478061863634,
            "loss_tokens_lower_95": 3.2269522244813014,
            "loss_tokens_upper_95": 3.2540090779789157,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.137507599611735,
            "data_time": 0.0016173922323148722,
            "batch_time": 0.037139161878131786,
            "samples_per_second": 899165.8410702333,
            "samples_per_second_per_gpu": 112395.73013377916,
            "loss_sequences_lower_95": 4.117175371207805,
            "loss_sequences_upper_95": 4.1581166517189505,
            "loss_tokens_lower_95": 4.116459160465034,
            "loss_tokens_upper_95": 4.157869242440091,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5594270334754707,
            "data_time": 0.0020417508900545207,
            "batch_time": 0.03756240973616861,
            "samples_per_second": 897512.9843971079,
            "samples_per_second_per_gpu": 112189.12304963848,
            "loss_sequences_lower_95": 2.5623996402609044,
            "loss_sequences_upper_95": 2.5871253285030997,
            "loss_tokens_lower_95": 2.5360764351624456,
            "loss_tokens_upper_95": 2.55371691273704,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.309408628911019,
            "data_time": 0.0035377613898159533,
            "batch_time": 0.039046495843899916,
            "samples_per_second": 895220.366368371,
            "samples_per_second_per_gpu": 111902.54579604637,
            "loss_sequences_lower_95": 3.5503214479787877,
            "loss_sequences_upper_95": 3.8416518264071358,
            "loss_tokens_lower_95": 2.7821170046126698,
            "loss_tokens_upper_95": 2.985596133585001,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4018865947897234,
            "data_time": 0.003524989682309171,
            "batch_time": 0.0390227901491713,
            "samples_per_second": 891763.8242984475,
            "samples_per_second_per_gpu": 111470.47803730593,
            "loss_sequences_lower_95": 3.4708142008463545,
            "loss_sequences_upper_95": 3.6729992350260416,
            "loss_tokens_lower_95": 3.176721083922956,
            "loss_tokens_upper_95": 3.3190321590015723,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.599830427795219,
            "data_time": 0.004617844232067263,
            "batch_time": 0.040020174929818776,
            "samples_per_second": 892081.5112067704,
            "samples_per_second_per_gpu": 111510.1889008463,
            "loss_sequences_lower_95": 2.6368311861051223,
            "loss_sequences_upper_95": 2.6966533391922147,
            "loss_tokens_lower_95": 2.511680680667812,
            "loss_tokens_upper_95": 2.5436957727853873,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9298970341682433,
            "data_time": 0.025057224290711538,
            "batch_time": 0.06091959987367902,
            "samples_per_second": 828271.8039827736,
            "samples_per_second_per_gpu": 103533.9754978467,
            "loss_sequences_lower_95": 2.867608774358576,
            "loss_sequences_upper_95": 3.056023219715465,
            "loss_tokens_lower_95": 2.8371006200871713,
            "loss_tokens_upper_95": 2.901228585105311,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2831797006178873,
            "data_time": 0.022316228598356247,
            "batch_time": 0.0578143410384655,
            "samples_per_second": 817865.2044915901,
            "samples_per_second_per_gpu": 102233.15056144877,
            "loss_sequences_lower_95": 3.266051947146046,
            "loss_sequences_upper_95": 3.4786071528220663,
            "loss_tokens_lower_95": 3.174204739600965,
            "loss_tokens_upper_95": 3.277091576040961,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.09179861386617,
            "data_time": 0.017204637710864727,
            "batch_time": 0.05258988875609178,
            "samples_per_second": 831900.977169902,
            "samples_per_second_per_gpu": 103987.62214623774,
            "loss_sequences_lower_95": 3.0676102498372395,
            "loss_sequences_upper_95": 3.15849723815918,
            "loss_tokens_lower_95": 2.9704090019386395,
            "loss_tokens_upper_95": 3.1516736458494283,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.195034575412579,
            "data_time": 0.0014299002820940283,
            "batch_time": 0.036914817485127926,
            "samples_per_second": 899840.7668740451,
            "samples_per_second_per_gpu": 112480.09585925564,
            "loss_sequences_lower_95": 4.194520475662492,
            "loss_sequences_upper_95": 4.273745207774592,
            "loss_tokens_lower_95": 4.076066804359382,
            "loss_tokens_upper_95": 4.155989763754838,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6616987181444864,
            "data_time": 0.003026828269830486,
            "batch_time": 0.03852330878276953,
            "samples_per_second": 895802.5605648898,
            "samples_per_second_per_gpu": 111975.32007061123,
            "loss_sequences_lower_95": 4.184894353693182,
            "loss_sequences_upper_95": 4.500824786356403,
            "loss_tokens_lower_95": 2.964815433235411,
            "loss_tokens_upper_95": 3.0975169002522565,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4079310368148947,
            "data_time": 0.005199201606415413,
            "batch_time": 0.040640550690728264,
            "samples_per_second": 884917.3306883306,
            "samples_per_second_per_gpu": 110614.66633604132,
            "loss_sequences_lower_95": 3.7768265759985598,
            "loss_sequences_upper_95": 4.138672999880012,
            "loss_tokens_lower_95": 3.032562923523717,
            "loss_tokens_upper_95": 3.18815625693268,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.4190152747446,
            "data_time": 0.023561843803950717,
            "batch_time": 0.059224922742162435,
            "samples_per_second": 827592.9261735055,
            "samples_per_second_per_gpu": 103449.11577168819,
            "loss_sequences_lower_95": 5.336478678385417,
            "loss_sequences_upper_95": 5.498974442155394,
            "loss_tokens_lower_95": 5.337440358340468,
            "loss_tokens_upper_95": 5.498011591663099,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8615171790122984,
            "data_time": 0.051235309013953574,
            "batch_time": 0.08776573034433219,
            "samples_per_second": 733453.4344097129,
            "samples_per_second_per_gpu": 91681.67930121411,
            "loss_sequences_lower_95": 2.73740852355957,
            "loss_sequences_upper_95": 3.082559959411621,
            "loss_tokens_lower_95": 2.5872294963365805,
            "loss_tokens_upper_95": 2.993826834417626,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.7401143130650905,
            "data_time": 0.0034185967318607013,
            "batch_time": 0.03896852578122192,
            "samples_per_second": 895867.7661342802,
            "samples_per_second_per_gpu": 111983.47076678503,
            "loss_sequences_lower_95": 4.690750581239604,
            "loss_sequences_upper_95": 4.789954188023765,
            "loss_tokens_lower_95": 4.689361241163958,
            "loss_tokens_upper_95": 4.791101249140385,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.992256964161004,
            "data_time": 0.00529941571283885,
            "batch_time": 0.04078978932117755,
            "samples_per_second": 889858.8624974306,
            "samples_per_second_per_gpu": 111232.35781217883,
            "loss_sequences_lower_95": 4.929590123681921,
            "loss_sequences_upper_95": 5.055469299866273,
            "loss_tokens_lower_95": 4.927360955815545,
            "loss_tokens_upper_95": 5.05514362906941,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.881331494311124,
            "data_time": 0.0037108572203963675,
            "batch_time": 0.03919301434675509,
            "samples_per_second": 889611.463903545,
            "samples_per_second_per_gpu": 111201.43298794313,
            "loss_sequences_lower_95": 3.031163009959025,
            "loss_sequences_upper_95": 3.1552236530025843,
            "loss_tokens_lower_95": 2.706196856069614,
            "loss_tokens_upper_95": 2.7568874973794713,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.870456043481827,
            "data_time": 0.0111786387860775,
            "batch_time": 0.04679557587951422,
            "samples_per_second": 852487.6314506024,
            "samples_per_second_per_gpu": 106560.9539313253,
            "loss_sequences_lower_95": 5.047720654296875,
            "loss_sequences_upper_95": 5.623903930664062,
            "loss_tokens_lower_95": 4.275265016183405,
            "loss_tokens_upper_95": 4.638969214368071,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2590829133987427,
            "data_time": 0.16677886247634888,
            "batch_time": 0.20678368210792542,
            "samples_per_second": 464308.26484927174,
            "samples_per_second_per_gpu": 58038.53310615897,
            "loss_sequences_lower_95": 3.034359335899353,
            "loss_sequences_upper_95": 3.498857599496841,
            "loss_tokens_lower_95": 2.8519151139533383,
            "loss_tokens_upper_95": 3.580306042199847,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.499809430248436,
            "data_time": 0.030535548291307814,
            "batch_time": 0.06600160801664312,
            "samples_per_second": 774940.781139348,
            "samples_per_second_per_gpu": 96867.5976424185,
            "loss_sequences_lower_95": 4.990211355275122,
            "loss_sequences_upper_95": 5.829484207328709,
            "loss_tokens_lower_95": 3.0210283797650086,
            "loss_tokens_upper_95": 3.465459055938205,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.301441083625187,
            "data_time": 0.003152898202339808,
            "batch_time": 0.038533059466216296,
            "samples_per_second": 894203.4613666316,
            "samples_per_second_per_gpu": 111775.43267082894,
            "loss_sequences_lower_95": 2.2784858406970154,
            "loss_sequences_upper_95": 2.324580977253645,
            "loss_tokens_lower_95": 2.2782013566070063,
            "loss_tokens_upper_95": 2.3248624162519795,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1807398096224846,
            "data_time": 0.002572451907162709,
            "batch_time": 0.03811875516516227,
            "samples_per_second": 896642.5378521259,
            "samples_per_second_per_gpu": 112080.31723151574,
            "loss_sequences_lower_95": 2.1531467513993667,
            "loss_sequences_upper_95": 2.2790874437524256,
            "loss_tokens_lower_95": 2.0594381312232275,
            "loss_tokens_upper_95": 2.1836118180083397,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.861201981048444,
            "data_time": 0.019246737162272137,
            "batch_time": 0.05446666810247633,
            "samples_per_second": 817810.5465468172,
            "samples_per_second_per_gpu": 102226.31831835215,
            "loss_sequences_lower_95": 2.7405362363263364,
            "loss_sequences_upper_95": 3.149343754695012,
            "loss_tokens_lower_95": 2.614931085041339,
            "loss_tokens_upper_95": 2.896188379899977,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.327755230378239,
            "data_time": 0.005142204090952873,
            "batch_time": 0.0405267521739006,
            "samples_per_second": 886801.2920961507,
            "samples_per_second_per_gpu": 110850.16151201884,
            "loss_sequences_lower_95": 3.3739467364930693,
            "loss_sequences_upper_95": 3.531495980832552,
            "loss_tokens_lower_95": 3.1831262098278073,
            "loss_tokens_upper_95": 3.323158385994495,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3896014966615815,
            "data_time": 0.03141426756268456,
            "batch_time": 0.06818717150461107,
            "samples_per_second": 792185.707828129,
            "samples_per_second_per_gpu": 99023.21347851612,
            "loss_sequences_lower_95": 2.275763204621106,
            "loss_sequences_upper_95": 2.670728720688238,
            "loss_tokens_lower_95": 2.1269933554321088,
            "loss_tokens_upper_95": 2.4376637643935433,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.041793728547231,
            "data_time": 0.0018502591468789699,
            "batch_time": 0.03737107787282763,
            "samples_per_second": 896450.5813606017,
            "samples_per_second_per_gpu": 112056.32267007521,
            "loss_sequences_lower_95": 4.025326862247449,
            "loss_sequences_upper_95": 4.058426958829266,
            "loss_tokens_lower_95": 4.025306184283732,
            "loss_tokens_upper_95": 4.058223763893404,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.7291314344382981,
            "data_time": 0.046713516928932886,
            "batch_time": 0.08245556571266868,
            "samples_per_second": 739081.762749053,
            "samples_per_second_per_gpu": 92385.22034363162,
            "loss_sequences_lower_95": 0.6950940845082106,
            "loss_sequences_upper_95": 0.8039477061299444,
            "loss_tokens_lower_95": 0.6158601712057998,
            "loss_tokens_upper_95": 0.779268150368635,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8636031443686605,
            "data_time": 0.0012882873728209349,
            "batch_time": 0.03675565786803358,
            "samples_per_second": 899989.5502091234,
            "samples_per_second_per_gpu": 112498.69377614043,
            "loss_sequences_lower_95": 4.155753457891117,
            "loss_sequences_upper_95": 4.195141816529088,
            "loss_tokens_lower_95": 3.404879962524178,
            "loss_tokens_upper_95": 3.4443531431334624,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.458329388618469,
            "data_time": 0.005819966868748741,
            "batch_time": 0.04123110856328692,
            "samples_per_second": 886199.7259917795,
            "samples_per_second_per_gpu": 110774.96574897243,
            "loss_sequences_lower_95": 7.43111142578125,
            "loss_sequences_upper_95": 7.825981213378906,
            "loss_tokens_lower_95": 7.078849661488544,
            "loss_tokens_upper_95": 7.4279256845756,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7728802826093593,
            "data_time": 0.023877978324890137,
            "batch_time": 0.05961846295049635,
            "samples_per_second": 825030.2240985199,
            "samples_per_second_per_gpu": 103128.77801231499,
            "loss_sequences_lower_95": 3.627047052798064,
            "loss_sequences_upper_95": 3.9180708644701085,
            "loss_tokens_lower_95": 3.6278663635253907,
            "loss_tokens_upper_95": 3.9156266718325403,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.817197886741522,
            "data_time": 0.004799764199429248,
            "batch_time": 0.040223089327295146,
            "samples_per_second": 890379.7682433393,
            "samples_per_second_per_gpu": 111297.47103041741,
            "loss_sequences_lower_95": 5.766940511067708,
            "loss_sequences_upper_95": 5.865352718468869,
            "loss_tokens_lower_95": 5.7672298731948395,
            "loss_tokens_upper_95": 5.8664682191790956,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.7554709971547127,
            "data_time": 0.004237273589093635,
            "batch_time": 0.039899006802985006,
            "samples_per_second": 889127.2648271488,
            "samples_per_second_per_gpu": 111140.9081033936,
            "loss_sequences_lower_95": 0.7800178548177084,
            "loss_sequences_upper_95": 0.8199630310058594,
            "loss_tokens_lower_95": 0.7042531700180071,
            "loss_tokens_upper_95": 0.7449967037596289,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.782574772267115,
            "data_time": 0.023744585258620127,
            "batch_time": 0.059051879814692905,
            "samples_per_second": 795578.2926856837,
            "samples_per_second_per_gpu": 99447.28658571046,
            "loss_sequences_lower_95": 5.434079023088728,
            "loss_sequences_upper_95": 6.133633669898623,
            "loss_tokens_lower_95": 5.435963919503348,
            "loss_tokens_upper_95": 6.142806788853236,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8968234732747078,
            "data_time": 0.15899401903152466,
            "batch_time": 0.1964094042778015,
            "samples_per_second": 484639.561361492,
            "samples_per_second_per_gpu": 60579.9451701865,
            "loss_sequences_lower_95": 1.734066304564476,
            "loss_sequences_upper_95": 2.5162163496017453,
            "loss_tokens_lower_95": 1.4889958710031412,
            "loss_tokens_upper_95": 1.90421286671432,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.625553064346313,
            "data_time": 0.0062003906757112535,
            "batch_time": 0.04167982652073815,
            "samples_per_second": 884229.9141706232,
            "samples_per_second_per_gpu": 110528.7392713279,
            "loss_sequences_lower_95": 7.5283255859375,
            "loss_sequences_upper_95": 7.8896678466796875,
            "loss_tokens_lower_95": 7.3486338039340104,
            "loss_tokens_upper_95": 7.666195301759465,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.7264874029159545,
            "data_time": 0.006238936904876951,
            "batch_time": 0.04169114619966537,
            "samples_per_second": 885041.654029389,
            "samples_per_second_per_gpu": 110630.20675367363,
            "loss_sequences_lower_95": 6.83227890625,
            "loss_sequences_upper_95": 7.053524389648437,
            "loss_tokens_lower_95": 6.477763544731159,
            "loss_tokens_upper_95": 6.691272541135922,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.704628440065349,
            "data_time": 0.00363695246718799,
            "batch_time": 0.039111153975777004,
            "samples_per_second": 891820.5433725287,
            "samples_per_second_per_gpu": 111477.5679215661,
            "loss_sequences_lower_95": 4.649420061664537,
            "loss_sequences_upper_95": 4.759386900116284,
            "loss_tokens_lower_95": 4.650293991799886,
            "loss_tokens_upper_95": 4.759191006523948,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1798762586625857,
            "data_time": 0.008983944838140667,
            "batch_time": 0.04452897702819271,
            "samples_per_second": 868059.6861864685,
            "samples_per_second_per_gpu": 108507.46077330856,
            "loss_sequences_lower_95": 3.097781704904114,
            "loss_sequences_upper_95": 3.2612892256354407,
            "loss_tokens_lower_95": 3.09530760593678,
            "loss_tokens_upper_95": 3.261299876317084,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.712485935688019,
            "data_time": 0.006207366784413655,
            "batch_time": 0.04173457527917529,
            "samples_per_second": 883487.5963271895,
            "samples_per_second_per_gpu": 110435.94954089868,
            "loss_sequences_lower_95": 5.632785375976562,
            "loss_sequences_upper_95": 5.796659118652344,
            "loss_tokens_lower_95": 5.632281896972656,
            "loss_tokens_upper_95": 5.79509326171875,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7058186196277134,
            "data_time": 0.0016795728968312025,
            "batch_time": 0.037115711030076715,
            "samples_per_second": 899947.1788010476,
            "samples_per_second_per_gpu": 112493.39735013095,
            "loss_sequences_lower_95": 3.1881579370417454,
            "loss_sequences_upper_95": 3.280646397528382,
            "loss_tokens_lower_95": 2.127420057093269,
            "loss_tokens_upper_95": 2.187795322364352,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5769133590050597,
            "data_time": 0.0205256564276559,
            "batch_time": 0.05598522424697876,
            "samples_per_second": 821038.5303828939,
            "samples_per_second_per_gpu": 102629.81629786173,
            "loss_sequences_lower_95": 3.428984252018715,
            "loss_sequences_upper_95": 3.7238904981470817,
            "loss_tokens_lower_95": 3.4286963106980965,
            "loss_tokens_upper_95": 3.7258966246647622,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.483486923049478,
            "data_time": 0.011761260218918324,
            "batch_time": 0.047350614331662655,
            "samples_per_second": 875348.2473678034,
            "samples_per_second_per_gpu": 109418.53092097542,
            "loss_sequences_lower_95": 3.389496436025582,
            "loss_sequences_upper_95": 3.5749950094784007,
            "loss_tokens_lower_95": 3.388132108800552,
            "loss_tokens_upper_95": 3.5761496450386794,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7078602451228653,
            "data_time": 0.002123264349469783,
            "batch_time": 0.03756127556612206,
            "samples_per_second": 897862.7872033395,
            "samples_per_second_per_gpu": 112232.84840041744,
            "loss_sequences_lower_95": 3.059446043300999,
            "loss_sequences_upper_95": 3.149796142731038,
            "loss_tokens_lower_95": 2.200280577787605,
            "loss_tokens_upper_95": 2.2639951713554463,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.049363936065997,
            "data_time": 0.026899037261803944,
            "batch_time": 0.06316117693980534,
            "samples_per_second": 813145.4334019675,
            "samples_per_second_per_gpu": 101643.17917524594,
            "loss_sequences_lower_95": 3.861935481318721,
            "loss_sequences_upper_95": 4.229683382548983,
            "loss_tokens_lower_95": 3.8608803885323657,
            "loss_tokens_upper_95": 4.228154492252087,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1640384333578453,
            "data_time": 0.0035150679913195934,
            "batch_time": 0.03899292456798065,
            "samples_per_second": 892565.5319437475,
            "samples_per_second_per_gpu": 111570.69149296844,
            "loss_sequences_lower_95": 3.1377735046946675,
            "loss_sequences_upper_95": 3.1901598038512997,
            "loss_tokens_lower_95": 3.1389650005375573,
            "loss_tokens_upper_95": 3.189817223038513,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.858060553814601,
            "data_time": 0.024324482137506657,
            "batch_time": 0.05953133539720015,
            "samples_per_second": 789588.3516869572,
            "samples_per_second_per_gpu": 98698.54396086965,
            "loss_sequences_lower_95": 3.6954322592726028,
            "loss_sequences_upper_95": 4.023957165468087,
            "loss_tokens_lower_95": 3.6928369022110132,
            "loss_tokens_upper_95": 4.0228688582633305,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5357099244991939,
            "data_time": 0.08696319907903671,
            "batch_time": 0.1251143291592598,
            "samples_per_second": 623433.6089737552,
            "samples_per_second_per_gpu": 77929.2011217194,
            "loss_sequences_lower_95": 1.3877620792388916,
            "loss_sequences_upper_95": 1.7862079811096192,
            "loss_tokens_lower_95": 1.2358665572272407,
            "loss_tokens_upper_95": 1.6721319039662677,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6176975200573602,
            "data_time": 0.08388398587703705,
            "batch_time": 0.12070249021053314,
            "samples_per_second": 626801.3221534989,
            "samples_per_second_per_gpu": 78350.16526918736,
            "loss_sequences_lower_95": 1.4950753211975096,
            "loss_sequences_upper_95": 1.9838164361317954,
            "loss_tokens_lower_95": 1.2146412795849062,
            "loss_tokens_upper_95": 1.7972426296619886,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.352152097137406,
            "data_time": 0.003356361894343726,
            "batch_time": 0.03883682958794875,
            "samples_per_second": 894577.8028876046,
            "samples_per_second_per_gpu": 111822.22536095057,
            "loss_sequences_lower_95": 5.3340825777798235,
            "loss_sequences_upper_95": 5.37055980474503,
            "loss_tokens_lower_95": 5.333580662048969,
            "loss_tokens_upper_95": 5.3706961207888435,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.5402918006319907,
            "data_time": 0.0011677628044748202,
            "batch_time": 0.03664731468263549,
            "samples_per_second": 900088.1838188671,
            "samples_per_second_per_gpu": 112511.02297735839,
            "loss_sequences_lower_95": 0.6145287730319958,
            "loss_sequences_upper_95": 0.6269812889075642,
            "loss_tokens_lower_95": 0.4573562905585812,
            "loss_tokens_upper_95": 0.464862885472292,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.556934874827467,
            "data_time": 0.043647415935993195,
            "batch_time": 0.09205334633588791,
            "samples_per_second": 789982.9869904188,
            "samples_per_second_per_gpu": 98747.87337380234,
            "loss_sequences_lower_95": 4.665272678164985,
            "loss_sequences_upper_95": 5.077340674212598,
            "loss_tokens_lower_95": 4.169976112480041,
            "loss_tokens_upper_95": 4.3971820378707624,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.3418073654174805,
            "data_time": 0.12360328719729469,
            "batch_time": 0.16069760776701428,
            "samples_per_second": 490269.19512230344,
            "samples_per_second_per_gpu": 61283.64939028793,
            "loss_sequences_lower_95": 6.887813093855574,
            "loss_sequences_upper_95": 7.992473684774863,
            "loss_tokens_lower_95": 6.532531549901138,
            "loss_tokens_upper_95": 7.804918397503134,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.363485003389964,
            "data_time": 0.033188433874221074,
            "batch_time": 0.06991070508956909,
            "samples_per_second": 792459.5703381246,
            "samples_per_second_per_gpu": 99057.44629226558,
            "loss_sequences_lower_95": 4.345481509697146,
            "loss_sequences_upper_95": 4.698814875905107,
            "loss_tokens_lower_95": 3.970927487975715,
            "loss_tokens_upper_95": 4.163511600478343,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.398357607969424,
            "data_time": 0.031733864829653784,
            "batch_time": 0.06752822796503703,
            "samples_per_second": 801963.3199396466,
            "samples_per_second_per_gpu": 100245.41499245583,
            "loss_sequences_lower_95": 4.364139761575839,
            "loss_sequences_upper_95": 4.663473306051115,
            "loss_tokens_lower_95": 4.059574721747831,
            "loss_tokens_upper_95": 4.2206969520530375,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.696534713593925,
            "data_time": 0.03278103612718128,
            "batch_time": 0.06849272762026105,
            "samples_per_second": 807817.3869079748,
            "samples_per_second_per_gpu": 100977.17336349686,
            "loss_sequences_lower_95": 4.725093804336176,
            "loss_sequences_upper_95": 5.174130323456555,
            "loss_tokens_lower_95": 4.238890141492057,
            "loss_tokens_upper_95": 4.498875486034987,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.468307070615815,
            "data_time": 0.03286683275586083,
            "batch_time": 0.06876940102804274,
            "samples_per_second": 799627.6638776836,
            "samples_per_second_per_gpu": 99953.45798471045,
            "loss_sequences_lower_95": 4.433392054860184,
            "loss_sequences_upper_95": 4.718980565885218,
            "loss_tokens_lower_95": 4.1548403921157036,
            "loss_tokens_upper_95": 4.303082769756376,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.850519180297852,
            "data_time": 0.032099376490086685,
            "batch_time": 0.0682188316627785,
            "samples_per_second": 816550.1318545021,
            "samples_per_second_per_gpu": 102068.76648181277,
            "loss_sequences_lower_95": 4.826737454811239,
            "loss_sequences_upper_95": 5.15360354784853,
            "loss_tokens_lower_95": 4.527316798131764,
            "loss_tokens_upper_95": 4.65428660381325,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.531128974949441,
            "data_time": 0.031246633756728398,
            "batch_time": 0.06734923805509295,
            "samples_per_second": 803585.1638422987,
            "samples_per_second_per_gpu": 100448.14548028733,
            "loss_sequences_lower_95": 4.564115059085009,
            "loss_sequences_upper_95": 4.895552314200053,
            "loss_tokens_lower_95": 4.162120158981314,
            "loss_tokens_upper_95": 4.296649819639089,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/params.txt",
    "uuid": "06ca6ffb-ba45-4139-a432-2a66ba95b592",
    "creation_date": "2023_12_14-13_35_23"
}