{
    "name": "rw_original-d=576_l=24_h=8-1.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 3073547520,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "614709504",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.655301344394684,
            "data_time": 0.039143484085798264,
            "batch_time": 0.3821641653776169,
            "samples_per_second": 837790.8750910365,
            "samples_per_second_per_gpu": 104723.85938637957,
            "loss_sequences_lower_95": 3.5799492263793944,
            "loss_sequences_upper_95": 3.7316492780049644,
            "loss_tokens_lower_95": 3.641495501200358,
            "loss_tokens_upper_95": 3.6692027982076008,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.517893769129325,
            "data_time": 0.0011180493505332644,
            "batch_time": 0.03034664078257806,
            "samples_per_second": 1091830.7279207748,
            "samples_per_second_per_gpu": 136478.84099009685,
            "loss_sequences_lower_95": 3.5155129766232545,
            "loss_sequences_upper_95": 3.5202329953881812,
            "loss_tokens_lower_95": 3.507221875,
            "loss_tokens_upper_95": 3.5287297604166667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0957851974331603,
            "data_time": 0.009731298446655273,
            "batch_time": 0.03981110858917236,
            "samples_per_second": 1032744.5138753797,
            "samples_per_second_per_gpu": 129093.06423442246,
            "loss_sequences_lower_95": 3.0448288150709506,
            "loss_sequences_upper_95": 3.1596743587571745,
            "loss_tokens_lower_95": 3.082823697916667,
            "loss_tokens_upper_95": 3.108779734375,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6046021947172497,
            "data_time": 0.0016677823701971455,
            "batch_time": 0.0299488733473577,
            "samples_per_second": 1127357.0554734338,
            "samples_per_second_per_gpu": 140919.63193417923,
            "loss_sequences_lower_95": 3.5695812157699742,
            "loss_sequences_upper_95": 3.641011648276417,
            "loss_tokens_lower_95": 3.592231927083333,
            "loss_tokens_upper_95": 3.6167165,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5702322471408885,
            "data_time": 0.009830483402388979,
            "batch_time": 0.03842470275453362,
            "samples_per_second": 1074252.4967204349,
            "samples_per_second_per_gpu": 134281.56209005436,
            "loss_sequences_lower_95": 3.51783165708336,
            "loss_sequences_upper_95": 3.637646503021178,
            "loss_tokens_lower_95": 3.5590056875,
            "loss_tokens_upper_95": 3.5814259791666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7201238297670574,
            "data_time": 0.0038241044334743333,
            "batch_time": 0.03268130933460982,
            "samples_per_second": 1104376.3227271063,
            "samples_per_second_per_gpu": 138047.04034088828,
            "loss_sequences_lower_95": 3.6728363348301536,
            "loss_sequences_upper_95": 3.7722110709391994,
            "loss_tokens_lower_95": 3.7077814270833334,
            "loss_tokens_upper_95": 3.73248265625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3835209160921527,
            "data_time": 0.001632732532152733,
            "batch_time": 0.02994775830435325,
            "samples_per_second": 1129055.8196951316,
            "samples_per_second_per_gpu": 141131.97746189145,
            "loss_sequences_lower_95": 3.353044104751276,
            "loss_sequences_upper_95": 3.41279196229273,
            "loss_tokens_lower_95": 3.368756552083333,
            "loss_tokens_upper_95": 3.3987809947916667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.007742920121597,
            "data_time": 0.0018341353826072115,
            "batch_time": 0.030645076623276056,
            "samples_per_second": 1117454.8817656224,
            "samples_per_second_per_gpu": 139681.8602207028,
            "loss_sequences_lower_95": 3.98468089414267,
            "loss_sequences_upper_95": 4.033339608556937,
            "loss_tokens_lower_95": 3.9960949270833335,
            "loss_tokens_upper_95": 4.019206291666666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.682243589947863,
            "data_time": 0.012337150081755623,
            "batch_time": 0.05105552692261953,
            "samples_per_second": 1060630.600872476,
            "samples_per_second_per_gpu": 132578.8251090595,
            "loss_sequences_lower_95": 3.6004364323809868,
            "loss_sequences_upper_95": 3.7826910189496794,
            "loss_tokens_lower_95": 3.6703452083333334,
            "loss_tokens_upper_95": 3.6940634270833335,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.785953173995489,
            "data_time": 0.009936586953699589,
            "batch_time": 0.03900077659636736,
            "samples_per_second": 1074556.2601311938,
            "samples_per_second_per_gpu": 134319.53251639922,
            "loss_sequences_lower_95": 4.676261172087297,
            "loss_sequences_upper_95": 4.922145918895134,
            "loss_tokens_lower_95": 4.7724673125,
            "loss_tokens_upper_95": 4.799324947916666,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7423490785941436,
            "data_time": 0.0013443850007374733,
            "batch_time": 0.02992211802099201,
            "samples_per_second": 1119752.6112791814,
            "samples_per_second_per_gpu": 139969.07640989768,
            "loss_sequences_lower_95": 3.7327779583005,
            "loss_sequences_upper_95": 3.7521584480351513,
            "loss_tokens_lower_95": 3.7310543125,
            "loss_tokens_upper_95": 3.7538414687499997,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5903599247342592,
            "data_time": 0.0027043876997338645,
            "batch_time": 0.03199548447360405,
            "samples_per_second": 1092720.8898822092,
            "samples_per_second_per_gpu": 136590.11123527616,
            "loss_sequences_lower_95": 3.5724775931577986,
            "loss_sequences_upper_95": 3.609208372243987,
            "loss_tokens_lower_95": 3.57905371875,
            "loss_tokens_upper_95": 3.601811625,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.048932311017421,
            "data_time": 0.00998261229322833,
            "batch_time": 0.03932224055052746,
            "samples_per_second": 1050733.3192727212,
            "samples_per_second_per_gpu": 131341.66490909015,
            "loss_sequences_lower_95": 3.967237368074924,
            "loss_sequences_upper_95": 4.152107025208386,
            "loss_tokens_lower_95": 4.036028083333333,
            "loss_tokens_upper_95": 4.061425104166666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.349906285280123,
            "data_time": 0.01001763438798517,
            "batch_time": 0.03931657441583763,
            "samples_per_second": 1053098.5984010305,
            "samples_per_second_per_gpu": 131637.32480012882,
            "loss_sequences_lower_95": 3.26864486663259,
            "loss_sequences_upper_95": 3.4450215698988034,
            "loss_tokens_lower_95": 3.337990359375,
            "loss_tokens_upper_95": 3.3617015677083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.402376695112749,
            "data_time": 0.08200904301234654,
            "batch_time": 0.11785901444298881,
            "samples_per_second": 510926.8102398568,
            "samples_per_second_per_gpu": 63865.8512799821,
            "loss_sequences_lower_95": 4.33587517304854,
            "loss_sequences_upper_95": 4.472738430716775,
            "loss_tokens_lower_95": 4.378992314772172,
            "loss_tokens_upper_95": 4.426240262118252,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6543887050784365,
            "data_time": 0.014270855621858076,
            "batch_time": 0.043317163532430474,
            "samples_per_second": 1046412.5037590835,
            "samples_per_second_per_gpu": 130801.56296988543,
            "loss_sequences_lower_95": 3.5916906151062546,
            "loss_sequences_upper_95": 3.7148080898095843,
            "loss_tokens_lower_95": 3.6416528645833335,
            "loss_tokens_upper_95": 3.6667300312499997,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.511038479515619,
            "data_time": 0.013225995004177094,
            "batch_time": 0.043282099068164825,
            "samples_per_second": 1045588.8009686399,
            "samples_per_second_per_gpu": 130698.60012107999,
            "loss_sequences_lower_95": 5.421606960648911,
            "loss_sequences_upper_95": 5.628415150503999,
            "loss_tokens_lower_95": 5.499203333333333,
            "loss_tokens_upper_95": 5.522876708333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.023009726258575,
            "data_time": 0.036134254187345505,
            "batch_time": 0.06736293062567711,
            "samples_per_second": 912317.072025193,
            "samples_per_second_per_gpu": 114039.63400314913,
            "loss_sequences_lower_95": 3.864922520371734,
            "loss_sequences_upper_95": 4.3025089576596125,
            "loss_tokens_lower_95": 4.0091434666367824,
            "loss_tokens_upper_95": 4.037544538153977,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.083705330638698,
            "data_time": 0.0016482363121927736,
            "batch_time": 0.030492904483593136,
            "samples_per_second": 1102324.0505388365,
            "samples_per_second_per_gpu": 137790.50631735456,
            "loss_sequences_lower_95": 5.062755706095998,
            "loss_sequences_upper_95": 5.105322564672055,
            "loss_tokens_lower_95": 5.06262617003899,
            "loss_tokens_upper_95": 5.105290754410874,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.172268949260609,
            "data_time": 0.0018467855206720388,
            "batch_time": 0.03063065829170737,
            "samples_per_second": 1103835.8222880282,
            "samples_per_second_per_gpu": 137979.47778600352,
            "loss_sequences_lower_95": 3.1696464546171703,
            "loss_sequences_upper_95": 3.194832129859279,
            "loss_tokens_lower_95": 3.152471192206235,
            "loss_tokens_upper_95": 3.171779906051322,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.887593307308448,
            "data_time": 0.0030899984897250392,
            "batch_time": 0.0321286747075835,
            "samples_per_second": 1093792.132254682,
            "samples_per_second_per_gpu": 136724.01653183525,
            "loss_sequences_lower_95": 5.127181258672355,
            "loss_sequences_upper_95": 5.417524199560108,
            "loss_tokens_lower_95": 4.366814535494103,
            "loss_tokens_upper_95": 4.585570329253038,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.999860327442487,
            "data_time": 0.003901623823541276,
            "batch_time": 0.03285146472935981,
            "samples_per_second": 1091699.453657379,
            "samples_per_second_per_gpu": 136462.43170717236,
            "loss_sequences_lower_95": 5.129793131510417,
            "loss_sequences_upper_95": 5.330579182942708,
            "loss_tokens_lower_95": 4.687930313974057,
            "loss_tokens_upper_95": 4.83074619202044,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.308945583519925,
            "data_time": 0.004800034144705058,
            "batch_time": 0.03419670152448421,
            "samples_per_second": 1077264.6685304977,
            "samples_per_second_per_gpu": 134658.0835663122,
            "loss_sequences_lower_95": 3.3531723566178213,
            "loss_sequences_upper_95": 3.4147419568713273,
            "loss_tokens_lower_95": 3.214052738471187,
            "loss_tokens_upper_95": 3.24678376846183,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6886593265966936,
            "data_time": 0.023688967738832747,
            "batch_time": 0.05472715411867414,
            "samples_per_second": 980446.1730165143,
            "samples_per_second_per_gpu": 122555.77162706428,
            "loss_sequences_lower_95": 2.663251578591087,
            "loss_sequences_upper_95": 2.7762986477938565,
            "loss_tokens_lower_95": 2.617638307152575,
            "loss_tokens_upper_95": 2.6671899327950217,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.561575329060457,
            "data_time": 0.02078968472778797,
            "batch_time": 0.0522406417876482,
            "samples_per_second": 950121.4905282703,
            "samples_per_second_per_gpu": 118765.18631603378,
            "loss_sequences_lower_95": 3.5468345050422516,
            "loss_sequences_upper_95": 3.7399856255978956,
            "loss_tokens_lower_95": 3.438403984798155,
            "loss_tokens_upper_95": 3.5318110639175693,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.927048668861389,
            "data_time": 0.01656233347379244,
            "batch_time": 0.04597754509021074,
            "samples_per_second": 1012459.5398638436,
            "samples_per_second_per_gpu": 126557.44248298045,
            "loss_sequences_lower_95": 3.8878733520507813,
            "loss_sequences_upper_95": 3.9867269897460935,
            "loss_tokens_lower_95": 3.7991249414545347,
            "loss_tokens_upper_95": 4.016551949745056,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.481855047185202,
            "data_time": 0.0016176052513982282,
            "batch_time": 0.03044366665260023,
            "samples_per_second": 1103807.1777890618,
            "samples_per_second_per_gpu": 137975.89722363272,
            "loss_sequences_lower_95": 6.4939163150189465,
            "loss_sequences_upper_95": 6.577258477990748,
            "loss_tokens_lower_95": 6.331401012799225,
            "loss_tokens_upper_95": 6.416897008934971,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.689536456457693,
            "data_time": 0.003010504597785489,
            "batch_time": 0.031582233689775405,
            "samples_per_second": 1108447.1571222113,
            "samples_per_second_per_gpu": 138555.8946402764,
            "loss_sequences_lower_95": 5.235108716560133,
            "loss_sequences_upper_95": 5.545610359782723,
            "loss_tokens_lower_95": 3.948025299266292,
            "loss_tokens_upper_95": 4.087988846698314,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.249501640377598,
            "data_time": 0.004981016790544665,
            "batch_time": 0.033982006681931985,
            "samples_per_second": 1082587.1898549313,
            "samples_per_second_per_gpu": 135323.39873186641,
            "loss_sequences_lower_95": 4.6990967317652785,
            "loss_sequences_upper_95": 5.041813987757972,
            "loss_tokens_lower_95": 3.827246976091078,
            "loss_tokens_upper_95": 3.9841077136777994,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.721284069427072,
            "data_time": 0.023043049233300344,
            "batch_time": 0.053322285413742065,
            "samples_per_second": 987970.2330878628,
            "samples_per_second_per_gpu": 123496.27913598286,
            "loss_sequences_lower_95": 5.589896256956336,
            "loss_sequences_upper_95": 5.845670879485945,
            "loss_tokens_lower_95": 5.59590061135488,
            "loss_tokens_upper_95": 5.844221712574023,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6040167665481566,
            "data_time": 0.046798499730917126,
            "batch_time": 0.0783853530883789,
            "samples_per_second": 882056.0177277946,
            "samples_per_second_per_gpu": 110257.00221597432,
            "loss_sequences_lower_95": 3.467142669677734,
            "loss_sequences_upper_95": 3.8214924392700196,
            "loss_tokens_lower_95": 3.3087066104458995,
            "loss_tokens_upper_95": 3.7618963524779176,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.133363460689143,
            "data_time": 0.003414237426102527,
            "batch_time": 0.0321640273544686,
            "samples_per_second": 1104887.7669408494,
            "samples_per_second_per_gpu": 138110.97086760617,
            "loss_sequences_lower_95": 5.088603130797402,
            "loss_sequences_upper_95": 5.177870876347396,
            "loss_tokens_lower_95": 5.088652033888819,
            "loss_tokens_upper_95": 5.17823789942106,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.422316095459959,
            "data_time": 0.004925382078957986,
            "batch_time": 0.03393826554688894,
            "samples_per_second": 1088589.4750502624,
            "samples_per_second_per_gpu": 136073.6843812828,
            "loss_sequences_lower_95": 5.371199524275696,
            "loss_sequences_upper_95": 5.473492606598075,
            "loss_tokens_lower_95": 5.369732381084921,
            "loss_tokens_upper_95": 5.474318245802621,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6223956377643756,
            "data_time": 0.0035198543293172286,
            "batch_time": 0.03302864637024717,
            "samples_per_second": 1071507.0933930292,
            "samples_per_second_per_gpu": 133938.38667412865,
            "loss_sequences_lower_95": 3.7544270279877074,
            "loss_sequences_upper_95": 3.88064698062602,
            "loss_tokens_lower_95": 3.470057546027435,
            "loss_tokens_upper_95": 3.5299551068237003,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.581786576986313,
            "data_time": 0.010812446475028992,
            "batch_time": 0.041171811521053314,
            "samples_per_second": 1010328.9685219165,
            "samples_per_second_per_gpu": 126291.12106523957,
            "loss_sequences_lower_95": 5.7820830078125,
            "loss_sequences_upper_95": 6.319417065429688,
            "loss_tokens_lower_95": 4.993808910438038,
            "loss_tokens_upper_95": 5.351338693655381,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.042763382196426,
            "data_time": 0.16038617491722107,
            "batch_time": 0.19637086987495422,
            "samples_per_second": 516412.50576905493,
            "samples_per_second_per_gpu": 64551.563221131866,
            "loss_sequences_lower_95": 3.8043474316596986,
            "loss_sequences_upper_95": 4.357803368568421,
            "loss_tokens_lower_95": 3.579003108232871,
            "loss_tokens_upper_95": 4.3751812638907595,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.260340209664969,
            "data_time": 0.02761057843553259,
            "batch_time": 0.05759407357966646,
            "samples_per_second": 922920.7129226073,
            "samples_per_second_per_gpu": 115365.08911532591,
            "loss_sequences_lower_95": 4.46584736615762,
            "loss_sequences_upper_95": 4.958045696390086,
            "loss_tokens_lower_95": 3.486802176182004,
            "loss_tokens_upper_95": 3.8584524264330256,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3241591537399757,
            "data_time": 0.0028591559578975043,
            "batch_time": 0.03156125317845079,
            "samples_per_second": 1101133.996436285,
            "samples_per_second_per_gpu": 137641.74955453564,
            "loss_sequences_lower_95": 2.3027588157270507,
            "loss_sequences_upper_95": 2.345358544345921,
            "loss_tokens_lower_95": 2.302526626153615,
            "loss_tokens_upper_95": 2.345344572121423,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3603182316052154,
            "data_time": 0.0024667536513861823,
            "batch_time": 0.031170005154415394,
            "samples_per_second": 1105813.0347970494,
            "samples_per_second_per_gpu": 138226.62934963117,
            "loss_sequences_lower_95": 3.3331196254002524,
            "loss_sequences_upper_95": 3.499929207273069,
            "loss_tokens_lower_95": 3.172914667021737,
            "loss_tokens_upper_95": 3.337061178810267,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3264394092472482,
            "data_time": 0.018578835659556918,
            "batch_time": 0.047620034880108304,
            "samples_per_second": 998067.4511954538,
            "samples_per_second_per_gpu": 124758.43139943172,
            "loss_sequences_lower_95": 3.1759733598310866,
            "loss_sequences_upper_95": 3.5881116538694053,
            "loss_tokens_lower_95": 3.06197536867267,
            "loss_tokens_upper_95": 3.358510039803278,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.710145460025664,
            "data_time": 0.004613589122891426,
            "batch_time": 0.03326274156570434,
            "samples_per_second": 1092810.9352085958,
            "samples_per_second_per_gpu": 136601.36690107448,
            "loss_sequences_lower_95": 3.7461234245511053,
            "loss_sequences_upper_95": 3.895534693018696,
            "loss_tokens_lower_95": 3.566532026128341,
            "loss_tokens_upper_95": 3.711398276955686,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0742509902977364,
            "data_time": 0.030715028444925945,
            "batch_time": 0.06072370495115008,
            "samples_per_second": 974215.6721334847,
            "samples_per_second_per_gpu": 121776.95901668559,
            "loss_sequences_lower_95": 2.9187340899211605,
            "loss_sequences_upper_95": 3.3771036194591986,
            "loss_tokens_lower_95": 2.8088046050577335,
            "loss_tokens_upper_95": 3.1816534443741715,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.088683018376289,
            "data_time": 0.0020663496784053227,
            "batch_time": 0.031050275337015015,
            "samples_per_second": 1095635.075454449,
            "samples_per_second_per_gpu": 136954.38443180613,
            "loss_sequences_lower_95": 5.077009034619424,
            "loss_sequences_upper_95": 5.100153810058887,
            "loss_tokens_lower_95": 5.077273853208141,
            "loss_tokens_upper_95": 5.100090984993874,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.32161700783424,
            "data_time": 0.04586336395957253,
            "batch_time": 0.07793016000227494,
            "samples_per_second": 848303.2295962274,
            "samples_per_second_per_gpu": 106037.90369952843,
            "loss_sequences_lower_95": 1.263525172113215,
            "loss_sequences_upper_95": 1.4336253786549986,
            "loss_tokens_lower_95": 1.1334730290829709,
            "loss_tokens_upper_95": 1.392607661767528,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.943632149265247,
            "data_time": 0.001398728971509302,
            "batch_time": 0.030453510713597254,
            "samples_per_second": 1094375.038497785,
            "samples_per_second_per_gpu": 136796.87981222314,
            "loss_sequences_lower_95": 5.302477428508255,
            "loss_sequences_upper_95": 5.347447916666667,
            "loss_tokens_lower_95": 4.385115413442939,
            "loss_tokens_upper_95": 4.43025749516441,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.940195317268372,
            "data_time": 0.00567085449657743,
            "batch_time": 0.03475512304003277,
            "samples_per_second": 1080331.0767214552,
            "samples_per_second_per_gpu": 135041.3845901819,
            "loss_sequences_lower_95": 5.938436291503906,
            "loss_sequences_upper_95": 6.154746081542968,
            "loss_tokens_lower_95": 5.717906563058899,
            "loss_tokens_upper_95": 5.919334190941004,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.590117589287136,
            "data_time": 0.022627883038278354,
            "batch_time": 0.052834672442937304,
            "samples_per_second": 993226.1899795668,
            "samples_per_second_per_gpu": 124153.27374744586,
            "loss_sequences_lower_95": 5.4127141537873635,
            "loss_sequences_upper_95": 5.771754707668139,
            "loss_tokens_lower_95": 5.4129324871560796,
            "loss_tokens_upper_95": 5.765197674295177,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.472574722405636,
            "data_time": 0.004603282514824925,
            "batch_time": 0.03376016846622329,
            "samples_per_second": 1083414.199157966,
            "samples_per_second_per_gpu": 135426.77489474576,
            "loss_sequences_lower_95": 5.39945146040483,
            "loss_sequences_upper_95": 5.545364657315341,
            "loss_tokens_lower_95": 5.400896939364347,
            "loss_tokens_upper_95": 5.5443652898615055,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.591647382179896,
            "data_time": 0.004144388627498708,
            "batch_time": 0.033846145931710585,
            "samples_per_second": 1070833.4047192885,
            "samples_per_second_per_gpu": 133854.17558991106,
            "loss_sequences_lower_95": 1.6566848795572917,
            "loss_sequences_upper_95": 1.7378573323567708,
            "loss_tokens_lower_95": 1.4818280476253,
            "loss_tokens_upper_95": 1.5560702405962383,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.405728622845241,
            "data_time": 0.023404811109815325,
            "batch_time": 0.05335941059248788,
            "samples_per_second": 952968.6188053793,
            "samples_per_second_per_gpu": 119121.0773506724,
            "loss_sequences_lower_95": 6.056238592238654,
            "loss_sequences_upper_95": 6.752660580589658,
            "loss_tokens_lower_95": 6.054203404017857,
            "loss_tokens_upper_95": 6.7530990455264135,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.380569666624069,
            "data_time": 0.15054629743099213,
            "batch_time": 0.1861794888973236,
            "samples_per_second": 542781.0067666089,
            "samples_per_second_per_gpu": 67847.62584582611,
            "loss_sequences_lower_95": 2.16716411113739,
            "loss_sequences_upper_95": 3.214460164308548,
            "loss_tokens_lower_95": 1.8306181319718509,
            "loss_tokens_upper_95": 2.3555188177049775,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.2386953911781315,
            "data_time": 0.005569140116373698,
            "batch_time": 0.034728712032711696,
            "samples_per_second": 1079824.8995553965,
            "samples_per_second_per_gpu": 134978.11244442457,
            "loss_sequences_lower_95": 7.191239636230469,
            "loss_sequences_upper_95": 7.5081294921875,
            "loss_tokens_lower_95": 6.954923626533418,
            "loss_tokens_upper_95": 7.238314752207593,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.055966429710388,
            "data_time": 0.006055721687892127,
            "batch_time": 0.03515190217230055,
            "samples_per_second": 1079847.461832563,
            "samples_per_second_per_gpu": 134980.93272907037,
            "loss_sequences_lower_95": 7.159186889648438,
            "loss_sequences_upper_95": 7.383941662597656,
            "loss_tokens_lower_95": 6.815973264095361,
            "loss_tokens_upper_95": 6.994517777643027,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.649760264499932,
            "data_time": 0.003676358991642062,
            "batch_time": 0.032394109840775814,
            "samples_per_second": 1098988.8635583478,
            "samples_per_second_per_gpu": 137373.60794479348,
            "loss_sequences_lower_95": 5.623808842555733,
            "loss_sequences_upper_95": 5.6762659137456,
            "loss_tokens_lower_95": 5.623890162745139,
            "loss_tokens_upper_95": 5.67597574406009,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.304457671814434,
            "data_time": 0.008123719079977075,
            "batch_time": 0.037302241944834544,
            "samples_per_second": 1059226.9292345652,
            "samples_per_second_per_gpu": 132403.36615432065,
            "loss_sequences_lower_95": 5.203041538408458,
            "loss_sequences_upper_95": 5.403478844521049,
            "loss_tokens_lower_95": 5.200065216673867,
            "loss_tokens_upper_95": 5.40307376234579,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.513668880462647,
            "data_time": 0.005927436881595188,
            "batch_time": 0.03477815408555288,
            "samples_per_second": 1087174.9769067827,
            "samples_per_second_per_gpu": 135896.87211334784,
            "loss_sequences_lower_95": 6.4224736083984375,
            "loss_sequences_upper_95": 6.608716345214844,
            "loss_tokens_lower_95": 6.422671472167969,
            "loss_tokens_upper_95": 6.606594604492187,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2193439765030636,
            "data_time": 0.002053412417608308,
            "batch_time": 0.030943141407236485,
            "samples_per_second": 1098200.781810396,
            "samples_per_second_per_gpu": 137275.0977262995,
            "loss_sequences_lower_95": 3.6751879582544937,
            "loss_sequences_upper_95": 3.754054203376301,
            "loss_tokens_lower_95": 2.6453814629833534,
            "loss_tokens_upper_95": 2.702666303474118,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.8208238927286065,
            "data_time": 0.02009103468486241,
            "batch_time": 0.049523356982639856,
            "samples_per_second": 1001258.4320084393,
            "samples_per_second_per_gpu": 125157.30400105492,
            "loss_sequences_lower_95": 5.619302436486999,
            "loss_sequences_upper_95": 6.021505999209276,
            "loss_tokens_lower_95": 5.62129746622114,
            "loss_tokens_upper_95": 6.020490025762302,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.849185413472792,
            "data_time": 0.010784708894789219,
            "batch_time": 0.04106646869331598,
            "samples_per_second": 1042705.7011249175,
            "samples_per_second_per_gpu": 130338.21264061469,
            "loss_sequences_lower_95": 5.711554038851869,
            "loss_sequences_upper_95": 5.979833972407322,
            "loss_tokens_lower_95": 5.713854250440411,
            "loss_tokens_upper_95": 5.979648054534314,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9785307282928417,
            "data_time": 0.0022050381660699905,
            "batch_time": 0.03131266950696491,
            "samples_per_second": 1088885.9363948412,
            "samples_per_second_per_gpu": 136110.74204935515,
            "loss_sequences_lower_95": 4.441171057833834,
            "loss_sequences_upper_95": 4.530520128711011,
            "loss_tokens_lower_95": 3.311480969543938,
            "loss_tokens_upper_95": 3.3879605528467205,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.1007825836302745,
            "data_time": 0.026861762007077534,
            "batch_time": 0.057449904580911,
            "samples_per_second": 983849.0032671186,
            "samples_per_second_per_gpu": 122981.12540838982,
            "loss_sequences_lower_95": 6.011573476640005,
            "loss_sequences_upper_95": 6.188194234535176,
            "loss_tokens_lower_95": 6.011704992869544,
            "loss_tokens_upper_95": 6.187146674262153,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.024493781798476,
            "data_time": 0.0035729727028927087,
            "batch_time": 0.03281856965960455,
            "samples_per_second": 1081535.6797093146,
            "samples_per_second_per_gpu": 135191.95996366433,
            "loss_sequences_lower_95": 4.985149022840214,
            "loss_sequences_upper_95": 5.063650611620795,
            "loss_tokens_lower_95": 4.985885633182818,
            "loss_tokens_upper_95": 5.06281267918578,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.950672522331905,
            "data_time": 0.02294377847151323,
            "batch_time": 0.05291052731600675,
            "samples_per_second": 951354.8641223902,
            "samples_per_second_per_gpu": 118919.35801529877,
            "loss_sequences_lower_95": 5.738588159061172,
            "loss_sequences_upper_95": 6.160317889463554,
            "loss_tokens_lower_95": 5.73887123181982,
            "loss_tokens_upper_95": 6.16276714732346,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.553884259859721,
            "data_time": 0.07790223509073257,
            "batch_time": 0.11120155453681946,
            "samples_per_second": 713437.1678639502,
            "samples_per_second_per_gpu": 89179.64598299378,
            "loss_sequences_lower_95": 3.2875237782796223,
            "loss_sequences_upper_95": 3.9724903424580886,
            "loss_tokens_lower_95": 2.9520918157365585,
            "loss_tokens_upper_95": 3.8869586838616264,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.159783188501994,
            "data_time": 0.08407320827245712,
            "batch_time": 0.11705642193555832,
            "samples_per_second": 716274.2399690979,
            "samples_per_second_per_gpu": 89534.27999613724,
            "loss_sequences_lower_95": 2.9591683832804363,
            "loss_sequences_upper_95": 3.671417026519775,
            "loss_tokens_lower_95": 2.425995091641887,
            "loss_tokens_upper_95": 3.4524568279137773,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.492813885018822,
            "data_time": 0.0032042647334250933,
            "batch_time": 0.03229464805427897,
            "samples_per_second": 1089293.8257554239,
            "samples_per_second_per_gpu": 136161.72821942798,
            "loss_sequences_lower_95": 4.467869770572533,
            "loss_sequences_upper_95": 4.518042977379418,
            "loss_tokens_lower_95": 4.467490809669551,
            "loss_tokens_upper_95": 4.517988985985824,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.7436679144606979,
            "data_time": 0.001334219315054641,
            "batch_time": 0.030453750134101322,
            "samples_per_second": 1091580.0799487752,
            "samples_per_second_per_gpu": 136447.5099935969,
            "loss_sequences_lower_95": 0.870183026832195,
            "loss_sequences_upper_95": 0.8918656360725612,
            "loss_tokens_lower_95": 0.6165350373094741,
            "loss_tokens_upper_95": 0.6277812494474309,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.766898695878156,
            "data_time": 0.039439354091882706,
            "batch_time": 0.072382602840662,
            "samples_per_second": 898499.8611885731,
            "samples_per_second_per_gpu": 112312.48264857163,
            "loss_sequences_lower_95": 4.804006753756306,
            "loss_sequences_upper_95": 5.1834615752452935,
            "loss_tokens_lower_95": 4.4393199421211005,
            "loss_tokens_upper_95": 4.701592557303181,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 8.33377389650087,
            "data_time": 0.11757448741367885,
            "batch_time": 0.15099220048813594,
            "samples_per_second": 559161.6306907239,
            "samples_per_second_per_gpu": 69895.20383634049,
            "loss_sequences_lower_95": 7.824167344376848,
            "loss_sequences_upper_95": 9.083743698532516,
            "loss_tokens_lower_95": 7.259308916256752,
            "loss_tokens_upper_95": 9.081359373492958,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.695974594209252,
            "data_time": 0.030900007202511744,
            "batch_time": 0.061759741533370244,
            "samples_per_second": 946538.5902248414,
            "samples_per_second_per_gpu": 118317.32377810517,
            "loss_sequences_lower_95": 4.68368062275212,
            "loss_sequences_upper_95": 5.031692318800019,
            "loss_tokens_lower_95": 4.3141360583999155,
            "loss_tokens_upper_95": 4.536781105445752,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.840292235700096,
            "data_time": 0.030740933758871897,
            "batch_time": 0.061523306937444774,
            "samples_per_second": 952448.1383144216,
            "samples_per_second_per_gpu": 119056.0172893027,
            "loss_sequences_lower_95": 4.821827558191811,
            "loss_sequences_upper_95": 5.136457601407679,
            "loss_tokens_lower_95": 4.490897972524775,
            "loss_tokens_upper_95": 4.676378330871789,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.805500707975248,
            "data_time": 0.03071232069106329,
            "batch_time": 0.06115788789022537,
            "samples_per_second": 960872.5771126816,
            "samples_per_second_per_gpu": 120109.0721390852,
            "loss_sequences_lower_95": 4.785550782738663,
            "loss_sequences_upper_95": 5.184911727905273,
            "loss_tokens_lower_95": 4.372389787144192,
            "loss_tokens_upper_95": 4.664015426767359,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.9806123521269825,
            "data_time": 0.030765743482680546,
            "batch_time": 0.06087963637851533,
            "samples_per_second": 977624.4721382809,
            "samples_per_second_per_gpu": 122203.05901728511,
            "loss_sequences_lower_95": 4.9347809396139,
            "loss_sequences_upper_95": 5.238872137302306,
            "loss_tokens_lower_95": 4.659236244843385,
            "loss_tokens_upper_95": 4.831879764925282,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.263954864525647,
            "data_time": 0.03320781978560083,
            "batch_time": 0.06462027114114643,
            "samples_per_second": 970377.8784402057,
            "samples_per_second_per_gpu": 121297.23480502571,
            "loss_sequences_lower_95": 4.190173851629222,
            "loss_sequences_upper_95": 4.408512015964673,
            "loss_tokens_lower_95": 4.020304845166716,
            "loss_tokens_upper_95": 4.160008945872331,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8465226001855806,
            "data_time": 0.032518051919483,
            "batch_time": 0.06270869572957356,
            "samples_per_second": 977345.9068460106,
            "samples_per_second_per_gpu": 122168.23835575132,
            "loss_sequences_lower_95": 3.8585210288443217,
            "loss_sequences_upper_95": 4.1184052630168635,
            "loss_tokens_lower_95": 3.55898672369462,
            "loss_tokens_upper_95": 3.6742788506818864,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-1.0/params.txt",
    "uuid": "b020e1a2-9fa1-467c-baa3-6241b307c52d",
    "creation_date": "2023_12_13-16_18_09"
}