{
    "name": "rw_original-d=96_l=8_h=4-4.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 845544960,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "169108992",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.282645416259766,
            "data_time": 0.13335955142974854,
            "batch_time": 1.2893697172403336,
            "samples_per_second": 376214.37800526805,
            "samples_per_second_per_gpu": 47026.79725065851,
            "loss_sequences_lower_95": 5.155650418599446,
            "loss_sequences_upper_95": 5.412414563496907,
            "loss_tokens_lower_95": 5.266810124715169,
            "loss_tokens_upper_95": 5.29822208404541,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7386848524060845,
            "data_time": 0.019093342619879976,
            "batch_time": 0.06402847098551157,
            "samples_per_second": 4665328.211930374,
            "samples_per_second_per_gpu": 583166.0264912967,
            "loss_sequences_lower_95": 4.736472258129508,
            "loss_sequences_upper_95": 4.740895867157261,
            "loss_tokens_lower_95": 4.727131135416667,
            "loss_tokens_upper_95": 4.750278604166667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.2449602389822205,
            "data_time": 0.09501946717500687,
            "batch_time": 0.13953574746847153,
            "samples_per_second": 4115603.6776304124,
            "samples_per_second_per_gpu": 514450.45970380155,
            "loss_sequences_lower_95": 4.181406387017697,
            "loss_sequences_upper_95": 4.325634927555006,
            "loss_tokens_lower_95": 4.2305743125,
            "loss_tokens_upper_95": 4.259292125,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.930886481176946,
            "data_time": 0.01567366876100239,
            "batch_time": 0.059580056290877495,
            "samples_per_second": 5185725.343749117,
            "samples_per_second_per_gpu": 648215.6679686396,
            "loss_sequences_lower_95": 4.886973119362113,
            "loss_sequences_upper_95": 4.9758969072164945,
            "loss_tokens_lower_95": 4.9172633125,
            "loss_tokens_upper_95": 4.944085604166667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.797851341076635,
            "data_time": 0.0922495648264885,
            "batch_time": 0.137858085334301,
            "samples_per_second": 4059492.0101707117,
            "samples_per_second_per_gpu": 507436.50127133896,
            "loss_sequences_lower_95": 4.735185984438646,
            "loss_sequences_upper_95": 4.876374670832804,
            "loss_tokens_lower_95": 4.785599145833333,
            "loss_tokens_upper_95": 4.810076541666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.215247435605421,
            "data_time": 0.03348046044508616,
            "batch_time": 0.07634808371464412,
            "samples_per_second": 4964052.1603494845,
            "samples_per_second_per_gpu": 620506.5200436856,
            "loss_sequences_lower_95": 5.160802334721278,
            "loss_sequences_upper_95": 5.275808601716519,
            "loss_tokens_lower_95": 5.20165625,
            "loss_tokens_upper_95": 5.228964760416667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.806585446085249,
            "data_time": 0.012426165491342544,
            "batch_time": 0.05493051931262016,
            "samples_per_second": 5223591.669550851,
            "samples_per_second_per_gpu": 652948.9586938564,
            "loss_sequences_lower_95": 5.770631766183035,
            "loss_sequences_upper_95": 5.841712123325893,
            "loss_tokens_lower_95": 5.788585604166666,
            "loss_tokens_upper_95": 5.825116520833333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.0538981676351336,
            "data_time": 0.013908450540743376,
            "batch_time": 0.05716129431599065,
            "samples_per_second": 5298533.224634984,
            "samples_per_second_per_gpu": 662316.653079373,
            "loss_sequences_lower_95": 5.026023856757199,
            "loss_sequences_upper_95": 5.084231695844241,
            "loss_tokens_lower_95": 5.041626479166666,
            "loss_tokens_upper_95": 5.066109770833334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.924557170247644,
            "data_time": 0.09758581966161728,
            "batch_time": 0.1423216164112091,
            "samples_per_second": 4131175.9018718665,
            "samples_per_second_per_gpu": 516396.9877339833,
            "loss_sequences_lower_95": 4.829843189270516,
            "loss_sequences_upper_95": 5.042185527522389,
            "loss_tokens_lower_95": 4.911728864583333,
            "loss_tokens_upper_95": 4.937140187500001,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.124183442752823,
            "data_time": 0.09665894508361816,
            "batch_time": 0.14167331159114838,
            "samples_per_second": 4111509.29774736,
            "samples_per_second_per_gpu": 513938.66221842,
            "loss_sequences_lower_95": 5.996455823574141,
            "loss_sequences_upper_95": 6.2783051140223565,
            "loss_tokens_lower_95": 6.110534833333333,
            "loss_tokens_upper_95": 6.137451125,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.163166966555265,
            "data_time": 0.010123470220072516,
            "batch_time": 0.05350757056269152,
            "samples_per_second": 5391285.753777492,
            "samples_per_second_per_gpu": 673910.7192221865,
            "loss_sequences_lower_95": 5.151630472604324,
            "loss_sequences_upper_95": 5.175172641582157,
            "loss_tokens_lower_95": 5.150471479166666,
            "loss_tokens_upper_95": 5.176070447916667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.997775250402702,
            "data_time": 0.021975921094417573,
            "batch_time": 0.07170225977897644,
            "samples_per_second": 5039783.757027639,
            "samples_per_second_per_gpu": 629972.9696284549,
            "loss_sequences_lower_95": 4.972279697912327,
            "loss_sequences_upper_95": 5.024460938313463,
            "loss_tokens_lower_95": 4.984794395833333,
            "loss_tokens_upper_95": 5.0106541770833335,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.221447043196425,
            "data_time": 0.08767660707235336,
            "batch_time": 0.13165682554244995,
            "samples_per_second": 4245674.46872078,
            "samples_per_second_per_gpu": 530709.3085900975,
            "loss_sequences_lower_95": 5.118808484802865,
            "loss_sequences_upper_95": 5.347228062999176,
            "loss_tokens_lower_95": 5.207632041666667,
            "loss_tokens_upper_95": 5.23514515625,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.862640588686568,
            "data_time": 0.09617786109447479,
            "batch_time": 0.14098159223794937,
            "samples_per_second": 4145127.675899853,
            "samples_per_second_per_gpu": 518140.9594874816,
            "loss_sequences_lower_95": 4.777693987378277,
            "loss_sequences_upper_95": 4.967939612239053,
            "loss_tokens_lower_95": 4.8497385,
            "loss_tokens_upper_95": 4.875862083333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.993410002101552,
            "data_time": 0.13764911890029907,
            "batch_time": 0.1591866910457611,
            "samples_per_second": 1060534.8029607153,
            "samples_per_second_per_gpu": 132566.8503700894,
            "loss_sequences_lower_95": 5.912815041975541,
            "loss_sequences_upper_95": 6.082747598127885,
            "loss_tokens_lower_95": 5.966443408619274,
            "loss_tokens_upper_95": 6.020015473799273,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.404853970941927,
            "data_time": 0.09260047972202301,
            "batch_time": 0.1270156279206276,
            "samples_per_second": 3401615.6134807463,
            "samples_per_second_per_gpu": 425201.9516850933,
            "loss_sequences_lower_95": 5.321974044221483,
            "loss_sequences_upper_95": 5.4894770052273145,
            "loss_tokens_lower_95": 5.3905220104166665,
            "loss_tokens_upper_95": 5.4191722916666665,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.41072040874914,
            "data_time": 0.09097079932689667,
            "batch_time": 0.12747172266244888,
            "samples_per_second": 3760400.8408650877,
            "samples_per_second_per_gpu": 470050.10510813596,
            "loss_sequences_lower_95": 6.309670287763852,
            "loss_sequences_upper_95": 6.54593415675503,
            "loss_tokens_lower_95": 6.3991887187500005,
            "loss_tokens_upper_95": 6.4226303125,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.713233314576696,
            "data_time": 0.16017204523086548,
            "batch_time": 0.18916642665863037,
            "samples_per_second": 2273565.995167139,
            "samples_per_second_per_gpu": 284195.7493958924,
            "loss_sequences_lower_95": 5.539688973348649,
            "loss_sequences_upper_95": 6.023566736940477,
            "loss_tokens_lower_95": 5.69785436411373,
            "loss_tokens_upper_95": 5.72828016437468,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.568941030079883,
            "data_time": 0.026672060381282458,
            "batch_time": 0.07138158570636402,
            "samples_per_second": 4511199.391068197,
            "samples_per_second_per_gpu": 563899.9238835246,
            "loss_sequences_lower_95": 5.5523770485418735,
            "loss_sequences_upper_95": 5.584941089121029,
            "loss_tokens_lower_95": 5.552293440548889,
            "loss_tokens_upper_95": 5.585278789746831,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.421401456683975,
            "data_time": 0.034173978865146636,
            "batch_time": 0.07801318354904652,
            "samples_per_second": 4370056.983111369,
            "samples_per_second_per_gpu": 546257.1228889212,
            "loss_sequences_lower_95": 4.4400994339399515,
            "loss_sequences_upper_95": 4.465421051894543,
            "loss_tokens_lower_95": 4.409315933235558,
            "loss_tokens_upper_95": 4.430600072387109,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.880776753558491,
            "data_time": 0.04844898316595289,
            "batch_time": 0.0902267297108968,
            "samples_per_second": 4300621.834501855,
            "samples_per_second_per_gpu": 537577.7293127319,
            "loss_sequences_lower_95": 7.299950837435787,
            "loss_sequences_upper_95": 7.552600714638935,
            "loss_tokens_lower_95": 6.751406975964975,
            "loss_tokens_upper_95": 6.941622864453032,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.582188836415609,
            "data_time": 0.04519666483004888,
            "batch_time": 0.0889882892370224,
            "samples_per_second": 4513120.981504971,
            "samples_per_second_per_gpu": 564140.1226881214,
            "loss_sequences_lower_95": 6.921005582682291,
            "loss_sequences_upper_95": 7.089835367838542,
            "loss_tokens_lower_95": 6.48950961821934,
            "loss_tokens_upper_95": 6.615724118022799,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.668628825664158,
            "data_time": 0.06641499946514766,
            "batch_time": 0.10665408770243327,
            "samples_per_second": 3933424.0642579873,
            "samples_per_second_per_gpu": 491678.0080322484,
            "loss_sequences_lower_95": 4.761677751403762,
            "loss_sequences_upper_95": 4.826906952991731,
            "loss_tokens_lower_95": 4.643853438175556,
            "loss_tokens_upper_95": 4.6784406332679955,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.738475190509449,
            "data_time": 0.3282133638858795,
            "batch_time": 0.3699609190225601,
            "samples_per_second": 2719478.3716589385,
            "samples_per_second_per_gpu": 339934.7964573673,
            "loss_sequences_lower_95": 4.7358339483087715,
            "loss_sequences_upper_95": 4.867960427024148,
            "loss_tokens_lower_95": 4.697457744948604,
            "loss_tokens_upper_95": 4.760940578391112,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.720495299903714,
            "data_time": 0.35291169583797455,
            "batch_time": 0.39889849722385406,
            "samples_per_second": 2480951.5915975776,
            "samples_per_second_per_gpu": 310118.9489496972,
            "loss_sequences_lower_95": 4.753080481704401,
            "loss_sequences_upper_95": 4.953021900410555,
            "loss_tokens_lower_95": 4.6734060146200065,
            "loss_tokens_upper_95": 4.778551460942789,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.773499546051025,
            "data_time": 0.16818882524967194,
            "batch_time": 0.19923819601535797,
            "samples_per_second": 2648732.665871424,
            "samples_per_second_per_gpu": 331091.583233928,
            "loss_sequences_lower_95": 4.760399597167969,
            "loss_sequences_upper_95": 4.858582082112631,
            "loss_tokens_lower_95": 4.664610503659997,
            "loss_tokens_upper_95": 4.877080499109143,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.959421287928208,
            "data_time": 0.02531564701348543,
            "batch_time": 0.0696392321959138,
            "samples_per_second": 4497951.674378047,
            "samples_per_second_per_gpu": 562243.9592972558,
            "loss_sequences_lower_95": 9.036832087188131,
            "loss_sequences_upper_95": 9.108867135598642,
            "loss_tokens_lower_95": 8.905123068618902,
            "loss_tokens_upper_95": 8.980913557444216,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.713528558461353,
            "data_time": 0.04998396784067154,
            "batch_time": 0.09196818619966507,
            "samples_per_second": 4368194.056456434,
            "samples_per_second_per_gpu": 546024.2570570542,
            "loss_sequences_lower_95": 6.848883765634864,
            "loss_sequences_upper_95": 7.125576073232323,
            "loss_tokens_lower_95": 5.5713830045306825,
            "loss_tokens_upper_95": 5.715618192445787,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.406683822133842,
            "data_time": 0.07800761759281158,
            "batch_time": 0.11971836388111115,
            "samples_per_second": 4324460.064277484,
            "samples_per_second_per_gpu": 540557.5080346855,
            "loss_sequences_lower_95": 6.118221660522877,
            "loss_sequences_upper_95": 6.420585043845323,
            "loss_tokens_lower_95": 5.302602338960149,
            "loss_tokens_upper_95": 5.467218901888715,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.671574157122608,
            "data_time": 0.36051730811595917,
            "batch_time": 0.40183983743190765,
            "samples_per_second": 2760599.7395196487,
            "samples_per_second_per_gpu": 345074.9674399561,
            "loss_sequences_lower_95": 5.579947066633669,
            "loss_sequences_upper_95": 5.7636542054616156,
            "loss_tokens_lower_95": 5.580437953722532,
            "loss_tokens_upper_95": 5.764364470738799,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.878637528419494,
            "data_time": 0.2977241724729538,
            "batch_time": 0.3243711441755295,
            "samples_per_second": 1581227.4275270123,
            "samples_per_second_per_gpu": 197653.42844087654,
            "loss_sequences_lower_95": 4.81196004486084,
            "loss_sequences_upper_95": 5.23080485534668,
            "loss_tokens_lower_95": 4.619280874835784,
            "loss_tokens_upper_95": 5.107103804313645,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.11174104084378,
            "data_time": 0.05411624535918236,
            "batch_time": 0.09727787412703037,
            "samples_per_second": 4444930.038855897,
            "samples_per_second_per_gpu": 555616.2548569872,
            "loss_sequences_lower_95": 6.059022857759724,
            "loss_sequences_upper_95": 6.164791710813555,
            "loss_tokens_lower_95": 6.057855443261419,
            "loss_tokens_upper_95": 6.164215331531474,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.216815387685215,
            "data_time": 0.09245906472206115,
            "batch_time": 0.13574437499046327,
            "samples_per_second": 4199932.103642298,
            "samples_per_second_per_gpu": 524991.5129552872,
            "loss_sequences_lower_95": 6.169715305237766,
            "loss_sequences_upper_95": 6.261229928881296,
            "loss_tokens_lower_95": 6.170534675966805,
            "loss_tokens_upper_95": 6.263867717371135,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.939317465087924,
            "data_time": 0.06814901903271675,
            "batch_time": 0.10925192385911942,
            "samples_per_second": 4002355.1103214948,
            "samples_per_second_per_gpu": 500294.38879018684,
            "loss_sequences_lower_95": 5.16073732594022,
            "loss_sequences_upper_95": 5.276485379191036,
            "loss_tokens_lower_95": 4.906931083224862,
            "loss_tokens_upper_95": 4.970871161512178,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.493725005149841,
            "data_time": 0.17206639051437378,
            "batch_time": 0.21669261157512665,
            "samples_per_second": 3933451.134416171,
            "samples_per_second_per_gpu": 491681.39180202136,
            "loss_sequences_lower_95": 7.132545092773437,
            "loss_sequences_upper_95": 7.633497741699219,
            "loss_tokens_lower_95": 6.258329444748925,
            "loss_tokens_upper_95": 6.600239067153375,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.214831441640854,
            "data_time": 0.14763487875461578,
            "batch_time": 0.1646835207939148,
            "samples_per_second": 866166.60472102,
            "samples_per_second_per_gpu": 108270.8255901275,
            "loss_sequences_lower_95": 4.861698055267334,
            "loss_sequences_upper_95": 5.720683646202088,
            "loss_tokens_lower_95": 4.6656284639205055,
            "loss_tokens_upper_95": 5.5576352788114,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.043548011231697,
            "data_time": 0.34237954020500183,
            "batch_time": 0.3780725449323654,
            "samples_per_second": 2386730.532962242,
            "samples_per_second_per_gpu": 298341.31662028027,
            "loss_sequences_lower_95": 5.791317784101113,
            "loss_sequences_upper_95": 6.325430631089485,
            "loss_tokens_lower_95": 4.795529708088184,
            "loss_tokens_upper_95": 5.193399092578566,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.280119022046077,
            "data_time": 0.052971482276916504,
            "batch_time": 0.09757623738712734,
            "samples_per_second": 4413703.970672923,
            "samples_per_second_per_gpu": 551712.9963341154,
            "loss_sequences_lower_95": 5.248971942919943,
            "loss_sequences_upper_95": 5.311861192029134,
            "loss_tokens_lower_95": 5.249711404236293,
            "loss_tokens_upper_95": 5.312002556810425,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.112633338543991,
            "data_time": 0.03419394720168341,
            "batch_time": 0.07727492849032085,
            "samples_per_second": 4357202.202627371,
            "samples_per_second_per_gpu": 544650.2753284214,
            "loss_sequences_lower_95": 6.247304636331385,
            "loss_sequences_upper_95": 6.4736444107194835,
            "loss_tokens_lower_95": 5.968892568156444,
            "loss_tokens_upper_95": 6.192469823755924,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.2829097871815325,
            "data_time": 0.18056365102529526,
            "batch_time": 0.21030966937541962,
            "samples_per_second": 1993257.7862790283,
            "samples_per_second_per_gpu": 249157.22328487854,
            "loss_sequences_lower_95": 4.189899240570627,
            "loss_sequences_upper_95": 4.561647452888908,
            "loss_tokens_lower_95": 4.082140119554377,
            "loss_tokens_upper_95": 4.411543718562095,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.59499635372749,
            "data_time": 0.07554509937763214,
            "batch_time": 0.12025034725666046,
            "samples_per_second": 4475386.607640458,
            "samples_per_second_per_gpu": 559423.3259550573,
            "loss_sequences_lower_95": 4.657002750869426,
            "loss_sequences_upper_95": 4.794677252645275,
            "loss_tokens_lower_95": 4.514924079611594,
            "loss_tokens_upper_95": 4.666426687389687,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.833563336511937,
            "data_time": 0.3277527540922165,
            "batch_time": 0.3623996376991272,
            "samples_per_second": 1756868.1027198483,
            "samples_per_second_per_gpu": 219608.51283998103,
            "loss_sequences_lower_95": 4.628910250780059,
            "loss_sequences_upper_95": 5.1057971116973135,
            "loss_tokens_lower_95": 4.643254788820351,
            "loss_tokens_upper_95": 5.040033655960058,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.977625138021798,
            "data_time": 0.028696792252758827,
            "batch_time": 0.07241015000776811,
            "samples_per_second": 4426278.126962058,
            "samples_per_second_per_gpu": 553284.7658702573,
            "loss_sequences_lower_95": 4.966280668243023,
            "loss_sequences_upper_95": 4.988940356430661,
            "loss_tokens_lower_95": 4.966278011461667,
            "loss_tokens_upper_95": 4.988908289470394,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.6188149174440256,
            "data_time": 0.30848929286003113,
            "batch_time": 0.33492685854434967,
            "samples_per_second": 1889126.0877625234,
            "samples_per_second_per_gpu": 236140.76097031543,
            "loss_sequences_lower_95": 3.5008528774224437,
            "loss_sequences_upper_95": 3.8731518939860816,
            "loss_tokens_lower_95": 3.3645947303108206,
            "loss_tokens_upper_95": 3.76548932953037,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.864919598347486,
            "data_time": 0.02780380129814148,
            "batch_time": 0.07207074125607808,
            "samples_per_second": 4404021.502513132,
            "samples_per_second_per_gpu": 550502.6878141415,
            "loss_sequences_lower_95": 6.655485793779481,
            "loss_sequences_upper_95": 6.701690374410377,
            "loss_tokens_lower_95": 5.7693117867504835,
            "loss_tokens_upper_95": 5.81616167794971,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.321557132720947,
            "data_time": 0.09339695051312447,
            "batch_time": 0.1379196159541607,
            "samples_per_second": 4317299.060788447,
            "samples_per_second_per_gpu": 539662.3825985559,
            "loss_sequences_lower_95": 5.3303895019531256,
            "loss_sequences_upper_95": 5.567953637695313,
            "loss_tokens_lower_95": 5.200256489269141,
            "loss_tokens_upper_95": 5.417005876746181,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.961526512063068,
            "data_time": 0.3450774699449539,
            "batch_time": 0.38820911943912506,
            "samples_per_second": 2149209.7259591864,
            "samples_per_second_per_gpu": 268651.2157448983,
            "loss_sequences_lower_95": 5.829012942106827,
            "loss_sequences_upper_95": 6.095140579887059,
            "loss_tokens_lower_95": 5.82754954462466,
            "loss_tokens_upper_95": 6.0922168499490486,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.579258268529719,
            "data_time": 0.06572559972604115,
            "batch_time": 0.10584848870833714,
            "samples_per_second": 4056878.170305804,
            "samples_per_second_per_gpu": 507109.7712882255,
            "loss_sequences_lower_95": 8.466918408942945,
            "loss_sequences_upper_95": 8.691114409475617,
            "loss_tokens_lower_95": 8.466847534179687,
            "loss_tokens_upper_95": 8.692716082948627,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.6985579977035523,
            "data_time": 0.06610243519147237,
            "batch_time": 0.11061129967371623,
            "samples_per_second": 4505322.778549552,
            "samples_per_second_per_gpu": 563165.347318694,
            "loss_sequences_lower_95": 2.8175565185546874,
            "loss_sequences_upper_95": 2.9064219726562497,
            "loss_tokens_lower_95": 2.647697110094038,
            "loss_tokens_upper_95": 2.7321004651860745,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.56612396694365,
            "data_time": 0.3405357450246811,
            "batch_time": 0.3811456114053726,
            "samples_per_second": 2729668.5260508074,
            "samples_per_second_per_gpu": 341208.56575635093,
            "loss_sequences_lower_95": 6.21289794921875,
            "loss_sequences_upper_95": 6.920564386276971,
            "loss_tokens_lower_95": 6.214570893787203,
            "loss_tokens_upper_95": 6.915665457589285,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.759932592511177,
            "data_time": 0.14754949510097504,
            "batch_time": 0.1655309647321701,
            "samples_per_second": 730965.435487265,
            "samples_per_second_per_gpu": 91370.67943590813,
            "loss_sequences_lower_95": 4.422454011440277,
            "loss_sequences_upper_95": 5.652339589595795,
            "loss_tokens_lower_95": 4.1708108992429125,
            "loss_tokens_upper_95": 4.740348504449903,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.854590882778168,
            "data_time": 0.09015636891126633,
            "batch_time": 0.13484610617160797,
            "samples_per_second": 4373226.032695467,
            "samples_per_second_per_gpu": 546653.2540869333,
            "loss_sequences_lower_95": 7.979890478515625,
            "loss_sequences_upper_95": 8.314478735351562,
            "loss_tokens_lower_95": 7.687670526649746,
            "loss_tokens_upper_95": 7.981209268586083,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.32142819070816,
            "data_time": 0.10477030649781227,
            "batch_time": 0.14869026467204094,
            "samples_per_second": 4260997.899394759,
            "samples_per_second_per_gpu": 532624.7374243449,
            "loss_sequences_lower_95": 8.58231201171875,
            "loss_sequences_upper_95": 8.825714575195313,
            "loss_tokens_lower_95": 8.18276210702773,
            "loss_tokens_upper_95": 8.423621173948423,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.7401791758638,
            "data_time": 0.04209815586606661,
            "batch_time": 0.08574409782886505,
            "samples_per_second": 4527592.888755124,
            "samples_per_second_per_gpu": 565949.1110943905,
            "loss_sequences_lower_95": 4.714230083755867,
            "loss_sequences_upper_95": 4.7662427502592815,
            "loss_tokens_lower_95": 4.714483243588669,
            "loss_tokens_upper_95": 4.766112307306508,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.741972904967089,
            "data_time": 0.1141926646232605,
            "batch_time": 0.15476304292678833,
            "samples_per_second": 4008839.876573132,
            "samples_per_second_per_gpu": 501104.9845716415,
            "loss_sequences_lower_95": 5.6472814135104645,
            "loss_sequences_upper_95": 5.836212336339526,
            "loss_tokens_lower_95": 5.649104620820733,
            "loss_tokens_upper_95": 5.834090581797235,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.676294724464416,
            "data_time": 0.0905003622174263,
            "batch_time": 0.1347971148788929,
            "samples_per_second": 4322244.1082136035,
            "samples_per_second_per_gpu": 540280.5135267004,
            "loss_sequences_lower_95": 8.60060859375,
            "loss_sequences_upper_95": 8.751770092773437,
            "loss_tokens_lower_95": 8.599582641601563,
            "loss_tokens_upper_95": 8.749664379882812,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.046726798244321,
            "data_time": 0.028336677522886367,
            "batch_time": 0.07220218437058586,
            "samples_per_second": 4446789.598412367,
            "samples_per_second_per_gpu": 555848.6998015458,
            "loss_sequences_lower_95": 5.878059735690634,
            "loss_sequences_upper_95": 5.968604402569182,
            "loss_tokens_lower_95": 4.952324721132108,
            "loss_tokens_upper_95": 5.016146469295205,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.834143053239851,
            "data_time": 0.2048483235495431,
            "batch_time": 0.23695075511932373,
            "samples_per_second": 2022204.8660426794,
            "samples_per_second_per_gpu": 252775.60825533493,
            "loss_sequences_lower_95": 5.704943255524137,
            "loss_sequences_upper_95": 5.9633382313287076,
            "loss_tokens_lower_95": 5.702936895569758,
            "loss_tokens_upper_95": 5.96244399796671,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.938164134119071,
            "data_time": 0.18384630978107452,
            "batch_time": 0.2291417196393013,
            "samples_per_second": 3751480.071164388,
            "samples_per_second_per_gpu": 468935.0088955485,
            "loss_sequences_lower_95": 5.842120756261489,
            "loss_sequences_upper_95": 6.033910630170037,
            "loss_tokens_lower_95": 5.845073553347119,
            "loss_tokens_upper_95": 6.0330801690793505,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.594970442456409,
            "data_time": 0.028185201808810234,
            "batch_time": 0.07172202551737428,
            "samples_per_second": 4488807.049673526,
            "samples_per_second_per_gpu": 561100.8812091907,
            "loss_sequences_lower_95": 6.205779419939246,
            "loss_sequences_upper_95": 6.295157213962483,
            "loss_tokens_lower_95": 5.500364160396121,
            "loss_tokens_upper_95": 5.578868013802838,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.849485551238691,
            "data_time": 0.32525891065597534,
            "batch_time": 0.36282770335674286,
            "samples_per_second": 2216266.8124835105,
            "samples_per_second_per_gpu": 277033.3515604388,
            "loss_sequences_lower_95": 4.772562791935351,
            "loss_sequences_upper_95": 4.929793318491134,
            "loss_tokens_lower_95": 4.7718556883473875,
            "loss_tokens_upper_95": 4.927785043867807,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.155899256814146,
            "data_time": 0.05112391710281372,
            "batch_time": 0.09500626761179704,
            "samples_per_second": 4369412.011170326,
            "samples_per_second_per_gpu": 546176.5013962907,
            "loss_sequences_lower_95": 7.120593970995795,
            "loss_sequences_upper_95": 7.191729560875382,
            "loss_tokens_lower_95": 7.1208894931431574,
            "loss_tokens_upper_95": 7.191093973982225,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.890558064562603,
            "data_time": 0.3265542984008789,
            "batch_time": 0.3668387234210968,
            "samples_per_second": 2559298.060709774,
            "samples_per_second_per_gpu": 319912.25758872175,
            "loss_sequences_lower_95": 5.7494805493401095,
            "loss_sequences_upper_95": 6.031862870003414,
            "loss_tokens_lower_95": 5.750961333339654,
            "loss_tokens_upper_95": 6.0315454575621965,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.609221410751343,
            "data_time": 0.2765186131000519,
            "batch_time": 0.29676327109336853,
            "samples_per_second": 1133659.0774373533,
            "samples_per_second_per_gpu": 141707.38467966917,
            "loss_sequences_lower_95": 5.3043568166097,
            "loss_sequences_upper_95": 6.1860182698567705,
            "loss_tokens_lower_95": 4.768525208367242,
            "loss_tokens_upper_95": 6.1617410977681475,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2072547992070515,
            "data_time": 0.2771286368370056,
            "batch_time": 0.2970985174179077,
            "samples_per_second": 1427283.8641419702,
            "samples_per_second_per_gpu": 178410.48301774627,
            "loss_sequences_lower_95": 5.100425033569336,
            "loss_sequences_upper_95": 6.196558825174967,
            "loss_tokens_lower_95": 4.284105613794219,
            "loss_tokens_upper_95": 5.826635450727483,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.670376930742446,
            "data_time": 0.04524958665881838,
            "batch_time": 0.08793639604534421,
            "samples_per_second": 4322360.170112123,
            "samples_per_second_per_gpu": 540295.0212640153,
            "loss_sequences_lower_95": 8.648262754280191,
            "loss_sequences_upper_95": 8.692438874953977,
            "loss_tokens_lower_95": 8.64862655617176,
            "loss_tokens_upper_95": 8.692357283804307,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.332653077928767,
            "data_time": 0.022664514626478005,
            "batch_time": 0.06718641246250102,
            "samples_per_second": 4490758.354693654,
            "samples_per_second_per_gpu": 561344.7943367067,
            "loss_sequences_lower_95": 4.060332449309307,
            "loss_sequences_upper_95": 4.095607144994187,
            "loss_tokens_lower_95": 3.263025471231036,
            "loss_tokens_upper_95": 3.2900676676274,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.737568254545917,
            "data_time": 0.33042244613170624,
            "batch_time": 0.36074477434158325,
            "samples_per_second": 2283935.700940609,
            "samples_per_second_per_gpu": 285491.9626175761,
            "loss_sequences_lower_95": 7.020230499027282,
            "loss_sequences_upper_95": 7.4838626531165415,
            "loss_tokens_lower_95": 6.5382275162736425,
            "loss_tokens_upper_95": 6.854614851522199,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 11.147450008907834,
            "data_time": 0.21355777978897095,
            "batch_time": 0.23058512806892395,
            "samples_per_second": 1025750.2638406478,
            "samples_per_second_per_gpu": 128218.78298008097,
            "loss_sequences_lower_95": 10.606516430829021,
            "loss_sequences_upper_95": 11.854187156058646,
            "loss_tokens_lower_95": 9.984079582308546,
            "loss_tokens_upper_95": 11.959188767421391,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.5855355640737026,
            "data_time": 0.3033272624015808,
            "batch_time": 0.3384963870048523,
            "samples_per_second": 1980670.4654398675,
            "samples_per_second_per_gpu": 247583.80817998343,
            "loss_sequences_lower_95": 6.792161448408917,
            "loss_sequences_upper_95": 7.179411148443455,
            "loss_tokens_lower_95": 6.387330280172414,
            "loss_tokens_upper_95": 6.647353737384357,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.704978937056007,
            "data_time": 0.33158546686172485,
            "batch_time": 0.3658374547958374,
            "samples_per_second": 2185584.5455072764,
            "samples_per_second_per_gpu": 273198.06818840955,
            "loss_sequences_lower_95": 6.895065382050305,
            "loss_sequences_upper_95": 7.244960319705126,
            "loss_tokens_lower_95": 6.534274411886508,
            "loss_tokens_upper_95": 6.750562047393573,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.7699605342818465,
            "data_time": 0.33392922580242157,
            "batch_time": 0.36914029717445374,
            "samples_per_second": 2123034.539895011,
            "samples_per_second_per_gpu": 265379.3174868764,
            "loss_sequences_lower_95": 7.0486658049792785,
            "loss_sequences_upper_95": 7.52532218374857,
            "loss_tokens_lower_95": 6.5429393941606175,
            "loss_tokens_upper_95": 6.884135674799633,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.792129246200004,
            "data_time": 0.3804399222135544,
            "batch_time": 0.41505269706249237,
            "samples_per_second": 1363093.553324695,
            "samples_per_second_per_gpu": 170386.69416558687,
            "loss_sequences_lower_95": 6.912095567656727,
            "loss_sequences_upper_95": 7.2235047317132715,
            "loss_tokens_lower_95": 6.633328846013434,
            "loss_tokens_upper_95": 6.827182263525847,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.307714403045844,
            "data_time": 0.3207576125860214,
            "batch_time": 0.3552016019821167,
            "samples_per_second": 1960526.857955425,
            "samples_per_second_per_gpu": 245065.85724442813,
            "loss_sequences_lower_95": 6.380213444869711,
            "loss_sequences_upper_95": 6.609125722565266,
            "loss_tokens_lower_95": 6.187080252358146,
            "loss_tokens_upper_95": 6.328989733383299,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.774555956445089,
            "data_time": 0.33878089487552643,
            "batch_time": 0.3735736459493637,
            "samples_per_second": 2023332.692445599,
            "samples_per_second_per_gpu": 252916.58655569988,
            "loss_sequences_lower_95": 5.970238336702673,
            "loss_sequences_upper_95": 6.230667951630383,
            "loss_tokens_lower_95": 5.649351289368596,
            "loss_tokens_upper_95": 5.788696315551537,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/checkpoints/epoch_4.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-4.0/params.txt",
    "uuid": "ad053e2a-ec41-4244-8aed-ea3ed07973b6",
    "creation_date": "2023_12_14-05_01_02"
}