{
    "name": "rw_original-d=512_l=8_h=4-1.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 1578280960,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "315656192",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.118569912513097,
            "data_time": 0.030678343027830124,
            "batch_time": 0.33859167620539665,
            "samples_per_second": 1740431.014249444,
            "samples_per_second_per_gpu": 217553.8767811805,
            "loss_sequences_lower_95": 4.030680224100749,
            "loss_sequences_upper_95": 4.207416540781657,
            "loss_tokens_lower_95": 4.103818473815918,
            "loss_tokens_upper_95": 4.133272323608399,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8609163607427246,
            "data_time": 0.0014658211074267977,
            "batch_time": 0.015294111406337827,
            "samples_per_second": 2246625.7448380915,
            "samples_per_second_per_gpu": 280828.21810476144,
            "loss_sequences_lower_95": 3.858556424193226,
            "loss_sequences_upper_95": 3.8632436392634033,
            "loss_tokens_lower_95": 3.8498086875,
            "loss_tokens_upper_95": 3.87229009375,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3871750310975677,
            "data_time": 0.009282376289367676,
            "batch_time": 0.022991680145263673,
            "samples_per_second": 2199479.0075423312,
            "samples_per_second_per_gpu": 274934.8759427914,
            "loss_sequences_lower_95": 3.332336924027423,
            "loss_sequences_upper_95": 3.45586463149713,
            "loss_tokens_lower_95": 3.3737263958333332,
            "loss_tokens_upper_95": 3.4007220052083333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.990357292755363,
            "data_time": 0.0015950578411943034,
            "batch_time": 0.015022407432920054,
            "samples_per_second": 2321508.7120254445,
            "samples_per_second_per_gpu": 290188.58900318056,
            "loss_sequences_lower_95": 3.9511511658344074,
            "loss_sequences_upper_95": 4.030806137242267,
            "loss_tokens_lower_95": 3.9773532916666667,
            "loss_tokens_upper_95": 4.0031339062499995,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9224081034572937,
            "data_time": 0.009976420269544381,
            "batch_time": 0.02420572645635719,
            "samples_per_second": 2134767.3942209044,
            "samples_per_second_per_gpu": 266845.92427761306,
            "loss_sequences_lower_95": 3.864587234528147,
            "loss_sequences_upper_95": 3.996494530951661,
            "loss_tokens_lower_95": 3.910711989583333,
            "loss_tokens_upper_95": 3.93389259375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.120191503430126,
            "data_time": 0.0038056305569151173,
            "batch_time": 0.01758483667736468,
            "samples_per_second": 2275694.8919813195,
            "samples_per_second_per_gpu": 284461.86149766494,
            "loss_sequences_lower_95": 4.070801934735618,
            "loss_sequences_upper_95": 4.173467839161284,
            "loss_tokens_lower_95": 4.1073826562499995,
            "loss_tokens_upper_95": 4.1330010625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9631299144394543,
            "data_time": 0.0015642140273363322,
            "batch_time": 0.014905009534191735,
            "samples_per_second": 2341011.939875704,
            "samples_per_second_per_gpu": 292626.492484463,
            "loss_sequences_lower_95": 3.929631666533801,
            "loss_sequences_upper_95": 3.9959582071109696,
            "loss_tokens_lower_95": 3.9468007708333337,
            "loss_tokens_upper_95": 3.9800375,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.322859477397659,
            "data_time": 0.0015637875600421544,
            "batch_time": 0.0149772107376807,
            "samples_per_second": 2329138.2233432303,
            "samples_per_second_per_gpu": 291142.2779179038,
            "loss_sequences_lower_95": 4.2982371461878275,
            "loss_sequences_upper_95": 4.349601184145942,
            "loss_tokens_lower_95": 4.310915989583333,
            "loss_tokens_upper_95": 4.334633041666667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.0444444049664625,
            "data_time": 0.010082478561098613,
            "batch_time": 0.02444747043034387,
            "samples_per_second": 2150195.498887203,
            "samples_per_second_per_gpu": 268774.4373609004,
            "loss_sequences_lower_95": 3.956633218904821,
            "loss_sequences_upper_95": 4.150220687990266,
            "loss_tokens_lower_95": 4.032225395833334,
            "loss_tokens_upper_95": 4.056431875,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.200752765293649,
            "data_time": 0.0101056769490242,
            "batch_time": 0.02388585451990366,
            "samples_per_second": 2222520.0425022454,
            "samples_per_second_per_gpu": 277815.0053127807,
            "loss_sequences_lower_95": 5.085824331652977,
            "loss_sequences_upper_95": 5.344578690962358,
            "loss_tokens_lower_95": 5.18713375,
            "loss_tokens_upper_95": 5.214208625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.116158947089248,
            "data_time": 0.001371980268813447,
            "batch_time": 0.014849804629688886,
            "samples_per_second": 2319636.034920875,
            "samples_per_second_per_gpu": 289954.5043651094,
            "loss_sequences_lower_95": 4.104315912414776,
            "loss_sequences_upper_95": 4.128512647267285,
            "loss_tokens_lower_95": 4.104242979166667,
            "loss_tokens_upper_95": 4.1281673125000005,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.006827708235982,
            "data_time": 0.0027032053341575707,
            "batch_time": 0.01619080699155174,
            "samples_per_second": 2314612.7897227234,
            "samples_per_second_per_gpu": 289326.5987153404,
            "loss_sequences_lower_95": 3.98149305261805,
            "loss_sequences_upper_95": 4.033543315285298,
            "loss_tokens_lower_95": 3.994647875,
            "loss_tokens_upper_95": 4.018865427083333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.399617544777737,
            "data_time": 0.009919844126041698,
            "batch_time": 0.023815813743078663,
            "samples_per_second": 2150959.3722910653,
            "samples_per_second_per_gpu": 268869.92153638316,
            "loss_sequences_lower_95": 4.3107323327364355,
            "loss_sequences_upper_95": 4.508996458227688,
            "loss_tokens_lower_95": 4.386288239583333,
            "loss_tokens_upper_95": 4.41270484375,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7372157097833947,
            "data_time": 0.010314245148009039,
            "batch_time": 0.024168244396072933,
            "samples_per_second": 2188630.5888897213,
            "samples_per_second_per_gpu": 273578.82361121516,
            "loss_sequences_lower_95": 3.6515821219947333,
            "loss_sequences_upper_95": 3.8379317087457037,
            "loss_tokens_lower_95": 3.7247136875,
            "loss_tokens_upper_95": 3.74969125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.854047948663885,
            "data_time": 0.08375769002096993,
            "batch_time": 0.09971125636781965,
            "samples_per_second": 1055139.8397573119,
            "samples_per_second_per_gpu": 131892.47996966398,
            "loss_sequences_lower_95": 4.771655932339755,
            "loss_sequences_upper_95": 4.950437285683371,
            "loss_tokens_lower_95": 4.828604819557884,
            "loss_tokens_upper_95": 4.879590129852295,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.090346150773607,
            "data_time": 0.013952672481536865,
            "batch_time": 0.027970553799109024,
            "samples_per_second": 2108187.2332283114,
            "samples_per_second_per_gpu": 263523.4041535389,
            "loss_sequences_lower_95": 4.020590841665908,
            "loss_sequences_upper_95": 4.15794666167946,
            "loss_tokens_lower_95": 4.076803322916667,
            "loss_tokens_upper_95": 4.10365571875,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.989197424030555,
            "data_time": 0.0130202683309714,
            "batch_time": 0.027290528019269306,
            "samples_per_second": 2128119.861330421,
            "samples_per_second_per_gpu": 266014.98266630265,
            "loss_sequences_lower_95": 5.899265565092143,
            "loss_sequences_upper_95": 6.107711558480376,
            "loss_tokens_lower_95": 5.97739871875,
            "loss_tokens_upper_95": 6.0006280104166665,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.514844218238455,
            "data_time": 0.03704093396663666,
            "batch_time": 0.05268612131476402,
            "samples_per_second": 1698706.662642463,
            "samples_per_second_per_gpu": 212338.3328303079,
            "loss_sequences_lower_95": 4.35795148005251,
            "loss_sequences_upper_95": 4.792672504362513,
            "loss_tokens_lower_95": 4.500170122990843,
            "loss_tokens_upper_95": 4.529715303514824,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.083880687656438,
            "data_time": 0.002000307473343434,
            "batch_time": 0.015762578415979285,
            "samples_per_second": 2245818.535653639,
            "samples_per_second_per_gpu": 280727.3169567049,
            "loss_sequences_lower_95": 5.062538542297215,
            "loss_sequences_upper_95": 5.105730214490457,
            "loss_tokens_lower_95": 5.0622581474327015,
            "loss_tokens_upper_95": 5.105487624737395,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5261841055359695,
            "data_time": 0.002192678535060518,
            "batch_time": 0.015860010294397923,
            "samples_per_second": 2257571.2513762387,
            "samples_per_second_per_gpu": 282196.40642202983,
            "loss_sequences_lower_95": 3.526767809574786,
            "loss_sequences_upper_95": 3.55275244364295,
            "loss_tokens_lower_95": 3.504770899738273,
            "loss_tokens_upper_95": 3.524666095140532,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.759228298446734,
            "data_time": 0.003115645902096158,
            "batch_time": 0.016869430604911324,
            "samples_per_second": 2246694.5914382227,
            "samples_per_second_per_gpu": 280836.82392977783,
            "loss_sequences_lower_95": 5.994804982729098,
            "loss_sequences_upper_95": 6.296373052871841,
            "loss_tokens_lower_95": 5.236616335421878,
            "loss_tokens_upper_95": 5.451257734319157,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.7525561897357305,
            "data_time": 0.004284435130180197,
            "batch_time": 0.017897356856376567,
            "samples_per_second": 2250601.7907399707,
            "samples_per_second_per_gpu": 281325.22384249634,
            "loss_sequences_lower_95": 5.901283170572917,
            "loss_sequences_upper_95": 6.103791650390625,
            "loss_tokens_lower_95": 5.3950910966981125,
            "loss_tokens_upper_95": 5.536678925904088,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7295990313789535,
            "data_time": 0.004534197069401115,
            "batch_time": 0.0184783975103502,
            "samples_per_second": 2203525.9089656663,
            "samples_per_second_per_gpu": 275440.7386207083,
            "loss_sequences_lower_95": 3.774885176166248,
            "loss_sequences_upper_95": 3.8417139245670255,
            "loss_tokens_lower_95": 3.6280030596626656,
            "loss_tokens_upper_95": 3.66100799852991,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1408509677106684,
            "data_time": 0.0229492826121194,
            "batch_time": 0.03713571173804147,
            "samples_per_second": 2050486.0434418065,
            "samples_per_second_per_gpu": 256310.7554302258,
            "loss_sequences_lower_95": 3.114972749189897,
            "loss_sequences_upper_95": 3.2372720753062856,
            "loss_tokens_lower_95": 3.0628609905648494,
            "loss_tokens_upper_95": 3.1164628420084655,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.950808171836697,
            "data_time": 0.020567186176776886,
            "batch_time": 0.034686170518398285,
            "samples_per_second": 2013627.0903725454,
            "samples_per_second_per_gpu": 251703.38629656818,
            "loss_sequences_lower_95": 3.9399477713448663,
            "loss_sequences_upper_95": 4.141407408422353,
            "loss_tokens_lower_95": 3.8167170044854433,
            "loss_tokens_upper_95": 3.915319396104858,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.517820530732473,
            "data_time": 0.016454902979043815,
            "batch_time": 0.030787775149712197,
            "samples_per_second": 1974727.851268769,
            "samples_per_second_per_gpu": 246840.98140859613,
            "loss_sequences_lower_95": 4.470696431477864,
            "loss_sequences_upper_95": 4.5766715189615885,
            "loss_tokens_lower_95": 4.3844297656612135,
            "loss_tokens_upper_95": 4.624190842825344,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.469124416817268,
            "data_time": 0.001938155194622198,
            "batch_time": 0.015662079004776334,
            "samples_per_second": 2251918.0579437567,
            "samples_per_second_per_gpu": 281489.7572429696,
            "loss_sequences_lower_95": 7.487753182514394,
            "loss_sequences_upper_95": 7.565828876224103,
            "loss_tokens_lower_95": 7.314566249283291,
            "loss_tokens_upper_95": 7.396827853994457,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.302751145980976,
            "data_time": 0.0029577412861305596,
            "batch_time": 0.016648148930312804,
            "samples_per_second": 2249793.21600383,
            "samples_per_second_per_gpu": 281224.15200047876,
            "loss_sequences_lower_95": 5.9042452417238795,
            "loss_sequences_upper_95": 6.2195868392584694,
            "loss_tokens_lower_95": 4.492745575311507,
            "loss_tokens_upper_95": 4.637708617166212,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.7097854886852435,
            "data_time": 0.005167569663073565,
            "batch_time": 0.01910202930102477,
            "samples_per_second": 2185828.528548841,
            "samples_per_second_per_gpu": 273228.5660686051,
            "loss_sequences_lower_95": 5.1659692966083615,
            "loss_sequences_upper_95": 5.508365982628519,
            "loss_tokens_lower_95": 4.262495060465573,
            "loss_tokens_upper_95": 4.423424193981425,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.87262331078586,
            "data_time": 0.023788054074559892,
            "batch_time": 0.03812144696712494,
            "samples_per_second": 2011486.5489844072,
            "samples_per_second_per_gpu": 251435.8186230509,
            "loss_sequences_lower_95": 5.793277939051798,
            "loss_sequences_upper_95": 5.949747191825414,
            "loss_tokens_lower_95": 5.795549742816245,
            "loss_tokens_upper_95": 5.947972573754995,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.907189040184021,
            "data_time": 0.04920580753913292,
            "batch_time": 0.0644064683180589,
            "samples_per_second": 1734281.700058851,
            "samples_per_second_per_gpu": 216785.21250735637,
            "loss_sequences_lower_95": 3.7685496292114258,
            "loss_sequences_upper_95": 4.15228092956543,
            "loss_tokens_lower_95": 3.5863166385985017,
            "loss_tokens_upper_95": 4.058344237407759,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.136090340121376,
            "data_time": 0.0033678057734951652,
            "batch_time": 0.0172351710879242,
            "samples_per_second": 2227699.8625555905,
            "samples_per_second_per_gpu": 278462.4828194488,
            "loss_sequences_lower_95": 5.094284448067266,
            "loss_sequences_upper_95": 5.177096873100851,
            "loss_tokens_lower_95": 5.094151470141377,
            "loss_tokens_upper_95": 5.1780283433022,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.446214314663049,
            "data_time": 0.004985875445516813,
            "batch_time": 0.018661641763435103,
            "samples_per_second": 2243499.038814384,
            "samples_per_second_per_gpu": 280437.379851798,
            "loss_sequences_lower_95": 5.388949717428721,
            "loss_sequences_upper_95": 5.5035650729742525,
            "loss_tokens_lower_95": 5.386689467121596,
            "loss_tokens_upper_95": 5.504940838388103,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.044657605127619,
            "data_time": 0.0036234456571080514,
            "batch_time": 0.01730553326287486,
            "samples_per_second": 2239660.1170205567,
            "samples_per_second_per_gpu": 279957.5146275696,
            "loss_sequences_lower_95": 4.1771590040868976,
            "loss_sequences_upper_95": 4.298950660215757,
            "loss_tokens_lower_95": 3.891122910226163,
            "loss_tokens_upper_95": 3.9529484663258154,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.04092969751358,
            "data_time": 0.011065579019486904,
            "batch_time": 0.025337004102766514,
            "samples_per_second": 2075095.8526655002,
            "samples_per_second_per_gpu": 259386.98158318753,
            "loss_sequences_lower_95": 6.225847961425782,
            "loss_sequences_upper_95": 6.76153955078125,
            "loss_tokens_lower_95": 5.3987801209101995,
            "loss_tokens_upper_95": 5.759583852529368,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.599930495023727,
            "data_time": 0.159471333026886,
            "batch_time": 0.17649586498737335,
            "samples_per_second": 788406.6196238765,
            "samples_per_second_per_gpu": 98550.82745298457,
            "loss_sequences_lower_95": 4.328902173042297,
            "loss_sequences_upper_95": 4.968046438694,
            "loss_tokens_lower_95": 4.129334513346355,
            "loss_tokens_upper_95": 4.9062409411901715,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.751636203678175,
            "data_time": 0.028024325979516863,
            "batch_time": 0.042705363415657206,
            "samples_per_second": 1825929.7834035694,
            "samples_per_second_per_gpu": 228241.22292544617,
            "loss_sequences_lower_95": 5.041393665883733,
            "loss_sequences_upper_95": 5.61806226708423,
            "loss_tokens_lower_95": 3.808233229713483,
            "loss_tokens_upper_95": 4.191681465570392,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.176979385645122,
            "data_time": 0.0030804663482639524,
            "batch_time": 0.017064835048384137,
            "samples_per_second": 2199186.766865961,
            "samples_per_second_per_gpu": 274898.3458582451,
            "loss_sequences_lower_95": 3.156712656616904,
            "loss_sequences_upper_95": 3.197359852188797,
            "loss_tokens_lower_95": 3.1563684865969037,
            "loss_tokens_upper_95": 3.19758583425486,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.1910230691230055,
            "data_time": 0.002732381276673571,
            "batch_time": 0.016638477041964667,
            "samples_per_second": 2223655.0066895788,
            "samples_per_second_per_gpu": 277956.87583619735,
            "loss_sequences_lower_95": 4.161756823240102,
            "loss_sequences_upper_95": 4.357971549112774,
            "loss_tokens_lower_95": 3.9654551696221065,
            "loss_tokens_upper_95": 4.155704889078449,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.63821118524223,
            "data_time": 0.018198654055595398,
            "batch_time": 0.03239225678973728,
            "samples_per_second": 2000647.3154311404,
            "samples_per_second_per_gpu": 250080.91442889255,
            "loss_sequences_lower_95": 3.4741290585025326,
            "loss_sequences_upper_95": 3.8718030768873053,
            "loss_tokens_lower_95": 3.370557645644224,
            "loss_tokens_upper_95": 3.6810155651505405,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9637264872312357,
            "data_time": 0.004841909185051918,
            "batch_time": 0.018659044057130814,
            "samples_per_second": 2211034.7198630646,
            "samples_per_second_per_gpu": 276379.3399828831,
            "loss_sequences_lower_95": 3.988120535328902,
            "loss_sequences_upper_95": 4.131206451632794,
            "loss_tokens_lower_95": 3.827687245886914,
            "loss_tokens_upper_95": 3.975438394320474,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4263370037078857,
            "data_time": 0.02938646361941383,
            "batch_time": 0.04456380435398647,
            "samples_per_second": 1862716.6315522317,
            "samples_per_second_per_gpu": 232839.57894402897,
            "loss_sequences_lower_95": 3.2301734273026628,
            "loss_sequences_upper_95": 3.755203098204078,
            "loss_tokens_lower_95": 3.1314500618914405,
            "loss_tokens_upper_95": 3.5249973714059952,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9635332985672145,
            "data_time": 0.0022391609987448655,
            "batch_time": 0.015952751153571768,
            "samples_per_second": 2249372.825514543,
            "samples_per_second_per_gpu": 281171.60318931786,
            "loss_sequences_lower_95": 4.9530009810555855,
            "loss_sequences_upper_95": 4.9739955999012295,
            "loss_tokens_lower_95": 4.953082374287357,
            "loss_tokens_upper_95": 4.973786964424135,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.6366499812857618,
            "data_time": 0.04630014679648659,
            "batch_time": 0.06145505471663042,
            "samples_per_second": 1692592.0693126007,
            "samples_per_second_per_gpu": 211574.00866407508,
            "loss_sequences_lower_95": 1.5605371864096633,
            "loss_sequences_upper_95": 1.774386207802782,
            "loss_tokens_lower_95": 1.4262279842380614,
            "loss_tokens_upper_95": 1.7176225301065122,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.633048477181599,
            "data_time": 0.001677131125733119,
            "batch_time": 0.015371077306012795,
            "samples_per_second": 2254365.902331433,
            "samples_per_second_per_gpu": 281795.7377914291,
            "loss_sequences_lower_95": 6.046176870414047,
            "loss_sequences_upper_95": 6.0984493026893345,
            "loss_tokens_lower_95": 4.992429799323017,
            "loss_tokens_upper_95": 5.044073597678916,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.551418666601181,
            "data_time": 0.005734594095320929,
            "batch_time": 0.019538850065261598,
            "samples_per_second": 2208372.512884165,
            "samples_per_second_per_gpu": 276046.56411052065,
            "loss_sequences_lower_95": 5.52523623046875,
            "loss_sequences_upper_95": 5.7827701171875,
            "loss_tokens_lower_95": 5.31950047279824,
            "loss_tokens_upper_95": 5.5636159851121425,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.253842960233274,
            "data_time": 0.022714667401071323,
            "batch_time": 0.03714169486094329,
            "samples_per_second": 1990908.9254397138,
            "samples_per_second_per_gpu": 248863.61567996422,
            "loss_sequences_lower_95": 5.099034078846807,
            "loss_sequences_upper_95": 5.4073607336956515,
            "loss_tokens_lower_95": 5.100473115340523,
            "loss_tokens_upper_95": 5.405211128566576,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.135937994537931,
            "data_time": 0.004531830549240112,
            "batch_time": 0.01826538019869701,
            "samples_per_second": 2226360.2360032364,
            "samples_per_second_per_gpu": 278295.02950040455,
            "loss_sequences_lower_95": 7.0253369140625,
            "loss_sequences_upper_95": 7.2428377278645835,
            "loss_tokens_lower_95": 7.025635301994555,
            "loss_tokens_upper_95": 7.246930874911222,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.3258112706343332,
            "data_time": 0.00421924223291113,
            "batch_time": 0.018096941899746023,
            "samples_per_second": 2219743.386985956,
            "samples_per_second_per_gpu": 277467.9233732445,
            "loss_sequences_lower_95": 1.3783381876627605,
            "loss_sequences_upper_95": 1.4515059814453126,
            "loss_tokens_lower_95": 1.2304734139749651,
            "loss_tokens_upper_95": 1.2966734213216535,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.431929739316304,
            "data_time": 0.022221667425973073,
            "batch_time": 0.036347819226128716,
            "samples_per_second": 1965171.7143843863,
            "samples_per_second_per_gpu": 245646.4642980483,
            "loss_sequences_lower_95": 6.126576291038877,
            "loss_sequences_upper_95": 6.735479038783482,
            "loss_tokens_lower_95": 6.125793064662388,
            "loss_tokens_upper_95": 6.739723074776785,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.904840961098671,
            "data_time": 0.1470084935426712,
            "batch_time": 0.1643054187297821,
            "samples_per_second": 868291.4363152015,
            "samples_per_second_per_gpu": 108536.42953940018,
            "loss_sequences_lower_95": 2.6600088238716126,
            "loss_sequences_upper_95": 3.680256164073944,
            "loss_tokens_lower_95": 2.383027425549694,
            "loss_tokens_upper_95": 2.904454106596327,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.436962790489197,
            "data_time": 0.005897110416775658,
            "batch_time": 0.019848656559747362,
            "samples_per_second": 2196463.6364125446,
            "samples_per_second_per_gpu": 274557.9545515681,
            "loss_sequences_lower_95": 7.3908783691406255,
            "loss_sequences_upper_95": 7.733980969238281,
            "loss_tokens_lower_95": 7.118587404822335,
            "loss_tokens_upper_95": 7.42652924565091,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.349067360401153,
            "data_time": 0.00562076625369844,
            "batch_time": 0.019535266217731294,
            "samples_per_second": 2196044.7732242923,
            "samples_per_second_per_gpu": 274505.59665303654,
            "loss_sequences_lower_95": 7.450087475585938,
            "loss_sequences_upper_95": 7.6721666870117184,
            "loss_tokens_lower_95": 7.10103135660522,
            "loss_tokens_upper_95": 7.287032648582248,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.305901006608452,
            "data_time": 0.0037092719189698083,
            "batch_time": 0.017497951131201908,
            "samples_per_second": 2228876.2017286695,
            "samples_per_second_per_gpu": 278609.5252160837,
            "loss_sequences_lower_95": 5.2808372362004485,
            "loss_sequences_upper_95": 5.331093808927673,
            "loss_tokens_lower_95": 5.281023693180628,
            "loss_tokens_upper_95": 5.330958242541066,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.379499409788398,
            "data_time": 0.008197466052190775,
            "batch_time": 0.02268730765743198,
            "samples_per_second": 2071816.0568536723,
            "samples_per_second_per_gpu": 258977.00710670903,
            "loss_sequences_lower_95": 5.285438436809956,
            "loss_sequences_upper_95": 5.471053359915035,
            "loss_tokens_lower_95": 5.282515874765985,
            "loss_tokens_upper_95": 5.469948389196909,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.848550235271454,
            "data_time": 0.006341370798292614,
            "batch_time": 0.0204585696023608,
            "samples_per_second": 2157980.707573922,
            "samples_per_second_per_gpu": 269747.58844674024,
            "loss_sequences_lower_95": 7.7891361450195316,
            "loss_sequences_upper_95": 7.909880981445312,
            "loss_tokens_lower_95": 7.789438342285156,
            "loss_tokens_upper_95": 7.906981262207031,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6630402072827097,
            "data_time": 0.0022335897997331528,
            "batch_time": 0.015959798185496338,
            "samples_per_second": 2248751.626491992,
            "samples_per_second_per_gpu": 281093.953311499,
            "loss_sequences_lower_95": 4.142344794007214,
            "loss_sequences_upper_95": 4.228988061361755,
            "loss_tokens_lower_95": 3.0890422964077766,
            "loss_tokens_upper_95": 3.148285543992343,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.712770442464459,
            "data_time": 0.01862733704703195,
            "batch_time": 0.032947850227355954,
            "samples_per_second": 1982409.2185022363,
            "samples_per_second_per_gpu": 247801.15231277954,
            "loss_sequences_lower_95": 5.534449084837045,
            "loss_sequences_upper_95": 5.885315966250292,
            "loss_tokens_lower_95": 5.540336904952776,
            "loss_tokens_upper_95": 5.883909436126253,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.593157833697749,
            "data_time": 0.010994941927492619,
            "batch_time": 0.025010745972394943,
            "samples_per_second": 2159278.307420088,
            "samples_per_second_per_gpu": 269909.788427511,
            "loss_sequences_lower_95": 5.468894066904105,
            "loss_sequences_upper_95": 5.7131322763480386,
            "loss_tokens_lower_95": 5.470475511737899,
            "loss_tokens_upper_95": 5.711610346775429,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.432333175138201,
            "data_time": 0.0023551305612047545,
            "batch_time": 0.016054805441777925,
            "samples_per_second": 2247062.246364127,
            "samples_per_second_per_gpu": 280882.78079551586,
            "loss_sequences_lower_95": 4.91361768269291,
            "loss_sequences_upper_95": 5.008096171649912,
            "loss_tokens_lower_95": 3.7417683195778495,
            "loss_tokens_upper_95": 3.820694188899989,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.473124761430044,
            "data_time": 0.028249748051166534,
            "batch_time": 0.04370534668366114,
            "samples_per_second": 1895645.749224799,
            "samples_per_second_per_gpu": 236955.71865309987,
            "loss_sequences_lower_95": 5.385647534567212,
            "loss_sequences_upper_95": 5.557624518177497,
            "loss_tokens_lower_95": 5.386071543214182,
            "loss_tokens_upper_95": 5.557953655908978,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.469086713236771,
            "data_time": 0.004273337758941092,
            "batch_time": 0.018220645895225516,
            "samples_per_second": 2210103.772414144,
            "samples_per_second_per_gpu": 276262.971551768,
            "loss_sequences_lower_95": 5.438500961630352,
            "loss_sequences_upper_95": 5.499813139095948,
            "loss_tokens_lower_95": 5.437846366112385,
            "loss_tokens_upper_95": 5.500515039659786,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.999492860534816,
            "data_time": 0.023600727861577814,
            "batch_time": 0.037766027450561526,
            "samples_per_second": 1941107.6664813648,
            "samples_per_second_per_gpu": 242638.4583101706,
            "loss_sequences_lower_95": 5.809593126611802,
            "loss_sequences_upper_95": 6.194630891374014,
            "loss_tokens_lower_95": 5.804656567619842,
            "loss_tokens_upper_95": 6.193533295566596,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.457563304901123,
            "data_time": 0.07855114340782166,
            "batch_time": 0.09374162554740906,
            "samples_per_second": 1394593.7019724553,
            "samples_per_second_per_gpu": 174324.2127465569,
            "loss_sequences_lower_95": 3.163068453470866,
            "loss_sequences_upper_95": 3.875244986216227,
            "loss_tokens_lower_95": 2.8653883192274305,
            "loss_tokens_upper_95": 3.8539443545871306,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.946596332391103,
            "data_time": 0.07825098931789398,
            "batch_time": 0.09299804270267487,
            "samples_per_second": 1471826.2862287702,
            "samples_per_second_per_gpu": 183978.28577859627,
            "loss_sequences_lower_95": 2.712679443359375,
            "loss_sequences_upper_95": 3.4279711341857912,
            "loss_tokens_lower_95": 2.269265845652377,
            "loss_tokens_upper_95": 3.300032497791761,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.969241427532528,
            "data_time": 0.0039270448993415676,
            "batch_time": 0.017622951902734173,
            "samples_per_second": 2241718.8741567433,
            "samples_per_second_per_gpu": 280214.8592695929,
            "loss_sequences_lower_95": 6.9479680164994475,
            "loss_sequences_upper_95": 6.9897293958256626,
            "loss_tokens_lower_95": 6.948693793722385,
            "loss_tokens_upper_95": 6.989948453608247,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.1109123420900004,
            "data_time": 0.0015681035392785228,
            "batch_time": 0.015284064399074412,
            "samples_per_second": 2249016.330982822,
            "samples_per_second_per_gpu": 281127.04137285275,
            "loss_sequences_lower_95": 1.3290830737269628,
            "loss_sequences_upper_95": 1.3660043654474117,
            "loss_tokens_lower_95": 0.8871437475686954,
            "loss_tokens_upper_95": 0.9047975565941402,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.509183613334115,
            "data_time": 0.04182526841759682,
            "batch_time": 0.05691206082701683,
            "samples_per_second": 1827917.9431456001,
            "samples_per_second_per_gpu": 228489.74289320002,
            "loss_sequences_lower_95": 5.55881556713675,
            "loss_sequences_upper_95": 5.955658247339444,
            "loss_tokens_lower_95": 5.152003644298084,
            "loss_tokens_upper_95": 5.440974365174404,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 9.187056464117926,
            "data_time": 0.11538334119887579,
            "batch_time": 0.13138389587402344,
            "samples_per_second": 1011616.5043312551,
            "samples_per_second_per_gpu": 126452.06304140689,
            "loss_sequences_lower_95": 8.727169366784997,
            "loss_sequences_upper_95": 9.856019695385083,
            "loss_tokens_lower_95": 8.127008697133006,
            "loss_tokens_upper_95": 9.936071551287615,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.410800587840196,
            "data_time": 0.029912593818846204,
            "batch_time": 0.0448237487248012,
            "samples_per_second": 1899910.3117912973,
            "samples_per_second_per_gpu": 237488.78897391216,
            "loss_sequences_lower_95": 5.387373919603301,
            "loss_sequences_upper_95": 5.749846063009122,
            "loss_tokens_lower_95": 4.994353850728028,
            "loss_tokens_upper_95": 5.239311215438393,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.55072140257533,
            "data_time": 0.030990032922653926,
            "batch_time": 0.045269452390216645,
            "samples_per_second": 1945101.5756487246,
            "samples_per_second_per_gpu": 243137.69695609057,
            "loss_sequences_lower_95": 5.521934704664276,
            "loss_sequences_upper_95": 5.845785903930664,
            "loss_tokens_lower_95": 5.174405289814706,
            "loss_tokens_upper_95": 5.378027355202592,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.569066379128433,
            "data_time": 0.030821709405808223,
            "batch_time": 0.04600648936771211,
            "samples_per_second": 1866824.431810428,
            "samples_per_second_per_gpu": 233353.0539763035,
            "loss_sequences_lower_95": 5.535719317924685,
            "loss_sequences_upper_95": 5.9616754950546635,
            "loss_tokens_lower_95": 5.08764263765095,
            "loss_tokens_upper_95": 5.403389918550531,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.689462900161743,
            "data_time": 0.031338274478912354,
            "batch_time": 0.0456799552554176,
            "samples_per_second": 1967899.9690394923,
            "samples_per_second_per_gpu": 245987.49612993654,
            "loss_sequences_lower_95": 5.645348320937738,
            "loss_sequences_upper_95": 5.955446494497903,
            "loss_tokens_lower_95": 5.34996095461266,
            "loss_tokens_upper_95": 5.5391950850917535,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.035781762614754,
            "data_time": 0.031266477372911244,
            "batch_time": 0.046274532506495346,
            "samples_per_second": 1951492.8490868509,
            "samples_per_second_per_gpu": 243936.60613585636,
            "loss_sequences_lower_95": 4.973215906368279,
            "loss_sequences_upper_95": 5.220159097043624,
            "loss_tokens_lower_95": 4.756868750457005,
            "loss_tokens_upper_95": 4.903082544238081,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4494068869730325,
            "data_time": 0.03222820588520595,
            "batch_time": 0.046656773203895205,
            "samples_per_second": 1958718.3433628245,
            "samples_per_second_per_gpu": 244839.79292035307,
            "loss_sequences_lower_95": 4.4344756149664155,
            "loss_sequences_upper_95": 4.702056996415301,
            "loss_tokens_lower_95": 4.156005959444696,
            "loss_tokens_upper_95": 4.277948159188517,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/checkpoints/epoch_7.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-1.0/params.txt",
    "uuid": "3925c744-feb7-4dcd-8000-8e2491b87486",
    "creation_date": "2023_12_13-16_17_51"
}