{
    "name": "rw_original-d=512_l=8_h=4-16.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 25252495360,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "5050499072",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.6146242678165437,
            "data_time": 0.02825441211462021,
            "batch_time": 0.32802971452474594,
            "samples_per_second": 1749909.141221806,
            "samples_per_second_per_gpu": 218738.64265272574,
            "loss_sequences_lower_95": 3.5353090794881186,
            "loss_sequences_upper_95": 3.6955061403910316,
            "loss_tokens_lower_95": 3.600191618601481,
            "loss_tokens_upper_95": 3.62901273727417,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4712117426656195,
            "data_time": 0.0014087203261212956,
            "batch_time": 0.015247678486338234,
            "samples_per_second": 2247465.788510077,
            "samples_per_second_per_gpu": 280933.22356375965,
            "loss_sequences_lower_95": 3.4688061219837882,
            "loss_sequences_upper_95": 3.4735762424410876,
            "loss_tokens_lower_95": 3.460478151041667,
            "loss_tokens_upper_95": 3.482014614583333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.0460060625660175,
            "data_time": 0.00850074863433838,
            "batch_time": 0.022470281600952147,
            "samples_per_second": 2154332.949944828,
            "samples_per_second_per_gpu": 269291.6187431035,
            "loss_sequences_lower_95": 2.9840774972098214,
            "loss_sequences_upper_95": 3.1245991858657525,
            "loss_tokens_lower_95": 3.0331694791666663,
            "loss_tokens_upper_95": 3.058999625,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.612613331362144,
            "data_time": 0.001510175043030789,
            "batch_time": 0.015157610177993774,
            "samples_per_second": 2287077.3966268892,
            "samples_per_second_per_gpu": 285884.67457836115,
            "loss_sequences_lower_95": 3.5673914807506444,
            "loss_sequences_upper_95": 3.6602023195876288,
            "loss_tokens_lower_95": 3.599731541666667,
            "loss_tokens_upper_95": 3.625542520833333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5474817141010417,
            "data_time": 0.008861939745595255,
            "batch_time": 0.02241486287211992,
            "samples_per_second": 2227811.922524662,
            "samples_per_second_per_gpu": 278476.4903155828,
            "loss_sequences_lower_95": 3.4826394386058426,
            "loss_sequences_upper_95": 3.632550372028545,
            "loss_tokens_lower_95": 3.5358801562499997,
            "loss_tokens_upper_95": 3.5587942187499997,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6871093460526456,
            "data_time": 0.0035124955617863197,
            "batch_time": 0.01704990248317304,
            "samples_per_second": 2297402.178857116,
            "samples_per_second_per_gpu": 287175.2723571395,
            "loss_sequences_lower_95": 3.6349059768147733,
            "loss_sequences_upper_95": 3.744770038119795,
            "loss_tokens_lower_95": 3.67413625,
            "loss_tokens_upper_95": 3.6997438437500003,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.4243315236665763,
            "data_time": 0.0014785117854107456,
            "batch_time": 0.015001406580161115,
            "samples_per_second": 2315344.092053493,
            "samples_per_second_per_gpu": 289418.0115066866,
            "loss_sequences_lower_95": 3.391673758370536,
            "loss_sequences_upper_95": 3.4573287228954084,
            "loss_tokens_lower_95": 3.4075965052083337,
            "loss_tokens_upper_95": 3.4415765364583333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.008700873989085,
            "data_time": 0.0015305532465814697,
            "batch_time": 0.014895528791739521,
            "samples_per_second": 2337561.524125186,
            "samples_per_second_per_gpu": 292195.1905156482,
            "loss_sequences_lower_95": 3.981118271433246,
            "loss_sequences_upper_95": 4.038664451079843,
            "loss_tokens_lower_95": 3.9965593541666666,
            "loss_tokens_upper_95": 4.020666708333334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.65684791648291,
            "data_time": 0.009873245443616594,
            "batch_time": 0.023635775323898073,
            "samples_per_second": 2194321.154226221,
            "samples_per_second_per_gpu": 274290.14427827764,
            "loss_sequences_lower_95": 3.5591829005295668,
            "loss_sequences_upper_95": 3.776686220246602,
            "loss_tokens_lower_95": 3.644592479166667,
            "loss_tokens_upper_95": 3.66887915625,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.762660169789914,
            "data_time": 0.008633969351649284,
            "batch_time": 0.022956714034080505,
            "samples_per_second": 2235414.09400325,
            "samples_per_second_per_gpu": 279426.76175040624,
            "loss_sequences_lower_95": 4.630548566132195,
            "loss_sequences_upper_95": 4.926687356700068,
            "loss_tokens_lower_95": 4.748814510416667,
            "loss_tokens_upper_95": 4.776629770833334,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7139185435219435,
            "data_time": 0.001174936362507246,
            "batch_time": 0.014582901662197724,
            "samples_per_second": 2338300.4369834065,
            "samples_per_second_per_gpu": 292287.5546229258,
            "loss_sequences_lower_95": 3.6997415729495,
            "loss_sequences_upper_95": 3.728797097694258,
            "loss_tokens_lower_95": 3.7021927916666666,
            "loss_tokens_upper_95": 3.7258489270833333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5765720961839644,
            "data_time": 0.002411200343123284,
            "batch_time": 0.015781187197250888,
            "samples_per_second": 2339714.8272358207,
            "samples_per_second_per_gpu": 292464.3534044776,
            "loss_sequences_lower_95": 3.5463816244370836,
            "loss_sequences_upper_95": 3.6083999830799667,
            "loss_tokens_lower_95": 3.564860958333333,
            "loss_tokens_upper_95": 3.58855321875,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.058083533757115,
            "data_time": 0.008557121744268969,
            "batch_time": 0.022276162158830364,
            "samples_per_second": 2188711.5251934933,
            "samples_per_second_per_gpu": 273588.94064918667,
            "loss_sequences_lower_95": 3.9579582972652285,
            "loss_sequences_upper_95": 4.179922454400672,
            "loss_tokens_lower_95": 4.044610770833334,
            "loss_tokens_upper_95": 4.071208479166667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.311060188744073,
            "data_time": 0.008777269804145236,
            "batch_time": 0.022361687929981732,
            "samples_per_second": 2222956.565450441,
            "samples_per_second_per_gpu": 277869.57068130514,
            "loss_sequences_lower_95": 3.2156940965205734,
            "loss_sequences_upper_95": 3.423758755002148,
            "loss_tokens_lower_95": 3.29908109375,
            "loss_tokens_upper_95": 3.3231665833333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.342503049156883,
            "data_time": 0.07282411200659615,
            "batch_time": 0.089333176612854,
            "samples_per_second": 1042735.4182086819,
            "samples_per_second_per_gpu": 130341.92727608523,
            "loss_sequences_lower_95": 4.255317792025479,
            "loss_sequences_upper_95": 4.4572903546420015,
            "loss_tokens_lower_95": 4.319267524372448,
            "loss_tokens_upper_95": 4.366501894864169,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.652727409284942,
            "data_time": 0.012349418618462303,
            "batch_time": 0.02641911804676056,
            "samples_per_second": 2134858.484514777,
            "samples_per_second_per_gpu": 266857.31056434714,
            "loss_sequences_lower_95": 3.5837315595532298,
            "loss_sequences_upper_95": 3.721670114055667,
            "loss_tokens_lower_95": 3.6389339166666668,
            "loss_tokens_upper_95": 3.6663871145833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.4986572932442135,
            "data_time": 0.011635983983675638,
            "batch_time": 0.025396255155404408,
            "samples_per_second": 2213726.955811735,
            "samples_per_second_per_gpu": 276715.86947646685,
            "loss_sequences_lower_95": 5.402011366066643,
            "loss_sequences_upper_95": 5.629737471527664,
            "loss_tokens_lower_95": 5.487002052083334,
            "loss_tokens_upper_95": 5.510069479166667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.972340259395662,
            "data_time": 0.03352004289627075,
            "batch_time": 0.04807599261403084,
            "samples_per_second": 1883444.5561682475,
            "samples_per_second_per_gpu": 235430.56952103093,
            "loss_sequences_lower_95": 3.8041425736224066,
            "loss_sequences_upper_95": 4.273197411709145,
            "loss_tokens_lower_95": 3.958047604170002,
            "loss_tokens_upper_95": 3.987117917420434,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9221682877914805,
            "data_time": 0.0019072278281822292,
            "batch_time": 0.015602079638044492,
            "samples_per_second": 2260480.510262433,
            "samples_per_second_per_gpu": 282560.0637828041,
            "loss_sequences_lower_95": 4.899350441977639,
            "loss_sequences_upper_95": 4.945703570093292,
            "loss_tokens_lower_95": 4.898985827116864,
            "loss_tokens_upper_95": 4.945440728595464,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1360636265528106,
            "data_time": 0.0021547626727705547,
            "batch_time": 0.01589447316850067,
            "samples_per_second": 2247176.0161714708,
            "samples_per_second_per_gpu": 280897.00202143384,
            "loss_sequences_lower_95": 3.1343680370568614,
            "loss_sequences_upper_95": 3.1597493748910823,
            "loss_tokens_lower_95": 3.115223420063337,
            "loss_tokens_upper_95": 3.134342729560731,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.791660573870211,
            "data_time": 0.0029607989898163905,
            "batch_time": 0.016714580771386007,
            "samples_per_second": 2241209.1489899526,
            "samples_per_second_per_gpu": 280151.1436237441,
            "loss_sequences_lower_95": 5.047190537630639,
            "loss_sequences_upper_95": 5.3359815075873875,
            "loss_tokens_lower_95": 4.236079737945259,
            "loss_tokens_upper_95": 4.453038512441922,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.793660589873791,
            "data_time": 0.004198549275702619,
            "batch_time": 0.018175113391368947,
            "samples_per_second": 2190693.7323261723,
            "samples_per_second_per_gpu": 273836.71654077154,
            "loss_sequences_lower_95": 4.910478548177084,
            "loss_sequences_upper_95": 5.112593587239583,
            "loss_tokens_lower_95": 4.48806662735849,
            "loss_tokens_upper_95": 4.632541003341196,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1527427645503256,
            "data_time": 0.004343177398405463,
            "batch_time": 0.01835462374564929,
            "samples_per_second": 2200743.8060673885,
            "samples_per_second_per_gpu": 275092.97575842356,
            "loss_sequences_lower_95": 3.1958010311285774,
            "loss_sequences_upper_95": 3.253835913807809,
            "loss_tokens_lower_95": 3.0631269938978156,
            "loss_tokens_upper_95": 3.093410577338028,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.578143985704942,
            "data_time": 0.02230516288961683,
            "batch_time": 0.03666993762765612,
            "samples_per_second": 2041672.1489460526,
            "samples_per_second_per_gpu": 255209.01861825658,
            "loss_sequences_lower_95": 2.553751817183061,
            "loss_sequences_upper_95": 2.665135761607777,
            "loss_tokens_lower_95": 2.510242220567117,
            "loss_tokens_upper_95": 2.558082370975259,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5075364823244057,
            "data_time": 0.019857916980981827,
            "batch_time": 0.03398416005074978,
            "samples_per_second": 2030316.693800909,
            "samples_per_second_per_gpu": 253789.58672511362,
            "loss_sequences_lower_95": 3.4963237778021363,
            "loss_sequences_upper_95": 3.6882701017418684,
            "loss_tokens_lower_95": 3.379497252136933,
            "loss_tokens_upper_95": 3.473025274257363,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.937607668240865,
            "data_time": 0.01660072650664892,
            "batch_time": 0.03135433563819298,
            "samples_per_second": 1963157.7475916278,
            "samples_per_second_per_gpu": 245394.71844895347,
            "loss_sequences_lower_95": 3.9124385681152343,
            "loss_sequences_upper_95": 4.010070353190104,
            "loss_tokens_lower_95": 3.7950030766547243,
            "loss_tokens_upper_95": 4.004041862723559,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.812669043989016,
            "data_time": 0.001895759087145745,
            "batch_time": 0.015691338884421025,
            "samples_per_second": 2237507.1228986126,
            "samples_per_second_per_gpu": 279688.3903623266,
            "loss_sequences_lower_95": 5.822982642263545,
            "loss_sequences_upper_95": 5.898305343763841,
            "loss_tokens_lower_95": 5.680649868304744,
            "loss_tokens_upper_95": 5.757020611352669,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.609457209756479,
            "data_time": 0.0027243302972524757,
            "batch_time": 0.01658815585526844,
            "samples_per_second": 2219449.395178477,
            "samples_per_second_per_gpu": 277431.1743973096,
            "loss_sequences_lower_95": 5.124038706564341,
            "loss_sequences_upper_95": 5.413714126946549,
            "loss_tokens_lower_95": 3.9002345390908832,
            "loss_tokens_upper_95": 4.036587155763937,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.209073090817741,
            "data_time": 0.004943896387074445,
            "batch_time": 0.018978443902892037,
            "samples_per_second": 2168244.246205177,
            "samples_per_second_per_gpu": 271030.53077564714,
            "loss_sequences_lower_95": 4.623661851069219,
            "loss_sequences_upper_95": 4.950696248650144,
            "loss_tokens_lower_95": 3.795095971923907,
            "loss_tokens_upper_95": 3.951227738219486,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.71341992295496,
            "data_time": 0.021904311009815762,
            "batch_time": 0.03594428513731275,
            "samples_per_second": 2053281.4266388367,
            "samples_per_second_per_gpu": 256660.1783298546,
            "loss_sequences_lower_95": 5.622582338934075,
            "loss_sequences_upper_95": 5.802718530942316,
            "loss_tokens_lower_95": 5.624660014670733,
            "loss_tokens_upper_95": 5.799724575373681,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.53581570148468,
            "data_time": 0.04892840293737558,
            "batch_time": 0.06337736203120305,
            "samples_per_second": 1796748.38655212,
            "samples_per_second_per_gpu": 224593.548319015,
            "loss_sequences_lower_95": 3.3880014495849613,
            "loss_sequences_upper_95": 3.7479858245849607,
            "loss_tokens_lower_95": 3.221347291567671,
            "loss_tokens_upper_95": 3.6776706320228643,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.218276954356089,
            "data_time": 0.0032986307436703175,
            "batch_time": 0.017054881786276226,
            "samples_per_second": 2243748.3740193206,
            "samples_per_second_per_gpu": 280468.5467524151,
            "loss_sequences_lower_95": 5.166155712240916,
            "loss_sequences_upper_95": 5.2709379198119235,
            "loss_tokens_lower_95": 5.164641752994659,
            "loss_tokens_upper_95": 5.2710137108575355,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.622087152154596,
            "data_time": 0.0048739369502853215,
            "batch_time": 0.018822374390544547,
            "samples_per_second": 2202978.633230639,
            "samples_per_second_per_gpu": 275372.3291538299,
            "loss_sequences_lower_95": 5.5575941131116915,
            "loss_sequences_upper_95": 5.686519438472563,
            "loss_tokens_lower_95": 5.555346439745854,
            "loss_tokens_upper_95": 5.686775436213913,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6163787674462835,
            "data_time": 0.003352753801943159,
            "batch_time": 0.016960684990522414,
            "samples_per_second": 2254024.887457764,
            "samples_per_second_per_gpu": 281753.1109322205,
            "loss_sequences_lower_95": 3.7564499216837257,
            "loss_sequences_upper_95": 3.8837864258343817,
            "loss_tokens_lower_95": 3.4508114936387644,
            "loss_tokens_upper_95": 3.5097141531233573,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.588818754673004,
            "data_time": 0.009913683868944645,
            "batch_time": 0.023670315742492676,
            "samples_per_second": 2152611.1619394347,
            "samples_per_second_per_gpu": 269076.39524242934,
            "loss_sequences_lower_95": 5.758817492675782,
            "loss_sequences_upper_95": 6.301506396484375,
            "loss_tokens_lower_95": 4.9650140506080405,
            "loss_tokens_upper_95": 5.325057512809708,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9927085041999817,
            "data_time": 0.14564581215381622,
            "batch_time": 0.1625470072031021,
            "samples_per_second": 817828.0927134086,
            "samples_per_second_per_gpu": 102228.51158917608,
            "loss_sequences_lower_95": 3.7726321637630464,
            "loss_sequences_upper_95": 4.257449209690094,
            "loss_tokens_lower_95": 3.563639340455505,
            "loss_tokens_upper_95": 4.331205995055451,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.130162390484207,
            "data_time": 0.026260545913209307,
            "batch_time": 0.04003744429730354,
            "samples_per_second": 1897754.5623752272,
            "samples_per_second_per_gpu": 237219.3202969034,
            "loss_sequences_lower_95": 4.362088302086139,
            "loss_sequences_upper_95": 4.8720039630758345,
            "loss_tokens_lower_95": 3.329980055885358,
            "loss_tokens_upper_95": 3.689661318876832,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.580535960176617,
            "data_time": 0.0029666595574882296,
            "batch_time": 0.01661557849082682,
            "samples_per_second": 2244556.182207723,
            "samples_per_second_per_gpu": 280569.5227759654,
            "loss_sequences_lower_95": 2.5513360753023973,
            "loss_sequences_upper_95": 2.60966140393765,
            "loss_tokens_lower_95": 2.550854854825388,
            "loss_tokens_upper_95": 2.60984827444094,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.1485000529992324,
            "data_time": 0.0026811989971866983,
            "batch_time": 0.016417620392052943,
            "samples_per_second": 2251686.4974601856,
            "samples_per_second_per_gpu": 281460.8121825232,
            "loss_sequences_lower_95": 3.1220362522059357,
            "loss_sequences_upper_95": 3.280865733174243,
            "loss_tokens_lower_95": 2.9727709284499753,
            "loss_tokens_upper_95": 3.129522716203746,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.316852583116664,
            "data_time": 0.017864118019739788,
            "batch_time": 0.03170014090008206,
            "samples_per_second": 2027384.8756187952,
            "samples_per_second_per_gpu": 253423.1094523494,
            "loss_sequences_lower_95": 3.1647083827427456,
            "loss_sequences_upper_95": 3.566213184398609,
            "loss_tokens_lower_95": 3.0703910665309175,
            "loss_tokens_upper_95": 3.367681258040154,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7156226751648167,
            "data_time": 0.004501324146986008,
            "batch_time": 0.01826729141175747,
            "samples_per_second": 2215295.1087332545,
            "samples_per_second_per_gpu": 276911.8885916568,
            "loss_sequences_lower_95": 3.746691056321527,
            "loss_sequences_upper_95": 3.8934473003095404,
            "loss_tokens_lower_95": 3.573650820384729,
            "loss_tokens_upper_95": 3.7176934772388215,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.956055207223427,
            "data_time": 0.029438220319293794,
            "batch_time": 0.043523269040243964,
            "samples_per_second": 1967503.1897757142,
            "samples_per_second_per_gpu": 245937.89872196427,
            "loss_sequences_lower_95": 2.788581801623833,
            "loss_sequences_upper_95": 3.226860409248166,
            "loss_tokens_lower_95": 2.6848293086638457,
            "loss_tokens_upper_95": 3.0432082208879234,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.127794874074531,
            "data_time": 0.0021836106128517363,
            "batch_time": 0.01588602062229914,
            "samples_per_second": 2250662.990223612,
            "samples_per_second_per_gpu": 281332.8737779515,
            "loss_sequences_lower_95": 5.116446316607072,
            "loss_sequences_upper_95": 5.138742133192263,
            "loss_tokens_lower_95": 5.1166995020097765,
            "loss_tokens_upper_95": 5.138871250812663,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.2367082018296696,
            "data_time": 0.04349770112471147,
            "batch_time": 0.05903794982216575,
            "samples_per_second": 1736238.0080502909,
            "samples_per_second_per_gpu": 217029.75100628636,
            "loss_sequences_lower_95": 1.183341811467143,
            "loss_sequences_upper_95": 1.3500936674840243,
            "loss_tokens_lower_95": 1.056149279497785,
            "loss_tokens_upper_95": 1.2998480087170947,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.569257766895574,
            "data_time": 0.00163191500723312,
            "batch_time": 0.015255013547918922,
            "samples_per_second": 2267742.2412911737,
            "samples_per_second_per_gpu": 283467.7801613967,
            "loss_sequences_lower_95": 4.902764132517689,
            "loss_sequences_upper_95": 4.944582463230477,
            "loss_tokens_lower_95": 4.055995635880077,
            "loss_tokens_upper_95": 4.097778276112186,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.052488733053208,
            "data_time": 0.006003427127050975,
            "batch_time": 0.020097685711724416,
            "samples_per_second": 2172206.682849134,
            "samples_per_second_per_gpu": 271525.83535614173,
            "loss_sequences_lower_95": 5.0348695922851565,
            "loss_sequences_upper_95": 5.246140869140625,
            "loss_tokens_lower_95": 4.848861111362867,
            "loss_tokens_upper_95": 5.046319178841599,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.2730868515761005,
            "data_time": 0.021783054885217697,
            "batch_time": 0.036420373593346544,
            "samples_per_second": 1963057.4261802107,
            "samples_per_second_per_gpu": 245382.17827252633,
            "loss_sequences_lower_95": 5.0943698783542795,
            "loss_sequences_upper_95": 5.456366855787194,
            "loss_tokens_lower_95": 5.0906724481997285,
            "loss_tokens_upper_95": 5.450239974312161,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.742728705839677,
            "data_time": 0.004185540848467724,
            "batch_time": 0.018170933407473278,
            "samples_per_second": 2192018.670242101,
            "samples_per_second_per_gpu": 274002.3337802626,
            "loss_sequences_lower_95": 5.675851967551491,
            "loss_sequences_upper_95": 5.809686667702415,
            "loss_tokens_lower_95": 5.676295073538116,
            "loss_tokens_upper_95": 5.8076623905066285,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.5637489229043324,
            "data_time": 0.004017486217174124,
            "batch_time": 0.017567460207228967,
            "samples_per_second": 2268642.8382713986,
            "samples_per_second_per_gpu": 283580.3547839248,
            "loss_sequences_lower_95": 1.6120879801432293,
            "loss_sequences_upper_95": 1.6718843587239585,
            "loss_tokens_lower_95": 1.468497996854992,
            "loss_tokens_upper_95": 1.5455003720238096,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.040113754499526,
            "data_time": 0.02015910191195352,
            "batch_time": 0.03456287511757442,
            "samples_per_second": 1921368.1916803464,
            "samples_per_second_per_gpu": 240171.0239600433,
            "loss_sequences_lower_95": 5.7118671671549475,
            "loss_sequences_upper_95": 6.365664861769903,
            "loss_tokens_lower_95": 5.711415201822917,
            "loss_tokens_upper_95": 6.372850472586496,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.3793742693960667,
            "data_time": 0.13736529648303986,
            "batch_time": 0.15700174868106842,
            "samples_per_second": 750157.9447968957,
            "samples_per_second_per_gpu": 93769.74309961197,
            "loss_sequences_lower_95": 2.124809604883194,
            "loss_sequences_upper_95": 3.122039985656738,
            "loss_tokens_lower_95": 1.7937080933875644,
            "loss_tokens_upper_95": 2.2967202098099224,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.622598586559295,
            "data_time": 0.005374760854811896,
            "batch_time": 0.01893203882944016,
            "samples_per_second": 2246822.272123526,
            "samples_per_second_per_gpu": 280852.78401544073,
            "loss_sequences_lower_95": 7.571807446289062,
            "loss_sequences_upper_95": 7.8702001953125,
            "loss_tokens_lower_95": 7.354047198868443,
            "loss_tokens_upper_95": 7.621590148615958,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.226306206226349,
            "data_time": 0.005163850765379649,
            "batch_time": 0.018765321326634242,
            "samples_per_second": 2248456.960079084,
            "samples_per_second_per_gpu": 281057.1200098855,
            "loss_sequences_lower_95": 7.328224279785156,
            "loss_sequences_upper_95": 7.5449470703124994,
            "loss_tokens_lower_95": 6.984024620915748,
            "loss_tokens_upper_95": 7.177703860478218,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.78767259849065,
            "data_time": 0.0038524721777160034,
            "batch_time": 0.0176631830209075,
            "samples_per_second": 2228217.0720399437,
            "samples_per_second_per_gpu": 278527.13400499296,
            "loss_sequences_lower_95": 5.7635851858447875,
            "loss_sequences_upper_95": 5.811567289510351,
            "loss_tokens_lower_95": 5.763353812882375,
            "loss_tokens_upper_95": 5.811554063521413,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.743636794170842,
            "data_time": 0.007881602491857062,
            "batch_time": 0.02175289050329848,
            "samples_per_second": 2166819.9166722037,
            "samples_per_second_per_gpu": 270852.48958402546,
            "loss_sequences_lower_95": 4.637432964459725,
            "loss_sequences_upper_95": 4.848067520071285,
            "loss_tokens_lower_95": 4.63284887732815,
            "loss_tokens_upper_95": 4.84675783312632,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.758208999633789,
            "data_time": 0.0053074005104246594,
            "batch_time": 0.01899673919829111,
            "samples_per_second": 2228492.591494409,
            "samples_per_second_per_gpu": 278561.5739368011,
            "loss_sequences_lower_95": 6.673356091308594,
            "loss_sequences_upper_95": 6.843714794921874,
            "loss_tokens_lower_95": 6.674605261230469,
            "loss_tokens_upper_95": 6.842909448242187,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.950456115087475,
            "data_time": 0.002080276557763718,
            "batch_time": 0.015668965332449495,
            "samples_per_second": 2271080.427155545,
            "samples_per_second_per_gpu": 283885.0533944431,
            "loss_sequences_lower_95": 3.4145990735128904,
            "loss_sequences_upper_95": 3.495185408289972,
            "loss_tokens_lower_95": 2.3827034589216396,
            "loss_tokens_upper_95": 2.4379640286079534,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.497525927735798,
            "data_time": 0.018051365443638393,
            "batch_time": 0.03210030283246722,
            "samples_per_second": 2025490.2072109617,
            "samples_per_second_per_gpu": 253186.27590137022,
            "loss_sequences_lower_95": 5.294135762684381,
            "loss_sequences_upper_95": 5.698430906836666,
            "loss_tokens_lower_95": 5.29919382351548,
            "loss_tokens_upper_95": 5.699572013741109,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.407420277128033,
            "data_time": 0.010084416717290878,
            "batch_time": 0.024066098034381866,
            "samples_per_second": 2169500.71999678,
            "samples_per_second_per_gpu": 271187.5899995975,
            "loss_sequences_lower_95": 5.274015933766085,
            "loss_sequences_upper_95": 5.5375553026386335,
            "loss_tokens_lower_95": 5.275364643171722,
            "loss_tokens_upper_95": 5.537687318091299,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.964317832123915,
            "data_time": 0.0022570214053337857,
            "batch_time": 0.015905571508538757,
            "samples_per_second": 2257071.5325901015,
            "samples_per_second_per_gpu": 282133.9415737627,
            "loss_sequences_lower_95": 4.496040059246993,
            "loss_sequences_upper_95": 4.592656783359952,
            "loss_tokens_lower_95": 3.24418799062626,
            "loss_tokens_upper_95": 3.3194555652426954,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.821006530176395,
            "data_time": 0.026155297954877216,
            "batch_time": 0.04037037988503774,
            "samples_per_second": 1995698.1856642228,
            "samples_per_second_per_gpu": 249462.27320802785,
            "loss_sequences_lower_95": 5.723402348271122,
            "loss_sequences_upper_95": 5.913139028397818,
            "loss_tokens_lower_95": 5.724912015723173,
            "loss_tokens_upper_95": 5.913400882135623,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9315950861035502,
            "data_time": 0.0035305564656799094,
            "batch_time": 0.017184224320855333,
            "samples_per_second": 2247944.1321381098,
            "samples_per_second_per_gpu": 280993.0165172637,
            "loss_sequences_lower_95": 3.891992963971713,
            "loss_sequences_upper_95": 3.9713146188718462,
            "loss_tokens_lower_95": 3.893062135655581,
            "loss_tokens_upper_95": 3.9705642784188644,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.553590599773,
            "data_time": 0.02164169658314098,
            "batch_time": 0.03579019416462292,
            "samples_per_second": 1929994.5347568116,
            "samples_per_second_per_gpu": 241249.31684460145,
            "loss_sequences_lower_95": 5.3437210675582145,
            "loss_sequences_upper_95": 5.7632634135125915,
            "loss_tokens_lower_95": 5.3384194975917785,
            "loss_tokens_upper_95": 5.762238526575774,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.7521250466505687,
            "data_time": 0.07050136476755142,
            "batch_time": 0.08616237342357635,
            "samples_per_second": 1450009.0219745808,
            "samples_per_second_per_gpu": 181251.1277468226,
            "loss_sequences_lower_95": 2.5355268923441567,
            "loss_sequences_upper_95": 3.1634918149312337,
            "loss_tokens_lower_95": 2.246690856085883,
            "loss_tokens_upper_95": 2.9578009605407716,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.7044690370559694,
            "data_time": 0.06986195594072342,
            "batch_time": 0.08648013323545456,
            "samples_per_second": 1337256.3425396758,
            "samples_per_second_per_gpu": 167157.04281745947,
            "loss_sequences_lower_95": 2.535035228729248,
            "loss_sequences_upper_95": 3.246336574554443,
            "loss_tokens_lower_95": 2.0534548555867054,
            "loss_tokens_upper_95": 2.9417484240585496,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.485494611119311,
            "data_time": 0.0034902874619347547,
            "batch_time": 0.01724095243506743,
            "samples_per_second": 2239748.7161879768,
            "samples_per_second_per_gpu": 279968.5895234971,
            "loss_sequences_lower_95": 4.46229644640556,
            "loss_sequences_upper_95": 4.508892903166421,
            "loss_tokens_lower_95": 4.462281330541237,
            "loss_tokens_upper_95": 4.508704407906849,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.659838172345044,
            "data_time": 0.0015250352380442671,
            "batch_time": 0.015308562225990275,
            "samples_per_second": 2247493.3038338777,
            "samples_per_second_per_gpu": 280936.6629792347,
            "loss_sequences_lower_95": 0.768120364951083,
            "loss_sequences_upper_95": 0.7911136685892942,
            "loss_tokens_lower_95": 0.5483811918144604,
            "loss_tokens_upper_95": 0.559742132519362,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.950428880105807,
            "data_time": 0.03620719909667969,
            "batch_time": 0.0519171804189682,
            "samples_per_second": 1835577.6203580676,
            "samples_per_second_per_gpu": 229447.20254475845,
            "loss_sequences_lower_95": 4.971388665146715,
            "loss_sequences_upper_95": 5.357148646557425,
            "loss_tokens_lower_95": 4.651139838664487,
            "loss_tokens_upper_95": 4.959955923951425,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.31216912656217,
            "data_time": 0.11040099461873372,
            "batch_time": 0.12639097940354121,
            "samples_per_second": 1035389.3895256214,
            "samples_per_second_per_gpu": 129423.67369070268,
            "loss_sequences_lower_95": 7.860700741329709,
            "loss_sequences_upper_95": 9.001001925081821,
            "loss_tokens_lower_95": 7.1272171585648145,
            "loss_tokens_upper_95": 9.203000612612122,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.903444327959201,
            "data_time": 0.02916054782413301,
            "batch_time": 0.04326868908745902,
            "samples_per_second": 1961991.1148458307,
            "samples_per_second_per_gpu": 245248.88935572884,
            "loss_sequences_lower_95": 4.883595480569979,
            "loss_sequences_upper_95": 5.232627812827506,
            "loss_tokens_lower_95": 4.524163014481707,
            "loss_tokens_upper_95": 4.788017955937238,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.043111790971058,
            "data_time": 0.02938841070447649,
            "batch_time": 0.044243511699494864,
            "samples_per_second": 1880571.2250576627,
            "samples_per_second_per_gpu": 235071.40313220784,
            "loss_sequences_lower_95": 5.004668593988186,
            "loss_sequences_upper_95": 5.306400522371618,
            "loss_tokens_lower_95": 4.722506140879705,
            "loss_tokens_upper_95": 4.94379918888164,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.017881053250011,
            "data_time": 0.028140573274521602,
            "batch_time": 0.042291448229835146,
            "samples_per_second": 1977177.037527715,
            "samples_per_second_per_gpu": 247147.1296909644,
            "loss_sequences_lower_95": 5.008769858755716,
            "loss_sequences_upper_95": 5.427614760980373,
            "loss_tokens_lower_95": 4.548840914655513,
            "loss_tokens_upper_95": 4.888559253027962,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.188641635383048,
            "data_time": 0.029162247975667317,
            "batch_time": 0.04401380391347976,
            "samples_per_second": 1918300.1454487857,
            "samples_per_second_per_gpu": 239787.51818109822,
            "loss_sequences_lower_95": 5.1288459033500855,
            "loss_sequences_upper_95": 5.426580410468869,
            "loss_tokens_lower_95": 4.897982712029668,
            "loss_tokens_upper_95": 5.103635442145517,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.422714194896058,
            "data_time": 0.03016049479260857,
            "batch_time": 0.044479181737075615,
            "samples_per_second": 1979876.8832212319,
            "samples_per_second_per_gpu": 247484.61040265398,
            "loss_sequences_lower_95": 4.339072735709434,
            "loss_sequences_upper_95": 4.579128753472559,
            "loss_tokens_lower_95": 4.176517187957005,
            "loss_tokens_upper_95": 4.341184625438725,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8298155709010797,
            "data_time": 0.028811321372077578,
            "batch_time": 0.04317388648078555,
            "samples_per_second": 1976512.1539615062,
            "samples_per_second_per_gpu": 247064.01924518828,
            "loss_sequences_lower_95": 3.810930186946218,
            "loss_sequences_upper_95": 4.05297738051996,
            "loss_tokens_lower_95": 3.598764321739376,
            "loss_tokens_upper_95": 3.7252131072464585,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-16.0/params.txt",
    "uuid": "8d9843b3-dd7b-458e-aefa-c6a02a11c63a",
    "creation_date": "2023_12_14-05_01_27"
}