{
    "name": "rw_original-d=96_l=8_h=4-0.5",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 105693120,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "21138624",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.487451100349427,
            "data_time": 0.12547317147254944,
            "batch_time": 1.2336084842681885,
            "samples_per_second": 377370.9745179067,
            "samples_per_second_per_gpu": 47171.37181473834,
            "loss_sequences_lower_95": 6.336146456400553,
            "loss_sequences_upper_95": 6.641166025797526,
            "loss_tokens_lower_95": 6.472096036275228,
            "loss_tokens_upper_95": 6.502336311340332,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.611784724653798,
            "data_time": 0.018492243885657778,
            "batch_time": 0.06357276760157976,
            "samples_per_second": 4676753.001036364,
            "samples_per_second_per_gpu": 584594.1251295455,
            "loss_sequences_lower_95": 5.609364108327745,
            "loss_sequences_upper_95": 5.6141918029918,
            "loss_tokens_lower_95": 5.600216177083333,
            "loss_tokens_upper_95": 5.6232231979166665,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.089032173156738,
            "data_time": 0.08981072902679443,
            "batch_time": 0.13473966717720032,
            "samples_per_second": 4144953.805446795,
            "samples_per_second_per_gpu": 518119.2256808494,
            "loss_sequences_lower_95": 6.043943269690688,
            "loss_sequences_upper_95": 6.147901785714286,
            "loss_tokens_lower_95": 6.076213604166667,
            "loss_tokens_upper_95": 6.1018645625,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.797846044658385,
            "data_time": 0.012892242324979682,
            "batch_time": 0.05685038001913773,
            "samples_per_second": 5362965.870166878,
            "samples_per_second_per_gpu": 670370.7337708598,
            "loss_sequences_lower_95": 5.761549814755155,
            "loss_sequences_upper_95": 5.8347200386597935,
            "loss_tokens_lower_95": 5.784984072916667,
            "loss_tokens_upper_95": 5.810781697916666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.639883836757869,
            "data_time": 0.08938388526439667,
            "batch_time": 0.13346750289201736,
            "samples_per_second": 4038332.850597715,
            "samples_per_second_per_gpu": 504791.6063247144,
            "loss_sequences_lower_95": 5.5824204827520365,
            "loss_sequences_upper_95": 5.708540347345978,
            "loss_tokens_lower_95": 5.627816354166667,
            "loss_tokens_upper_95": 5.652038625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.306277184645227,
            "data_time": 0.03279385964075724,
            "batch_time": 0.0759055291612943,
            "samples_per_second": 4974655.524062673,
            "samples_per_second_per_gpu": 621831.9405078341,
            "loss_sequences_lower_95": 6.246132702960359,
            "loss_sequences_upper_95": 6.369340402070232,
            "loss_tokens_lower_95": 6.2928126875,
            "loss_tokens_upper_95": 6.319605229166667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.08889130611809,
            "data_time": 0.012805221229791641,
            "batch_time": 0.055264579504728316,
            "samples_per_second": 5209482.425188562,
            "samples_per_second_per_gpu": 651185.3031485702,
            "loss_sequences_lower_95": 8.058730110012755,
            "loss_sequences_upper_95": 8.118851682079082,
            "loss_tokens_lower_95": 8.072871750000001,
            "loss_tokens_upper_95": 8.105219354166666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.650169985306825,
            "data_time": 0.01484100363756481,
            "batch_time": 0.05815387634854568,
            "samples_per_second": 5234008.047154715,
            "samples_per_second_per_gpu": 654251.0058943394,
            "loss_sequences_lower_95": 5.625981583360602,
            "loss_sequences_upper_95": 5.675855284685864,
            "loss_tokens_lower_95": 5.637817239583333,
            "loss_tokens_upper_95": 5.6626311875,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.7651170511555865,
            "data_time": 0.08974950760602951,
            "batch_time": 0.13503491133451462,
            "samples_per_second": 4003699.1800404144,
            "samples_per_second_per_gpu": 500462.3975050518,
            "loss_sequences_lower_95": 5.683277185951791,
            "loss_sequences_upper_95": 5.862206708706491,
            "loss_tokens_lower_95": 5.752837395833334,
            "loss_tokens_upper_95": 5.777627104166666,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.790377728081503,
            "data_time": 0.09329163283109665,
            "batch_time": 0.1398473009467125,
            "samples_per_second": 4098469.161062435,
            "samples_per_second_per_gpu": 512308.64513280435,
            "loss_sequences_lower_95": 6.687830856383554,
            "loss_sequences_upper_95": 6.913806345340292,
            "loss_tokens_lower_95": 6.77734478125,
            "loss_tokens_upper_95": 6.80292809375,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.245349536452438,
            "data_time": 0.009613178413489768,
            "batch_time": 0.05305987185445325,
            "samples_per_second": 5402125.767861942,
            "samples_per_second_per_gpu": 675265.7209827427,
            "loss_sequences_lower_95": 6.23324868096478,
            "loss_sequences_upper_95": 6.257646255524531,
            "loss_tokens_lower_95": 6.23266703125,
            "loss_tokens_upper_95": 6.258155927083333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.051198449744527,
            "data_time": 0.02382594794034958,
            "batch_time": 0.07188103049993515,
            "samples_per_second": 4973153.161672744,
            "samples_per_second_per_gpu": 621644.145209093,
            "loss_sequences_lower_95": 6.026853740140827,
            "loss_sequences_upper_95": 6.07636937368219,
            "loss_tokens_lower_95": 6.038482614583333,
            "loss_tokens_upper_95": 6.063712208333333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.806349964219223,
            "data_time": 0.09525872766971588,
            "batch_time": 0.1405317783355713,
            "samples_per_second": 4142774.778061903,
            "samples_per_second_per_gpu": 517846.8472577379,
            "loss_sequences_lower_95": 5.713013883826698,
            "loss_sequences_upper_95": 5.916319934156314,
            "loss_tokens_lower_95": 5.793294895833333,
            "loss_tokens_upper_95": 5.81947425,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.248774358551274,
            "data_time": 0.08757729828357697,
            "batch_time": 0.13279429078102112,
            "samples_per_second": 4043902.025592075,
            "samples_per_second_per_gpu": 505487.7531990094,
            "loss_sequences_lower_95": 6.171615942432536,
            "loss_sequences_upper_95": 6.3380091401078795,
            "loss_tokens_lower_95": 6.236495208333333,
            "loss_tokens_upper_95": 6.261534197916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.291923674670133,
            "data_time": 0.12957054376602173,
            "batch_time": 0.14985701441764832,
            "samples_per_second": 1067627.1405256852,
            "samples_per_second_per_gpu": 133453.39256571064,
            "loss_sequences_lower_95": 7.217700472745029,
            "loss_sequences_upper_95": 7.384405933726918,
            "loss_tokens_lower_95": 7.266841454939408,
            "loss_tokens_upper_95": 7.316898033835671,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.821634145241784,
            "data_time": 0.08967801928520203,
            "batch_time": 0.12491628527641296,
            "samples_per_second": 3314929.0209031,
            "samples_per_second_per_gpu": 414366.1276128875,
            "loss_sequences_lower_95": 6.693610526660441,
            "loss_sequences_upper_95": 6.949064293686225,
            "loss_tokens_lower_95": 6.807639104166666,
            "loss_tokens_upper_95": 6.8355308854166665,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.835934329473249,
            "data_time": 0.09035876393318176,
            "batch_time": 0.12733366340398788,
            "samples_per_second": 3643457.5874222955,
            "samples_per_second_per_gpu": 455432.19842778693,
            "loss_sequences_lower_95": 6.751157848187046,
            "loss_sequences_upper_95": 6.9460617991424805,
            "loss_tokens_lower_95": 6.824796708333333,
            "loss_tokens_upper_95": 6.847265385416667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.004881374171523,
            "data_time": 0.14989091455936432,
            "batch_time": 0.17896445095539093,
            "samples_per_second": 2293346.972475899,
            "samples_per_second_per_gpu": 286668.37155948736,
            "loss_sequences_lower_95": 6.875068364377881,
            "loss_sequences_upper_95": 7.233452568679559,
            "loss_tokens_lower_95": 6.991221068335361,
            "loss_tokens_upper_95": 7.018357448890561,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.476516577036458,
            "data_time": 0.025345858660611238,
            "batch_time": 0.07106383388692683,
            "samples_per_second": 4620840.901062724,
            "samples_per_second_per_gpu": 577605.1126328405,
            "loss_sequences_lower_95": 5.463251150566158,
            "loss_sequences_upper_95": 5.489348388984832,
            "loss_tokens_lower_95": 5.4635169129887124,
            "loss_tokens_upper_95": 5.489602648527631,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2434409714106,
            "data_time": 0.030835673958063126,
            "batch_time": 0.07481162212789058,
            "samples_per_second": 4496277.643509539,
            "samples_per_second_per_gpu": 562034.7054386924,
            "loss_sequences_lower_95": 5.245463243829042,
            "loss_sequences_upper_95": 5.271329556487752,
            "loss_tokens_lower_95": 5.231246736998656,
            "loss_tokens_upper_95": 5.2532836097725175,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.810977435753939,
            "data_time": 0.05276964770423041,
            "batch_time": 0.09462864365842608,
            "samples_per_second": 4273393.653866004,
            "samples_per_second_per_gpu": 534174.2067332505,
            "loss_sequences_lower_95": 8.23857082822833,
            "loss_sequences_upper_95": 8.460876770451996,
            "loss_tokens_lower_95": 7.6860272799953835,
            "loss_tokens_upper_95": 7.859055924432631,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.321764642397563,
            "data_time": 0.04006527612606684,
            "batch_time": 0.08381365363796552,
            "samples_per_second": 4599896.0324845435,
            "samples_per_second_per_gpu": 574987.0040605679,
            "loss_sequences_lower_95": 7.674195719401041,
            "loss_sequences_upper_95": 7.825155794270834,
            "loss_tokens_lower_95": 7.231376203812894,
            "loss_tokens_upper_95": 7.346779505699685,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.192069644002502,
            "data_time": 0.06950262188911438,
            "batch_time": 0.10964807371298473,
            "samples_per_second": 3971449.837920127,
            "samples_per_second_per_gpu": 496431.2297400159,
            "loss_sequences_lower_95": 6.239902314134761,
            "loss_sequences_upper_95": 6.300426163286581,
            "loss_tokens_lower_95": 6.172537674837867,
            "loss_tokens_upper_95": 6.2053949556149774,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.799901329387318,
            "data_time": 0.3419450372457504,
            "batch_time": 0.38414466381073,
            "samples_per_second": 2745623.0856670584,
            "samples_per_second_per_gpu": 343202.8857083823,
            "loss_sequences_lower_95": 6.727536343661222,
            "loss_sequences_upper_95": 6.904811234907671,
            "loss_tokens_lower_95": 6.764263847248816,
            "loss_tokens_upper_95": 6.832047074977325,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.875348474541489,
            "data_time": 0.3788478374481201,
            "batch_time": 0.42439456284046173,
            "samples_per_second": 2723379.891469516,
            "samples_per_second_per_gpu": 340422.4864336895,
            "loss_sequences_lower_95": 5.843854307836416,
            "loss_sequences_upper_95": 6.021975184849331,
            "loss_tokens_lower_95": 5.836844744414353,
            "loss_tokens_upper_95": 5.935005091084124,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1419950151443485,
            "data_time": 0.18541871011257172,
            "batch_time": 0.21661200374364853,
            "samples_per_second": 2399876.6081058066,
            "samples_per_second_per_gpu": 299984.57601322583,
            "loss_sequences_lower_95": 5.063886474609375,
            "loss_sequences_upper_95": 5.195358408610026,
            "loss_tokens_lower_95": 5.048876296751873,
            "loss_tokens_upper_95": 5.245468399934333,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.631174867321295,
            "data_time": 0.024004788883030415,
            "batch_time": 0.06835383735597134,
            "samples_per_second": 4506773.287978238,
            "samples_per_second_per_gpu": 563346.6609972798,
            "loss_sequences_lower_95": 9.691757276185966,
            "loss_sequences_upper_95": 9.755942436395847,
            "loss_tokens_lower_95": 9.584523642433943,
            "loss_tokens_upper_95": 9.65030463859955,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.678549864075401,
            "data_time": 0.052389463782310484,
            "batch_time": 0.09493376761674881,
            "samples_per_second": 4273582.437133306,
            "samples_per_second_per_gpu": 534197.8046416632,
            "loss_sequences_lower_95": 7.706394162161985,
            "loss_sequences_upper_95": 7.955165752577862,
            "loss_tokens_lower_95": 6.5366256616854,
            "loss_tokens_upper_95": 6.687183027414708,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.373194848311233,
            "data_time": 0.07850989699363708,
            "batch_time": 0.12079399824142456,
            "samples_per_second": 4321628.017446916,
            "samples_per_second_per_gpu": 540203.5021808645,
            "loss_sequences_lower_95": 6.993879902810367,
            "loss_sequences_upper_95": 7.272760426387852,
            "loss_tokens_lower_95": 6.273645669469991,
            "loss_tokens_upper_95": 6.447557333262746,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.49625634711627,
            "data_time": 0.3251035511493683,
            "batch_time": 0.3691157251596451,
            "samples_per_second": 2337466.909493492,
            "samples_per_second_per_gpu": 292183.3636866865,
            "loss_sequences_lower_95": 5.469912719726563,
            "loss_sequences_upper_95": 5.5229058531321344,
            "loss_tokens_lower_95": 5.469978283747146,
            "loss_tokens_upper_95": 5.522527532708155,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.482360033988953,
            "data_time": 0.28795821964740753,
            "batch_time": 0.3127041310071945,
            "samples_per_second": 1787376.7899651034,
            "samples_per_second_per_gpu": 223422.09874563792,
            "loss_sequences_lower_95": 5.394549545288086,
            "loss_sequences_upper_95": 5.8796252899169925,
            "loss_tokens_lower_95": 5.223522228588998,
            "loss_tokens_upper_95": 5.731870090982781,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.287253707799843,
            "data_time": 0.05087682045996189,
            "batch_time": 0.09480075351893902,
            "samples_per_second": 4433135.920885433,
            "samples_per_second_per_gpu": 554141.9901106792,
            "loss_sequences_lower_95": 5.258476739920516,
            "loss_sequences_upper_95": 5.3165367915174,
            "loss_tokens_lower_95": 5.258001852670004,
            "loss_tokens_upper_95": 5.316430838984135,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.654907263272322,
            "data_time": 0.07832626402378082,
            "batch_time": 0.12208742797374725,
            "samples_per_second": 4234147.253039347,
            "samples_per_second_per_gpu": 529268.4066299184,
            "loss_sequences_lower_95": 5.62931093158144,
            "loss_sequences_upper_95": 5.680107787786011,
            "loss_tokens_lower_95": 5.628823050234183,
            "loss_tokens_upper_95": 5.680829572248029,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.772171666754472,
            "data_time": 0.0518089234828949,
            "batch_time": 0.09325383603572845,
            "samples_per_second": 4209002.763766018,
            "samples_per_second_per_gpu": 526125.3454707522,
            "loss_sequences_lower_95": 5.926275482011698,
            "loss_sequences_upper_95": 6.0416401096215315,
            "loss_tokens_lower_95": 5.750876351723448,
            "loss_tokens_upper_95": 5.813559417163133,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.2297846183776855,
            "data_time": 0.16793502867221832,
            "batch_time": 0.21345677971839905,
            "samples_per_second": 3916818.3878148375,
            "samples_per_second_per_gpu": 489602.2984768547,
            "loss_sequences_lower_95": 7.904753588867187,
            "loss_sequences_upper_95": 8.371110400390625,
            "loss_tokens_lower_95": 7.004138660241562,
            "loss_tokens_upper_95": 7.33357586358889,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.604933649301529,
            "data_time": 0.14272423088550568,
            "batch_time": 0.1602158546447754,
            "samples_per_second": 839452.5030344793,
            "samples_per_second_per_gpu": 104931.56287930992,
            "loss_sequences_lower_95": 5.283280789852142,
            "loss_sequences_upper_95": 6.101215076446533,
            "loss_tokens_lower_95": 4.9996713660229215,
            "loss_tokens_upper_95": 5.997289056887571,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.002949446097188,
            "data_time": 0.30924755334854126,
            "batch_time": 0.3475869596004486,
            "samples_per_second": 2242261.3065996915,
            "samples_per_second_per_gpu": 280282.66332496144,
            "loss_sequences_lower_95": 6.265615862265401,
            "loss_sequences_upper_95": 6.741768707626168,
            "loss_tokens_lower_95": 5.764049453477311,
            "loss_tokens_upper_95": 6.168846541515466,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3804512073921815,
            "data_time": 0.05722416109508938,
            "batch_time": 0.1017055826054679,
            "samples_per_second": 4286123.3965714825,
            "samples_per_second_per_gpu": 535765.4245714353,
            "loss_sequences_lower_95": 5.358699810916339,
            "loss_sequences_upper_95": 5.402130540253659,
            "loss_tokens_lower_95": 5.359332624696237,
            "loss_tokens_upper_95": 5.402737542748607,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.676286910775332,
            "data_time": 0.03365057139169602,
            "batch_time": 0.07689818456059411,
            "samples_per_second": 4335256.973147355,
            "samples_per_second_per_gpu": 541907.1216434194,
            "loss_sequences_lower_95": 8.715469314749903,
            "loss_sequences_upper_95": 8.864074041365466,
            "loss_tokens_lower_95": 8.589745719929137,
            "loss_tokens_upper_95": 8.736526089848024,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.829628844837566,
            "data_time": 0.1776033714413643,
            "batch_time": 0.20711395889520645,
            "samples_per_second": 1951511.7242449534,
            "samples_per_second_per_gpu": 243938.96553061917,
            "loss_sequences_lower_95": 4.698947825449291,
            "loss_sequences_upper_95": 5.0694350357894065,
            "loss_tokens_lower_95": 4.61946762889541,
            "loss_tokens_upper_95": 4.958901884377168,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.259445422262118,
            "data_time": 0.08033882975578308,
            "batch_time": 0.12516289949417114,
            "samples_per_second": 4476227.612688059,
            "samples_per_second_per_gpu": 559528.4515860074,
            "loss_sequences_lower_95": 5.3110522576552635,
            "loss_sequences_upper_95": 5.445471229944628,
            "loss_tokens_lower_95": 5.176321092568079,
            "loss_tokens_upper_95": 5.334577863664061,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.018348455429077,
            "data_time": 0.3070976138114929,
            "batch_time": 0.34149643778800964,
            "samples_per_second": 2190250.866966838,
            "samples_per_second_per_gpu": 273781.35837085475,
            "loss_sequences_lower_95": 6.7731469689345944,
            "loss_sequences_upper_95": 7.310405935892245,
            "loss_tokens_lower_95": 6.861390593934798,
            "loss_tokens_upper_95": 7.224540912074327,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.13905925711624,
            "data_time": 0.027236646245639526,
            "batch_time": 0.07162329433106329,
            "samples_per_second": 4437775.184673466,
            "samples_per_second_per_gpu": 554721.8980841832,
            "loss_sequences_lower_95": 5.131219896323015,
            "loss_sequences_upper_95": 5.146968065488098,
            "loss_tokens_lower_95": 5.131035347694539,
            "loss_tokens_upper_95": 5.1469247072070665,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.706232320915148,
            "data_time": 0.30927523970603943,
            "batch_time": 0.33602823317050934,
            "samples_per_second": 1646563.1837288989,
            "samples_per_second_per_gpu": 205820.39796611236,
            "loss_sequences_lower_95": 6.566081726666793,
            "loss_sequences_upper_95": 6.955270089455022,
            "loss_tokens_lower_95": 6.438539022018136,
            "loss_tokens_upper_95": 6.866528957527028,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.90180306924464,
            "data_time": 0.02744387209415436,
            "batch_time": 0.07165992081165314,
            "samples_per_second": 4405448.312766173,
            "samples_per_second_per_gpu": 550681.0390957716,
            "loss_sequences_lower_95": 7.340316451290619,
            "loss_sequences_upper_95": 7.381773249148323,
            "loss_tokens_lower_95": 6.835266997098646,
            "loss_tokens_upper_95": 6.875990316731142,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.017645496845246,
            "data_time": 0.09206660464406013,
            "batch_time": 0.1365567184984684,
            "samples_per_second": 4359641.852262575,
            "samples_per_second_per_gpu": 544955.2315328219,
            "loss_sequences_lower_95": 4.989106457519531,
            "loss_sequences_upper_95": 5.174387463378906,
            "loss_tokens_lower_95": 4.928163169394697,
            "loss_tokens_upper_95": 5.103340836928413,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.3150227463763695,
            "data_time": 0.34959952533245087,
            "batch_time": 0.39299529790878296,
            "samples_per_second": 2199283.127058963,
            "samples_per_second_per_gpu": 274910.39088237035,
            "loss_sequences_lower_95": 5.220547883406929,
            "loss_sequences_upper_95": 5.409886925738791,
            "loss_tokens_lower_95": 5.221789418096128,
            "loss_tokens_upper_95": 5.408031311035156,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.66298342040091,
            "data_time": 0.06529692063728969,
            "batch_time": 0.10529391715923946,
            "samples_per_second": 4028188.2062755562,
            "samples_per_second_per_gpu": 503523.52578444453,
            "loss_sequences_lower_95": 10.516290301698628,
            "loss_sequences_upper_95": 10.810659438624526,
            "loss_tokens_lower_95": 10.514634972774623,
            "loss_tokens_upper_95": 10.812303041400332,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.063939517339071,
            "data_time": 0.06541951249043147,
            "batch_time": 0.10944088051716487,
            "samples_per_second": 4457714.871886922,
            "samples_per_second_per_gpu": 557214.3589858653,
            "loss_sequences_lower_95": 5.172642586263021,
            "loss_sequences_upper_95": 5.257472672526041,
            "loss_tokens_lower_95": 5.0077906006152455,
            "loss_tokens_upper_95": 5.104468709358743,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.1722264539627805,
            "data_time": 0.32796667516231537,
            "batch_time": 0.3696947395801544,
            "samples_per_second": 2197376.964496474,
            "samples_per_second_per_gpu": 274672.1205620592,
            "loss_sequences_lower_95": 5.8361834571475075,
            "loss_sequences_upper_95": 6.5165392049153645,
            "loss_tokens_lower_95": 5.831992812383743,
            "loss_tokens_upper_95": 6.505948602585565,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.068736135959625,
            "data_time": 0.14583784341812134,
            "batch_time": 0.1635432094335556,
            "samples_per_second": 949864.3695939251,
            "samples_per_second_per_gpu": 118733.04619924063,
            "loss_sequences_lower_95": 6.864343822002411,
            "loss_sequences_upper_95": 8.309546160697938,
            "loss_tokens_lower_95": 6.7235832544700385,
            "loss_tokens_upper_95": 7.304267490033022,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.9314991145133975,
            "data_time": 0.08905041590332985,
            "batch_time": 0.13383584469556808,
            "samples_per_second": 4359170.199486377,
            "samples_per_second_per_gpu": 544896.2749357971,
            "loss_sequences_lower_95": 8.05684306640625,
            "loss_sequences_upper_95": 8.360581298828125,
            "loss_tokens_lower_95": 7.779555721896151,
            "loss_tokens_upper_95": 8.052738712523794,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.581867668628693,
            "data_time": 0.09954428300261497,
            "batch_time": 0.1444518007338047,
            "samples_per_second": 4248126.483573565,
            "samples_per_second_per_gpu": 531015.8104466957,
            "loss_sequences_lower_95": 7.899494689941406,
            "loss_sequences_upper_95": 8.139334326171875,
            "loss_tokens_lower_95": 7.46427468350964,
            "loss_tokens_upper_95": 7.662169719422884,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.218296395665321,
            "data_time": 0.04041569431622823,
            "batch_time": 0.08427910134196281,
            "samples_per_second": 4521140.498782877,
            "samples_per_second_per_gpu": 565142.5623478596,
            "loss_sequences_lower_95": 5.203016630371585,
            "loss_sequences_upper_95": 5.233458527358154,
            "loss_tokens_lower_95": 5.2030314605027455,
            "loss_tokens_upper_95": 5.2332560535089465,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.35691011592906,
            "data_time": 0.11684451500574748,
            "batch_time": 0.15732068320115408,
            "samples_per_second": 3980038.647895789,
            "samples_per_second_per_gpu": 497504.8309869736,
            "loss_sequences_lower_95": 5.296676002889185,
            "loss_sequences_upper_95": 5.4149027975290425,
            "loss_tokens_lower_95": 5.297031928793443,
            "loss_tokens_upper_95": 5.41531747094314,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.9031745681762695,
            "data_time": 0.09109890460968018,
            "batch_time": 0.1351291425526142,
            "samples_per_second": 4334504.963803393,
            "samples_per_second_per_gpu": 541813.1204754241,
            "loss_sequences_lower_95": 7.8101129150390625,
            "loss_sequences_upper_95": 7.9979243652343746,
            "loss_tokens_lower_95": 7.807954931640625,
            "loss_tokens_upper_95": 7.995470129394532,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.285882098243952,
            "data_time": 0.031507277417750584,
            "batch_time": 0.07535022781008766,
            "samples_per_second": 4373318.101639491,
            "samples_per_second_per_gpu": 546664.7627049363,
            "loss_sequences_lower_95": 7.8111945659886475,
            "loss_sequences_upper_95": 7.880673356935903,
            "loss_tokens_lower_95": 7.204166155675468,
            "loss_tokens_upper_95": 7.259290036604743,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.411781647312107,
            "data_time": 0.22064777782985143,
            "batch_time": 0.2540262086050851,
            "samples_per_second": 1752229.2937424332,
            "samples_per_second_per_gpu": 219028.66171780415,
            "loss_sequences_lower_95": 5.30190020888599,
            "loss_sequences_upper_95": 5.520117517727524,
            "loss_tokens_lower_95": 5.298686058841534,
            "loss_tokens_upper_95": 5.520186011470965,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.581354587218341,
            "data_time": 0.17202924937009811,
            "batch_time": 0.2184727042913437,
            "samples_per_second": 3711649.0129906526,
            "samples_per_second_per_gpu": 463956.1266238316,
            "loss_sequences_lower_95": 5.5041293035768994,
            "loss_sequences_upper_95": 5.656751589307598,
            "loss_tokens_lower_95": 5.50492036707261,
            "loss_tokens_upper_95": 5.65776320513557,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.986430554017826,
            "data_time": 0.029951824340969324,
            "batch_time": 0.0737944170832634,
            "samples_per_second": 4440236.532416216,
            "samples_per_second_per_gpu": 555029.566552027,
            "loss_sequences_lower_95": 7.2586834180837405,
            "loss_sequences_upper_95": 7.331664785227201,
            "loss_tokens_lower_95": 6.915770168663159,
            "loss_tokens_upper_95": 6.984973475249294,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.422905858862337,
            "data_time": 0.33328716456890106,
            "batch_time": 0.37166261672973633,
            "samples_per_second": 2317240.891039181,
            "samples_per_second_per_gpu": 289655.11137989763,
            "loss_sequences_lower_95": 5.375320232734478,
            "loss_sequences_upper_95": 5.470999807529348,
            "loss_tokens_lower_95": 5.374200269911024,
            "loss_tokens_upper_95": 5.470615641276042,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.379724021934953,
            "data_time": 0.05070096368973072,
            "batch_time": 0.09455338693582095,
            "samples_per_second": 4364527.102786492,
            "samples_per_second_per_gpu": 545565.8878483115,
            "loss_sequences_lower_95": 9.356713493883792,
            "loss_sequences_upper_95": 9.403114905867737,
            "loss_tokens_lower_95": 9.35560982893731,
            "loss_tokens_upper_95": 9.402912485665139,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.282667713257873,
            "data_time": 0.33930788934230804,
            "batch_time": 0.3795258402824402,
            "samples_per_second": 2514258.492562681,
            "samples_per_second_per_gpu": 314282.3115703351,
            "loss_sequences_lower_95": 5.168766888368476,
            "loss_sequences_upper_95": 5.392166152509671,
            "loss_tokens_lower_95": 5.167271097423961,
            "loss_tokens_upper_95": 5.395013664764107,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.352729264895121,
            "data_time": 0.27498796582221985,
            "batch_time": 0.2950289249420166,
            "samples_per_second": 1108120.1285660896,
            "samples_per_second_per_gpu": 138515.0160707612,
            "loss_sequences_lower_95": 8.11939811706543,
            "loss_sequences_upper_95": 8.816492741902668,
            "loss_tokens_lower_95": 7.787939940558539,
            "loss_tokens_upper_95": 8.856419902377658,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.037083061536153,
            "data_time": 0.27571795880794525,
            "batch_time": 0.29565557837486267,
            "samples_per_second": 1098504.024355537,
            "samples_per_second_per_gpu": 137313.00304444213,
            "loss_sequences_lower_95": 7.88283519744873,
            "loss_sequences_upper_95": 8.717352701822916,
            "loss_tokens_lower_95": 7.3787312025434515,
            "loss_tokens_upper_95": 8.527268227566495,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.066223420099588,
            "data_time": 0.04160170044217791,
            "batch_time": 0.08412660551922661,
            "samples_per_second": 4430894.32744673,
            "samples_per_second_per_gpu": 553861.7909308412,
            "loss_sequences_lower_95": 9.047771970498896,
            "loss_sequences_upper_95": 9.084562574788292,
            "loss_tokens_lower_95": 9.047858624240611,
            "loss_tokens_upper_95": 9.08483788199558,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.851299957587326,
            "data_time": 0.022234206441708527,
            "batch_time": 0.06671054973612706,
            "samples_per_second": 4524236.765828844,
            "samples_per_second_per_gpu": 565529.5957286055,
            "loss_sequences_lower_95": 7.320311752034466,
            "loss_sequences_upper_95": 7.34856740558538,
            "loss_tokens_lower_95": 6.797195125455318,
            "loss_tokens_upper_95": 6.826318518584008,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.15216315261961,
            "data_time": 0.31343047320842743,
            "batch_time": 0.34301401674747467,
            "samples_per_second": 2197457.2043470987,
            "samples_per_second_per_gpu": 274682.15054338734,
            "loss_sequences_lower_95": 8.181736190675752,
            "loss_sequences_upper_95": 8.563116262841412,
            "loss_tokens_lower_95": 7.997550293808339,
            "loss_tokens_upper_95": 8.301254874416605,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 11.310455373815588,
            "data_time": 0.22826391458511353,
            "batch_time": 0.245742529630661,
            "samples_per_second": 1060141.6522812056,
            "samples_per_second_per_gpu": 132517.7065351507,
            "loss_sequences_lower_95": 10.874927128972233,
            "loss_sequences_upper_95": 11.91452451138883,
            "loss_tokens_lower_95": 10.289843410915799,
            "loss_tokens_upper_95": 12.103425485116464,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.214182469902969,
            "data_time": 0.32861971855163574,
            "batch_time": 0.3624855577945709,
            "samples_per_second": 2386415.1752497028,
            "samples_per_second_per_gpu": 298301.89690621285,
            "loss_sequences_lower_95": 8.20966655452077,
            "loss_sequences_upper_95": 8.481509380805784,
            "loss_tokens_lower_95": 8.049563068361017,
            "loss_tokens_upper_95": 8.303445220510932,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.218840645580757,
            "data_time": 0.33227112889289856,
            "batch_time": 0.36640140414237976,
            "samples_per_second": 2438990.8144941437,
            "samples_per_second_per_gpu": 304873.85181176796,
            "loss_sequences_lower_95": 8.204164737608375,
            "loss_sequences_upper_95": 8.444746101193312,
            "loss_tokens_lower_95": 8.084456427927465,
            "loss_tokens_upper_95": 8.296807315182654,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.282270210545237,
            "data_time": 0.35004207491874695,
            "batch_time": 0.3849668353796005,
            "samples_per_second": 1986576.3483844453,
            "samples_per_second_per_gpu": 248322.04354805566,
            "loss_sequences_lower_95": 8.410778641119236,
            "loss_sequences_upper_95": 8.78962147410323,
            "loss_tokens_lower_95": 8.082102653060307,
            "loss_tokens_upper_95": 8.419072675567854,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.264390864023348,
            "data_time": 0.31833456456661224,
            "batch_time": 0.35197946429252625,
            "samples_per_second": 2416445.5899742167,
            "samples_per_second_per_gpu": 302055.6987467771,
            "loss_sequences_lower_95": 8.202246577565262,
            "loss_sequences_upper_95": 8.438174754817313,
            "loss_tokens_lower_95": 8.140342374902648,
            "loss_tokens_upper_95": 8.330168879143544,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.85474135120463,
            "data_time": 0.3365401327610016,
            "batch_time": 0.37117867171764374,
            "samples_per_second": 2354675.940822594,
            "samples_per_second_per_gpu": 294334.49260282423,
            "loss_sequences_lower_95": 7.732744963272758,
            "loss_sequences_upper_95": 7.8641418836131605,
            "loss_tokens_lower_95": 7.787110883116408,
            "loss_tokens_upper_95": 7.914042985887686,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.364164125628587,
            "data_time": 0.33126644790172577,
            "batch_time": 0.36643049120903015,
            "samples_per_second": 2407790.466957492,
            "samples_per_second_per_gpu": 300973.8083696865,
            "loss_sequences_lower_95": 7.391810031053497,
            "loss_sequences_upper_95": 7.570710624136575,
            "loss_tokens_lower_95": 7.279330945788125,
            "loss_tokens_upper_95": 7.407220275768536,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.5/params.txt",
    "uuid": "8f523b9e-28a3-4acc-8d5e-8677676423b3",
    "creation_date": "2023_12_14-05_00_54"
}