{
    "name": "rw_original-d=96_l=8_h=4-0.25",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 52846560,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "10569312",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.941783161958059,
            "data_time": 0.1418636590242386,
            "batch_time": 1.2947801053524017,
            "samples_per_second": 372951.6914217647,
            "samples_per_second_per_gpu": 46618.96142772059,
            "loss_sequences_lower_95": 6.791097081502278,
            "loss_sequences_upper_95": 7.093854268391927,
            "loss_tokens_lower_95": 6.926763242085775,
            "loss_tokens_upper_95": 6.956634038289388,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.025375922080577,
            "data_time": 0.018503390480912166,
            "batch_time": 0.06377032413185035,
            "samples_per_second": 4687069.200960434,
            "samples_per_second_per_gpu": 585883.6501200543,
            "loss_sequences_lower_95": 6.022767455352415,
            "loss_sequences_upper_95": 6.028037103745338,
            "loss_tokens_lower_95": 6.0136650625,
            "loss_tokens_upper_95": 6.036903927083333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.454035801790199,
            "data_time": 0.09775500744581223,
            "batch_time": 0.1426946371793747,
            "samples_per_second": 4040081.303106074,
            "samples_per_second_per_gpu": 505010.1628882593,
            "loss_sequences_lower_95": 6.420606863839286,
            "loss_sequences_upper_95": 6.495114970304528,
            "loss_tokens_lower_95": 6.441503458333333,
            "loss_tokens_upper_95": 6.466865645833334,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.160772069852377,
            "data_time": 0.014110632632908068,
            "batch_time": 0.05809060444957331,
            "samples_per_second": 5341966.075572974,
            "samples_per_second_per_gpu": 667745.7594466218,
            "loss_sequences_lower_95": 6.135228626369201,
            "loss_sequences_upper_95": 6.186813718186212,
            "loss_tokens_lower_95": 6.148508729166667,
            "loss_tokens_upper_95": 6.17308453125,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.037096073088481,
            "data_time": 0.1000317707657814,
            "batch_time": 0.14449328184127808,
            "samples_per_second": 4058496.159310536,
            "samples_per_second_per_gpu": 507312.019913817,
            "loss_sequences_lower_95": 5.988464081991472,
            "loss_sequences_upper_95": 6.090802722938677,
            "loss_tokens_lower_95": 6.0251445625,
            "loss_tokens_upper_95": 6.048978916666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.73211751577402,
            "data_time": 0.03704154988129934,
            "batch_time": 0.08062106122573216,
            "samples_per_second": 4888985.842913012,
            "samples_per_second_per_gpu": 611123.2303641265,
            "loss_sequences_lower_95": 6.67599822147455,
            "loss_sequences_upper_95": 6.789845276916213,
            "loss_tokens_lower_95": 6.719116125,
            "loss_tokens_upper_95": 6.745098479166666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.590405621820567,
            "data_time": 0.013048795610666275,
            "batch_time": 0.05567117929458618,
            "samples_per_second": 5228487.348910776,
            "samples_per_second_per_gpu": 653560.918613847,
            "loss_sequences_lower_95": 8.562979591836736,
            "loss_sequences_upper_95": 8.617521763392858,
            "loss_tokens_lower_95": 8.575475291666667,
            "loss_tokens_upper_95": 8.605666020833333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.923856666125552,
            "data_time": 0.013937333696766904,
            "batch_time": 0.05746431335022575,
            "samples_per_second": 5336913.029313873,
            "samples_per_second_per_gpu": 667114.1286642341,
            "loss_sequences_lower_95": 5.90569906536322,
            "loss_sequences_upper_95": 5.942995122300392,
            "loss_tokens_lower_95": 5.911846958333333,
            "loss_tokens_upper_95": 5.935838520833334,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.143628958764115,
            "data_time": 0.09506440907716751,
            "batch_time": 0.14057695120573044,
            "samples_per_second": 4079222.6374777136,
            "samples_per_second_per_gpu": 509902.8296847142,
            "loss_sequences_lower_95": 6.08140499456142,
            "loss_sequences_upper_95": 6.214449688671081,
            "loss_tokens_lower_95": 6.131471135416667,
            "loss_tokens_upper_95": 6.155534322916666,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.069123078711891,
            "data_time": 0.1025429293513298,
            "batch_time": 0.14918015897274017,
            "samples_per_second": 4083377.2444815566,
            "samples_per_second_per_gpu": 510422.1555601946,
            "loss_sequences_lower_95": 7.001987719630065,
            "loss_sequences_upper_95": 7.14961292131145,
            "loss_tokens_lower_95": 7.0572628125,
            "loss_tokens_upper_95": 7.080752104166667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.65979655821127,
            "data_time": 0.011550123321598974,
            "batch_time": 0.055085960133322356,
            "samples_per_second": 5340806.438793792,
            "samples_per_second_per_gpu": 667600.804849224,
            "loss_sequences_lower_95": 6.649504585574551,
            "loss_sequences_upper_95": 6.670244514012608,
            "loss_tokens_lower_95": 6.64718559375,
            "loss_tokens_upper_95": 6.672792114583333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.448478857212789,
            "data_time": 0.02557992786169052,
            "batch_time": 0.0736139878630638,
            "samples_per_second": 4985846.981664304,
            "samples_per_second_per_gpu": 623230.872708038,
            "loss_sequences_lower_95": 6.430505101634085,
            "loss_sequences_upper_95": 6.466952327806122,
            "loss_tokens_lower_95": 6.436159875,
            "loss_tokens_upper_95": 6.460755447916666,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.039191297295128,
            "data_time": 0.10108420252799988,
            "batch_time": 0.1466331109404564,
            "samples_per_second": 4036960.9727270035,
            "samples_per_second_per_gpu": 504620.12159087544,
            "loss_sequences_lower_95": 5.967209351328759,
            "loss_sequences_upper_95": 6.120142913880262,
            "loss_tokens_lower_95": 6.026884000000001,
            "loss_tokens_upper_95": 6.05181096875,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.693623334958452,
            "data_time": 0.0998239666223526,
            "batch_time": 0.14470423012971878,
            "samples_per_second": 4120495.2558605587,
            "samples_per_second_per_gpu": 515061.90698256984,
            "loss_sequences_lower_95": 6.6257613980357375,
            "loss_sequences_upper_95": 6.767883077027113,
            "loss_tokens_lower_95": 6.681952145833333,
            "loss_tokens_upper_95": 6.706050854166667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.521030111746355,
            "data_time": 0.14795386791229248,
            "batch_time": 0.17051127552986145,
            "samples_per_second": 1056415.419294268,
            "samples_per_second_per_gpu": 132051.9274117835,
            "loss_sequences_lower_95": 7.457945893027565,
            "loss_sequences_upper_95": 7.593469931862571,
            "loss_tokens_lower_95": 7.498264208706942,
            "loss_tokens_upper_95": 7.5441906668923115,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.300944627200201,
            "data_time": 0.09646257013082504,
            "batch_time": 0.13131753355264664,
            "samples_per_second": 3287574.070690869,
            "samples_per_second_per_gpu": 410946.7588363586,
            "loss_sequences_lower_95": 7.166739896991163,
            "loss_sequences_upper_95": 7.433267647879464,
            "loss_tokens_lower_95": 7.287046395833333,
            "loss_tokens_upper_95": 7.314922520833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.100384872003722,
            "data_time": 0.09739359468221664,
            "batch_time": 0.13403690606355667,
            "samples_per_second": 3696817.2993953167,
            "samples_per_second_per_gpu": 462102.1624244146,
            "loss_sequences_lower_95": 7.034210462746331,
            "loss_sequences_upper_95": 7.17906516686593,
            "loss_tokens_lower_95": 7.090039,
            "loss_tokens_upper_95": 7.110760125,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.333841343395045,
            "data_time": 0.16506445407867432,
            "batch_time": 0.19391924142837524,
            "samples_per_second": 2219277.7025366905,
            "samples_per_second_per_gpu": 277409.7128170863,
            "loss_sequences_lower_95": 7.24323694197858,
            "loss_sequences_upper_95": 7.484254493088018,
            "loss_tokens_lower_95": 7.320925302974514,
            "loss_tokens_upper_95": 7.346508088659067,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.568183697657171,
            "data_time": 0.02783102664080533,
            "batch_time": 0.07231269641356035,
            "samples_per_second": 4502992.883464567,
            "samples_per_second_per_gpu": 562874.1104330709,
            "loss_sequences_lower_95": 5.556879125458447,
            "loss_sequences_upper_95": 5.579074244899231,
            "loss_tokens_lower_95": 5.556990955147949,
            "loss_tokens_upper_95": 5.579191220979918,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.702990891994696,
            "data_time": 0.02785845287144184,
            "batch_time": 0.071951974183321,
            "samples_per_second": 4483995.1242348375,
            "samples_per_second_per_gpu": 560499.3905293547,
            "loss_sequences_lower_95": 5.696847303823939,
            "loss_sequences_upper_95": 5.723663756752888,
            "loss_tokens_lower_95": 5.68983531074077,
            "loss_tokens_upper_95": 5.71281410681349,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.420047280469705,
            "data_time": 0.046353816986083984,
            "batch_time": 0.08808838658862644,
            "samples_per_second": 4450106.139818389,
            "samples_per_second_per_gpu": 556263.2674772986,
            "loss_sequences_lower_95": 8.85837426561762,
            "loss_sequences_upper_95": 9.048454614061763,
            "loss_tokens_lower_95": 8.29985715708393,
            "loss_tokens_upper_95": 8.455832523603169,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.003610274950663,
            "data_time": 0.04120549062887827,
            "batch_time": 0.08504114051659901,
            "samples_per_second": 4579407.013722016,
            "samples_per_second_per_gpu": 572425.876715252,
            "loss_sequences_lower_95": 8.323701204427083,
            "loss_sequences_upper_95": 8.45108642578125,
            "loss_tokens_lower_95": 7.922065656937893,
            "loss_tokens_upper_95": 8.027435166077044,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.655160711605139,
            "data_time": 0.0643713374932607,
            "batch_time": 0.10437324643135071,
            "samples_per_second": 3973602.604197797,
            "samples_per_second_per_gpu": 496700.3255247246,
            "loss_sequences_lower_95": 6.694561679657884,
            "loss_sequences_upper_95": 6.758452559348939,
            "loss_tokens_lower_95": 6.636383417223244,
            "loss_tokens_upper_95": 6.669468761091213,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.115350944345647,
            "data_time": 0.39579738676548004,
            "batch_time": 0.4381911903619766,
            "samples_per_second": 2477749.205620713,
            "samples_per_second_per_gpu": 309718.65070258913,
            "loss_sequences_lower_95": 7.03629731611772,
            "loss_sequences_upper_95": 7.215551646839488,
            "loss_tokens_lower_95": 7.082454146931371,
            "loss_tokens_upper_95": 7.148910706187645,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.304666577553262,
            "data_time": 0.39286117255687714,
            "batch_time": 0.43858253955841064,
            "samples_per_second": 1826928.5097522377,
            "samples_per_second_per_gpu": 228366.0637190297,
            "loss_sequences_lower_95": 6.260132433832908,
            "loss_sequences_upper_95": 6.436436692841199,
            "loss_tokens_lower_95": 6.266903179481211,
            "loss_tokens_upper_95": 6.365018552703961,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.552438848813375,
            "data_time": 0.18706350028514862,
            "batch_time": 0.21819676458835602,
            "samples_per_second": 2669559.9455826674,
            "samples_per_second_per_gpu": 333694.9931978334,
            "loss_sequences_lower_95": 5.473018218994141,
            "loss_sequences_upper_95": 5.59224272664388,
            "loss_tokens_lower_95": 5.460149716446809,
            "loss_tokens_upper_95": 5.652361253283375,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.271845291099776,
            "data_time": 0.02427043206989765,
            "batch_time": 0.06867101658135652,
            "samples_per_second": 4528925.557403308,
            "samples_per_second_per_gpu": 566115.6946754135,
            "loss_sequences_lower_95": 10.322569171054575,
            "loss_sequences_upper_95": 10.370625272962698,
            "loss_tokens_lower_95": 10.236623748745759,
            "loss_tokens_upper_95": 10.285796957869463,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.181817002970763,
            "data_time": 0.051112118363380435,
            "batch_time": 0.09318411946296692,
            "samples_per_second": 4364426.093152258,
            "samples_per_second_per_gpu": 545553.2616440322,
            "loss_sequences_lower_95": 8.21457605843592,
            "loss_sequences_upper_95": 8.43379933777883,
            "loss_tokens_lower_95": 7.042026402666609,
            "loss_tokens_upper_95": 7.190418101353173,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.7948690563338605,
            "data_time": 0.08407747149467468,
            "batch_time": 0.1263707160949707,
            "samples_per_second": 4188187.752885822,
            "samples_per_second_per_gpu": 523523.4691107278,
            "loss_sequences_lower_95": 7.399983668408703,
            "loss_sequences_upper_95": 7.6589551281196675,
            "loss_tokens_lower_95": 6.694399324252783,
            "loss_tokens_upper_95": 6.868751969511335,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.71909413707855,
            "data_time": 0.37703707814216614,
            "batch_time": 0.4189712852239609,
            "samples_per_second": 2139596.7727841316,
            "samples_per_second_per_gpu": 267449.59659801645,
            "loss_sequences_lower_95": 5.693742502987657,
            "loss_sequences_upper_95": 5.7457168648776396,
            "loss_tokens_lower_95": 5.693752187789847,
            "loss_tokens_upper_95": 5.744689091373252,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.9168735694885255,
            "data_time": 0.30087141692638397,
            "batch_time": 0.3269065171480179,
            "samples_per_second": 1587119.9891465434,
            "samples_per_second_per_gpu": 198389.99864331793,
            "loss_sequences_lower_95": 5.817033767700195,
            "loss_sequences_upper_95": 6.307827438354492,
            "loss_tokens_lower_95": 5.640814077960784,
            "loss_tokens_upper_95": 6.176373367446053,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.487850868836152,
            "data_time": 0.05439634621143341,
            "batch_time": 0.09797193668782711,
            "samples_per_second": 4467122.218051056,
            "samples_per_second_per_gpu": 558390.277256382,
            "loss_sequences_lower_95": 5.454825150832427,
            "loss_sequences_upper_95": 5.52101263633892,
            "loss_tokens_lower_95": 5.454464275004798,
            "loss_tokens_upper_95": 5.520847710225818,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.522797319363805,
            "data_time": 0.07905330955982208,
            "batch_time": 0.1223947525024414,
            "samples_per_second": 4404667.382062613,
            "samples_per_second_per_gpu": 550583.4227578266,
            "loss_sequences_lower_95": 5.4957860748266025,
            "loss_sequences_upper_95": 5.549200874347359,
            "loss_tokens_lower_95": 5.495387531752278,
            "loss_tokens_upper_95": 5.549626820757192,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.413597382451037,
            "data_time": 0.05679812654852867,
            "batch_time": 0.09795518964529037,
            "samples_per_second": 4305546.905786831,
            "samples_per_second_per_gpu": 538193.3632233539,
            "loss_sequences_lower_95": 6.5110534867213,
            "loss_sequences_upper_95": 6.624003207566818,
            "loss_tokens_lower_95": 6.3968592963841395,
            "loss_tokens_upper_95": 6.462387473716489,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.69855493068695,
            "data_time": 0.18702194839715958,
            "batch_time": 0.23271579295396805,
            "samples_per_second": 3689309.088970416,
            "samples_per_second_per_gpu": 461163.636121302,
            "loss_sequences_lower_95": 8.353217700195312,
            "loss_sequences_upper_95": 8.781113842773438,
            "loss_tokens_lower_95": 7.476255086397667,
            "loss_tokens_upper_95": 7.799810407273743,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.875066190958023,
            "data_time": 0.16085775196552277,
            "batch_time": 0.17857012152671814,
            "samples_per_second": 776183.9228174929,
            "samples_per_second_per_gpu": 97022.9903521866,
            "loss_sequences_lower_95": 5.552291214466095,
            "loss_sequences_upper_95": 6.330471193790435,
            "loss_tokens_lower_95": 5.2549251337161005,
            "loss_tokens_upper_95": 6.305844414371183,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.475218890727251,
            "data_time": 0.3461413234472275,
            "batch_time": 0.3822665363550186,
            "samples_per_second": 1648790.4413214594,
            "samples_per_second_per_gpu": 206098.80516518242,
            "loss_sequences_lower_95": 6.900197311927532,
            "loss_sequences_upper_95": 7.418193703136224,
            "loss_tokens_lower_95": 6.221135119037133,
            "loss_tokens_upper_95": 6.635256713729565,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.629405447258706,
            "data_time": 0.05016488002406226,
            "batch_time": 0.09479709300729963,
            "samples_per_second": 4475489.2303557135,
            "samples_per_second_per_gpu": 559436.1537944642,
            "loss_sequences_lower_95": 5.6040036502703146,
            "loss_sequences_upper_95": 5.6557269695076995,
            "loss_tokens_lower_95": 5.604403533202272,
            "loss_tokens_upper_95": 5.65589223639212,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.302642914431317,
            "data_time": 0.033572801521846225,
            "batch_time": 0.076618173292705,
            "samples_per_second": 4431652.287875755,
            "samples_per_second_per_gpu": 553956.5359844693,
            "loss_sequences_lower_95": 9.353046484602416,
            "loss_sequences_upper_95": 9.477494147826508,
            "loss_tokens_lower_95": 9.227666304001094,
            "loss_tokens_upper_95": 9.349861543873041,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.111904122453907,
            "data_time": 0.18640243262052536,
            "batch_time": 0.21566857397556305,
            "samples_per_second": 1974145.546187498,
            "samples_per_second_per_gpu": 246768.19327343724,
            "loss_sequences_lower_95": 4.980410811315963,
            "loss_sequences_upper_95": 5.343602408943596,
            "loss_tokens_lower_95": 4.895586836371126,
            "loss_tokens_upper_95": 5.244652032984505,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.560771616218404,
            "data_time": 0.0750753939151764,
            "batch_time": 0.1199516475200653,
            "samples_per_second": 4493042.204597306,
            "samples_per_second_per_gpu": 561630.2755746633,
            "loss_sequences_lower_95": 5.5993002972079715,
            "loss_sequences_upper_95": 5.730427957126825,
            "loss_tokens_lower_95": 5.476567047114011,
            "loss_tokens_upper_95": 5.632887387848798,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.354116154880058,
            "data_time": 0.32503244280815125,
            "batch_time": 0.35922588407993317,
            "samples_per_second": 2335090.883170409,
            "samples_per_second_per_gpu": 291886.36039630114,
            "loss_sequences_lower_95": 7.114592463795732,
            "loss_sequences_upper_95": 7.6342048831102325,
            "loss_tokens_lower_95": 7.205122552221401,
            "loss_tokens_upper_95": 7.558608406402937,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.253748612586058,
            "data_time": 0.03393388653997344,
            "batch_time": 0.07781534546221311,
            "samples_per_second": 4330530.976191024,
            "samples_per_second_per_gpu": 541316.372023878,
            "loss_sequences_lower_95": 5.243791697792684,
            "loss_sequences_upper_95": 5.263534855408582,
            "loss_tokens_lower_95": 5.243935466780856,
            "loss_tokens_upper_95": 5.2636387629088315,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.128178290950442,
            "data_time": 0.34299787878990173,
            "batch_time": 0.36913228034973145,
            "samples_per_second": 1592580.5147502134,
            "samples_per_second_per_gpu": 199072.56434377667,
            "loss_sequences_lower_95": 7.0033914510486195,
            "loss_sequences_upper_95": 7.391927433939814,
            "loss_tokens_lower_95": 6.864590126123496,
            "loss_tokens_upper_95": 7.289101807390289,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.32075875377755,
            "data_time": 0.022844203611214954,
            "batch_time": 0.06718402832746506,
            "samples_per_second": 4482223.798691313,
            "samples_per_second_per_gpu": 560277.9748364141,
            "loss_sequences_lower_95": 7.691248628308439,
            "loss_sequences_upper_95": 7.733043087493448,
            "loss_tokens_lower_95": 7.258480174081238,
            "loss_tokens_upper_95": 7.298470224854932,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.7223693175315855,
            "data_time": 0.0962086170911789,
            "batch_time": 0.14054885134100914,
            "samples_per_second": 4293043.305273928,
            "samples_per_second_per_gpu": 536630.413159241,
            "loss_sequences_lower_95": 6.710855529785156,
            "loss_sequences_upper_95": 6.838571325683594,
            "loss_tokens_lower_95": 6.647665350988496,
            "loss_tokens_upper_95": 6.77789073451397,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.472546025981074,
            "data_time": 0.35171249508857727,
            "batch_time": 0.39444272220134735,
            "samples_per_second": 2608978.0992194586,
            "samples_per_second_per_gpu": 326122.26240243233,
            "loss_sequences_lower_95": 5.390662377398947,
            "loss_sequences_upper_95": 5.553434010381284,
            "loss_tokens_lower_95": 5.393361922554348,
            "loss_tokens_upper_95": 5.550053671131963,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.896167452407605,
            "data_time": 0.07006741066773732,
            "batch_time": 0.11057048290967941,
            "samples_per_second": 3910569.474046407,
            "samples_per_second_per_gpu": 488821.18425580085,
            "loss_sequences_lower_95": 10.745878739790482,
            "loss_sequences_upper_95": 11.045948763760654,
            "loss_tokens_lower_95": 10.744689534505207,
            "loss_tokens_upper_95": 11.049247048117898,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.364482887903849,
            "data_time": 0.06914115448792775,
            "batch_time": 0.11345507701237996,
            "samples_per_second": 4461467.152733678,
            "samples_per_second_per_gpu": 557683.3940917097,
            "loss_sequences_lower_95": 5.479693652343751,
            "loss_sequences_upper_95": 5.569789990234375,
            "loss_tokens_lower_95": 5.303512295543217,
            "loss_tokens_upper_95": 5.409990027260904,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.774388926369803,
            "data_time": 0.37086549401283264,
            "batch_time": 0.41244810819625854,
            "samples_per_second": 1870941.855436718,
            "samples_per_second_per_gpu": 233867.73192958976,
            "loss_sequences_lower_95": 6.418905944824219,
            "loss_sequences_upper_95": 7.137076154436383,
            "loss_tokens_lower_95": 6.425430719284784,
            "loss_tokens_upper_95": 7.128906627836682,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.666759103536606,
            "data_time": 0.1572563648223877,
            "batch_time": 0.17488814890384674,
            "samples_per_second": 912683.0958736266,
            "samples_per_second_per_gpu": 114085.38698420333,
            "loss_sequences_lower_95": 7.507651352882386,
            "loss_sequences_upper_95": 8.933656930923462,
            "loss_tokens_lower_95": 7.2640373308634025,
            "loss_tokens_upper_95": 7.835601215165915,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.968622920036316,
            "data_time": 0.09862921386957169,
            "batch_time": 0.1431262455880642,
            "samples_per_second": 4201975.858239891,
            "samples_per_second_per_gpu": 525246.9822799864,
            "loss_sequences_lower_95": 8.070669738769531,
            "loss_sequences_upper_95": 8.387332202148437,
            "loss_tokens_lower_95": 7.816780413064456,
            "loss_tokens_upper_95": 8.096256985478798,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.267661867141723,
            "data_time": 0.09029118716716766,
            "batch_time": 0.13525951653718948,
            "samples_per_second": 4333083.662031556,
            "samples_per_second_per_gpu": 541635.4577539444,
            "loss_sequences_lower_95": 7.577138549804688,
            "loss_sequences_upper_95": 7.817820275878907,
            "loss_tokens_lower_95": 7.150098438673635,
            "loss_tokens_upper_95": 7.350208332518309,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.052373534594851,
            "data_time": 0.04104219128688177,
            "batch_time": 0.08486298223336537,
            "samples_per_second": 4539949.619610117,
            "samples_per_second_per_gpu": 567493.7024512646,
            "loss_sequences_lower_95": 5.034589095695923,
            "loss_sequences_upper_95": 5.070323491647985,
            "loss_tokens_lower_95": 5.034781682791024,
            "loss_tokens_upper_95": 5.070569850245139,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.605332978127007,
            "data_time": 0.11926154792308807,
            "batch_time": 0.15938017268975577,
            "samples_per_second": 4010353.4672077443,
            "samples_per_second_per_gpu": 501294.18340096803,
            "loss_sequences_lower_95": 5.550911064558131,
            "loss_sequences_upper_95": 5.659200416951685,
            "loss_tokens_lower_95": 5.5507592923447096,
            "loss_tokens_upper_95": 5.658616935858895,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.047818154811859,
            "data_time": 0.09088212251663208,
            "batch_time": 0.13498277589678764,
            "samples_per_second": 4365302.603379963,
            "samples_per_second_per_gpu": 545662.8254224954,
            "loss_sequences_lower_95": 7.982120080566407,
            "loss_sequences_upper_95": 8.117105554199219,
            "loss_tokens_lower_95": 7.979853076171875,
            "loss_tokens_upper_95": 8.114804809570312,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.90594317116823,
            "data_time": 0.029537999559016453,
            "batch_time": 0.0734557786158153,
            "samples_per_second": 4470003.895759039,
            "samples_per_second_per_gpu": 558750.4869698798,
            "loss_sequences_lower_95": 8.456524878784297,
            "loss_sequences_upper_95": 8.525536693619916,
            "loss_tokens_lower_95": 7.821337826056818,
            "loss_tokens_upper_95": 7.876997098014332,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.664886967459721,
            "data_time": 0.19627731187003,
            "batch_time": 0.22838619777134486,
            "samples_per_second": 1873118.67810424,
            "samples_per_second_per_gpu": 234139.83476303,
            "loss_sequences_lower_95": 5.557750906873105,
            "loss_sequences_upper_95": 5.770501583725658,
            "loss_tokens_lower_95": 5.554652450333781,
            "loss_tokens_upper_95": 5.771418898852904,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.7398844204696955,
            "data_time": 0.17226574569940567,
            "batch_time": 0.21794717013835907,
            "samples_per_second": 3736914.0854035895,
            "samples_per_second_per_gpu": 467114.2606754487,
            "loss_sequences_lower_95": 5.6608192234413295,
            "loss_sequences_upper_95": 5.8188118130553,
            "loss_tokens_lower_95": 5.660640007467831,
            "loss_tokens_upper_95": 5.817946119121476,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.452011944401552,
            "data_time": 0.033194181974977255,
            "batch_time": 0.07697226339951158,
            "samples_per_second": 4383944.694468379,
            "samples_per_second_per_gpu": 547993.0868085474,
            "loss_sequences_lower_95": 7.737968231319679,
            "loss_sequences_upper_95": 7.814571858754541,
            "loss_tokens_lower_95": 7.379616354896067,
            "loss_tokens_upper_95": 7.448203586125674,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.081227562414906,
            "data_time": 0.30945010483264923,
            "batch_time": 0.3464897572994232,
            "samples_per_second": 2575513.1785357124,
            "samples_per_second_per_gpu": 321939.14731696405,
            "loss_sequences_lower_95": 5.003043789333767,
            "loss_sequences_upper_95": 5.166518406136325,
            "loss_tokens_lower_95": 5.001006078972387,
            "loss_tokens_upper_95": 5.162004533394303,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.071817025350867,
            "data_time": 0.04671277564305525,
            "batch_time": 0.09097544734294598,
            "samples_per_second": 4398379.596782053,
            "samples_per_second_per_gpu": 549797.4495977566,
            "loss_sequences_lower_95": 9.049367414468653,
            "loss_sequences_upper_95": 9.09480328387806,
            "loss_tokens_lower_95": 9.048883493405963,
            "loss_tokens_upper_95": 9.094273595780772,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.523970305340962,
            "data_time": 0.36892133951187134,
            "batch_time": 0.4089491367340088,
            "samples_per_second": 2684966.137732477,
            "samples_per_second_per_gpu": 335620.7672165596,
            "loss_sequences_lower_95": 5.406476007850425,
            "loss_sequences_upper_95": 5.6399349175610585,
            "loss_tokens_lower_95": 5.406479222566179,
            "loss_tokens_upper_95": 5.639849764629475,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.250668573379517,
            "data_time": 0.302009254693985,
            "batch_time": 0.3225891590118408,
            "samples_per_second": 918124.4983648015,
            "samples_per_second_per_gpu": 114765.56229560019,
            "loss_sequences_lower_95": 9.042090377807616,
            "loss_sequences_upper_95": 9.61595428466797,
            "loss_tokens_lower_95": 8.837052133348253,
            "loss_tokens_upper_95": 9.6283504486084,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.90210657119751,
            "data_time": 0.29930153489112854,
            "batch_time": 0.3191612958908081,
            "samples_per_second": 1299177.9348874586,
            "samples_per_second_per_gpu": 162397.24186093232,
            "loss_sequences_lower_95": 8.731989339192708,
            "loss_sequences_upper_95": 9.426784159342446,
            "loss_tokens_lower_95": 8.378607263457909,
            "loss_tokens_upper_95": 9.265426104256276,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.600159238329342,
            "data_time": 0.04082992460046496,
            "batch_time": 0.08395384784255709,
            "samples_per_second": 4393699.16512943,
            "samples_per_second_per_gpu": 549212.3956411787,
            "loss_sequences_lower_95": 8.588263876104566,
            "loss_sequences_upper_95": 8.61196166812408,
            "loss_tokens_lower_95": 8.588447855301915,
            "loss_tokens_upper_95": 8.612093295517305,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.554187004856247,
            "data_time": 0.02305329984712809,
            "batch_time": 0.06735242994331377,
            "samples_per_second": 4497332.668147115,
            "samples_per_second_per_gpu": 562166.5835183894,
            "loss_sequences_lower_95": 8.011226922591979,
            "loss_sequences_upper_95": 8.039690430422109,
            "loss_tokens_lower_95": 7.50230646815433,
            "loss_tokens_upper_95": 7.529946201860169,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.517162653404897,
            "data_time": 0.3430529534816742,
            "batch_time": 0.373841792345047,
            "samples_per_second": 1405749.7834415727,
            "samples_per_second_per_gpu": 175718.72293019659,
            "loss_sequences_lower_95": 8.539334563007506,
            "loss_sequences_upper_95": 8.89060214786079,
            "loss_tokens_lower_95": 8.376049294936747,
            "loss_tokens_upper_95": 8.664265464758351,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 11.6121427175161,
            "data_time": 0.22244030237197876,
            "batch_time": 0.23970389366149902,
            "samples_per_second": 1071378.5297343251,
            "samples_per_second_per_gpu": 133922.31621679064,
            "loss_sequences_lower_95": 11.165601513836835,
            "loss_sequences_upper_95": 12.215584358009131,
            "loss_tokens_lower_95": 10.663480631510417,
            "loss_tokens_upper_95": 12.30491675859616,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.587592816934354,
            "data_time": 0.33922041952610016,
            "batch_time": 0.37383483350276947,
            "samples_per_second": 2275408.9054947034,
            "samples_per_second_per_gpu": 284426.11318683793,
            "loss_sequences_lower_95": 8.564252341665872,
            "loss_sequences_upper_95": 8.809315546547493,
            "loss_tokens_lower_95": 8.434914548727923,
            "loss_tokens_upper_95": 8.675285905303827,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.570848953433153,
            "data_time": 0.35519076883792877,
            "batch_time": 0.39072656631469727,
            "samples_per_second": 1951049.325971801,
            "samples_per_second_per_gpu": 243881.16574647513,
            "loss_sequences_lower_95": 8.538441374243758,
            "loss_sequences_upper_95": 8.753212068139053,
            "loss_tokens_lower_95": 8.449031431409054,
            "loss_tokens_upper_95": 8.648533060425706,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.636269470540489,
            "data_time": 0.3393315523862839,
            "batch_time": 0.37441912293434143,
            "samples_per_second": 2026009.58790457,
            "samples_per_second_per_gpu": 253251.19848807124,
            "loss_sequences_lower_95": 8.742786556336936,
            "loss_sequences_upper_95": 9.095402452422352,
            "loss_tokens_lower_95": 8.449848573848117,
            "loss_tokens_upper_95": 8.76965942733791,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.63778003832189,
            "data_time": 0.3487277030944824,
            "batch_time": 0.38310037553310394,
            "samples_per_second": 2040986.2005238854,
            "samples_per_second_per_gpu": 255123.27506548568,
            "loss_sequences_lower_95": 8.59953979864353,
            "loss_sequences_upper_95": 8.807633283661634,
            "loss_tokens_lower_95": 8.524021688205801,
            "loss_tokens_upper_95": 8.703685420621593,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.552228661057372,
            "data_time": 0.3233083635568619,
            "batch_time": 0.3575897663831711,
            "samples_per_second": 2281053.3897673953,
            "samples_per_second_per_gpu": 285131.6737209244,
            "loss_sequences_lower_95": 8.43745094441479,
            "loss_sequences_upper_95": 8.56875299489276,
            "loss_tokens_lower_95": 8.488771159330213,
            "loss_tokens_upper_95": 8.614619292007898,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.067243067229667,
            "data_time": 0.3364272266626358,
            "batch_time": 0.3714253455400467,
            "samples_per_second": 2205392.5227372274,
            "samples_per_second_per_gpu": 275674.0653421534,
            "loss_sequences_lower_95": 8.109912593190263,
            "loss_sequences_upper_95": 8.299523776915017,
            "loss_tokens_lower_95": 7.9805392932489445,
            "loss_tokens_upper_95": 8.105482759757384,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-0.25/params.txt",
    "uuid": "11303891-2d3e-4442-b970-293c5c16c4ba",
    "creation_date": "2023_12_14-05_00_54"
}