{
    "name": "rw_original-d=1024_l=24_h=8-4.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 32929300480,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "6585860096",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=1024_l=24_h=8-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.884507910410563,
            "data_time": 0.04753477871417999,
            "batch_time": 0.4376922696828842,
            "samples_per_second": 688668.7949890794,
            "samples_per_second_per_gpu": 86083.59937363492,
            "loss_sequences_lower_95": 2.8245294189453123,
            "loss_sequences_upper_95": 2.944861373901367,
            "loss_tokens_lower_95": 2.871701513926188,
            "loss_tokens_upper_95": 2.8970591417948404,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.975748178487632,
            "data_time": 0.0010642506690725726,
            "batch_time": 0.03677543023612963,
            "samples_per_second": 897812.3963516405,
            "samples_per_second_per_gpu": 112226.54954395506,
            "loss_sequences_lower_95": 2.9733033440099295,
            "loss_sequences_upper_95": 2.978203792287262,
            "loss_tokens_lower_95": 2.9654401406249997,
            "loss_tokens_upper_95": 2.9859061510416667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6484517476996596,
            "data_time": 0.00925645637512207,
            "batch_time": 0.04468706607818604,
            "samples_per_second": 868649.2675583332,
            "samples_per_second_per_gpu": 108581.15844479165,
            "loss_sequences_lower_95": 2.5994236163703763,
            "loss_sequences_upper_95": 2.7099939213966837,
            "loss_tokens_lower_95": 2.636845854166667,
            "loss_tokens_upper_95": 2.660126223958333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.047018145826674,
            "data_time": 0.0015953334146424343,
            "batch_time": 0.036900293081998825,
            "samples_per_second": 906740.8230087232,
            "samples_per_second_per_gpu": 113342.6028760904,
            "loss_sequences_lower_95": 3.0112399575144977,
            "loss_sequences_upper_95": 3.0842493959407213,
            "loss_tokens_lower_95": 3.035488395833333,
            "loss_tokens_upper_95": 3.0586168385416665,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.03947677388939,
            "data_time": 0.010053031473045804,
            "batch_time": 0.04539082819722088,
            "samples_per_second": 868022.6659998891,
            "samples_per_second_per_gpu": 108502.83324998614,
            "loss_sequences_lower_95": 2.98623086031974,
            "loss_sequences_upper_95": 3.106689757679242,
            "loss_tokens_lower_95": 3.0288381979166665,
            "loss_tokens_upper_95": 3.049853875,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0836431702977536,
            "data_time": 0.003794370462065158,
            "batch_time": 0.03926563424908597,
            "samples_per_second": 900013.3141049154,
            "samples_per_second_per_gpu": 112501.66426311442,
            "loss_sequences_lower_95": 3.0399412568777615,
            "loss_sequences_upper_95": 3.1322236995320996,
            "loss_tokens_lower_95": 3.072157395833333,
            "loss_tokens_upper_95": 3.0950699010416667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5595441319747847,
            "data_time": 0.00158869831247003,
            "batch_time": 0.03700663749778835,
            "samples_per_second": 906271.7015696795,
            "samples_per_second_per_gpu": 113283.96269620994,
            "loss_sequences_lower_95": 2.532987175143495,
            "loss_sequences_upper_95": 2.5862654655612243,
            "loss_tokens_lower_95": 2.54634628125,
            "loss_tokens_upper_95": 2.5733085260416666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4975424051534443,
            "data_time": 0.0017447194410935949,
            "batch_time": 0.037668037095462764,
            "samples_per_second": 901844.3633699968,
            "samples_per_second_per_gpu": 112730.5454212496,
            "loss_sequences_lower_95": 3.4760462614528795,
            "loss_sequences_upper_95": 3.520767762189136,
            "loss_tokens_lower_95": 3.4865625885416667,
            "loss_tokens_upper_95": 3.5086084479166666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1239245873156603,
            "data_time": 0.010774690007406568,
            "batch_time": 0.04676711937737844,
            "samples_per_second": 866436.7250788269,
            "samples_per_second_per_gpu": 108304.59063485336,
            "loss_sequences_lower_95": 3.0431143473803512,
            "loss_sequences_upper_95": 3.22057849915047,
            "loss_tokens_lower_95": 3.11270525,
            "loss_tokens_upper_95": 3.135193682291667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.156065205811512,
            "data_time": 0.00945376604795456,
            "batch_time": 0.044763785786926746,
            "samples_per_second": 877034.3083680128,
            "samples_per_second_per_gpu": 109629.2885460016,
            "loss_sequences_lower_95": 4.049161707459702,
            "loss_sequences_upper_95": 4.288534594147573,
            "loss_tokens_lower_95": 4.142728229166667,
            "loss_tokens_upper_95": 4.16950953125,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1638833110718365,
            "data_time": 0.0012439113997472218,
            "batch_time": 0.036638739730404254,
            "samples_per_second": 908035.6472893376,
            "samples_per_second_per_gpu": 113504.4559111672,
            "loss_sequences_lower_95": 3.152937101719885,
            "loss_sequences_upper_95": 3.1751280157406296,
            "loss_tokens_lower_95": 3.1532226875,
            "loss_tokens_upper_95": 3.174632421875,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9925271562117928,
            "data_time": 0.002593261216899735,
            "batch_time": 0.03798561092221072,
            "samples_per_second": 904938.3993800019,
            "samples_per_second_per_gpu": 113117.29992250024,
            "loss_sequences_lower_95": 2.970905509545177,
            "loss_sequences_upper_95": 3.015299543565832,
            "loss_tokens_lower_95": 2.981898354166667,
            "loss_tokens_upper_95": 3.003314666666667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.546893820559519,
            "data_time": 0.0097114427287588,
            "batch_time": 0.045032804662531074,
            "samples_per_second": 866666.6126685946,
            "samples_per_second_per_gpu": 108333.32658357432,
            "loss_sequences_lower_95": 3.4696404014107665,
            "loss_sequences_upper_95": 3.641670050630705,
            "loss_tokens_lower_95": 3.5344961458333333,
            "loss_tokens_upper_95": 3.5593801666666667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.774317943637094,
            "data_time": 0.00983127274836202,
            "batch_time": 0.04514510983014962,
            "samples_per_second": 867612.6147137367,
            "samples_per_second_per_gpu": 108451.57683921709,
            "loss_sequences_lower_95": 2.693231387633656,
            "loss_sequences_upper_95": 2.8688451700929223,
            "loss_tokens_lower_95": 2.763111453125,
            "loss_tokens_upper_95": 2.7854503385416667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5864556702700527,
            "data_time": 0.08296740906579154,
            "batch_time": 0.11751902103424072,
            "samples_per_second": 516003.85021415347,
            "samples_per_second_per_gpu": 64500.481276769184,
            "loss_sequences_lower_95": 3.5171110673384236,
            "loss_sequences_upper_95": 3.660718397660689,
            "loss_tokens_lower_95": 3.566328421506015,
            "loss_tokens_upper_95": 3.607239532470703,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9691031576940685,
            "data_time": 0.014277320016514172,
            "batch_time": 0.049550452015616676,
            "samples_per_second": 855375.3040212422,
            "samples_per_second_per_gpu": 106921.91300265527,
            "loss_sequences_lower_95": 2.911736362112507,
            "loss_sequences_upper_95": 3.0253127740353953,
            "loss_tokens_lower_95": 2.9571741562500002,
            "loss_tokens_upper_95": 2.9808627552083333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.059366979825465,
            "data_time": 0.012725681066513062,
            "batch_time": 0.04828442260622978,
            "samples_per_second": 865915.2529265918,
            "samples_per_second_per_gpu": 108239.40661582397,
            "loss_sequences_lower_95": 4.980029176093028,
            "loss_sequences_upper_95": 5.163254218994784,
            "loss_tokens_lower_95": 5.047692333333334,
            "loss_tokens_upper_95": 5.07094465625,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.252505753861099,
            "data_time": 0.03608965873718262,
            "batch_time": 0.07235748693346977,
            "samples_per_second": 773442.5694488107,
            "samples_per_second_per_gpu": 96680.32118110133,
            "loss_sequences_lower_95": 3.1123428594870646,
            "loss_sequences_upper_95": 3.4982861128009732,
            "loss_tokens_lower_95": 3.239421419237481,
            "loss_tokens_upper_95": 3.265823814517162,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8423337539518139,
            "data_time": 0.001631062808775413,
            "batch_time": 0.037061274696591234,
            "samples_per_second": 901883.6643637868,
            "samples_per_second_per_gpu": 112735.45804547335,
            "loss_sequences_lower_95": 1.8328090227086598,
            "loss_sequences_upper_95": 1.8519695725880394,
            "loss_tokens_lower_95": 1.8326943694585884,
            "loss_tokens_upper_95": 1.8521193047531512,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6392784571771104,
            "data_time": 0.0018028880760168574,
            "batch_time": 0.03724201199165575,
            "samples_per_second": 900575.9481617579,
            "samples_per_second_per_gpu": 112571.99352021974,
            "loss_sequences_lower_95": 2.636041808000149,
            "loss_sequences_upper_95": 2.660406108796181,
            "loss_tokens_lower_95": 2.618187271804577,
            "loss_tokens_upper_95": 2.635935095683221,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4025239132672027,
            "data_time": 0.0032371263477062977,
            "batch_time": 0.038786506562947555,
            "samples_per_second": 896910.3340211572,
            "samples_per_second_per_gpu": 112113.79175264465,
            "loss_sequences_lower_95": 3.6509564673162935,
            "loss_sequences_upper_95": 3.935343316843558,
            "loss_tokens_lower_95": 2.8614649880122704,
            "loss_tokens_upper_95": 3.064965320048174,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2362320482892293,
            "data_time": 0.003728363108127675,
            "batch_time": 0.03919358836843612,
            "samples_per_second": 893209.9712952679,
            "samples_per_second_per_gpu": 111651.24641190849,
            "loss_sequences_lower_95": 3.2741977701822917,
            "loss_sequences_upper_95": 3.4648431803385416,
            "loss_tokens_lower_95": 3.0607197572720124,
            "loss_tokens_upper_95": 3.1990652208628143,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.428336710347833,
            "data_time": 0.004418526659544057,
            "batch_time": 0.03988062238621316,
            "samples_per_second": 891038.8484574341,
            "samples_per_second_per_gpu": 111379.85605717926,
            "loss_sequences_lower_95": 2.4709300934138674,
            "loss_sequences_upper_95": 2.5231533973121207,
            "loss_tokens_lower_95": 2.344137568614284,
            "loss_tokens_upper_95": 2.37290461127311,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.034952533245087,
            "data_time": 0.021303221583366394,
            "batch_time": 0.0568755886384419,
            "samples_per_second": 835839.7205283188,
            "samples_per_second_per_gpu": 104479.96506603985,
            "loss_sequences_lower_95": 2.0152550506591798,
            "loss_sequences_upper_95": 2.1099966500022194,
            "loss_tokens_lower_95": 1.974479161417918,
            "loss_tokens_upper_95": 2.0177513061511134,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8134891451621544,
            "data_time": 0.019793089479207993,
            "batch_time": 0.054797668009996414,
            "samples_per_second": 829229.9268234973,
            "samples_per_second_per_gpu": 103653.74085293716,
            "loss_sequences_lower_95": 2.7922328590860173,
            "loss_sequences_upper_95": 2.955827599350287,
            "loss_tokens_lower_95": 2.721444704214624,
            "loss_tokens_upper_95": 2.8059039781863575,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.805474897225698,
            "data_time": 0.01611261642896212,
            "batch_time": 0.0513114944482461,
            "samples_per_second": 837847.7738879496,
            "samples_per_second_per_gpu": 104730.9717359937,
            "loss_sequences_lower_95": 2.7848733622233075,
            "loss_sequences_upper_95": 2.8768811340332032,
            "loss_tokens_lower_95": 2.689666883848212,
            "loss_tokens_upper_95": 2.847407937256161,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.830292223812331,
            "data_time": 0.0012860109505485847,
            "batch_time": 0.03697741558415462,
            "samples_per_second": 901709.8164252423,
            "samples_per_second_per_gpu": 112713.7270531553,
            "loss_sequences_lower_95": 4.833425570376704,
            "loss_sequences_upper_95": 4.916339335693371,
            "loss_tokens_lower_95": 4.6991601067896225,
            "loss_tokens_upper_95": 4.782779044253786,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5453517701389012,
            "data_time": 0.0030022313530813126,
            "batch_time": 0.038463576928081125,
            "samples_per_second": 897531.3515785955,
            "samples_per_second_per_gpu": 112191.41894732443,
            "loss_sequences_lower_95": 3.949504711009838,
            "loss_sequences_upper_95": 4.221248896435053,
            "loss_tokens_lower_95": 2.9719353311176633,
            "loss_tokens_upper_95": 3.094502500833759,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.355996877107604,
            "data_time": 0.0048560677347956474,
            "batch_time": 0.04026816302054637,
            "samples_per_second": 887214.1901659805,
            "samples_per_second_per_gpu": 110901.77377074756,
            "loss_sequences_lower_95": 3.676599058600416,
            "loss_sequences_upper_95": 3.9811957557860493,
            "loss_tokens_lower_95": 3.0149366951788883,
            "loss_tokens_upper_95": 3.1577254003307518,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.935613096576848,
            "data_time": 0.02306036651134491,
            "batch_time": 0.05976318461554391,
            "samples_per_second": 815208.4055920467,
            "samples_per_second_per_gpu": 101901.05069900583,
            "loss_sequences_lower_95": 5.848768901389484,
            "loss_sequences_upper_95": 6.019394157792879,
            "loss_tokens_lower_95": 5.84904459078018,
            "loss_tokens_upper_95": 6.017649987625749,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9566261768341064,
            "data_time": 0.04969447392683763,
            "batch_time": 0.0852749622785128,
            "samples_per_second": 756768.3632087442,
            "samples_per_second_per_gpu": 94596.04540109303,
            "loss_sequences_lower_95": 2.8195234909057616,
            "loss_sequences_upper_95": 3.161761795043945,
            "loss_tokens_lower_95": 2.6799475326947535,
            "loss_tokens_upper_95": 3.101107957324743,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8058919873671986,
            "data_time": 0.0033085938611644907,
            "batch_time": 0.03883360890035493,
            "samples_per_second": 896892.6955416908,
            "samples_per_second_per_gpu": 112111.58694271135,
            "loss_sequences_lower_95": 2.7680773138633894,
            "loss_sequences_upper_95": 2.8442389372061316,
            "loss_tokens_lower_95": 2.76801414216431,
            "loss_tokens_upper_95": 2.8435052971268875,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3241110062813974,
            "data_time": 0.004643477195627732,
            "batch_time": 0.04013367617110833,
            "samples_per_second": 892259.3226000866,
            "samples_per_second_per_gpu": 111532.41532501082,
            "loss_sequences_lower_95": 2.284933289224061,
            "loss_sequences_upper_95": 2.3645482818578523,
            "loss_tokens_lower_95": 2.28409686764179,
            "loss_tokens_upper_95": 2.3650404821547397,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0171918793134513,
            "data_time": 0.0034161553784529024,
            "batch_time": 0.03880170689288277,
            "samples_per_second": 893971.5331760497,
            "samples_per_second_per_gpu": 111746.44164700621,
            "loss_sequences_lower_95": 3.147649621807841,
            "loss_sequences_upper_95": 3.2784148804640743,
            "loss_tokens_lower_95": 2.8653020833985203,
            "loss_tokens_upper_95": 2.9214977660970862,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.912089841365814,
            "data_time": 0.01044955663383007,
            "batch_time": 0.04573289677500725,
            "samples_per_second": 864425.5844178247,
            "samples_per_second_per_gpu": 108053.1980522281,
            "loss_sequences_lower_95": 5.085366369628907,
            "loss_sequences_upper_95": 5.636284997558594,
            "loss_tokens_lower_95": 4.35430310373666,
            "loss_tokens_upper_95": 4.7098478184718315,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.287021577358246,
            "data_time": 0.14743679761886597,
            "batch_time": 0.1868254542350769,
            "samples_per_second": 475101.4176103908,
            "samples_per_second_per_gpu": 59387.67720129885,
            "loss_sequences_lower_95": 3.0808587312698363,
            "loss_sequences_upper_95": 3.52662478685379,
            "loss_tokens_lower_95": 2.8505222189015353,
            "loss_tokens_upper_95": 3.6337276677975705,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.115439813027437,
            "data_time": 0.027538938725248298,
            "batch_time": 0.062478648855331095,
            "samples_per_second": 791263.6067845798,
            "samples_per_second_per_gpu": 98907.95084807248,
            "loss_sequences_lower_95": 4.422405576157844,
            "loss_sequences_upper_95": 5.035358858656609,
            "loss_tokens_lower_95": 3.0823952245873554,
            "loss_tokens_upper_95": 3.4435745944568246,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.172921055005908,
            "data_time": 0.0028628346820672355,
            "batch_time": 0.038222369427482285,
            "samples_per_second": 895997.5056228276,
            "samples_per_second_per_gpu": 111999.68820285345,
            "loss_sequences_lower_95": 2.1503985017082377,
            "loss_sequences_upper_95": 2.196275863547469,
            "loss_tokens_lower_95": 2.1499643921591716,
            "loss_tokens_upper_95": 2.1965391919830575,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1378932949533755,
            "data_time": 0.0023004693211535867,
            "batch_time": 0.0377421962877184,
            "samples_per_second": 900052.1459573642,
            "samples_per_second_per_gpu": 112506.51824467053,
            "loss_sequences_lower_95": 2.1146315234299196,
            "loss_sequences_upper_95": 2.2379160576969728,
            "loss_tokens_lower_95": 2.013579105122699,
            "loss_tokens_upper_95": 2.1346911767951946,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8976515459490346,
            "data_time": 0.01801049874888526,
            "batch_time": 0.05292564299371508,
            "samples_per_second": 827046.2290898903,
            "samples_per_second_per_gpu": 103380.77863623628,
            "loss_sequences_lower_95": 2.761747646681118,
            "loss_sequences_upper_95": 3.1675257644373853,
            "loss_tokens_lower_95": 2.651346906437905,
            "loss_tokens_upper_95": 2.9302332931927904,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.356196435352151,
            "data_time": 0.00459795854985714,
            "batch_time": 0.03989569991827011,
            "samples_per_second": 890635.4559673347,
            "samples_per_second_per_gpu": 111329.43199591684,
            "loss_sequences_lower_95": 3.399157599228616,
            "loss_sequences_upper_95": 3.5518260157945685,
            "loss_tokens_lower_95": 3.2063037035504918,
            "loss_tokens_upper_95": 3.3454723384444867,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.461761072641466,
            "data_time": 0.031082749366760254,
            "batch_time": 0.06710843812851679,
            "samples_per_second": 808011.2798267072,
            "samples_per_second_per_gpu": 101001.4099783384,
            "loss_sequences_lower_95": 2.31780636019823,
            "loss_sequences_upper_95": 2.7266013308269224,
            "loss_tokens_lower_95": 2.2269410772774774,
            "loss_tokens_upper_95": 2.5614526048570045,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.267191681773168,
            "data_time": 0.0018885079927554952,
            "batch_time": 0.03732542876381899,
            "samples_per_second": 899544.5936312181,
            "samples_per_second_per_gpu": 112443.07420390226,
            "loss_sequences_lower_95": 5.257659324833718,
            "loss_sequences_upper_95": 5.276417060755901,
            "loss_tokens_lower_95": 5.257656912241823,
            "loss_tokens_upper_95": 5.276474523420309,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2821320191170404,
            "data_time": 0.04662916443564675,
            "batch_time": 0.0828101244839755,
            "samples_per_second": 733937.6239386257,
            "samples_per_second_per_gpu": 91742.20299232821,
            "loss_sequences_lower_95": 1.223949976800715,
            "loss_sequences_upper_95": 1.4248405938009614,
            "loss_tokens_lower_95": 1.0892076349795292,
            "loss_tokens_upper_95": 1.341225874558198,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.707527723974027,
            "data_time": 0.0011865311526292927,
            "batch_time": 0.03659041445742908,
            "samples_per_second": 903015.3948299707,
            "samples_per_second_per_gpu": 112876.92435374633,
            "loss_sequences_lower_95": 5.091362703911163,
            "loss_sequences_upper_95": 5.1358035041601156,
            "loss_tokens_lower_95": 4.118870696324952,
            "loss_tokens_upper_95": 4.164966972920697,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.5184986481666565,
            "data_time": 0.005390673402755979,
            "batch_time": 0.04073096646202935,
            "samples_per_second": 888186.3476702077,
            "samples_per_second_per_gpu": 111023.29345877597,
            "loss_sequences_lower_95": 4.5065433715820316,
            "loss_sequences_upper_95": 4.657114172363281,
            "loss_tokens_lower_95": 4.371271247598245,
            "loss_tokens_upper_95": 4.523307083967759,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4196958282719487,
            "data_time": 0.02112793114225743,
            "batch_time": 0.05707739975492833,
            "samples_per_second": 824808.0093422682,
            "samples_per_second_per_gpu": 103101.00116778353,
            "loss_sequences_lower_95": 2.340067238185717,
            "loss_sequences_upper_95": 2.498883414890455,
            "loss_tokens_lower_95": 2.3418275849715524,
            "loss_tokens_upper_95": 2.4988209401006283,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.067002602418264,
            "data_time": 0.004563180078943092,
            "batch_time": 0.04001481597682079,
            "samples_per_second": 890823.4113832047,
            "samples_per_second_per_gpu": 111352.9264229006,
            "loss_sequences_lower_95": 6.003234798546994,
            "loss_sequences_upper_95": 6.12963652639678,
            "loss_tokens_lower_95": 6.003488806522253,
            "loss_tokens_upper_95": 6.130538505785394,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2919860973358155,
            "data_time": 0.00396149050682149,
            "batch_time": 0.039445837127401476,
            "samples_per_second": 894548.1120466365,
            "samples_per_second_per_gpu": 111818.51400582957,
            "loss_sequences_lower_95": 1.3320174458821614,
            "loss_sequences_upper_95": 1.3808883138020833,
            "loss_tokens_lower_95": 1.2123912357911915,
            "loss_tokens_upper_95": 1.2797479812237393,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.678067562125978,
            "data_time": 0.023318976163864136,
            "batch_time": 0.05836739071777889,
            "samples_per_second": 804526.9383026854,
            "samples_per_second_per_gpu": 100565.86728783567,
            "loss_sequences_lower_95": 5.360711597260975,
            "loss_sequences_upper_95": 5.991141124906994,
            "loss_tokens_lower_95": 5.364230768112909,
            "loss_tokens_upper_95": 6.003749273390997,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.802602980285883,
            "data_time": 0.1533307284116745,
            "batch_time": 0.1917533129453659,
            "samples_per_second": 459692.192874951,
            "samples_per_second_per_gpu": 57461.524109368875,
            "loss_sequences_lower_95": 1.6436559498310088,
            "loss_sequences_upper_95": 2.286685609817505,
            "loss_tokens_lower_95": 1.3989191114288015,
            "loss_tokens_upper_95": 1.8138451991130395,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.522397634983062,
            "data_time": 0.005550437976443578,
            "batch_time": 0.041201553647480314,
            "samples_per_second": 882943.7966954276,
            "samples_per_second_per_gpu": 110367.97458692845,
            "loss_sequences_lower_95": 7.476974279785157,
            "loss_sequences_upper_95": 7.784722717285156,
            "loss_tokens_lower_95": 7.2244390548990065,
            "loss_tokens_upper_95": 7.504382634041878,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.162466847419739,
            "data_time": 0.005537148032869611,
            "batch_time": 0.04102889744062272,
            "samples_per_second": 885351.9958634021,
            "samples_per_second_per_gpu": 110668.99948292527,
            "loss_sequences_lower_95": 7.254056079101563,
            "loss_sequences_upper_95": 7.4612439453125,
            "loss_tokens_lower_95": 6.915702227657893,
            "loss_tokens_upper_95": 7.1270850367347895,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.605925106019171,
            "data_time": 0.0036239509199773984,
            "batch_time": 0.039250855621286856,
            "samples_per_second": 889020.9428010796,
            "samples_per_second_per_gpu": 111127.61785013495,
            "loss_sequences_lower_95": 5.5875061546681195,
            "loss_sequences_upper_95": 5.623395317607065,
            "loss_tokens_lower_95": 5.587839128760895,
            "loss_tokens_upper_95": 5.623698713281512,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.041859838087255,
            "data_time": 0.008723297147952537,
            "batch_time": 0.04400921084009268,
            "samples_per_second": 874194.7581714428,
            "samples_per_second_per_gpu": 109274.34477143035,
            "loss_sequences_lower_95": 1.9884720809631815,
            "loss_sequences_upper_95": 2.098016001149073,
            "loss_tokens_lower_95": 1.9865749798612111,
            "loss_tokens_upper_95": 2.0973736787904427,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.263151585102081,
            "data_time": 0.005719604473265391,
            "batch_time": 0.041121191448635526,
            "samples_per_second": 888524.899268191,
            "samples_per_second_per_gpu": 111065.61240852387,
            "loss_sequences_lower_95": 7.178966613769531,
            "loss_sequences_upper_95": 7.347966809082031,
            "loss_tokens_lower_95": 7.180570568847656,
            "loss_tokens_upper_95": 7.348219860839844,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5991657349898665,
            "data_time": 0.0016834093627497019,
            "batch_time": 0.037105897919667466,
            "samples_per_second": 900671.4689989855,
            "samples_per_second_per_gpu": 112583.93362487319,
            "loss_sequences_lower_95": 3.0429681956598866,
            "loss_sequences_upper_95": 3.1171208867963576,
            "loss_tokens_lower_95": 2.063186538892207,
            "loss_tokens_upper_95": 2.114833714854914,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5034325256276486,
            "data_time": 0.018766776153019495,
            "batch_time": 0.05397876160485404,
            "samples_per_second": 828560.1858285478,
            "samples_per_second_per_gpu": 103570.02322856848,
            "loss_sequences_lower_95": 2.418644344272898,
            "loss_sequences_upper_95": 2.5928366703773613,
            "loss_tokens_lower_95": 2.4179988234790404,
            "loss_tokens_upper_95": 2.5909672751355526,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3501599208981383,
            "data_time": 0.010079793632030487,
            "batch_time": 0.04578836169093847,
            "samples_per_second": 873374.4850904413,
            "samples_per_second_per_gpu": 109171.81063630516,
            "loss_sequences_lower_95": 2.3044941741344975,
            "loss_sequences_upper_95": 2.396454479741115,
            "loss_tokens_lower_95": 2.305655637254902,
            "loss_tokens_upper_95": 2.395417085535386,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6625204390785413,
            "data_time": 0.002039077073879676,
            "batch_time": 0.03745280602300844,
            "samples_per_second": 899234.0972887975,
            "samples_per_second_per_gpu": 112404.26216109969,
            "loss_sequences_lower_95": 4.268051335161437,
            "loss_sequences_upper_95": 4.374353729213642,
            "loss_tokens_lower_95": 2.882757608860029,
            "loss_tokens_upper_95": 2.958797841072607,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.914716662553253,
            "data_time": 0.026870240767796833,
            "batch_time": 0.06313108652830124,
            "samples_per_second": 810515.803007386,
            "samples_per_second_per_gpu": 101314.47537592326,
            "loss_sequences_lower_95": 5.8289401907138725,
            "loss_sequences_upper_95": 5.998917320291832,
            "loss_tokens_lower_95": 5.830231423352761,
            "loss_tokens_upper_95": 5.9964925776083,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.353436826420122,
            "data_time": 0.0031426002399970905,
            "batch_time": 0.038574288616250285,
            "samples_per_second": 895611.0879375415,
            "samples_per_second_per_gpu": 111951.38599219269,
            "loss_sequences_lower_95": 3.3257531252986428,
            "loss_sequences_upper_95": 3.3813265123279814,
            "loss_tokens_lower_95": 3.3248151176056004,
            "loss_tokens_upper_95": 3.3818632857296445,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.476271661161219,
            "data_time": 0.022927795756946912,
            "batch_time": 0.057897090911865234,
            "samples_per_second": 804063.3474269361,
            "samples_per_second_per_gpu": 100507.91842836702,
            "loss_sequences_lower_95": 2.393993992481417,
            "loss_sequences_upper_95": 2.561583909710634,
            "loss_tokens_lower_95": 2.393513029524424,
            "loss_tokens_upper_95": 2.5608925902727737,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.736733288069566,
            "data_time": 0.07941620796918869,
            "batch_time": 0.11552654951810837,
            "samples_per_second": 641765.2572633044,
            "samples_per_second_per_gpu": 80220.65715791305,
            "loss_sequences_lower_95": 1.573755391438802,
            "loss_sequences_upper_95": 2.0450571537017823,
            "loss_tokens_lower_95": 1.39391012456682,
            "loss_tokens_upper_95": 1.8892616589864095,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7123623500267664,
            "data_time": 0.08256105333566666,
            "batch_time": 0.1207827627658844,
            "samples_per_second": 606239.7713938503,
            "samples_per_second_per_gpu": 75779.97142423129,
            "loss_sequences_lower_95": 1.5940177790323893,
            "loss_sequences_upper_95": 2.1479513486226396,
            "loss_tokens_lower_95": 1.300259341550677,
            "loss_tokens_upper_95": 1.933200107531601,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4650644480743185,
            "data_time": 0.003209690124305155,
            "batch_time": 0.03864605038077639,
            "samples_per_second": 897107.7550399968,
            "samples_per_second_per_gpu": 112138.4693799996,
            "loss_sequences_lower_95": 3.4373301658574187,
            "loss_sequences_upper_95": 3.4940083360180414,
            "loss_tokens_lower_95": 3.4368066190514543,
            "loss_tokens_upper_95": 3.493377611837261,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.47358673902460685,
            "data_time": 0.001070813722174851,
            "batch_time": 0.036529148718916875,
            "samples_per_second": 902241.7819646735,
            "samples_per_second_per_gpu": 112780.2227455842,
            "loss_sequences_lower_95": 0.5417941853292971,
            "loss_sequences_upper_95": 0.5537649028793468,
            "loss_tokens_lower_95": 0.3977097884322948,
            "loss_tokens_upper_95": 0.4045739477147947,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8114703170896513,
            "data_time": 0.04060855135321617,
            "batch_time": 0.07717163488268852,
            "samples_per_second": 792291.5738914058,
            "samples_per_second_per_gpu": 99036.44673642573,
            "loss_sequences_lower_95": 3.8843468733659883,
            "loss_sequences_upper_95": 4.266366036482683,
            "loss_tokens_lower_95": 3.470840104022737,
            "loss_tokens_upper_95": 3.7429106504083762,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.006307460166313,
            "data_time": 0.11414112363542829,
            "batch_time": 0.15104290417262486,
            "samples_per_second": 497734.5294033489,
            "samples_per_second_per_gpu": 62216.816175418615,
            "loss_sequences_lower_95": 6.550508550695471,
            "loss_sequences_upper_95": 7.653297362456451,
            "loss_tokens_lower_95": 5.869459957546658,
            "loss_tokens_upper_95": 7.98084178029755,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6789886369937803,
            "data_time": 0.030425392446063813,
            "batch_time": 0.06614371992292858,
            "samples_per_second": 807356.4991267086,
            "samples_per_second_per_gpu": 100919.56239083858,
            "loss_sequences_lower_95": 3.6917005957626716,
            "loss_sequences_upper_95": 4.0109471949135385,
            "loss_tokens_lower_95": 3.3230879415475187,
            "loss_tokens_upper_95": 3.544067781158011,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.820747420555208,
            "data_time": 0.03126902807326544,
            "batch_time": 0.06715556269600279,
            "samples_per_second": 808310.1241256875,
            "samples_per_second_per_gpu": 101038.76551571094,
            "loss_sequences_lower_95": 3.8171047303734755,
            "loss_sequences_upper_95": 4.097476494021532,
            "loss_tokens_lower_95": 3.5014253895713616,
            "loss_tokens_upper_95": 3.694318105814618,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7468885561315024,
            "data_time": 0.02881608974365961,
            "batch_time": 0.06458700554711479,
            "samples_per_second": 805942.8325141547,
            "samples_per_second_per_gpu": 100742.85406426934,
            "loss_sequences_lower_95": 3.7708768425918207,
            "loss_sequences_upper_95": 4.12075336735423,
            "loss_tokens_lower_95": 3.3508716620543417,
            "loss_tokens_upper_95": 3.633821436754241,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.028788101382371,
            "data_time": 0.03053517284847441,
            "batch_time": 0.06672540448960804,
            "samples_per_second": 794990.2982098451,
            "samples_per_second_per_gpu": 99373.78727623064,
            "loss_sequences_lower_95": 4.013625261260242,
            "loss_sequences_upper_95": 4.296630868679139,
            "loss_tokens_lower_95": 3.720476990696797,
            "loss_tokens_upper_95": 3.9066150415723566,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5509141927920513,
            "data_time": 0.030372375323448653,
            "batch_time": 0.06635651765046297,
            "samples_per_second": 816385.6846979775,
            "samples_per_second_per_gpu": 102048.21058724719,
            "loss_sequences_lower_95": 3.4965863624714917,
            "loss_sequences_upper_95": 3.740141504891911,
            "loss_tokens_lower_95": 3.2955562893024277,
            "loss_tokens_upper_95": 3.4625403878144194,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8073793875008093,
            "data_time": 0.0304232466788519,
            "batch_time": 0.06740666287285942,
            "samples_per_second": 794167.127605083,
            "samples_per_second_per_gpu": 99270.89095063538,
            "loss_sequences_lower_95": 2.8369300563160964,
            "loss_sequences_upper_95": 3.0839965587709006,
            "loss_tokens_lower_95": 2.5560734888298673,
            "loss_tokens_upper_95": 2.6664907147095387,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-4.0/params.txt",
    "uuid": "d819d0b6-ddf0-4342-956c-8c09dedbaed5",
    "creation_date": "2023_12_14-05_25_55"
}