{
    "name": "rpj-d=1024_l=24_h=8-2.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 16464650240,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "3292930048",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=1024_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.5195374578237533,
            "data_time": 0.041252829134464264,
            "batch_time": 0.44617848843336105,
            "samples_per_second": 684862.9631183672,
            "samples_per_second_per_gpu": 85607.8703897959,
            "loss_sequences_lower_95": 2.454534174601237,
            "loss_sequences_upper_95": 2.5819957288106283,
            "loss_tokens_lower_95": 2.5083411089579264,
            "loss_tokens_upper_95": 2.5306427574157717,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.048349079376072,
            "data_time": 0.0011537009356030463,
            "batch_time": 0.03695736286362668,
            "samples_per_second": 893752.2806496789,
            "samples_per_second_per_gpu": 111719.03508120986,
            "loss_sequences_lower_95": 3.045712444962588,
            "loss_sequences_upper_95": 3.0509666060409533,
            "loss_tokens_lower_95": 3.0379578020833335,
            "loss_tokens_upper_95": 3.0585749322916667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.530409661117865,
            "data_time": 0.009511533737182618,
            "batch_time": 0.04470546436309814,
            "samples_per_second": 871761.3617536364,
            "samples_per_second_per_gpu": 108970.17021920455,
            "loss_sequences_lower_95": 2.5062483931560906,
            "loss_sequences_upper_95": 2.5543811159717795,
            "loss_tokens_lower_95": 2.5195604166666667,
            "loss_tokens_upper_95": 2.5414358020833334,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8879596993357866,
            "data_time": 0.0016172513561813456,
            "batch_time": 0.03691462348950537,
            "samples_per_second": 905646.1068300981,
            "samples_per_second_per_gpu": 113205.76335376226,
            "loss_sequences_lower_95": 2.8758393705702323,
            "loss_sequences_upper_95": 2.899607089642397,
            "loss_tokens_lower_95": 2.877618088541667,
            "loss_tokens_upper_95": 2.8981208645833334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.048655947939684,
            "data_time": 0.009196425814077674,
            "batch_time": 0.044613027002706945,
            "samples_per_second": 864354.3272314706,
            "samples_per_second_per_gpu": 108044.29090393383,
            "loss_sequences_lower_95": 3.0146620554253913,
            "loss_sequences_upper_95": 3.081848442870099,
            "loss_tokens_lower_95": 3.0381406145833334,
            "loss_tokens_upper_95": 3.0588784375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.838382432608277,
            "data_time": 0.0036441878132198167,
            "batch_time": 0.03915195717759754,
            "samples_per_second": 897411.8378149864,
            "samples_per_second_per_gpu": 112176.4797268733,
            "loss_sequences_lower_95": 2.7980921762804853,
            "loss_sequences_upper_95": 2.878372775349465,
            "loss_tokens_lower_95": 2.8277907916666667,
            "loss_tokens_upper_95": 2.848930765625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5671476445757613,
            "data_time": 0.0017141490737143388,
            "batch_time": 0.03711991345123988,
            "samples_per_second": 906627.5629637024,
            "samples_per_second_per_gpu": 113328.4453704628,
            "loss_sequences_lower_95": 1.5462014733139349,
            "loss_sequences_upper_95": 1.5883993467992665,
            "loss_tokens_lower_95": 1.5578848151041667,
            "loss_tokens_upper_95": 1.5767179817708334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4593143519556335,
            "data_time": 0.001810325034638919,
            "batch_time": 0.03789606606895482,
            "samples_per_second": 906576.9553460801,
            "samples_per_second_per_gpu": 113322.11941826002,
            "loss_sequences_lower_95": 3.45092233516034,
            "loss_sequences_upper_95": 3.467622801456152,
            "loss_tokens_lower_95": 3.4488557239583333,
            "loss_tokens_upper_95": 3.469545458333333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.22225870446461,
            "data_time": 0.009051691918146042,
            "batch_time": 0.04422230758364239,
            "samples_per_second": 868396.3228051887,
            "samples_per_second_per_gpu": 108549.54035064859,
            "loss_sequences_lower_95": 3.179641028923717,
            "loss_sequences_upper_95": 3.2686073675388245,
            "loss_tokens_lower_95": 3.2115960364583334,
            "loss_tokens_upper_95": 3.2330813958333335,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.848172487006357,
            "data_time": 0.008962620981037617,
            "batch_time": 0.04437888041138649,
            "samples_per_second": 874682.9681604069,
            "samples_per_second_per_gpu": 109335.37102005086,
            "loss_sequences_lower_95": 3.813847483759341,
            "loss_sequences_upper_95": 3.8783200290363298,
            "loss_tokens_lower_95": 3.83634625,
            "loss_tokens_upper_95": 3.8602401354166664,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9725666921465823,
            "data_time": 0.0012766702693271193,
            "batch_time": 0.036648961673798254,
            "samples_per_second": 907331.602234774,
            "samples_per_second_per_gpu": 113416.45027934675,
            "loss_sequences_lower_95": 2.9643972506894953,
            "loss_sequences_upper_95": 2.9806524125389715,
            "loss_tokens_lower_95": 2.9623470052083336,
            "loss_tokens_upper_95": 2.9827975416666663,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.879190230061739,
            "data_time": 0.002575070732936176,
            "batch_time": 0.03801229990689979,
            "samples_per_second": 903510.4726207231,
            "samples_per_second_per_gpu": 112938.80907759038,
            "loss_sequences_lower_95": 2.869070367205396,
            "loss_sequences_upper_95": 2.889035920091889,
            "loss_tokens_lower_95": 2.8691617135416667,
            "loss_tokens_upper_95": 2.889329421875,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4024189065000834,
            "data_time": 0.009480141839491049,
            "batch_time": 0.04505654757201907,
            "samples_per_second": 861812.4135148148,
            "samples_per_second_per_gpu": 107726.55168935185,
            "loss_sequences_lower_95": 3.3679128526915885,
            "loss_sequences_upper_95": 3.4354205953663794,
            "loss_tokens_lower_95": 3.3915553072916667,
            "loss_tokens_upper_95": 3.4132397395833336,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.783314483228866,
            "data_time": 0.009472384395827335,
            "batch_time": 0.04483748812124549,
            "samples_per_second": 868929.2395352079,
            "samples_per_second_per_gpu": 108616.15494190098,
            "loss_sequences_lower_95": 2.7220729229649314,
            "loss_sequences_upper_95": 2.8427123526206084,
            "loss_tokens_lower_95": 2.7725258072916668,
            "loss_tokens_upper_95": 2.7940455833333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.50828757611188,
            "data_time": 0.0788647106715611,
            "batch_time": 0.11479783058166504,
            "samples_per_second": 510328.54112908896,
            "samples_per_second_per_gpu": 63791.06764113612,
            "loss_sequences_lower_95": 3.4489026763222435,
            "loss_sequences_upper_95": 3.5674353686246008,
            "loss_tokens_lower_95": 3.489087148146196,
            "loss_tokens_upper_95": 3.5281567746942692,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.469745397915298,
            "data_time": 0.012883277101950212,
            "batch_time": 0.048367218537764115,
            "samples_per_second": 851131.9817949976,
            "samples_per_second_per_gpu": 106391.4977243747,
            "loss_sequences_lower_95": 2.380063471224148,
            "loss_sequences_upper_95": 2.5587605156634363,
            "loss_tokens_lower_95": 2.4593643541666665,
            "loss_tokens_upper_95": 2.4799071041666667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.421876896025323,
            "data_time": 0.012208116551240286,
            "batch_time": 0.047981309394041695,
            "samples_per_second": 863254.9492779461,
            "samples_per_second_per_gpu": 107906.86865974327,
            "loss_sequences_lower_95": 5.366670802778178,
            "loss_sequences_upper_95": 5.47279142918247,
            "loss_tokens_lower_95": 5.410195802083334,
            "loss_tokens_upper_95": 5.433485729166667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0508869671430743,
            "data_time": 0.03499838709831238,
            "batch_time": 0.07422986254096031,
            "samples_per_second": 767993.7222213288,
            "samples_per_second_per_gpu": 95999.2152776661,
            "loss_sequences_lower_95": 3.002745725287766,
            "loss_sequences_upper_95": 3.0927615431488538,
            "loss_tokens_lower_95": 3.0393319552061984,
            "loss_tokens_upper_95": 3.0624209919913867,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.395313499201704,
            "data_time": 0.0016117170622788693,
            "batch_time": 0.03712448980107666,
            "samples_per_second": 899302.4559144174,
            "samples_per_second_per_gpu": 112412.80698930217,
            "loss_sequences_lower_95": 3.3763382007993874,
            "loss_sequences_upper_95": 3.4148111469163935,
            "loss_tokens_lower_95": 3.37614471040005,
            "loss_tokens_upper_95": 3.4144256196254985,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8134409081672347,
            "data_time": 0.0016722617445478014,
            "batch_time": 0.03715005061429018,
            "samples_per_second": 898723.0835587407,
            "samples_per_second_per_gpu": 112340.38544484258,
            "loss_sequences_lower_95": 2.804970398739357,
            "loss_sequences_upper_95": 2.829435375329864,
            "loss_tokens_lower_95": 2.79579772117997,
            "loss_tokens_upper_95": 2.8143167672329406,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.309938042254368,
            "data_time": 0.003096675782918256,
            "batch_time": 0.03855990329853888,
            "samples_per_second": 898100.7879904683,
            "samples_per_second_per_gpu": 112262.59849880854,
            "loss_sequences_lower_95": 3.568565037817002,
            "loss_sequences_upper_95": 3.85717319638086,
            "loss_tokens_lower_95": 2.761270117606326,
            "loss_tokens_upper_95": 2.963617898806067,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4426929376025996,
            "data_time": 0.003326098018504204,
            "batch_time": 0.03880895991274651,
            "samples_per_second": 892687.6090762019,
            "samples_per_second_per_gpu": 111585.95113452524,
            "loss_sequences_lower_95": 3.505656404622396,
            "loss_sequences_upper_95": 3.7064668538411456,
            "loss_tokens_lower_95": 3.213412379618711,
            "loss_tokens_upper_95": 3.3530298742138362,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4626877366637894,
            "data_time": 0.0041774865548894775,
            "batch_time": 0.03960589226194816,
            "samples_per_second": 891750.0652310402,
            "samples_per_second_per_gpu": 111468.75815388003,
            "loss_sequences_lower_95": 2.5030164277580673,
            "loss_sequences_upper_95": 2.55616493948126,
            "loss_tokens_lower_95": 2.380646077087971,
            "loss_tokens_upper_95": 2.4096095161609075,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.8922492693771016,
            "data_time": 0.022273985402924672,
            "batch_time": 0.0582655668258667,
            "samples_per_second": 829627.5240666588,
            "samples_per_second_per_gpu": 103703.44050833235,
            "loss_sequences_lower_95": 1.8755238376964223,
            "loss_sequences_upper_95": 1.9709695261174982,
            "loss_tokens_lower_95": 1.8301232655509927,
            "loss_tokens_upper_95": 1.871857997990779,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9151345817410217,
            "data_time": 0.01936587132513523,
            "batch_time": 0.054766278713941574,
            "samples_per_second": 824254.2318597194,
            "samples_per_second_per_gpu": 103031.77898246492,
            "loss_sequences_lower_95": 2.89822547134088,
            "loss_sequences_upper_95": 3.0776325708506063,
            "loss_tokens_lower_95": 2.804030042685765,
            "loss_tokens_upper_95": 2.8911936344257785,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8853293887774147,
            "data_time": 0.016138168481680062,
            "batch_time": 0.051590778888800204,
            "samples_per_second": 831249.1157544373,
            "samples_per_second_per_gpu": 103906.13946930466,
            "loss_sequences_lower_95": 2.865590845743815,
            "loss_sequences_upper_95": 2.957766418457031,
            "loss_tokens_lower_95": 2.758359937821095,
            "loss_tokens_upper_95": 2.9400922513568064,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.603593629590996,
            "data_time": 0.0015044139602641612,
            "batch_time": 0.036959671596410344,
            "samples_per_second": 900453.7996908871,
            "samples_per_second_per_gpu": 112556.72496136089,
            "loss_sequences_lower_95": 4.606441006609542,
            "loss_sequences_upper_95": 4.6844239026130605,
            "loss_tokens_lower_95": 4.4776791380231975,
            "loss_tokens_upper_95": 4.556161185959076,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8404608710546686,
            "data_time": 0.002982084183084885,
            "batch_time": 0.03842102621225703,
            "samples_per_second": 896212.2246804787,
            "samples_per_second_per_gpu": 112026.52808505984,
            "loss_sequences_lower_95": 4.338740986525411,
            "loss_sequences_upper_95": 4.648116068727641,
            "loss_tokens_lower_95": 3.155659483477378,
            "loss_tokens_upper_95": 3.2864786950162492,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.534348283306324,
            "data_time": 0.004848394039514902,
            "batch_time": 0.040208701749105714,
            "samples_per_second": 885553.7391647584,
            "samples_per_second_per_gpu": 110694.2173955948,
            "loss_sequences_lower_95": 3.921039807837164,
            "loss_sequences_upper_95": 4.262521029006906,
            "loss_tokens_lower_95": 3.1497975027226524,
            "loss_tokens_upper_95": 3.2956630651394603,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.655358264435372,
            "data_time": 0.022338588322911943,
            "batch_time": 0.05816170786108289,
            "samples_per_second": 826406.0313037056,
            "samples_per_second_per_gpu": 103300.7539129632,
            "loss_sequences_lower_95": 5.573450780023723,
            "loss_sequences_upper_95": 5.736954083725742,
            "loss_tokens_lower_95": 5.57243163226402,
            "loss_tokens_upper_95": 5.737304102231378,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0674666213989257,
            "data_time": 0.045611133942237266,
            "batch_time": 0.08129349121680626,
            "samples_per_second": 752145.4925330443,
            "samples_per_second_per_gpu": 94018.18656663054,
            "loss_sequences_lower_95": 2.9331505813598633,
            "loss_sequences_upper_95": 3.268351089477539,
            "loss_tokens_lower_95": 2.7853127904356274,
            "loss_tokens_upper_95": 3.218904323953209,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2148217925517804,
            "data_time": 0.0032702352859247438,
            "batch_time": 0.03892259997580436,
            "samples_per_second": 891692.0595419067,
            "samples_per_second_per_gpu": 111461.50744273834,
            "loss_sequences_lower_95": 3.1719476174673744,
            "loss_sequences_upper_95": 3.2567236190686573,
            "loss_tokens_lower_95": 3.17215162607152,
            "loss_tokens_upper_95": 3.258073451843574,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.326518661364681,
            "data_time": 0.004815142461954089,
            "batch_time": 0.04037618092572708,
            "samples_per_second": 886665.8998632398,
            "samples_per_second_per_gpu": 110833.23748290498,
            "loss_sequences_lower_95": 4.252161174402385,
            "loss_sequences_upper_95": 4.399023257543765,
            "loss_tokens_lower_95": 4.250473834763385,
            "loss_tokens_upper_95": 4.401125366310913,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.229857029670989,
            "data_time": 0.0033771896465293275,
            "batch_time": 0.03873567215567274,
            "samples_per_second": 892402.4218640267,
            "samples_per_second_per_gpu": 111550.30273300334,
            "loss_sequences_lower_95": 3.3867231333735717,
            "loss_sequences_upper_95": 3.5237985438698654,
            "loss_tokens_lower_95": 3.041524061342274,
            "loss_tokens_upper_95": 3.0952269612505945,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.076756331443787,
            "data_time": 0.009758534841239452,
            "batch_time": 0.045329274609684944,
            "samples_per_second": 859270.8780779333,
            "samples_per_second_per_gpu": 107408.85975974167,
            "loss_sequences_lower_95": 5.272140612792969,
            "loss_sequences_upper_95": 5.831369165039063,
            "loss_tokens_lower_95": 4.497906982906602,
            "loss_tokens_upper_95": 4.859966592643531,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4067366123199463,
            "data_time": 0.13974155485630035,
            "batch_time": 0.1795002520084381,
            "samples_per_second": 473606.146892597,
            "samples_per_second_per_gpu": 59200.76836157463,
            "loss_sequences_lower_95": 3.1945583879947663,
            "loss_sequences_upper_95": 3.6438523054122927,
            "loss_tokens_lower_95": 2.9345513749396663,
            "loss_tokens_upper_95": 3.7767553746015174,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.3891269852375165,
            "data_time": 0.026408575950784885,
            "batch_time": 0.061200370179845934,
            "samples_per_second": 784477.6719166462,
            "samples_per_second_per_gpu": 98059.70898958077,
            "loss_sequences_lower_95": 4.8632925274728365,
            "loss_sequences_upper_95": 5.671901632725508,
            "loss_tokens_lower_95": 3.059649100835858,
            "loss_tokens_upper_95": 3.5013011429194263,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1039162542998087,
            "data_time": 0.0029035605904128817,
            "batch_time": 0.038344072798887886,
            "samples_per_second": 892737.5845978385,
            "samples_per_second_per_gpu": 111592.19807472981,
            "loss_sequences_lower_95": 2.0748373355757153,
            "loss_sequences_upper_95": 2.132809812213303,
            "loss_tokens_lower_95": 2.074072180298438,
            "loss_tokens_upper_95": 2.1333835640136933,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1701585914157606,
            "data_time": 0.0023543318436503825,
            "batch_time": 0.037840205104858964,
            "samples_per_second": 898053.9725376336,
            "samples_per_second_per_gpu": 112256.7465672042,
            "loss_sequences_lower_95": 2.1460078151531876,
            "loss_sequences_upper_95": 2.272297974935717,
            "loss_tokens_lower_95": 2.0420297030936703,
            "loss_tokens_upper_95": 2.1657588501199077,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.995703016881978,
            "data_time": 0.017577939563327365,
            "batch_time": 0.052226942446496755,
            "samples_per_second": 827855.8242928955,
            "samples_per_second_per_gpu": 103481.97803661194,
            "loss_sequences_lower_95": 2.8629249656593405,
            "loss_sequences_upper_95": 3.26472654237852,
            "loss_tokens_lower_95": 2.7353113052692817,
            "loss_tokens_upper_95": 3.018505876313526,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4610016502163594,
            "data_time": 0.00455540120601654,
            "batch_time": 0.03986263126134872,
            "samples_per_second": 888356.3271924459,
            "samples_per_second_per_gpu": 111044.54089905573,
            "loss_sequences_lower_95": 3.501778912261987,
            "loss_sequences_upper_95": 3.6564769332391966,
            "loss_tokens_lower_95": 3.31504405283451,
            "loss_tokens_upper_95": 3.4565694470709363,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5845046864777075,
            "data_time": 0.029808762527647473,
            "batch_time": 0.06565942934581212,
            "samples_per_second": 806528.4170287973,
            "samples_per_second_per_gpu": 100816.05212859967,
            "loss_sequences_lower_95": 2.454803127195777,
            "loss_sequences_upper_95": 2.8761466793897674,
            "loss_tokens_lower_95": 2.3039620180114446,
            "loss_tokens_upper_95": 2.6390708039674244,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7759204317889563,
            "data_time": 0.0017843736061038278,
            "batch_time": 0.03723100681472577,
            "samples_per_second": 898435.1023955374,
            "samples_per_second_per_gpu": 112304.38779944218,
            "loss_sequences_lower_95": 3.758292185780906,
            "loss_sequences_upper_95": 3.7933217014496647,
            "loss_tokens_lower_95": 3.7579337058818014,
            "loss_tokens_upper_95": 3.793399343931286,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.7391354288291005,
            "data_time": 0.04262337251143022,
            "batch_time": 0.07811193466186524,
            "samples_per_second": 742163.0267972501,
            "samples_per_second_per_gpu": 92770.37834965627,
            "loss_sequences_lower_95": 0.6999361260423382,
            "loss_sequences_upper_95": 0.8191874587420122,
            "loss_tokens_lower_95": 0.6218321988624096,
            "loss_tokens_upper_95": 0.7763321438245665,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9916031129611365,
            "data_time": 0.0011862800686877012,
            "batch_time": 0.03660292547486554,
            "samples_per_second": 901598.2486213648,
            "samples_per_second_per_gpu": 112699.7810776706,
            "loss_sequences_lower_95": 4.311729477856394,
            "loss_sequences_upper_95": 4.354831824472615,
            "loss_tokens_lower_95": 3.492455657640232,
            "loss_tokens_upper_95": 3.5331308752417794,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.7582335610389705,
            "data_time": 0.0054263269144391255,
            "batch_time": 0.0408257402124859,
            "samples_per_second": 886111.5772668693,
            "samples_per_second_per_gpu": 110763.94715835866,
            "loss_sequences_lower_95": 4.7587353759765625,
            "loss_sequences_upper_95": 4.997936975097656,
            "loss_tokens_lower_95": 4.49510063767341,
            "loss_tokens_upper_95": 4.716650975958285,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1351540648418923,
            "data_time": 0.021330631385415286,
            "batch_time": 0.056874428765248446,
            "samples_per_second": 829337.8681946753,
            "samples_per_second_per_gpu": 103667.23352433441,
            "loss_sequences_lower_95": 3.0337442414656928,
            "loss_sequences_upper_95": 3.2373141147779383,
            "loss_tokens_lower_95": 3.0347585661514946,
            "loss_tokens_upper_95": 3.238305312446926,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.71438323837338,
            "data_time": 0.004318345023925046,
            "batch_time": 0.03978251944105309,
            "samples_per_second": 889571.9054125126,
            "samples_per_second_per_gpu": 111196.48817656407,
            "loss_sequences_lower_95": 6.634305105498343,
            "loss_sequences_upper_95": 6.790913640802557,
            "loss_tokens_lower_95": 6.636296257250237,
            "loss_tokens_upper_95": 6.791270862926136,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0444053541024525,
            "data_time": 0.0038997825789958873,
            "batch_time": 0.03944801237988979,
            "samples_per_second": 891425.9505874234,
            "samples_per_second_per_gpu": 111428.24382342793,
            "loss_sequences_lower_95": 1.0743687622070313,
            "loss_sequences_upper_95": 1.121018868001302,
            "loss_tokens_lower_95": 0.9777334781568877,
            "loss_tokens_upper_95": 1.0338525859562575,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.204572612898691,
            "data_time": 0.020623485956873213,
            "batch_time": 0.055882151637758525,
            "samples_per_second": 800629.2857690532,
            "samples_per_second_per_gpu": 100078.66072113164,
            "loss_sequences_lower_95": 5.835176522391183,
            "loss_sequences_upper_95": 6.579166753859747,
            "loss_tokens_lower_95": 5.830562918526786,
            "loss_tokens_upper_95": 6.583074137369792,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9467244893312454,
            "data_time": 0.14174482226371765,
            "batch_time": 0.1794811338186264,
            "samples_per_second": 486818.32456207566,
            "samples_per_second_per_gpu": 60852.29057025946,
            "loss_sequences_lower_95": 1.792243528366089,
            "loss_sequences_upper_95": 2.5284529328346252,
            "loss_tokens_lower_95": 1.5257417564785358,
            "loss_tokens_upper_95": 1.9713313340649161,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.524619596958161,
            "data_time": 0.005474681892092266,
            "batch_time": 0.04100727893057324,
            "samples_per_second": 883784.7832002264,
            "samples_per_second_per_gpu": 110473.0979000283,
            "loss_sequences_lower_95": 7.452846240234375,
            "loss_sequences_upper_95": 7.809564367675781,
            "loss_tokens_lower_95": 7.231113657168729,
            "loss_tokens_upper_95": 7.546172878826936,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.505656433582306,
            "data_time": 0.005721343888176812,
            "batch_time": 0.041178917127942284,
            "samples_per_second": 884675.6200206433,
            "samples_per_second_per_gpu": 110584.45250258042,
            "loss_sequences_lower_95": 6.57276298828125,
            "loss_sequences_upper_95": 6.759590808105468,
            "loss_tokens_lower_95": 6.31001750428377,
            "loss_tokens_upper_95": 6.481015884177829,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.313572364248288,
            "data_time": 0.003399636514210781,
            "batch_time": 0.03886962479173539,
            "samples_per_second": 892056.3430348381,
            "samples_per_second_per_gpu": 111507.04287935476,
            "loss_sequences_lower_95": 4.270915649291296,
            "loss_sequences_upper_95": 4.354425861129735,
            "loss_tokens_lower_95": 4.27203459578235,
            "loss_tokens_upper_95": 4.354580276187458,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5944950432272003,
            "data_time": 0.0078112178698767345,
            "batch_time": 0.04311761251028931,
            "samples_per_second": 874341.568422627,
            "samples_per_second_per_gpu": 109292.69605282838,
            "loss_sequences_lower_95": 2.5348094559301795,
            "loss_sequences_upper_95": 2.6557946458573345,
            "loss_tokens_lower_95": 2.531901313559068,
            "loss_tokens_upper_95": 2.6567207676291282,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.871535015583038,
            "data_time": 0.005098123399038163,
            "batch_time": 0.04053721021092127,
            "samples_per_second": 885035.0274302544,
            "samples_per_second_per_gpu": 110629.3784287818,
            "loss_sequences_lower_95": 3.7487128051757814,
            "loss_sequences_upper_95": 4.0004127319335945,
            "loss_tokens_lower_95": 3.7477675476074217,
            "loss_tokens_upper_95": 3.9993266601562496,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7250810858429664,
            "data_time": 0.0017174481444187561,
            "batch_time": 0.03716318706483607,
            "samples_per_second": 899147.4977317669,
            "samples_per_second_per_gpu": 112393.43721647086,
            "loss_sequences_lower_95": 3.1833707389723274,
            "loss_sequences_upper_95": 3.2752015303482738,
            "loss_tokens_lower_95": 2.1581574315560617,
            "loss_tokens_upper_95": 2.219137313277151,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.676717047815892,
            "data_time": 0.01697490385600499,
            "batch_time": 0.05249110800879342,
            "samples_per_second": 824446.738649724,
            "samples_per_second_per_gpu": 103055.8423312155,
            "loss_sequences_lower_95": 2.5907359279803375,
            "loss_sequences_upper_95": 2.7647151662342586,
            "loss_tokens_lower_95": 2.5895083128516356,
            "loss_tokens_upper_95": 2.7676793795913013,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5932456858017865,
            "data_time": 0.009950990788638592,
            "batch_time": 0.04549230169504881,
            "samples_per_second": 876424.5440698265,
            "samples_per_second_per_gpu": 109553.06800872831,
            "loss_sequences_lower_95": 2.5341421807981006,
            "loss_sequences_upper_95": 2.652674877690334,
            "loss_tokens_lower_95": 2.5346088783413756,
            "loss_tokens_upper_95": 2.6519673426011026,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.526551250212028,
            "data_time": 0.0018379583928727543,
            "batch_time": 0.03731208254200544,
            "samples_per_second": 897385.7833910874,
            "samples_per_second_per_gpu": 112173.22292388593,
            "loss_sequences_lower_95": 2.757958696898879,
            "loss_sequences_upper_95": 2.838903256578432,
            "loss_tokens_lower_95": 2.134599136434771,
            "loss_tokens_upper_95": 2.195796427765379,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.040031421752203,
            "data_time": 0.02501273900270462,
            "batch_time": 0.06082254648208618,
            "samples_per_second": 824320.1679993345,
            "samples_per_second_per_gpu": 103040.02099991681,
            "loss_sequences_lower_95": 2.947457175280051,
            "loss_sequences_upper_95": 3.1296636026372355,
            "loss_tokens_lower_95": 2.9479464091951884,
            "loss_tokens_upper_95": 3.1286913594240864,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3422042065804156,
            "data_time": 0.002873349102425488,
            "batch_time": 0.03832928281418424,
            "samples_per_second": 894920.9422985846,
            "samples_per_second_per_gpu": 111865.11778732308,
            "loss_sequences_lower_95": 3.3093499886515674,
            "loss_sequences_upper_95": 3.3744125916833907,
            "loss_tokens_lower_95": 3.309547414050554,
            "loss_tokens_upper_95": 3.3746857380064985,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.417482398667382,
            "data_time": 0.022599088061939585,
            "batch_time": 0.05722592310471968,
            "samples_per_second": 805905.6694147503,
            "samples_per_second_per_gpu": 100738.20867684379,
            "loss_sequences_lower_95": 3.288442030230772,
            "loss_sequences_upper_95": 3.550869484318113,
            "loss_tokens_lower_95": 3.2878612629418233,
            "loss_tokens_upper_95": 3.5527242716076306,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.566522055864334,
            "data_time": 0.07075853645801544,
            "batch_time": 0.1066298633813858,
            "samples_per_second": 656369.116978433,
            "samples_per_second_per_gpu": 82046.13962230412,
            "loss_sequences_lower_95": 1.4267891915639241,
            "loss_sequences_upper_95": 1.8213916397094725,
            "loss_tokens_lower_95": 1.2815247641669378,
            "loss_tokens_upper_95": 1.738056257035997,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6565626829862594,
            "data_time": 0.0713028684258461,
            "batch_time": 0.1067671924829483,
            "samples_per_second": 656061.3828532354,
            "samples_per_second_per_gpu": 82007.67285665442,
            "loss_sequences_lower_95": 1.5276747767130534,
            "loss_sequences_upper_95": 2.035506553649902,
            "loss_tokens_lower_95": 1.264349675982186,
            "loss_tokens_upper_95": 1.8742381192325206,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1912567319504817,
            "data_time": 0.0033099188251731115,
            "batch_time": 0.038817720037126906,
            "samples_per_second": 893396.0625849018,
            "samples_per_second_per_gpu": 111674.50782311273,
            "loss_sequences_lower_95": 2.1798177898333946,
            "loss_sequences_upper_95": 2.2031955239207472,
            "loss_tokens_lower_95": 2.179754241359076,
            "loss_tokens_upper_95": 2.2034133699891845,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.5291424936583787,
            "data_time": 0.0011713978237344086,
            "batch_time": 0.036668186342950986,
            "samples_per_second": 899525.371645378,
            "samples_per_second_per_gpu": 112440.67145567224,
            "loss_sequences_lower_95": 0.6001584518235399,
            "loss_sequences_upper_95": 0.6126829860947864,
            "loss_tokens_lower_95": 0.453169744293065,
            "loss_tokens_upper_95": 0.4607021510967394,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.3512009607525322,
            "data_time": 0.040657952427864075,
            "batch_time": 0.09683476388454437,
            "samples_per_second": 769180.723021881,
            "samples_per_second_per_gpu": 96147.59037773512,
            "loss_sequences_lower_95": 1.2752741295521655,
            "loss_sequences_upper_95": 1.4855206677294153,
            "loss_tokens_lower_95": 1.1906636091216838,
            "loss_tokens_upper_95": 1.2979308947097228,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.663770862527796,
            "data_time": 0.11294503439040411,
            "batch_time": 0.15060017222449892,
            "samples_per_second": 494107.83273568813,
            "samples_per_second_per_gpu": 61763.47909196102,
            "loss_sequences_lower_95": 3.254074514234388,
            "loss_sequences_upper_95": 4.106907777528505,
            "loss_tokens_lower_95": 3.0636156294080945,
            "loss_tokens_upper_95": 4.166046952612606,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1928315627865675,
            "data_time": 0.028083154133387973,
            "batch_time": 0.06440875359943934,
            "samples_per_second": 798587.3704129629,
            "samples_per_second_per_gpu": 99823.42130162036,
            "loss_sequences_lower_95": 1.150138085062911,
            "loss_sequences_upper_95": 1.3212981828829138,
            "loss_tokens_lower_95": 1.0677514525237857,
            "loss_tokens_upper_95": 1.1530427375453374,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.238657014035597,
            "data_time": 0.028356977871486118,
            "batch_time": 0.06403195290338426,
            "samples_per_second": 810482.6269967702,
            "samples_per_second_per_gpu": 101310.32837459627,
            "loss_sequences_lower_95": 1.2213693618774415,
            "loss_sequences_upper_95": 1.3804919173077839,
            "loss_tokens_lower_95": 1.1059819549528336,
            "loss_tokens_upper_95": 1.1797315867520084,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1897359279597677,
            "data_time": 0.03046833901178269,
            "batch_time": 0.06606049480892363,
            "samples_per_second": 810603.4899712293,
            "samples_per_second_per_gpu": 101325.43624640367,
            "loss_sequences_lower_95": 1.0846093805824837,
            "loss_sequences_upper_95": 1.2771674691176995,
            "loss_tokens_lower_95": 1.1114061668181021,
            "loss_tokens_upper_95": 1.2277002739316778,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2917152835828503,
            "data_time": 0.02876447779791696,
            "batch_time": 0.06482757840837751,
            "samples_per_second": 805573.8120999908,
            "samples_per_second_per_gpu": 100696.72651249885,
            "loss_sequences_lower_95": 1.257471959183856,
            "loss_sequences_upper_95": 1.3987416360436415,
            "loss_tokens_lower_95": 1.1677302517994914,
            "loss_tokens_upper_95": 1.2399611404751691,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1060877631169668,
            "data_time": 0.029998028719866718,
            "batch_time": 0.0668763702298388,
            "samples_per_second": 801881.2301494832,
            "samples_per_second_per_gpu": 100235.1537686854,
            "loss_sequences_lower_95": 1.0676161298100253,
            "loss_sequences_upper_95": 1.1623223606844126,
            "loss_tokens_lower_95": 1.0487198548095205,
            "loss_tokens_upper_95": 1.1030004747139148,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0561147332191467,
            "data_time": 0.027008584567478726,
            "batch_time": 0.06276696636563256,
            "samples_per_second": 811060.95035377,
            "samples_per_second_per_gpu": 101382.61879422126,
            "loss_sequences_lower_95": 1.0398571107445693,
            "loss_sequences_upper_95": 1.1450660752087105,
            "loss_tokens_lower_95": 0.94275866839022,
            "loss_tokens_upper_95": 0.9894324959265747,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-2.0/params.txt",
    "uuid": "d4eda944-bd23-43e2-a671-8783d5c04afd",
    "creation_date": "2023_12_14-07_47_43"
}