{
    "name": "rw_original-d=96_l=8_h=4-1.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 211386240,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "42277248",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=96_l=8_h=4-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.10843079884847,
            "data_time": 0.12295988202095032,
            "batch_time": 1.292650192975998,
            "samples_per_second": 373661.4245595709,
            "samples_per_second_per_gpu": 46707.67806994636,
            "loss_sequences_lower_95": 5.954117851257324,
            "loss_sequences_upper_95": 6.26331106821696,
            "loss_tokens_lower_95": 6.0931395848592125,
            "loss_tokens_upper_95": 6.123342475891113,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.271892380117802,
            "data_time": 0.01878728636182056,
            "batch_time": 0.06392394140443983,
            "samples_per_second": 4677350.427975302,
            "samples_per_second_per_gpu": 584668.8034969127,
            "loss_sequences_lower_95": 5.269586085203944,
            "loss_sequences_upper_95": 5.2742014520023766,
            "loss_tokens_lower_95": 5.26044171875,
            "loss_tokens_upper_95": 5.283189947916667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.612542756722898,
            "data_time": 0.08863689750432968,
            "batch_time": 0.13356389850378036,
            "samples_per_second": 4129357.67340223,
            "samples_per_second_per_gpu": 516169.7091752787,
            "loss_sequences_lower_95": 5.564079328264509,
            "loss_sequences_upper_95": 5.675792510363521,
            "loss_tokens_lower_95": 5.5992449375,
            "loss_tokens_upper_95": 5.625929177083333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.459095644410124,
            "data_time": 0.013427526543014929,
            "batch_time": 0.05753376609400699,
            "samples_per_second": 5314130.887309275,
            "samples_per_second_per_gpu": 664266.3609136593,
            "loss_sequences_lower_95": 5.421229602931701,
            "loss_sequences_upper_95": 5.49770536001933,
            "loss_tokens_lower_95": 5.4462561875,
            "loss_tokens_upper_95": 5.472036427083333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.307089509400719,
            "data_time": 0.09382914751768112,
            "batch_time": 0.13814204931259155,
            "samples_per_second": 4039921.56002971,
            "samples_per_second_per_gpu": 504990.1950037138,
            "loss_sequences_lower_95": 5.248034195598905,
            "loss_sequences_upper_95": 5.37844111487239,
            "loss_tokens_lower_95": 5.295117385416667,
            "loss_tokens_upper_95": 5.319214625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.945914075943501,
            "data_time": 0.03313479572534561,
            "batch_time": 0.07672175765037537,
            "samples_per_second": 4920898.617767786,
            "samples_per_second_per_gpu": 615112.3272209732,
            "loss_sequences_lower_95": 5.887032531281866,
            "loss_sequences_upper_95": 6.009348527919995,
            "loss_tokens_lower_95": 5.932565802083333,
            "loss_tokens_upper_95": 5.95905584375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.579996718192587,
            "data_time": 0.014743510633707047,
            "batch_time": 0.057657917588949205,
            "samples_per_second": 5053694.338848789,
            "samples_per_second_per_gpu": 631711.7923560987,
            "loss_sequences_lower_95": 7.547596400669644,
            "loss_sequences_upper_95": 7.612072106186225,
            "loss_tokens_lower_95": 7.563568624999999,
            "loss_tokens_upper_95": 7.597055041666667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.405159402218165,
            "data_time": 0.014833884019600717,
            "batch_time": 0.05826092785910556,
            "samples_per_second": 5227042.129408617,
            "samples_per_second_per_gpu": 653380.2661760771,
            "loss_sequences_lower_95": 5.380270165248691,
            "loss_sequences_upper_95": 5.431884673592932,
            "loss_tokens_lower_95": 5.39299678125,
            "loss_tokens_upper_95": 5.417343,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.43486668811581,
            "data_time": 0.09009812772274017,
            "batch_time": 0.13545570522546768,
            "samples_per_second": 4029180.82934658,
            "samples_per_second_per_gpu": 503647.6036683225,
            "loss_sequences_lower_95": 5.349953032702934,
            "loss_sequences_upper_95": 5.5363884282305955,
            "loss_tokens_lower_95": 5.422617197916667,
            "loss_tokens_upper_95": 5.447457552083333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.496391805264318,
            "data_time": 0.09411734342575073,
            "batch_time": 0.1402379646897316,
            "samples_per_second": 4136042.938426314,
            "samples_per_second_per_gpu": 517005.36730328924,
            "loss_sequences_lower_95": 6.387381053819015,
            "loss_sequences_upper_95": 6.627369695972548,
            "loss_tokens_lower_95": 6.4830541041666665,
            "loss_tokens_upper_95": 6.50922446875,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.894241075279191,
            "data_time": 0.009647514285712406,
            "batch_time": 0.0531247910754434,
            "samples_per_second": 5382766.10341514,
            "samples_per_second_per_gpu": 672845.7629268925,
            "loss_sequences_lower_95": 5.8820255354298006,
            "loss_sequences_upper_95": 5.906670496008634,
            "loss_tokens_lower_95": 5.881811208333334,
            "loss_tokens_upper_95": 5.906832520833333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.685468130536697,
            "data_time": 0.021167995035648347,
            "batch_time": 0.06361637711524963,
            "samples_per_second": 5057732.206139347,
            "samples_per_second_per_gpu": 632216.5257674183,
            "loss_sequences_lower_95": 5.6597988041278375,
            "loss_sequences_upper_95": 5.712195114014994,
            "loss_tokens_lower_95": 5.6726401979166665,
            "loss_tokens_upper_95": 5.6978629375,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.58376810758641,
            "data_time": 0.09487687051296234,
            "batch_time": 0.14057593792676926,
            "samples_per_second": 4026747.6119169053,
            "samples_per_second_per_gpu": 503343.45148961316,
            "loss_sequences_lower_95": 5.487349578671717,
            "loss_sequences_upper_95": 5.698052916613843,
            "loss_tokens_lower_95": 5.570475552083333,
            "loss_tokens_upper_95": 5.59722484375,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.845679266632453,
            "data_time": 0.08970919251441956,
            "batch_time": 0.13418541103601456,
            "samples_per_second": 4089450.5773447715,
            "samples_per_second_per_gpu": 511181.32216809643,
            "loss_sequences_lower_95": 5.767815677308745,
            "loss_sequences_upper_95": 5.937743755270653,
            "loss_tokens_lower_95": 5.833430260416667,
            "loss_tokens_upper_95": 5.858452718750001,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.028111804615367,
            "data_time": 0.14071878790855408,
            "batch_time": 0.16018380224704742,
            "samples_per_second": 1118973.9013024413,
            "samples_per_second_per_gpu": 139871.73766280516,
            "loss_sequences_lower_95": 6.956642948497426,
            "loss_sequences_upper_95": 7.121895044500178,
            "loss_tokens_lower_95": 7.00232552615079,
            "loss_tokens_upper_95": 7.053552974354137,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.395754736296984,
            "data_time": 0.09349115192890167,
            "batch_time": 0.1288057416677475,
            "samples_per_second": 3369375.54687383,
            "samples_per_second_per_gpu": 421171.9433592287,
            "loss_sequences_lower_95": 6.273401982165634,
            "loss_sequences_upper_95": 6.517942165007744,
            "loss_tokens_lower_95": 6.38181028125,
            "loss_tokens_upper_95": 6.409694395833333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.702482220994451,
            "data_time": 0.0911795124411583,
            "batch_time": 0.1277569979429245,
            "samples_per_second": 3743743.970460003,
            "samples_per_second_per_gpu": 467967.99630750035,
            "loss_sequences_lower_95": 6.613275597403735,
            "loss_sequences_upper_95": 6.818943283362879,
            "loss_tokens_lower_95": 6.691046500000001,
            "loss_tokens_upper_95": 6.714258364583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.603051896955146,
            "data_time": 0.16088728606700897,
            "batch_time": 0.19022664427757263,
            "samples_per_second": 2276863.872555137,
            "samples_per_second_per_gpu": 284607.98406939214,
            "loss_sequences_lower_95": 6.464790231673445,
            "loss_sequences_upper_95": 6.847518483146292,
            "loss_tokens_lower_95": 6.589018562191822,
            "loss_tokens_upper_95": 6.616592319676133,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.217667792125605,
            "data_time": 0.03187465180050243,
            "batch_time": 0.07632410580461675,
            "samples_per_second": 4393551.246827841,
            "samples_per_second_per_gpu": 549193.9058534801,
            "loss_sequences_lower_95": 5.203650043174049,
            "loss_sequences_upper_95": 5.231311102963431,
            "loss_tokens_lower_95": 5.20362314284824,
            "loss_tokens_upper_95": 5.231295733335707,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.897566822652583,
            "data_time": 0.027209505066275598,
            "batch_time": 0.0713484525680542,
            "samples_per_second": 4461630.428080778,
            "samples_per_second_per_gpu": 557703.8035100972,
            "loss_sequences_lower_95": 4.905690406931513,
            "loss_sequences_upper_95": 4.931050593367233,
            "loss_tokens_lower_95": 4.885681284626455,
            "loss_tokens_upper_95": 4.907341076807617,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.24944890283108,
            "data_time": 0.05275393194622464,
            "batch_time": 0.09466080367565155,
            "samples_per_second": 4299804.851375647,
            "samples_per_second_per_gpu": 537475.6064219559,
            "loss_sequences_lower_95": 7.649907867763196,
            "loss_sequences_upper_95": 7.8859307189566605,
            "loss_tokens_lower_95": 7.125682593221348,
            "loss_tokens_upper_95": 7.308675386157523,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.883081009070079,
            "data_time": 0.04342353716492653,
            "batch_time": 0.08714012925823529,
            "samples_per_second": 4535000.842204362,
            "samples_per_second_per_gpu": 566875.1052755453,
            "loss_sequences_lower_95": 7.181434521484375,
            "loss_sequences_upper_95": 7.3369245442708335,
            "loss_tokens_lower_95": 6.794568813875786,
            "loss_tokens_upper_95": 6.913648953419811,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.667708914599878,
            "data_time": 0.06610387812058131,
            "batch_time": 0.1059916044274966,
            "samples_per_second": 4007161.555074544,
            "samples_per_second_per_gpu": 500895.194384318,
            "loss_sequences_lower_95": 5.721742055341997,
            "loss_sequences_upper_95": 5.781312349332472,
            "loss_tokens_lower_95": 5.647584661254154,
            "loss_tokens_upper_95": 5.681181209264995,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.129371140219948,
            "data_time": 0.3794872909784317,
            "batch_time": 0.4217606484889984,
            "samples_per_second": 2266612.5571468156,
            "samples_per_second_per_gpu": 283326.56964335195,
            "loss_sequences_lower_95": 6.070332336425781,
            "loss_sequences_upper_95": 6.225390389182351,
            "loss_tokens_lower_95": 6.093919901995364,
            "loss_tokens_upper_95": 6.158359619066815,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.487434579888169,
            "data_time": 0.36032672226428986,
            "batch_time": 0.40614402294158936,
            "samples_per_second": 2785871.7424320877,
            "samples_per_second_per_gpu": 348233.96780401096,
            "loss_sequences_lower_95": 5.4689459851323345,
            "loss_sequences_upper_95": 5.6561515839245855,
            "loss_tokens_lower_95": 5.4453800461238995,
            "loss_tokens_upper_95": 5.5437866029113065,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.951315744717916,
            "data_time": 0.17247236520051956,
            "batch_time": 0.20422330498695374,
            "samples_per_second": 2535032.2135506696,
            "samples_per_second_per_gpu": 316879.0266938337,
            "loss_sequences_lower_95": 4.891614034016928,
            "loss_sequences_upper_95": 4.999364481608073,
            "loss_tokens_lower_95": 4.852894726200363,
            "loss_tokens_upper_95": 5.053281412961604,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.342334305792482,
            "data_time": 0.024003267474472522,
            "batch_time": 0.06834829710423947,
            "samples_per_second": 4534084.755517421,
            "samples_per_second_per_gpu": 566760.5944396777,
            "loss_sequences_lower_95": 9.41395548708848,
            "loss_sequences_upper_95": 9.487078061180553,
            "loss_tokens_lower_95": 9.287970470847867,
            "loss_tokens_upper_95": 9.364575432115725,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.184303004532952,
            "data_time": 0.04477836638689041,
            "batch_time": 0.08720802068710327,
            "samples_per_second": 4356552.881045532,
            "samples_per_second_per_gpu": 544569.1101306916,
            "loss_sequences_lower_95": 7.241795580314867,
            "loss_sequences_upper_95": 7.507109218815762,
            "loss_tokens_lower_95": 6.045279493340571,
            "loss_tokens_upper_95": 6.190126917645889,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.918745643042867,
            "data_time": 0.08421384990215301,
            "batch_time": 0.12673964500427246,
            "samples_per_second": 4323592.9629560625,
            "samples_per_second_per_gpu": 540449.1203695078,
            "loss_sequences_lower_95": 6.564477445322499,
            "loss_sequences_upper_95": 6.850876864801088,
            "loss_tokens_lower_95": 5.818232002762988,
            "loss_tokens_upper_95": 5.984648005783115,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.022512274790028,
            "data_time": 0.3494671881198883,
            "batch_time": 0.39124561846256256,
            "samples_per_second": 2217190.1724910266,
            "samples_per_second_per_gpu": 277148.7715613783,
            "loss_sequences_lower_95": 5.982772896823274,
            "loss_sequences_upper_95": 6.0614224089879425,
            "loss_tokens_lower_95": 5.9834282722647325,
            "loss_tokens_upper_95": 6.0616330499518405,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.151625328063965,
            "data_time": 0.2908164858818054,
            "batch_time": 0.3182934671640396,
            "samples_per_second": 1710086.9251006283,
            "samples_per_second_per_gpu": 213760.86563757854,
            "loss_sequences_lower_95": 5.074072647094726,
            "loss_sequences_upper_95": 5.508865768432617,
            "loss_tokens_lower_95": 4.8924738738959075,
            "loss_tokens_upper_95": 5.400632166990441,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.931557178985377,
            "data_time": 0.05097433924674988,
            "batch_time": 0.0942341610789299,
            "samples_per_second": 4513322.098424738,
            "samples_per_second_per_gpu": 564165.2623030923,
            "loss_sequences_lower_95": 4.895663555227259,
            "loss_sequences_upper_95": 4.968019652195816,
            "loss_tokens_lower_95": 4.895343973399917,
            "loss_tokens_upper_95": 4.968211791117579,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.312923111552395,
            "data_time": 0.08153197169303894,
            "batch_time": 0.12489810883998871,
            "samples_per_second": 4336771.73646822,
            "samples_per_second_per_gpu": 542096.4670585275,
            "loss_sequences_lower_95": 5.282592833422911,
            "loss_sequences_upper_95": 5.34220921471898,
            "loss_tokens_lower_95": 5.2822677924728705,
            "loss_tokens_upper_95": 5.343306977743013,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.377957269598014,
            "data_time": 0.05060437135398388,
            "batch_time": 0.09186299331486225,
            "samples_per_second": 4253856.4359716885,
            "samples_per_second_per_gpu": 531732.0544964611,
            "loss_sequences_lower_95": 5.559296192256359,
            "loss_sequences_upper_95": 5.674316818021457,
            "loss_tokens_lower_95": 5.349518947204561,
            "loss_tokens_upper_95": 5.412442416815815,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.921349143981933,
            "data_time": 0.21095093339681625,
            "batch_time": 0.2570660188794136,
            "samples_per_second": 3463454.283543167,
            "samples_per_second_per_gpu": 432931.7854428959,
            "loss_sequences_lower_95": 7.601929431152344,
            "loss_sequences_upper_95": 8.107482299804687,
            "loss_tokens_lower_95": 6.692042211283918,
            "loss_tokens_upper_95": 7.034735264958016,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.501480460166931,
            "data_time": 0.14341993629932404,
            "batch_time": 0.1618402898311615,
            "samples_per_second": 697029.4688591054,
            "samples_per_second_per_gpu": 87128.68360738817,
            "loss_sequences_lower_95": 5.201570045948029,
            "loss_sequences_upper_95": 5.990342974662781,
            "loss_tokens_lower_95": 4.930077739145564,
            "loss_tokens_upper_95": 5.898445532787805,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.655880777315161,
            "data_time": 0.31766556203365326,
            "batch_time": 0.35347720980644226,
            "samples_per_second": 1823941.9080301633,
            "samples_per_second_per_gpu": 227992.7385037704,
            "loss_sequences_lower_95": 6.050738314924569,
            "loss_sequences_upper_95": 6.518093854531474,
            "loss_tokens_lower_95": 5.4173668484005075,
            "loss_tokens_upper_95": 5.812851334048231,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.707882267463254,
            "data_time": 0.04866092734866672,
            "batch_time": 0.09343856738673316,
            "samples_per_second": 4500854.6975663025,
            "samples_per_second_per_gpu": 562606.8371957878,
            "loss_sequences_lower_95": 4.688627889157383,
            "loss_sequences_upper_95": 4.727518594790984,
            "loss_tokens_lower_95": 4.688568971166449,
            "loss_tokens_upper_95": 4.726997921444954,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.0761608450986,
            "data_time": 0.03446223480360849,
            "batch_time": 0.07763189999830156,
            "samples_per_second": 4407946.246388189,
            "samples_per_second_per_gpu": 550993.2807985236,
            "loss_sequences_lower_95": 8.104887701945469,
            "loss_sequences_upper_95": 8.266194430883708,
            "loss_tokens_lower_95": 7.983501754465913,
            "loss_tokens_upper_95": 8.14264000125319,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.580337541007297,
            "data_time": 0.17033760994672775,
            "batch_time": 0.2003132402896881,
            "samples_per_second": 1884020.0181081193,
            "samples_per_second_per_gpu": 235502.5022635149,
            "loss_sequences_lower_95": 4.439090178213713,
            "loss_sequences_upper_95": 4.797058329040751,
            "loss_tokens_lower_95": 4.3753065421557,
            "loss_tokens_upper_95": 4.712635519502341,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.908664196423874,
            "data_time": 0.07784271836280823,
            "batch_time": 0.12289614677429199,
            "samples_per_second": 4361907.04487667,
            "samples_per_second_per_gpu": 545238.3806095838,
            "loss_sequences_lower_95": 4.949842985013196,
            "loss_sequences_upper_95": 5.081634661185995,
            "loss_tokens_lower_95": 4.827904300814738,
            "loss_tokens_upper_95": 4.9827933676484495,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.530313750592674,
            "data_time": 0.33308541774749756,
            "batch_time": 0.36736394464969635,
            "samples_per_second": 2078312.6764616156,
            "samples_per_second_per_gpu": 259789.08455770195,
            "loss_sequences_lower_95": 6.295629287347561,
            "loss_sequences_upper_95": 6.838122893542778,
            "loss_tokens_lower_95": 6.368962378447441,
            "loss_tokens_upper_95": 6.731956056280587,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4044258784332095,
            "data_time": 0.028011152736819277,
            "batch_time": 0.071982877381543,
            "samples_per_second": 4429136.023379671,
            "samples_per_second_per_gpu": 553642.0029224589,
            "loss_sequences_lower_95": 4.398353879760327,
            "loss_sequences_upper_95": 4.410318997393229,
            "loss_tokens_lower_95": 4.398368277171059,
            "loss_tokens_upper_95": 4.410323890950065,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.005181539405897,
            "data_time": 0.3246517479419708,
            "batch_time": 0.3509196788072586,
            "samples_per_second": 1544476.350855269,
            "samples_per_second_per_gpu": 193059.54385690863,
            "loss_sequences_lower_95": 5.848543555991164,
            "loss_sequences_upper_95": 6.2131636536237105,
            "loss_tokens_lower_95": 5.750344320744306,
            "loss_tokens_upper_95": 6.1659119464479915,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.911897730464955,
            "data_time": 0.021981632709503172,
            "batch_time": 0.06631448239088059,
            "samples_per_second": 4520583.567266677,
            "samples_per_second_per_gpu": 565072.9459083346,
            "loss_sequences_lower_95": 6.419756985390461,
            "loss_sequences_upper_95": 6.459386649141772,
            "loss_tokens_lower_95": 5.843704521276596,
            "loss_tokens_upper_95": 5.884056141199227,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2859874882698055,
            "data_time": 0.09427981078624725,
            "batch_time": 0.13927168399095535,
            "samples_per_second": 4237977.756522487,
            "samples_per_second_per_gpu": 529747.2195653109,
            "loss_sequences_lower_95": 5.25583515625,
            "loss_sequences_upper_95": 5.488403759765625,
            "loss_tokens_lower_95": 5.175662974912993,
            "loss_tokens_upper_95": 5.390285450049546,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.435785482240759,
            "data_time": 0.37181591987609863,
            "batch_time": 0.41501861810684204,
            "samples_per_second": 2190331.614430328,
            "samples_per_second_per_gpu": 273791.451803791,
            "loss_sequences_lower_95": 5.31426865287449,
            "loss_sequences_upper_95": 5.554820715862772,
            "loss_tokens_lower_95": 5.315452363387398,
            "loss_tokens_upper_95": 5.552537801991338,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.418006817499796,
            "data_time": 0.06309834371010463,
            "batch_time": 0.10301257421573003,
            "samples_per_second": 4087212.3027916774,
            "samples_per_second_per_gpu": 510901.5378489597,
            "loss_sequences_lower_95": 10.296424209132338,
            "loss_sequences_upper_95": 10.542235791755445,
            "loss_tokens_lower_95": 10.294634343927557,
            "loss_tokens_upper_95": 10.543188624526515,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.479752643903097,
            "data_time": 0.06312890599171321,
            "batch_time": 0.10733300944169362,
            "samples_per_second": 4508150.833082729,
            "samples_per_second_per_gpu": 563518.8541353411,
            "loss_sequences_lower_95": 4.575810164388021,
            "loss_sequences_upper_95": 4.656204060872396,
            "loss_tokens_lower_95": 4.4245977531637655,
            "loss_tokens_upper_95": 4.519671055922369,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.336337200800577,
            "data_time": 0.3506710082292557,
            "batch_time": 0.3909436911344528,
            "samples_per_second": 2261817.0586903626,
            "samples_per_second_per_gpu": 282727.1323362953,
            "loss_sequences_lower_95": 6.010300845191592,
            "loss_sequences_upper_95": 6.674026460193452,
            "loss_tokens_lower_95": 6.008956996372768,
            "loss_tokens_upper_95": 6.672811119442894,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.528369247913361,
            "data_time": 0.1434747725725174,
            "batch_time": 0.16129526495933533,
            "samples_per_second": 959909.6783035558,
            "samples_per_second_per_gpu": 119988.70978794448,
            "loss_sequences_lower_95": 6.293146753311158,
            "loss_sequences_upper_95": 7.811116158962249,
            "loss_tokens_lower_95": 6.138256219293654,
            "loss_tokens_upper_95": 6.731294423526095,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.813563617229462,
            "data_time": 0.09927394241094589,
            "batch_time": 0.1441548764705658,
            "samples_per_second": 4047077.4281014334,
            "samples_per_second_per_gpu": 505884.6785126792,
            "loss_sequences_lower_95": 7.868323327636719,
            "loss_sequences_upper_95": 8.1853390625,
            "loss_tokens_lower_95": 7.6655431653844115,
            "loss_tokens_upper_95": 7.944112513714837,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.571317314147949,
            "data_time": 0.10338329523801804,
            "batch_time": 0.1478908620774746,
            "samples_per_second": 4280428.118787024,
            "samples_per_second_per_gpu": 535053.514848378,
            "loss_sequences_lower_95": 7.799331494140625,
            "loss_sequences_upper_95": 8.014440844726563,
            "loss_tokens_lower_95": 7.462406231417439,
            "loss_tokens_upper_95": 7.6475740221660615,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.98481204627226,
            "data_time": 0.03932242343823115,
            "batch_time": 0.0833057090640068,
            "samples_per_second": 4517286.282146904,
            "samples_per_second_per_gpu": 564660.785268363,
            "loss_sequences_lower_95": 4.970456504502074,
            "loss_sequences_upper_95": 4.999116264954534,
            "loss_tokens_lower_95": 4.970835941428511,
            "loss_tokens_upper_95": 4.999176911351827,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.788999830705962,
            "data_time": 0.1267592857281367,
            "batch_time": 0.1673919508854548,
            "samples_per_second": 3774243.797385372,
            "samples_per_second_per_gpu": 471780.4746731715,
            "loss_sequences_lower_95": 4.717004835184452,
            "loss_sequences_upper_95": 4.859270921463974,
            "loss_tokens_lower_95": 4.717793557537682,
            "loss_tokens_upper_95": 4.858356603572628,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.401599334239959,
            "data_time": 0.09518606588244438,
            "batch_time": 0.1397906206548214,
            "samples_per_second": 4378095.794825279,
            "samples_per_second_per_gpu": 547261.9743531599,
            "loss_sequences_lower_95": 7.334883496093751,
            "loss_sequences_upper_95": 7.470833215332031,
            "loss_tokens_lower_95": 7.334303491210938,
            "loss_tokens_upper_95": 7.468851391601562,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.812204282331151,
            "data_time": 0.033114444641839894,
            "batch_time": 0.07728601353509086,
            "samples_per_second": 4354697.985251713,
            "samples_per_second_per_gpu": 544337.2481564641,
            "loss_sequences_lower_95": 7.395527565486045,
            "loss_sequences_upper_95": 7.470552178187086,
            "loss_tokens_lower_95": 6.72632655536834,
            "loss_tokens_upper_95": 6.783861453845497,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.830501976297863,
            "data_time": 0.19544705322810582,
            "batch_time": 0.22745636531284877,
            "samples_per_second": 2069466.474495019,
            "samples_per_second_per_gpu": 258683.30931187738,
            "loss_sequences_lower_95": 5.685717557081535,
            "loss_sequences_upper_95": 5.974660127554367,
            "loss_tokens_lower_95": 5.681255761900944,
            "loss_tokens_upper_95": 5.974655117205719,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.660073531842699,
            "data_time": 0.1726880744099617,
            "batch_time": 0.2183980718255043,
            "samples_per_second": 3800177.215602169,
            "samples_per_second_per_gpu": 475022.1519502711,
            "loss_sequences_lower_95": 5.558727967505361,
            "loss_sequences_upper_95": 5.760583256740196,
            "loss_tokens_lower_95": 5.559727424172794,
            "loss_tokens_upper_95": 5.759252750172334,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.600214684739771,
            "data_time": 0.03168036649003625,
            "batch_time": 0.07569687394425273,
            "samples_per_second": 4405962.659898799,
            "samples_per_second_per_gpu": 550745.3324873499,
            "loss_sequences_lower_95": 6.900101521886352,
            "loss_sequences_upper_95": 6.978754756200677,
            "loss_tokens_lower_95": 6.527044934835209,
            "loss_tokens_upper_95": 6.597619102173993,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.236571932595874,
            "data_time": 0.31650324165821075,
            "batch_time": 0.3539261221885681,
            "samples_per_second": 2129216.5836890163,
            "samples_per_second_per_gpu": 266152.07296112704,
            "loss_sequences_lower_95": 5.180209568568639,
            "loss_sequences_upper_95": 5.292920164078001,
            "loss_tokens_lower_95": 5.180701482863654,
            "loss_tokens_upper_95": 5.2923177971411,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.255016821823354,
            "data_time": 0.044239741105299726,
            "batch_time": 0.08881379205446976,
            "samples_per_second": 4447793.26463913,
            "samples_per_second_per_gpu": 555974.1580798912,
            "loss_sequences_lower_95": 9.220250665973815,
            "loss_sequences_upper_95": 9.289616258720375,
            "loss_tokens_lower_95": 9.220006435755925,
            "loss_tokens_upper_95": 9.288944745078364,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.493155222494625,
            "data_time": 0.3538605123758316,
            "batch_time": 0.39423760771751404,
            "samples_per_second": 2141061.2761772284,
            "samples_per_second_per_gpu": 267632.65952215355,
            "loss_sequences_lower_95": 5.3427002249412165,
            "loss_sequences_upper_95": 5.636768615130082,
            "loss_tokens_lower_95": 5.343757999753489,
            "loss_tokens_upper_95": 5.638125580722845,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.129395612080892,
            "data_time": 0.3039748966693878,
            "batch_time": 0.32522906363010406,
            "samples_per_second": 1255696.1395727876,
            "samples_per_second_per_gpu": 156962.01744659845,
            "loss_sequences_lower_95": 6.805308405558268,
            "loss_sequences_upper_95": 7.675797945658366,
            "loss_tokens_lower_95": 6.496667777167427,
            "loss_tokens_upper_95": 7.706187460157606,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.806397382418314,
            "data_time": 0.2839321196079254,
            "batch_time": 0.30424830317497253,
            "samples_per_second": 1396410.8387213587,
            "samples_per_second_per_gpu": 174551.35484016984,
            "loss_sequences_lower_95": 6.550977045694987,
            "loss_sequences_upper_95": 7.508271916707357,
            "loss_tokens_lower_95": 6.126620637700799,
            "loss_tokens_upper_95": 7.387898408696893,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.664044046472204,
            "data_time": 0.051164139594350545,
            "batch_time": 0.0942227350814002,
            "samples_per_second": 4119854.173589205,
            "samples_per_second_per_gpu": 514981.7716986506,
            "loss_sequences_lower_95": 8.623351622905929,
            "loss_sequences_upper_95": 8.704409230025773,
            "loss_tokens_lower_95": 8.623571529247975,
            "loss_tokens_upper_95": 8.705303611699192,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.15619718160266,
            "data_time": 0.022243365059773475,
            "batch_time": 0.06665771789686128,
            "samples_per_second": 4515279.380853101,
            "samples_per_second_per_gpu": 564409.9226066376,
            "loss_sequences_lower_95": 6.6371145439761,
            "loss_sequences_upper_95": 6.665243898204028,
            "loss_tokens_lower_95": 6.100660928051067,
            "loss_tokens_upper_95": 6.129935161527036,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.646104384595015,
            "data_time": 0.3220771998167038,
            "batch_time": 0.4078672528266907,
            "samples_per_second": 2332398.6430184785,
            "samples_per_second_per_gpu": 291549.8303773098,
            "loss_sequences_lower_95": 7.670283508300781,
            "loss_sequences_upper_95": 8.038405639167845,
            "loss_tokens_lower_95": 7.500048264400639,
            "loss_tokens_upper_95": 7.790845440213554,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 10.815792805439717,
            "data_time": 0.20045138895511627,
            "batch_time": 0.21979042887687683,
            "samples_per_second": 1033981.9856867362,
            "samples_per_second_per_gpu": 129247.74821084202,
            "loss_sequences_lower_95": 10.365857345993454,
            "loss_sequences_upper_95": 11.45311555604677,
            "loss_tokens_lower_95": 9.856779235086323,
            "loss_tokens_upper_95": 11.558182101779513,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.638650521999452,
            "data_time": 0.31981439888477325,
            "batch_time": 0.3545718938112259,
            "samples_per_second": 2007459.862774659,
            "samples_per_second_per_gpu": 250932.48284683237,
            "loss_sequences_lower_95": 7.657506319371666,
            "loss_sequences_upper_95": 7.926597464956889,
            "loss_tokens_lower_95": 7.478687053195963,
            "loss_tokens_upper_95": 7.723816692073171,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.666356485064437,
            "data_time": 0.34363362193107605,
            "batch_time": 0.37835946679115295,
            "samples_per_second": 2074625.1439904617,
            "samples_per_second_per_gpu": 259328.1429988077,
            "loss_sequences_lower_95": 7.6760666637885855,
            "loss_sequences_upper_95": 7.918907519084652,
            "loss_tokens_lower_95": 7.534977761357307,
            "loss_tokens_upper_95": 7.739286329453501,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.719867555106559,
            "data_time": 0.33665867149829865,
            "batch_time": 0.3853597193956375,
            "samples_per_second": 1904545.4190734802,
            "samples_per_second_per_gpu": 238068.17738418502,
            "loss_sequences_lower_95": 7.850625722001238,
            "loss_sequences_upper_95": 8.222228017667444,
            "loss_tokens_lower_95": 7.528923167162342,
            "loss_tokens_upper_95": 7.8466157813398505,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.690738771019912,
            "data_time": 0.29907770454883575,
            "batch_time": 0.33352452516555786,
            "samples_per_second": 2227551.9712649416,
            "samples_per_second_per_gpu": 278443.9964081177,
            "loss_sequences_lower_95": 7.6407801186166155,
            "loss_sequences_upper_95": 7.870852958865282,
            "loss_tokens_lower_95": 7.568591860001703,
            "loss_tokens_upper_95": 7.752857357987733,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.269137243306415,
            "data_time": 0.31061360239982605,
            "batch_time": 0.34438641369342804,
            "samples_per_second": 2292586.809053624,
            "samples_per_second_per_gpu": 286573.351131703,
            "loss_sequences_lower_95": 7.137576483495487,
            "loss_sequences_upper_95": 7.270135043126456,
            "loss_tokens_lower_95": 7.207213743510529,
            "loss_tokens_upper_95": 7.327614445470167,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.773945543824173,
            "data_time": 0.3106791824102402,
            "batch_time": 0.34598349034786224,
            "samples_per_second": 2037155.3292803813,
            "samples_per_second_per_gpu": 254644.41616004767,
            "loss_sequences_lower_95": 6.822435444157298,
            "loss_sequences_upper_95": 6.997540636760433,
            "loss_tokens_lower_95": 6.688995158981314,
            "loss_tokens_upper_95": 6.8140068730221515,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=96_l=8_h=4-1.0/params.txt",
    "uuid": "2dcb8242-c72b-4e83-81c3-20e190ed2ee6",
    "creation_date": "2023_12_13-16_17_40"
}