{
    "name": "rw_original-d=1024_l=24_h=8-1.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 8232325120,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1646465024",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=1024_l=24_h=8-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.1728111426035563,
            "data_time": 0.04748115688562393,
            "batch_time": 0.43401747196912766,
            "samples_per_second": 692466.9039938401,
            "samples_per_second_per_gpu": 86558.36299923001,
            "loss_sequences_lower_95": 3.112297032674154,
            "loss_sequences_upper_95": 3.2333296203613284,
            "loss_tokens_lower_95": 3.1596517817179364,
            "loss_tokens_upper_95": 3.185891939798991,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1305160505406797,
            "data_time": 0.001064169085188742,
            "batch_time": 0.03666436952963814,
            "samples_per_second": 900253.6133934164,
            "samples_per_second_per_gpu": 112531.70167417705,
            "loss_sequences_lower_95": 3.128101038321039,
            "loss_sequences_upper_95": 3.1329284664378405,
            "loss_tokens_lower_95": 3.1200809739583337,
            "loss_tokens_upper_95": 3.1409334583333335,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.791405796518131,
            "data_time": 0.01010677433013916,
            "batch_time": 0.04541806316375732,
            "samples_per_second": 870199.3385089791,
            "samples_per_second_per_gpu": 108774.91731362238,
            "loss_sequences_lower_95": 2.754704253527583,
            "loss_sequences_upper_95": 2.836176963339047,
            "loss_tokens_lower_95": 2.779348546875,
            "loss_tokens_upper_95": 2.8035364791666666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1456298328674945,
            "data_time": 0.001586876221393284,
            "batch_time": 0.03685228271703971,
            "samples_per_second": 907848.9930653669,
            "samples_per_second_per_gpu": 113481.12413317086,
            "loss_sequences_lower_95": 3.120529095521907,
            "loss_sequences_upper_95": 3.171304878785438,
            "loss_tokens_lower_95": 3.1345685208333336,
            "loss_tokens_upper_95": 3.156612098958333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1754684744444495,
            "data_time": 0.009792902555123743,
            "batch_time": 0.04497838020324707,
            "samples_per_second": 870274.38014785,
            "samples_per_second_per_gpu": 108784.29751848125,
            "loss_sequences_lower_95": 3.133820431586924,
            "loss_sequences_upper_95": 3.2254116578888747,
            "loss_tokens_lower_95": 3.1648510520833333,
            "loss_tokens_upper_95": 3.1859122239583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2767009866391614,
            "data_time": 0.003877214763475501,
            "batch_time": 0.039379291884277176,
            "samples_per_second": 899596.6731298644,
            "samples_per_second_per_gpu": 112449.58414123305,
            "loss_sequences_lower_95": 3.237335956088545,
            "loss_sequences_upper_95": 3.319026189268567,
            "loss_tokens_lower_95": 3.264885239583333,
            "loss_tokens_upper_95": 3.28830934375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8194809139504726,
            "data_time": 0.001654803266727516,
            "batch_time": 0.03706974928764107,
            "samples_per_second": 908422.277815916,
            "samples_per_second_per_gpu": 113552.7847269895,
            "loss_sequences_lower_95": 2.7924114068478953,
            "loss_sequences_upper_95": 2.846189861686862,
            "loss_tokens_lower_95": 2.8059617604166665,
            "loss_tokens_upper_95": 2.8336923229166664,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.656719384917414,
            "data_time": 0.0017582599411744644,
            "batch_time": 0.03793928424766601,
            "samples_per_second": 906862.0326096708,
            "samples_per_second_per_gpu": 113357.75407620885,
            "loss_sequences_lower_95": 3.6383015788612565,
            "loss_sequences_upper_95": 3.677021494600785,
            "loss_tokens_lower_95": 3.6455248854166666,
            "loss_tokens_upper_95": 3.66786075,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2582131667834955,
            "data_time": 0.013235585084037176,
            "batch_time": 0.05548797145722404,
            "samples_per_second": 869271.6716761171,
            "samples_per_second_per_gpu": 108658.95895951464,
            "loss_sequences_lower_95": 3.195437814355866,
            "loss_sequences_upper_95": 3.3319359011766387,
            "loss_tokens_lower_95": 3.247256010416667,
            "loss_tokens_upper_95": 3.2692054010416665,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.2916729299447285,
            "data_time": 0.010070276446640491,
            "batch_time": 0.04549624118953943,
            "samples_per_second": 875068.0402132217,
            "samples_per_second_per_gpu": 109383.50502665271,
            "loss_sequences_lower_95": 4.214404284812717,
            "loss_sequences_upper_95": 4.387630357101501,
            "loss_tokens_lower_95": 4.278919364583333,
            "loss_tokens_upper_95": 4.304481520833333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3131313898048123,
            "data_time": 0.0013003442737624298,
            "batch_time": 0.036647957297147676,
            "samples_per_second": 909151.1437904104,
            "samples_per_second_per_gpu": 113643.8929738013,
            "loss_sequences_lower_95": 3.3043643055900542,
            "loss_sequences_upper_95": 3.322038648160203,
            "loss_tokens_lower_95": 3.3023800885416663,
            "loss_tokens_upper_95": 3.323855598958333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1486233060233846,
            "data_time": 0.0025725866932356785,
            "batch_time": 0.03796778213570855,
            "samples_per_second": 905255.4671516301,
            "samples_per_second_per_gpu": 113156.93339395376,
            "loss_sequences_lower_95": 3.1324891128126953,
            "loss_sequences_upper_95": 3.1653759836803026,
            "loss_tokens_lower_95": 3.1380109635416664,
            "loss_tokens_upper_95": 3.1592505364583334,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6893011580616175,
            "data_time": 0.010104302832260433,
            "batch_time": 0.04545055359248587,
            "samples_per_second": 869123.4453602526,
            "samples_per_second_per_gpu": 108640.43067003158,
            "loss_sequences_lower_95": 3.62600755672184,
            "loss_sequences_upper_95": 3.7687437479201,
            "loss_tokens_lower_95": 3.6768119895833333,
            "loss_tokens_upper_95": 3.7016524166666667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.916888208350436,
            "data_time": 0.009927051475798466,
            "batch_time": 0.04561989050937364,
            "samples_per_second": 859801.9083296307,
            "samples_per_second_per_gpu": 107475.23854120384,
            "loss_sequences_lower_95": 2.8478338299846455,
            "loss_sequences_upper_95": 2.9935374024929193,
            "loss_tokens_lower_95": 2.9059513333333333,
            "loss_tokens_upper_95": 2.9279206666666666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8175114501606333,
            "data_time": 0.08999747889382499,
            "batch_time": 0.12455026592527117,
            "samples_per_second": 514516.9643418445,
            "samples_per_second_per_gpu": 64314.62054273056,
            "loss_sequences_lower_95": 3.7555150985717773,
            "loss_sequences_upper_95": 3.8819720441644843,
            "loss_tokens_lower_95": 3.7962478290904653,
            "loss_tokens_upper_95": 3.839303502169522,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.182627230969532,
            "data_time": 0.014246265996586193,
            "batch_time": 0.049835400147871536,
            "samples_per_second": 850529.7848745331,
            "samples_per_second_per_gpu": 106316.22310931663,
            "loss_sequences_lower_95": 3.1255612921089195,
            "loss_sequences_upper_95": 3.2384763756576853,
            "loss_tokens_lower_95": 3.1705574999999997,
            "loss_tokens_upper_95": 3.194612583333333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.163061582318711,
            "data_time": 0.012990521887938181,
            "batch_time": 0.04852215821544329,
            "samples_per_second": 866490.3757639182,
            "samples_per_second_per_gpu": 108311.29697048977,
            "loss_sequences_lower_95": 5.093784463122526,
            "loss_sequences_upper_95": 5.24866184043381,
            "loss_tokens_lower_95": 5.151689447916667,
            "loss_tokens_upper_95": 5.174447572916667,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4649756326050056,
            "data_time": 0.038070134818553925,
            "batch_time": 0.07474401965737343,
            "samples_per_second": 769794.3976965337,
            "samples_per_second_per_gpu": 96224.29971206671,
            "loss_sequences_lower_95": 3.3297696285560483,
            "loss_sequences_upper_95": 3.7014960929995677,
            "loss_tokens_lower_95": 3.451572587060147,
            "loss_tokens_upper_95": 3.478547981137135,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5401595754342243,
            "data_time": 0.0015295309805924365,
            "batch_time": 0.03689846634050165,
            "samples_per_second": 904045.1316443422,
            "samples_per_second_per_gpu": 113005.64145554278,
            "loss_sequences_lower_95": 2.52662318568847,
            "loss_sequences_upper_95": 2.5538695298034466,
            "loss_tokens_lower_95": 2.5263330196352904,
            "loss_tokens_upper_95": 2.5541386721531834,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7897050443148617,
            "data_time": 0.0016482833559346047,
            "batch_time": 0.03709383978016058,
            "samples_per_second": 900568.3148803506,
            "samples_per_second_per_gpu": 112571.03936004383,
            "loss_sequences_lower_95": 2.7891581529451304,
            "loss_sequences_upper_95": 2.813994353597702,
            "loss_tokens_lower_95": 2.767561108286993,
            "loss_tokens_upper_95": 2.7857252063419007,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.819906417496879,
            "data_time": 0.0031996584522847745,
            "batch_time": 0.038708553350162773,
            "samples_per_second": 899023.0271421477,
            "samples_per_second_per_gpu": 112377.87839276846,
            "loss_sequences_lower_95": 4.071475864098665,
            "loss_sequences_upper_95": 4.353485352485091,
            "loss_tokens_lower_95": 3.2770010514858083,
            "loss_tokens_upper_95": 3.485949006079491,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.783202985420823,
            "data_time": 0.0035252577446876687,
            "batch_time": 0.03890093098929588,
            "samples_per_second": 895630.055535718,
            "samples_per_second_per_gpu": 111953.75694196475,
            "loss_sequences_lower_95": 3.85580107421875,
            "loss_sequences_upper_95": 4.052336246744792,
            "loss_tokens_lower_95": 3.545117279628538,
            "loss_tokens_upper_95": 3.6831156827338836,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7096193971915894,
            "data_time": 0.00473333843513132,
            "batch_time": 0.04049558912826699,
            "samples_per_second": 883881.4663230429,
            "samples_per_second_per_gpu": 110485.18329038036,
            "loss_sequences_lower_95": 2.752356003305061,
            "loss_sequences_upper_95": 2.8071234829593914,
            "loss_tokens_lower_95": 2.6207194404280805,
            "loss_tokens_upper_95": 2.649754657301649,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.244860698960044,
            "data_time": 0.022184763635907854,
            "batch_time": 0.05767880167279925,
            "samples_per_second": 836077.382630542,
            "samples_per_second_per_gpu": 104509.67282881775,
            "loss_sequences_lower_95": 2.223101113059304,
            "loss_sequences_upper_95": 2.3196735867587006,
            "loss_tokens_lower_95": 2.1800205764070846,
            "loss_tokens_upper_95": 2.2267526595409652,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0502760157293203,
            "data_time": 0.021461350843310356,
            "batch_time": 0.056587859988212585,
            "samples_per_second": 829440.8900400661,
            "samples_per_second_per_gpu": 103680.11125500826,
            "loss_sequences_lower_95": 3.029123173927774,
            "loss_sequences_upper_95": 3.2057636992785397,
            "loss_tokens_lower_95": 2.939556844268365,
            "loss_tokens_upper_95": 3.0263464859618314,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.42345308860143,
            "data_time": 0.01724425034645276,
            "batch_time": 0.05268576053472666,
            "samples_per_second": 834406.0515292418,
            "samples_per_second_per_gpu": 104300.75644115523,
            "loss_sequences_lower_95": 3.394300221761068,
            "loss_sequences_upper_95": 3.484314025878906,
            "loss_tokens_lower_95": 3.2975775833035574,
            "loss_tokens_upper_95": 3.496158157556107,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.001382122725647,
            "data_time": 0.0013094168457385652,
            "batch_time": 0.03670594272457724,
            "samples_per_second": 903957.9713860695,
            "samples_per_second_per_gpu": 112994.7464232587,
            "loss_sequences_lower_95": 5.005848783278382,
            "loss_sequences_upper_95": 5.085029783690517,
            "loss_tokens_lower_95": 4.870247174972526,
            "loss_tokens_upper_95": 4.9507489421078406,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9950863737299387,
            "data_time": 0.002923261999283861,
            "batch_time": 0.03835145899113393,
            "samples_per_second": 898074.3471418907,
            "samples_per_second_per_gpu": 112259.29339273633,
            "loss_sequences_lower_95": 4.480672457402805,
            "loss_sequences_upper_95": 4.779769784432871,
            "loss_tokens_lower_95": 3.328351611727265,
            "loss_tokens_upper_95": 3.4589384350962904,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.694025695018801,
            "data_time": 0.0050906434252455425,
            "batch_time": 0.04040882474667317,
            "samples_per_second": 889338.278507923,
            "samples_per_second_per_gpu": 111167.28481349038,
            "loss_sequences_lower_95": 4.062863867437473,
            "loss_sequences_upper_95": 4.393091111948059,
            "loss_tokens_lower_95": 3.302095851547374,
            "loss_tokens_upper_95": 3.4506913221124758,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.9263831730846945,
            "data_time": 0.022672365818704878,
            "batch_time": 0.0585646778345108,
            "samples_per_second": 825669.852322683,
            "samples_per_second_per_gpu": 103208.73154033537,
            "loss_sequences_lower_95": 5.848403805249358,
            "loss_sequences_upper_95": 6.001788148923551,
            "loss_tokens_lower_95": 5.848899917950913,
            "loss_tokens_upper_95": 6.002809428733233,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.098838233947754,
            "data_time": 0.04923810408665584,
            "batch_time": 0.08558588761549729,
            "samples_per_second": 746419.7458052298,
            "samples_per_second_per_gpu": 93302.46822565372,
            "loss_sequences_lower_95": 2.9671254119873045,
            "loss_sequences_upper_95": 3.3079138031005857,
            "loss_tokens_lower_95": 2.80544448333904,
            "loss_tokens_upper_95": 3.2265039870478813,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9225050908882273,
            "data_time": 0.0033295015615919616,
            "batch_time": 0.03887387994364483,
            "samples_per_second": 897178.7285754534,
            "samples_per_second_per_gpu": 112147.34107193167,
            "loss_sequences_lower_95": 3.8882798318853156,
            "loss_sequences_upper_95": 3.956908273843318,
            "loss_tokens_lower_95": 3.8880642534644476,
            "loss_tokens_upper_95": 3.9569064371661495,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.202188686305241,
            "data_time": 0.005018216750750145,
            "batch_time": 0.04052545973760658,
            "samples_per_second": 891051.4318317767,
            "samples_per_second_per_gpu": 111381.4289789721,
            "loss_sequences_lower_95": 4.15170735477132,
            "loss_sequences_upper_95": 4.252650285450578,
            "loss_tokens_lower_95": 4.150670616905969,
            "loss_tokens_upper_95": 4.253132638142404,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1904774288098623,
            "data_time": 0.003441821679183268,
            "batch_time": 0.038773690907538065,
            "samples_per_second": 894014.1093878952,
            "samples_per_second_per_gpu": 111751.7636734869,
            "loss_sequences_lower_95": 3.3273380117655056,
            "loss_sequences_upper_95": 3.457349741938843,
            "loss_tokens_lower_95": 3.032092627083907,
            "loss_tokens_upper_95": 3.088520260793449,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.095016088485718,
            "data_time": 0.010240700095891953,
            "batch_time": 0.0457444041967392,
            "samples_per_second": 857346.0778392368,
            "samples_per_second_per_gpu": 107168.2597299046,
            "loss_sequences_lower_95": 5.269946704101562,
            "loss_sequences_upper_95": 5.814633337402344,
            "loss_tokens_lower_95": 4.525090248011975,
            "loss_tokens_upper_95": 4.880911483108248,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5527015179395676,
            "data_time": 0.15143415331840515,
            "batch_time": 0.19009113311767578,
            "samples_per_second": 484897.1897889729,
            "samples_per_second_per_gpu": 60612.14872362161,
            "loss_sequences_lower_95": 3.3453994035720824,
            "loss_sequences_upper_95": 3.79023659825325,
            "loss_tokens_lower_95": 3.0704932815727144,
            "loss_tokens_upper_95": 3.943206638029252,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.013013451948933,
            "data_time": 0.02760824497709883,
            "batch_time": 0.06311874186739008,
            "samples_per_second": 772500.030057793,
            "samples_per_second_per_gpu": 96562.50375722413,
            "loss_sequences_lower_95": 4.299926205339103,
            "loss_sequences_upper_95": 4.845413988486103,
            "loss_tokens_lower_95": 3.1020067010683836,
            "loss_tokens_upper_95": 3.451977821186672,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.158180216505518,
            "data_time": 0.002857730206516054,
            "batch_time": 0.03822721064918571,
            "samples_per_second": 896824.0595074989,
            "samples_per_second_per_gpu": 112103.00743843736,
            "loss_sequences_lower_95": 2.133733578903178,
            "loss_sequences_upper_95": 2.182302988434838,
            "loss_tokens_lower_95": 2.132899165122256,
            "loss_tokens_upper_95": 2.182989136648782,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4197950544150326,
            "data_time": 0.002322322044624207,
            "batch_time": 0.03777292156848604,
            "samples_per_second": 899909.2416160019,
            "samples_per_second_per_gpu": 112488.65520200024,
            "loss_sequences_lower_95": 2.39432086644764,
            "loss_sequences_upper_95": 2.5285253991531027,
            "loss_tokens_lower_95": 2.2808177206753557,
            "loss_tokens_upper_95": 2.4124602770831434,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0447261660963623,
            "data_time": 0.019650951027870178,
            "batch_time": 0.05452541344695621,
            "samples_per_second": 824965.8524760292,
            "samples_per_second_per_gpu": 103120.73155950365,
            "loss_sequences_lower_95": 2.9117362724555718,
            "loss_sequences_upper_95": 3.3168699662763994,
            "loss_tokens_lower_95": 2.7812397521916195,
            "loss_tokens_upper_95": 3.069244740474676,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.471868290615157,
            "data_time": 0.004694492742419243,
            "batch_time": 0.040051809325814244,
            "samples_per_second": 888689.04859774,
            "samples_per_second_per_gpu": 111086.1310747175,
            "loss_sequences_lower_95": 3.51691119909851,
            "loss_sequences_upper_95": 3.6702439691292175,
            "loss_tokens_lower_95": 3.322152661227202,
            "loss_tokens_upper_95": 3.4619810134187468,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6091350215237314,
            "data_time": 0.03128819522403535,
            "batch_time": 0.06688319217591059,
            "samples_per_second": 813413.4672962597,
            "samples_per_second_per_gpu": 101676.68341203246,
            "loss_sequences_lower_95": 2.464711966165682,
            "loss_sequences_upper_95": 2.882438566626572,
            "loss_tokens_lower_95": 2.34741432974039,
            "loss_tokens_upper_95": 2.678410540980577,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.25316923197376,
            "data_time": 0.0018814976006758198,
            "batch_time": 0.0373015886680917,
            "samples_per_second": 899245.0469571499,
            "samples_per_second_per_gpu": 112405.63086964373,
            "loss_sequences_lower_95": 5.2435511809393125,
            "loss_sequences_upper_95": 5.262859437121799,
            "loss_tokens_lower_95": 5.243272013777755,
            "loss_tokens_upper_95": 5.262801857246449,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1889497232668609,
            "data_time": 0.045138952948830344,
            "batch_time": 0.08007386814464222,
            "samples_per_second": 747426.928415734,
            "samples_per_second_per_gpu": 93428.36605196675,
            "loss_sequences_lower_95": 1.1392886689565713,
            "loss_sequences_upper_95": 1.3025000266658449,
            "loss_tokens_lower_95": 1.0127285460488702,
            "loss_tokens_upper_95": 1.2575660510341367,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.402886101728465,
            "data_time": 0.001146605495934914,
            "batch_time": 0.03657085946249742,
            "samples_per_second": 902869.4849710679,
            "samples_per_second_per_gpu": 112858.68562138348,
            "loss_sequences_lower_95": 4.767557058274371,
            "loss_sequences_upper_95": 4.812953088148586,
            "loss_tokens_lower_95": 3.8429213189071567,
            "loss_tokens_upper_95": 3.887770218810445,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.925842994213104,
            "data_time": 0.005729903777440389,
            "batch_time": 0.04129200227676876,
            "samples_per_second": 885748.3644516751,
            "samples_per_second_per_gpu": 110718.54555645939,
            "loss_sequences_lower_95": 4.916317443847656,
            "loss_sequences_upper_95": 5.122858117675781,
            "loss_tokens_lower_95": 4.7274857782893465,
            "loss_tokens_upper_95": 4.921517691792948,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8657328885534534,
            "data_time": 0.022044753624221026,
            "batch_time": 0.057448332592592403,
            "samples_per_second": 832103.0616390172,
            "samples_per_second_per_gpu": 104012.88270487715,
            "loss_sequences_lower_95": 2.7674734895125677,
            "loss_sequences_upper_95": 2.9649320254118545,
            "loss_tokens_lower_95": 2.7655388077445653,
            "loss_tokens_upper_95": 2.965096913213315,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.522511305231037,
            "data_time": 0.004452339137893125,
            "batch_time": 0.03983502323368946,
            "samples_per_second": 891687.3344960308,
            "samples_per_second_per_gpu": 111460.91681200385,
            "loss_sequences_lower_95": 7.4034667228929925,
            "loss_sequences_upper_95": 7.636732251716382,
            "loss_tokens_lower_95": 7.404304957534328,
            "loss_tokens_upper_95": 7.640731977982954,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.464781619787216,
            "data_time": 0.003957176145086897,
            "batch_time": 0.0394519567489624,
            "samples_per_second": 893765.8606717485,
            "samples_per_second_per_gpu": 111720.73258396857,
            "loss_sequences_lower_95": 1.5006734375,
            "loss_sequences_upper_95": 1.5496433390299478,
            "loss_tokens_lower_95": 1.3857899605154562,
            "loss_tokens_upper_95": 1.4602311823166765,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.33024286883218,
            "data_time": 0.025533348321914673,
            "batch_time": 0.060551126088414876,
            "samples_per_second": 807283.5671933938,
            "samples_per_second_per_gpu": 100910.44589917423,
            "loss_sequences_lower_95": 5.015629606701078,
            "loss_sequences_upper_95": 5.645103730701265,
            "loss_tokens_lower_95": 5.0139146931966145,
            "loss_tokens_upper_95": 5.648565659295945,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.299095407128334,
            "data_time": 0.14459457993507385,
            "batch_time": 0.18413284420967102,
            "samples_per_second": 489561.8649905831,
            "samples_per_second_per_gpu": 61195.23312382289,
            "loss_sequences_lower_95": 2.110056531429291,
            "loss_sequences_upper_95": 2.995883160829544,
            "loss_tokens_lower_95": 1.8016159907075546,
            "loss_tokens_upper_95": 2.275949241402223,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.525286318778992,
            "data_time": 0.005800214551744007,
            "batch_time": 0.04127372115377396,
            "samples_per_second": 885551.3107853831,
            "samples_per_second_per_gpu": 110693.91384817289,
            "loss_sequences_lower_95": 7.466007543945312,
            "loss_sequences_upper_95": 7.7825384765625,
            "loss_tokens_lower_95": 7.241894985657255,
            "loss_tokens_upper_95": 7.52138081145569,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.794870886325836,
            "data_time": 0.005996604287435138,
            "batch_time": 0.04129054054381356,
            "samples_per_second": 889508.6715645817,
            "samples_per_second_per_gpu": 111188.58394557271,
            "loss_sequences_lower_95": 6.910658740234375,
            "loss_sequences_upper_95": 7.139181164550781,
            "loss_tokens_lower_95": 6.531894313638426,
            "loss_tokens_upper_95": 6.726962183998654,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.852498981963057,
            "data_time": 0.0034220891652697303,
            "batch_time": 0.03888433099191723,
            "samples_per_second": 894099.0239275739,
            "samples_per_second_per_gpu": 111762.37799094674,
            "loss_sequences_lower_95": 5.830635777216727,
            "loss_sequences_upper_95": 5.873879326600738,
            "loss_tokens_lower_95": 5.830762046125964,
            "loss_tokens_upper_95": 5.874097784581378,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3763128632590886,
            "data_time": 0.008503338361434706,
            "batch_time": 0.04372316184720964,
            "samples_per_second": 875968.2986375934,
            "samples_per_second_per_gpu": 109496.03732969917,
            "loss_sequences_lower_95": 2.3205459946311566,
            "loss_sequences_upper_95": 2.433934778075797,
            "loss_tokens_lower_95": 2.3197532360821094,
            "loss_tokens_upper_95": 2.4321962136826754,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.244174214363098,
            "data_time": 0.006041829075132098,
            "batch_time": 0.041556613312827215,
            "samples_per_second": 882225.070066739,
            "samples_per_second_per_gpu": 110278.13375834237,
            "loss_sequences_lower_95": 6.1813004760742185,
            "loss_sequences_upper_95": 6.310574353027344,
            "loss_tokens_lower_95": 6.180427551269531,
            "loss_tokens_upper_95": 6.309649462890625,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.677491630138715,
            "data_time": 0.001618668983473895,
            "batch_time": 0.037001444306400635,
            "samples_per_second": 902213.7627656781,
            "samples_per_second_per_gpu": 112776.72034570976,
            "loss_sequences_lower_95": 3.1230559107438505,
            "loss_sequences_upper_95": 3.1987088864415796,
            "loss_tokens_lower_95": 2.1325848784151984,
            "loss_tokens_upper_95": 2.1856800104242207,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9204795302739783,
            "data_time": 0.01850506578172956,
            "batch_time": 0.054128028665270125,
            "samples_per_second": 825439.394171775,
            "samples_per_second_per_gpu": 103179.92427147187,
            "loss_sequences_lower_95": 2.8228140361273466,
            "loss_sequences_upper_95": 3.0230321400201143,
            "loss_tokens_lower_95": 2.8225072433699423,
            "loss_tokens_upper_95": 3.018479315914325,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6179745211320764,
            "data_time": 0.010645315051078796,
            "batch_time": 0.04648967180401087,
            "samples_per_second": 872449.7317618834,
            "samples_per_second_per_gpu": 109056.21647023542,
            "loss_sequences_lower_95": 2.5588730217428766,
            "loss_sequences_upper_95": 2.678126573749617,
            "loss_tokens_lower_95": 2.558241002699908,
            "loss_tokens_upper_95": 2.6786940271714155,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.711778336967216,
            "data_time": 0.0018761962853422402,
            "batch_time": 0.03726728459124745,
            "samples_per_second": 900281.8889707614,
            "samples_per_second_per_gpu": 112535.23612134518,
            "loss_sequences_lower_95": 4.247164017482462,
            "loss_sequences_upper_95": 4.341738266570368,
            "loss_tokens_lower_95": 2.9822710783123143,
            "loss_tokens_upper_95": 3.0578019385035926,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.3212154580171775,
            "data_time": 0.027862300475438435,
            "batch_time": 0.06380543609460194,
            "samples_per_second": 818929.9665893689,
            "samples_per_second_per_gpu": 102366.24582367111,
            "loss_sequences_lower_95": 6.253135285554109,
            "loss_sequences_upper_95": 6.38710784104766,
            "loss_tokens_lower_95": 6.252823521851232,
            "loss_tokens_upper_95": 6.385635149920428,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.529313910845952,
            "data_time": 0.0033668768711579153,
            "batch_time": 0.038820337608765916,
            "samples_per_second": 895398.142797331,
            "samples_per_second_per_gpu": 111924.76784966637,
            "loss_sequences_lower_95": 3.4980062446244267,
            "loss_sequences_upper_95": 3.5601186657826833,
            "loss_tokens_lower_95": 3.4985954597309825,
            "loss_tokens_upper_95": 3.5601336576595948,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5477231758312113,
            "data_time": 0.023604512214660645,
            "batch_time": 0.058502121405168014,
            "samples_per_second": 805177.3215288218,
            "samples_per_second_per_gpu": 100647.16519110273,
            "loss_sequences_lower_95": 3.4255451239428476,
            "loss_sequences_upper_95": 3.674584101704718,
            "loss_tokens_lower_95": 3.42215725056176,
            "loss_tokens_upper_95": 3.6711479483298883,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1908252269029616,
            "data_time": 0.08008220791816711,
            "batch_time": 0.11600758135318756,
            "samples_per_second": 639437.9942708344,
            "samples_per_second_per_gpu": 79929.7492838543,
            "loss_sequences_lower_95": 1.9965144983927408,
            "loss_sequences_upper_95": 2.5334333419799804,
            "loss_tokens_lower_95": 1.7983696937561036,
            "loss_tokens_upper_95": 2.4658983919355606,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2520555476347606,
            "data_time": 0.07941778749227524,
            "batch_time": 0.11537996679544449,
            "samples_per_second": 651836.04857884,
            "samples_per_second_per_gpu": 81479.506072355,
            "loss_sequences_lower_95": 2.0784610398610432,
            "loss_sequences_upper_95": 2.7066895484924314,
            "loss_tokens_lower_95": 1.7158870954192087,
            "loss_tokens_upper_95": 2.52143124355359,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.595196667195069,
            "data_time": 0.003255299079270837,
            "batch_time": 0.03871447917922516,
            "samples_per_second": 896911.1168674667,
            "samples_per_second_per_gpu": 112113.88960843334,
            "loss_sequences_lower_95": 6.570842763369845,
            "loss_sequences_upper_95": 6.620021458486745,
            "loss_tokens_lower_95": 6.570067625874447,
            "loss_tokens_upper_95": 6.620076729910714,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.5699215367110513,
            "data_time": 0.001065186235067359,
            "batch_time": 0.03648146867947714,
            "samples_per_second": 903199.0908679668,
            "samples_per_second_per_gpu": 112899.88635849585,
            "loss_sequences_lower_95": 0.6488632593452951,
            "loss_sequences_upper_95": 0.6649358849954908,
            "loss_tokens_lower_95": 0.49305762523428937,
            "loss_tokens_upper_95": 0.5019295814067087,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.041988432876707,
            "data_time": 0.03962969034910202,
            "batch_time": 0.07576807960867882,
            "samples_per_second": 794734.13234737,
            "samples_per_second_per_gpu": 99341.76654342125,
            "loss_sequences_lower_95": 4.091248675969642,
            "loss_sequences_upper_95": 4.464756558636042,
            "loss_tokens_lower_95": 3.7028634678787458,
            "loss_tokens_upper_95": 3.962718401210544,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.729879456597406,
            "data_time": 0.1228769960857573,
            "batch_time": 0.1583529540470668,
            "samples_per_second": 505732.17766131443,
            "samples_per_second_per_gpu": 63216.522207664304,
            "loss_sequences_lower_95": 6.276624329025681,
            "loss_sequences_upper_95": 7.441356432115709,
            "loss_tokens_lower_95": 5.734116825056665,
            "loss_tokens_upper_95": 7.531152230721932,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8720401707218914,
            "data_time": 0.030597783270336333,
            "batch_time": 0.06608785334087554,
            "samples_per_second": 811386.1018915047,
            "samples_per_second_per_gpu": 101423.26273643809,
            "loss_sequences_lower_95": 3.879589378543016,
            "loss_sequences_upper_95": 4.190898355623571,
            "loss_tokens_lower_95": 3.532148051474453,
            "loss_tokens_upper_95": 3.7416504809714044,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.99826349136306,
            "data_time": 0.030592472780318486,
            "batch_time": 0.06620330469948905,
            "samples_per_second": 812129.093665629,
            "samples_per_second_per_gpu": 101516.13670820363,
            "loss_sequences_lower_95": 3.992793134363686,
            "loss_sequences_upper_95": 4.270049751095656,
            "loss_tokens_lower_95": 3.695926072603935,
            "loss_tokens_upper_95": 3.8761583036787557,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.957270067639467,
            "data_time": 0.032904806591215585,
            "batch_time": 0.06863188459759667,
            "samples_per_second": 809101.2718921875,
            "samples_per_second_per_gpu": 101137.65898652344,
            "loss_sequences_lower_95": 3.9611712665092655,
            "loss_sequences_upper_95": 4.311609510096108,
            "loss_tokens_lower_95": 3.5738577054656053,
            "loss_tokens_upper_95": 3.8436855533330037,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.134413543270855,
            "data_time": 0.03172913619450161,
            "batch_time": 0.06754241670880999,
            "samples_per_second": 813564.6903985443,
            "samples_per_second_per_gpu": 101695.58629981804,
            "loss_sequences_lower_95": 4.108167024938072,
            "loss_sequences_upper_95": 4.379583786755074,
            "loss_tokens_lower_95": 3.8547739189361856,
            "loss_tokens_upper_95": 4.0213516758238415,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5930618632654228,
            "data_time": 0.03210044201509452,
            "batch_time": 0.06885436140460732,
            "samples_per_second": 799340.0923883683,
            "samples_per_second_per_gpu": 99917.51154854604,
            "loss_sequences_lower_95": 3.527440174173864,
            "loss_sequences_upper_95": 3.752582772770283,
            "loss_tokens_lower_95": 3.3729313327179002,
            "loss_tokens_upper_95": 3.514376016836063,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.081526445179451,
            "data_time": 0.02924660841623942,
            "batch_time": 0.0651623010635376,
            "samples_per_second": 803530.5818488011,
            "samples_per_second_per_gpu": 100441.32273110014,
            "loss_sequences_lower_95": 3.0977729704321884,
            "loss_sequences_upper_95": 3.3251253081531056,
            "loss_tokens_lower_95": 2.8404531038087706,
            "loss_tokens_upper_95": 2.9492042163483276,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-1.0/params.txt",
    "uuid": "59285fd4-b4cc-4739-99fb-1cd96834ab72",
    "creation_date": "2023_12_13-16_18_34"
}