{
    "name": "rpj-d=576_l=24_h=8-32.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 98353520640,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 32.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "19670704128",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-32.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.5881362656752267,
            "data_time": 0.03792188689112663,
            "batch_time": 0.382742702960968,
            "samples_per_second": 829505.5011778545,
            "samples_per_second_per_gpu": 103688.18764723181,
            "loss_sequences_lower_95": 2.5226561482747396,
            "loss_sequences_upper_95": 2.65030392964681,
            "loss_tokens_lower_95": 2.5766869735717775,
            "loss_tokens_upper_95": 2.5993667793273927,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1051019658045784,
            "data_time": 0.0012085709777032167,
            "batch_time": 0.030872386508702283,
            "samples_per_second": 1076171.5706206854,
            "samples_per_second_per_gpu": 134521.44632758567,
            "loss_sequences_lower_95": 3.1024247252173556,
            "loss_sequences_upper_95": 3.107736975214414,
            "loss_tokens_lower_95": 3.094548265625,
            "loss_tokens_upper_95": 3.1155276666666665,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6172258085134077,
            "data_time": 0.010270883560180664,
            "batch_time": 0.0391885871887207,
            "samples_per_second": 1070401.8615543533,
            "samples_per_second_per_gpu": 133800.23269429416,
            "loss_sequences_lower_95": 2.5926551476303414,
            "loss_sequences_upper_95": 2.6416465292171556,
            "loss_tokens_lower_95": 2.6061047552083334,
            "loss_tokens_upper_95": 2.628577927083333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.948537926133146,
            "data_time": 0.001559450438148097,
            "batch_time": 0.02984853059445557,
            "samples_per_second": 1127584.9351935354,
            "samples_per_second_per_gpu": 140948.11689919193,
            "loss_sequences_lower_95": 2.936396831709085,
            "loss_sequences_upper_95": 2.9602592018363403,
            "loss_tokens_lower_95": 2.9379720364583335,
            "loss_tokens_upper_95": 2.9587630572916668,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1055666594301368,
            "data_time": 0.009322714520640582,
            "batch_time": 0.03890101463196287,
            "samples_per_second": 1050522.3663500226,
            "samples_per_second_per_gpu": 131315.29579375283,
            "loss_sequences_lower_95": 3.071183562424421,
            "loss_sequences_upper_95": 3.139058466210142,
            "loss_tokens_lower_95": 3.0950355104166665,
            "loss_tokens_upper_95": 3.11603609375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.895631591938208,
            "data_time": 0.003826508055562558,
            "batch_time": 0.03245782333871593,
            "samples_per_second": 1111730.4662790818,
            "samples_per_second_per_gpu": 138966.30828488522,
            "loss_sequences_lower_95": 2.8547815132270777,
            "loss_sequences_upper_95": 2.9362734381638766,
            "loss_tokens_lower_95": 2.8848927083333336,
            "loss_tokens_upper_95": 2.9061993802083332,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6016037618505712,
            "data_time": 0.0015821222574442478,
            "batch_time": 0.03008536753802276,
            "samples_per_second": 1122151.963340568,
            "samples_per_second_per_gpu": 140268.995417571,
            "loss_sequences_lower_95": 1.5805299396125636,
            "loss_sequences_upper_95": 1.6229810467155612,
            "loss_tokens_lower_95": 1.5920945182291666,
            "loss_tokens_upper_95": 1.6113328229166666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.516929026049469,
            "data_time": 0.0017742297979440214,
            "batch_time": 0.030687764318155872,
            "samples_per_second": 1121412.4560705135,
            "samples_per_second_per_gpu": 140176.55700881418,
            "loss_sequences_lower_95": 3.508530616001309,
            "loss_sequences_upper_95": 3.5251998629744765,
            "loss_tokens_lower_95": 3.5063596875,
            "loss_tokens_upper_95": 3.5273566041666666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2857461600768856,
            "data_time": 0.009535414831978934,
            "batch_time": 0.03994341218282306,
            "samples_per_second": 1020299.9487729135,
            "samples_per_second_per_gpu": 127537.4935966142,
            "loss_sequences_lower_95": 3.243803225881685,
            "loss_sequences_upper_95": 3.331976963446392,
            "loss_tokens_lower_95": 3.2748782135416663,
            "loss_tokens_upper_95": 3.2967651874999997,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.911627171539035,
            "data_time": 0.00914271455258131,
            "batch_time": 0.03846006374806166,
            "samples_per_second": 1068108.1994023258,
            "samples_per_second_per_gpu": 133513.52492529072,
            "loss_sequences_lower_95": 3.882382515767817,
            "loss_sequences_upper_95": 3.938089869804533,
            "loss_tokens_lower_95": 3.89986415625,
            "loss_tokens_upper_95": 3.923902989583333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0379575236261553,
            "data_time": 0.001268063090735509,
            "batch_time": 0.029538383873301554,
            "samples_per_second": 1131763.0906609115,
            "samples_per_second_per_gpu": 141470.38633261394,
            "loss_sequences_lower_95": 3.0296934755400335,
            "loss_sequences_upper_95": 3.046134688806187,
            "loss_tokens_lower_95": 3.02763315625,
            "loss_tokens_upper_95": 3.0483390052083332,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9410414371029727,
            "data_time": 0.002575033808826507,
            "batch_time": 0.03184376688027362,
            "samples_per_second": 1095403.140749698,
            "samples_per_second_per_gpu": 136925.39259371225,
            "loss_sequences_lower_95": 2.9310735344241334,
            "loss_sequences_upper_95": 2.9509022272214054,
            "loss_tokens_lower_95": 2.9308874427083333,
            "loss_tokens_upper_95": 2.9514088072916667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4775828899282955,
            "data_time": 0.009932790349123507,
            "batch_time": 0.03950281670913395,
            "samples_per_second": 1046621.7353824124,
            "samples_per_second_per_gpu": 130827.71692280156,
            "loss_sequences_lower_95": 3.444393605199353,
            "loss_sequences_upper_95": 3.5093626691651876,
            "loss_tokens_lower_95": 3.4665941041666666,
            "loss_tokens_upper_95": 3.488620838541667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.840200973139761,
            "data_time": 0.009872979851832903,
            "batch_time": 0.04004953202023449,
            "samples_per_second": 1030269.5035683489,
            "samples_per_second_per_gpu": 128783.68794604362,
            "loss_sequences_lower_95": 2.7784895017045095,
            "loss_sequences_upper_95": 2.8996692043700674,
            "loss_tokens_lower_95": 2.8291992604166665,
            "loss_tokens_upper_95": 2.85118375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.625803524797613,
            "data_time": 0.08561919416700091,
            "batch_time": 0.11903082472937447,
            "samples_per_second": 540364.762373428,
            "samples_per_second_per_gpu": 67545.5952966785,
            "loss_sequences_lower_95": 3.5658725651827723,
            "loss_sequences_upper_95": 3.684831948713823,
            "loss_tokens_lower_95": 3.60584565075961,
            "loss_tokens_upper_95": 3.6462638681585138,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5149191652720595,
            "data_time": 0.013619015162641352,
            "batch_time": 0.04263473505323583,
            "samples_per_second": 1053394.3945373958,
            "samples_per_second_per_gpu": 131674.29931717447,
            "loss_sequences_lower_95": 2.4244674638130923,
            "loss_sequences_upper_95": 2.605327096833432,
            "loss_tokens_lower_95": 2.5044492760416666,
            "loss_tokens_upper_95": 2.525135953125,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.483953111089943,
            "data_time": 0.011531263589859009,
            "batch_time": 0.041310312847296395,
            "samples_per_second": 1045264.6512950142,
            "samples_per_second_per_gpu": 130658.08141187677,
            "loss_sequences_lower_95": 5.432372628164165,
            "loss_sequences_upper_95": 5.533364328671256,
            "loss_tokens_lower_95": 5.472056135416667,
            "loss_tokens_upper_95": 5.495692677083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1545723422628935,
            "data_time": 0.03723708540201187,
            "batch_time": 0.06662694364786148,
            "samples_per_second": 952479.8433838274,
            "samples_per_second_per_gpu": 119059.98042297842,
            "loss_sequences_lower_95": 3.1188810754994876,
            "loss_sequences_upper_95": 3.1887741839299437,
            "loss_tokens_lower_95": 3.1427197065509733,
            "loss_tokens_upper_95": 3.166318311847624,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8801681138039177,
            "data_time": 0.0017213714489361148,
            "batch_time": 0.030658347290306264,
            "samples_per_second": 1100791.4911505566,
            "samples_per_second_per_gpu": 137598.93639381957,
            "loss_sequences_lower_95": 3.859884068497632,
            "loss_sequences_upper_95": 3.900739279093523,
            "loss_tokens_lower_95": 3.8593265821953784,
            "loss_tokens_upper_95": 3.9008037480749715,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8776768535001658,
            "data_time": 0.0019468883894811012,
            "batch_time": 0.030711497281007707,
            "samples_per_second": 1105061.0196092145,
            "samples_per_second_per_gpu": 138132.6274511518,
            "loss_sequences_lower_95": 2.8712998667316025,
            "loss_sequences_upper_95": 2.8963911192133662,
            "loss_tokens_lower_95": 2.8581167587748975,
            "loss_tokens_upper_95": 2.876854694256774,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3483181895761205,
            "data_time": 0.0030783627192778593,
            "batch_time": 0.032204575183591565,
            "samples_per_second": 1090054.8795688224,
            "samples_per_second_per_gpu": 136256.8599461028,
            "loss_sequences_lower_95": 3.60155688372771,
            "loss_sequences_upper_95": 3.884581741095152,
            "loss_tokens_lower_95": 2.8044143265930277,
            "loss_tokens_upper_95": 3.0061425653275404,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5441997268535195,
            "data_time": 0.003927041717032169,
            "batch_time": 0.0328384002155446,
            "samples_per_second": 1092886.1452989548,
            "samples_per_second_per_gpu": 136610.76816236935,
            "loss_sequences_lower_95": 3.6194728841145833,
            "loss_sequences_upper_95": 3.8137799560546877,
            "loss_tokens_lower_95": 3.314685534591195,
            "loss_tokens_upper_95": 3.4535130331171384,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.580003859872073,
            "data_time": 0.004500787901842397,
            "batch_time": 0.03323922941228024,
            "samples_per_second": 1096792.812184559,
            "samples_per_second_per_gpu": 137099.10152306987,
            "loss_sequences_lower_95": 2.624416265133387,
            "loss_sequences_upper_95": 2.6814124673491992,
            "loss_tokens_lower_95": 2.489631189401639,
            "loss_tokens_upper_95": 2.5192569756174783,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.912163425575603,
            "data_time": 0.02306977553027017,
            "batch_time": 0.05275385933262961,
            "samples_per_second": 1009447.9429250655,
            "samples_per_second_per_gpu": 126180.99286563319,
            "loss_sequences_lower_95": 1.894627238186923,
            "loss_sequences_upper_95": 1.9910485458374023,
            "loss_tokens_lower_95": 1.852405782336491,
            "loss_tokens_upper_95": 1.8941932568275723,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9698247014259804,
            "data_time": 0.01985890232026577,
            "batch_time": 0.04932234808802605,
            "samples_per_second": 995891.7582497518,
            "samples_per_second_per_gpu": 124486.46978121897,
            "loss_sequences_lower_95": 2.953526131766183,
            "loss_sequences_upper_95": 3.1324067034040177,
            "loss_tokens_lower_95": 2.8618431443593435,
            "loss_tokens_upper_95": 2.9496264995556873,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.192864816983541,
            "data_time": 0.016616219129317846,
            "batch_time": 0.04600180570895855,
            "samples_per_second": 1016025.3689911887,
            "samples_per_second_per_gpu": 127003.17112389859,
            "loss_sequences_lower_95": 3.1710255177815756,
            "loss_sequences_upper_95": 3.2938731587727865,
            "loss_tokens_lower_95": 3.04372142136318,
            "loss_tokens_upper_95": 3.2334934229903816,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.210488897166266,
            "data_time": 0.0015108194445991197,
            "batch_time": 0.030655970173127985,
            "samples_per_second": 1094300.6384962169,
            "samples_per_second_per_gpu": 136787.5798120271,
            "loss_sequences_lower_95": 5.216963949316593,
            "loss_sequences_upper_95": 5.299572198249348,
            "loss_tokens_lower_95": 5.071254085613861,
            "loss_tokens_upper_95": 5.155515784019781,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9531277446985644,
            "data_time": 0.002862281047257801,
            "batch_time": 0.03170919418334961,
            "samples_per_second": 1100308.0025881855,
            "samples_per_second_per_gpu": 137538.50032352319,
            "loss_sequences_lower_95": 4.4370874886560925,
            "loss_sequences_upper_95": 4.730198921177925,
            "loss_tokens_lower_95": 3.273878993173819,
            "loss_tokens_upper_95": 3.40274784597454,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.657837779550422,
            "data_time": 0.004911937020920418,
            "batch_time": 0.0335529378137073,
            "samples_per_second": 1092990.1453435607,
            "samples_per_second_per_gpu": 136623.7681679451,
            "loss_sequences_lower_95": 4.028282597772905,
            "loss_sequences_upper_95": 4.359148534572979,
            "loss_tokens_lower_95": 3.2713494713516456,
            "loss_tokens_upper_95": 3.4187488419273353,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.533805226626462,
            "data_time": 0.023498469165393283,
            "batch_time": 0.05397094999040876,
            "samples_per_second": 981223.7497395078,
            "samples_per_second_per_gpu": 122652.96871743847,
            "loss_sequences_lower_95": 5.435051765616082,
            "loss_sequences_upper_95": 5.629853228564675,
            "loss_tokens_lower_95": 5.435714345435573,
            "loss_tokens_upper_95": 5.6294632306381995,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1985859632492066,
            "data_time": 0.04724946847328773,
            "batch_time": 0.07747054558533889,
            "samples_per_second": 919663.1836653004,
            "samples_per_second_per_gpu": 114957.89795816255,
            "loss_sequences_lower_95": 3.0619877166748046,
            "loss_sequences_upper_95": 3.4130797119140626,
            "loss_tokens_lower_95": 2.887157728164482,
            "loss_tokens_upper_95": 3.3341899182587484,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.510062270057116,
            "data_time": 0.003227960600199631,
            "batch_time": 0.032124875513322514,
            "samples_per_second": 1102294.5447721751,
            "samples_per_second_per_gpu": 137786.8180965219,
            "loss_sequences_lower_95": 3.4539178260639236,
            "loss_sequences_upper_95": 3.565671878998209,
            "loss_tokens_lower_95": 3.4544487694512855,
            "loss_tokens_upper_95": 3.5663318395858656,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.365607822458828,
            "data_time": 0.004732310480239146,
            "batch_time": 0.03349985422940278,
            "samples_per_second": 1097862.9053590752,
            "samples_per_second_per_gpu": 137232.8631698844,
            "loss_sequences_lower_95": 4.291122818930436,
            "loss_sequences_upper_95": 4.440249271377201,
            "loss_tokens_lower_95": 4.2886093302108925,
            "loss_tokens_upper_95": 4.440035773299933,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.313428401428156,
            "data_time": 0.003371869513591989,
            "batch_time": 0.031916925768100415,
            "samples_per_second": 1103863.9393445726,
            "samples_per_second_per_gpu": 137982.99241807158,
            "loss_sequences_lower_95": 3.4689429614178118,
            "loss_sequences_upper_95": 3.598520810663765,
            "loss_tokens_lower_95": 3.123018577201557,
            "loss_tokens_upper_95": 3.177260122867594,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.261597804546356,
            "data_time": 0.010118820704519749,
            "batch_time": 0.03914234694093466,
            "samples_per_second": 1052196.6757750008,
            "samples_per_second_per_gpu": 131524.5844718751,
            "loss_sequences_lower_95": 5.4648166015625,
            "loss_sequences_upper_95": 6.041625512695313,
            "loss_tokens_lower_95": 4.6303888185855815,
            "loss_tokens_upper_95": 4.997308975714552,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.39328171312809,
            "data_time": 0.1513894498348236,
            "batch_time": 0.18561924993991852,
            "samples_per_second": 576147.8660850958,
            "samples_per_second_per_gpu": 72018.48326063697,
            "loss_sequences_lower_95": 3.1734509229660035,
            "loss_sequences_upper_95": 3.6264530777931214,
            "loss_tokens_lower_95": 2.96681522939397,
            "loss_tokens_upper_95": 3.735119944605334,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.7898326886111295,
            "data_time": 0.02716434762832966,
            "batch_time": 0.055794870599787286,
            "samples_per_second": 961155.4654993684,
            "samples_per_second_per_gpu": 120144.43318742105,
            "loss_sequences_lower_95": 5.285389525314857,
            "loss_sequences_upper_95": 6.154724436792834,
            "loss_tokens_lower_95": 3.2695997236008845,
            "loss_tokens_upper_95": 3.7590071720093183,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.344968571419172,
            "data_time": 0.0029982891347673205,
            "batch_time": 0.03174056588775582,
            "samples_per_second": 1098182.6028991777,
            "samples_per_second_per_gpu": 137272.8253623972,
            "loss_sequences_lower_95": 2.3199917414553983,
            "loss_sequences_upper_95": 2.3702680981903943,
            "loss_tokens_lower_95": 2.31903687856665,
            "loss_tokens_upper_95": 2.3703190328148893,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3054814038571574,
            "data_time": 0.0024809064817003005,
            "batch_time": 0.030900128552929392,
            "samples_per_second": 1116981.3370633158,
            "samples_per_second_per_gpu": 139622.66713291447,
            "loss_sequences_lower_95": 2.2794685457045656,
            "loss_sequences_upper_95": 2.414434221039322,
            "loss_tokens_lower_95": 2.1698813688394325,
            "loss_tokens_upper_95": 2.3008928755796005,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.075854418041942,
            "data_time": 0.017887390322155423,
            "batch_time": 0.04721823665830824,
            "samples_per_second": 986454.0219218872,
            "samples_per_second_per_gpu": 123306.7527402359,
            "loss_sequences_lower_95": 2.9303317618457387,
            "loss_sequences_upper_95": 3.339503954094408,
            "loss_tokens_lower_95": 2.8103652262445076,
            "loss_tokens_upper_95": 3.1010814571468837,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.530609649377445,
            "data_time": 0.004822317510843277,
            "batch_time": 0.03358540460467339,
            "samples_per_second": 1089462.1272747365,
            "samples_per_second_per_gpu": 136182.76590934207,
            "loss_sequences_lower_95": 3.57811339031361,
            "loss_sequences_upper_95": 3.7341505235824046,
            "loss_tokens_lower_95": 3.3858532060272735,
            "loss_tokens_upper_95": 3.528901120133111,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.650433177628168,
            "data_time": 0.030518049285525366,
            "batch_time": 0.05996561901909964,
            "samples_per_second": 993377.9310651333,
            "samples_per_second_per_gpu": 124172.24138314166,
            "loss_sequences_lower_95": 2.4828686644391316,
            "loss_sequences_upper_95": 2.9312420030919513,
            "loss_tokens_lower_95": 2.3683305102499235,
            "loss_tokens_upper_95": 2.7146044882047815,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.676894398731431,
            "data_time": 0.002066425499895121,
            "batch_time": 0.03079984057583049,
            "samples_per_second": 1104015.4892064065,
            "samples_per_second_per_gpu": 138001.93615080082,
            "loss_sequences_lower_95": 3.6638439406631322,
            "loss_sequences_upper_95": 3.6895942079040807,
            "loss_tokens_lower_95": 3.6640292902330467,
            "loss_tokens_upper_95": 3.689536422909582,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8178031378000685,
            "data_time": 0.045526630228216,
            "batch_time": 0.07754053636030718,
            "samples_per_second": 855058.7422989962,
            "samples_per_second_per_gpu": 106882.34278737452,
            "loss_sequences_lower_95": 0.7679865105638226,
            "loss_sequences_upper_95": 0.9093047503128793,
            "loss_tokens_lower_95": 0.6798837308385843,
            "loss_tokens_upper_95": 0.8668511600552952,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7602698089504143,
            "data_time": 0.0013681505896519536,
            "batch_time": 0.030167104075260753,
            "samples_per_second": 1103598.8836069496,
            "samples_per_second_per_gpu": 137949.8604508687,
            "loss_sequences_lower_95": 4.079888442577306,
            "loss_sequences_upper_95": 4.121695298168894,
            "loss_tokens_lower_95": 3.269245067698259,
            "loss_tokens_upper_95": 3.309349196083172,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.175664630651474,
            "data_time": 0.005804774307069324,
            "batch_time": 0.034502874291132364,
            "samples_per_second": 1092164.081130839,
            "samples_per_second_per_gpu": 136520.51014135487,
            "loss_sequences_lower_95": 5.196776489257812,
            "loss_sequences_upper_95": 5.459006274414063,
            "loss_tokens_lower_95": 4.861901496716091,
            "loss_tokens_upper_95": 5.1035368291702925,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.057728904226552,
            "data_time": 0.021570114766137075,
            "batch_time": 0.053430912858348785,
            "samples_per_second": 958796.2919521814,
            "samples_per_second_per_gpu": 119849.53649402267,
            "loss_sequences_lower_95": 3.911164776345958,
            "loss_sequences_upper_95": 4.20598680579144,
            "loss_tokens_lower_95": 3.907657430897588,
            "loss_tokens_upper_95": 4.206611819059953,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.494104302651954,
            "data_time": 0.004362809011735112,
            "batch_time": 0.03306380057909403,
            "samples_per_second": 1096462.0181409512,
            "samples_per_second_per_gpu": 137057.7522676189,
            "loss_sequences_lower_95": 6.408044748017282,
            "loss_sequences_upper_95": 6.578029063831676,
            "loss_tokens_lower_95": 6.412160182143702,
            "loss_tokens_upper_95": 6.576505200935133,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.005405489563942,
            "data_time": 0.004157118657802014,
            "batch_time": 0.033273655366390306,
            "samples_per_second": 1087970.0292912808,
            "samples_per_second_per_gpu": 135996.2536614101,
            "loss_sequences_lower_95": 1.0424267883300782,
            "loss_sequences_upper_95": 1.095223596191406,
            "loss_tokens_lower_95": 0.9324234459408763,
            "loss_tokens_upper_95": 0.991176556560124,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.247537967136928,
            "data_time": 0.024260046226637706,
            "batch_time": 0.05416164015020643,
            "samples_per_second": 964270.5671623048,
            "samples_per_second_per_gpu": 120533.8208952881,
            "loss_sequences_lower_95": 5.853391026088169,
            "loss_sequences_upper_95": 6.6406175304594495,
            "loss_tokens_lower_95": 5.858746919177827,
            "loss_tokens_upper_95": 6.6441683669317335,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9543893039226532,
            "data_time": 0.15320122241973877,
            "batch_time": 0.18790976703166962,
            "samples_per_second": 526500.15411534,
            "samples_per_second_per_gpu": 65812.5192644175,
            "loss_sequences_lower_95": 1.7977968633174897,
            "loss_sequences_upper_95": 2.5463601529598234,
            "loss_tokens_lower_95": 1.5513772583007812,
            "loss_tokens_upper_95": 1.999687087855388,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.482384666442871,
            "data_time": 0.005701452966720339,
            "batch_time": 0.03556925389501783,
            "samples_per_second": 1053007.0656961445,
            "samples_per_second_per_gpu": 131625.88321201806,
            "loss_sequences_lower_95": 7.400061425781249,
            "loss_sequences_upper_95": 7.744547155761719,
            "loss_tokens_lower_95": 7.20452639197335,
            "loss_tokens_upper_95": 7.506331710686337,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.415714186668396,
            "data_time": 0.005716164906819661,
            "batch_time": 0.0342703882663969,
            "samples_per_second": 1096156.9960664902,
            "samples_per_second_per_gpu": 137019.62450831127,
            "loss_sequences_lower_95": 6.509912585449219,
            "loss_sequences_upper_95": 6.742100756835938,
            "loss_tokens_lower_95": 6.182430640589791,
            "loss_tokens_upper_95": 6.367016161449205,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.703699913234308,
            "data_time": 0.0037637767983120817,
            "batch_time": 0.03228518045865572,
            "samples_per_second": 1105531.5128235773,
            "samples_per_second_per_gpu": 138191.43910294716,
            "loss_sequences_lower_95": 3.6692015578512613,
            "loss_sequences_upper_95": 3.7374989360281177,
            "loss_tokens_lower_95": 3.669980769935887,
            "loss_tokens_upper_95": 3.738149546649765,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.057017295408175,
            "data_time": 0.008501948906935954,
            "batch_time": 0.037257994769923274,
            "samples_per_second": 1073152.1664467296,
            "samples_per_second_per_gpu": 134144.0208058412,
            "loss_sequences_lower_95": 3.96553475984963,
            "loss_sequences_upper_95": 4.145559659568212,
            "loss_tokens_lower_95": 3.964284157936108,
            "loss_tokens_upper_95": 4.146869580903178,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.800767433166504,
            "data_time": 0.005637634841222611,
            "batch_time": 0.03479734583506509,
            "samples_per_second": 1079179.588485166,
            "samples_per_second_per_gpu": 134897.44856064575,
            "loss_sequences_lower_95": 5.69154569091797,
            "loss_sequences_upper_95": 5.909288488769531,
            "loss_tokens_lower_95": 5.693370080566407,
            "loss_tokens_upper_95": 5.911967687988281,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8276720479906112,
            "data_time": 0.0019136017797574654,
            "batch_time": 0.030496339942196592,
            "samples_per_second": 1109799.7710063476,
            "samples_per_second_per_gpu": 138724.97137579345,
            "loss_sequences_lower_95": 3.30739546069359,
            "loss_sequences_upper_95": 3.4054702744353125,
            "loss_tokens_lower_95": 2.2354642735504737,
            "loss_tokens_upper_95": 2.300678665739773,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.702985669249919,
            "data_time": 0.017591445786612373,
            "batch_time": 0.04691216434751238,
            "samples_per_second": 1003325.3820787132,
            "samples_per_second_per_gpu": 125415.67275983914,
            "loss_sequences_lower_95": 3.560474088298741,
            "loss_sequences_upper_95": 3.84379316870846,
            "loss_tokens_lower_95": 3.558742728162168,
            "loss_tokens_upper_95": 3.847037927428288,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9526971821691474,
            "data_time": 0.010325676761567593,
            "batch_time": 0.039812611415982246,
            "samples_per_second": 1067170.6266503064,
            "samples_per_second_per_gpu": 133396.3283312883,
            "loss_sequences_lower_95": 3.8401454013001683,
            "loss_sequences_upper_95": 4.063359207452512,
            "loss_tokens_lower_95": 3.8422573313993564,
            "loss_tokens_upper_95": 4.062051630955117,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6479522619841167,
            "data_time": 0.002210775593812241,
            "batch_time": 0.03092884611982082,
            "samples_per_second": 1104955.79921943,
            "samples_per_second_per_gpu": 138119.47490242874,
            "loss_sequences_lower_95": 2.8898652678934926,
            "loss_sequences_upper_95": 2.9727660903443254,
            "loss_tokens_lower_95": 2.2413707347307255,
            "loss_tokens_upper_95": 2.3066451968748853,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8001990066003546,
            "data_time": 0.028589492042859394,
            "batch_time": 0.05915192266305288,
            "samples_per_second": 985181.2995731336,
            "samples_per_second_per_gpu": 123147.6624466417,
            "loss_sequences_lower_95": 3.6499726472077545,
            "loss_sequences_upper_95": 3.9439018047675884,
            "loss_tokens_lower_95": 3.6516485627996857,
            "loss_tokens_upper_95": 3.942967756967696,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.184459706329789,
            "data_time": 0.003673919536837437,
            "batch_time": 0.03235850840697795,
            "samples_per_second": 1100585.731748913,
            "samples_per_second_per_gpu": 137573.21646861412,
            "loss_sequences_lower_95": 3.153143359076357,
            "loss_sequences_upper_95": 3.2152967615156727,
            "loss_tokens_lower_95": 3.153608361107129,
            "loss_tokens_upper_95": 3.2160483786673355,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9317347800847395,
            "data_time": 0.023154113509438256,
            "batch_time": 0.05224329991774126,
            "samples_per_second": 973172.2528286774,
            "samples_per_second_per_gpu": 121646.53160358468,
            "loss_sequences_lower_95": 3.759728307631409,
            "loss_sequences_upper_95": 4.103667338843485,
            "loss_tokens_lower_95": 3.7609230930365403,
            "loss_tokens_upper_95": 4.106059605866959,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9917792161305745,
            "data_time": 0.08178116381168365,
            "batch_time": 0.11367436498403549,
            "samples_per_second": 759852.6188486578,
            "samples_per_second_per_gpu": 94981.57735608223,
            "loss_sequences_lower_95": 1.8131882540384927,
            "loss_sequences_upper_95": 2.393495890299479,
            "loss_tokens_lower_95": 1.6394563330544367,
            "loss_tokens_upper_95": 2.2853094418843587,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.8923649768034616,
            "data_time": 0.07595191150903702,
            "batch_time": 0.10745503008365631,
            "samples_per_second": 775857.4108900942,
            "samples_per_second_per_gpu": 96982.17636126178,
            "loss_sequences_lower_95": 1.757400484085083,
            "loss_sequences_upper_95": 2.3233677927652994,
            "loss_tokens_lower_95": 1.4620512458715547,
            "loss_tokens_upper_95": 2.1796775432115187,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1163665625413492,
            "data_time": 0.00338764131735745,
            "batch_time": 0.032075327799697705,
            "samples_per_second": 1102750.168699866,
            "samples_per_second_per_gpu": 137843.77108748324,
            "loss_sequences_lower_95": 3.1009118130983064,
            "loss_sequences_upper_95": 3.1318399717530374,
            "loss_tokens_lower_95": 3.101165992613218,
            "loss_tokens_upper_95": 3.1316708207727357,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.42166673685593126,
            "data_time": 0.0013083690365764192,
            "batch_time": 0.030114368712875983,
            "samples_per_second": 1102620.5041596433,
            "samples_per_second_per_gpu": 137827.5630199554,
            "loss_sequences_lower_95": 0.47360806953301304,
            "loss_sequences_upper_95": 0.4847460610265079,
            "loss_tokens_lower_95": 0.3669485034491371,
            "loss_tokens_upper_95": 0.37367239441507233,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3649729509053268,
            "data_time": 0.03865722566843033,
            "batch_time": 0.06884005293250084,
            "samples_per_second": 978741.70767885,
            "samples_per_second_per_gpu": 122342.71345985626,
            "loss_sequences_lower_95": 1.2926209547388272,
            "loss_sequences_upper_95": 1.491000762699157,
            "loss_tokens_lower_95": 1.2114637007089015,
            "loss_tokens_upper_95": 1.3234268132254052,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.652790670862069,
            "data_time": 0.10956096649169922,
            "batch_time": 0.14597775822594053,
            "samples_per_second": 521015.67578199774,
            "samples_per_second_per_gpu": 65126.95947274972,
            "loss_sequences_lower_95": 3.2309249671729834,
            "loss_sequences_upper_95": 4.129336052971917,
            "loss_tokens_lower_95": 3.04889668594172,
            "loss_tokens_upper_95": 4.141859718605324,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2202110152419021,
            "data_time": 0.029421885808308918,
            "batch_time": 0.058767764341263545,
            "samples_per_second": 990828.3668754601,
            "samples_per_second_per_gpu": 123853.54585943252,
            "loss_sequences_lower_95": 1.1692571360890458,
            "loss_sequences_upper_95": 1.3323154961190573,
            "loss_tokens_lower_95": 1.0995916178248528,
            "loss_tokens_upper_95": 1.1889584106753313,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2677818253272917,
            "data_time": 0.029294249557313464,
            "batch_time": 0.05918160506657192,
            "samples_per_second": 1001954.5054008471,
            "samples_per_second_per_gpu": 125244.31317510588,
            "loss_sequences_lower_95": 1.2410441375360255,
            "loss_sequences_upper_95": 1.3889657671858624,
            "loss_tokens_lower_95": 1.13608949307164,
            "loss_tokens_upper_95": 1.2124576254104609,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2120091360880108,
            "data_time": 0.030904250485556468,
            "batch_time": 0.06028855698449271,
            "samples_per_second": 995618.9167254361,
            "samples_per_second_per_gpu": 124452.36459067951,
            "loss_sequences_lower_95": 1.1026627610369426,
            "loss_sequences_upper_95": 1.282939710849669,
            "loss_tokens_lower_95": 1.1351843879440862,
            "loss_tokens_upper_95": 1.2546668414071485,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3366145764182253,
            "data_time": 0.030135325023106167,
            "batch_time": 0.05948443639846075,
            "samples_per_second": 986705.1746540582,
            "samples_per_second_per_gpu": 123338.14683175727,
            "loss_sequences_lower_95": 1.301120946465469,
            "loss_sequences_upper_95": 1.440741024947748,
            "loss_tokens_lower_95": 1.2100793975164585,
            "loss_tokens_upper_95": 1.2848006726796755,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.112593390186381,
            "data_time": 0.030884139331770533,
            "batch_time": 0.061073888967066635,
            "samples_per_second": 991262.1725899049,
            "samples_per_second_per_gpu": 123907.77157373811,
            "loss_sequences_lower_95": 1.0675333674650016,
            "loss_sequences_upper_95": 1.1659742485662425,
            "loss_tokens_lower_95": 1.0597963614406625,
            "loss_tokens_upper_95": 1.1152807809986471,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.0707332953447248,
            "data_time": 0.029733345622108096,
            "batch_time": 0.06448175793602354,
            "samples_per_second": 959022.3045788201,
            "samples_per_second_per_gpu": 119877.78807235252,
            "loss_sequences_lower_95": 1.0513731049328314,
            "loss_sequences_upper_95": 1.1602996407485588,
            "loss_tokens_lower_95": 0.954029743362436,
            "loss_tokens_upper_95": 1.0020163925934298,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-32.0/params.txt",
    "uuid": "d8073af0-9b42-4c24-933f-a65254de273b",
    "creation_date": "2023_12_14-07_26_31"
}