{
    "name": "rpj-d=576_l=24_h=8-0.25",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 768386880,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "153677376",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.7249569873015087,
            "data_time": 0.042031679302453995,
            "batch_time": 0.3921470642089844,
            "samples_per_second": 824073.4745059799,
            "samples_per_second_per_gpu": 103009.18431324749,
            "loss_sequences_lower_95": 3.6510571543375647,
            "loss_sequences_upper_95": 3.794927177429199,
            "loss_tokens_lower_95": 3.711905727386475,
            "loss_tokens_upper_95": 3.738166077931722,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0789643763923635,
            "data_time": 0.0011245547789751893,
            "batch_time": 0.030653906219682028,
            "samples_per_second": 1083382.355964618,
            "samples_per_second_per_gpu": 135422.79449557725,
            "loss_sequences_lower_95": 4.076539843483453,
            "loss_sequences_upper_95": 4.081410636081013,
            "loss_tokens_lower_95": 4.0676715,
            "loss_tokens_upper_95": 4.090450156249999,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3239572272008777,
            "data_time": 0.01046584129333496,
            "batch_time": 0.039741670608520505,
            "samples_per_second": 1057195.900414185,
            "samples_per_second_per_gpu": 132149.4875517731,
            "loss_sequences_lower_95": 3.2954954622229753,
            "loss_sequences_upper_95": 3.3520822828643175,
            "loss_tokens_lower_95": 3.3117370833333335,
            "loss_tokens_upper_95": 3.3363064999999996,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9245616980680484,
            "data_time": 0.0016425603902653645,
            "batch_time": 0.02985377541105998,
            "samples_per_second": 1131302.1776098653,
            "samples_per_second_per_gpu": 141412.77220123317,
            "loss_sequences_lower_95": 3.911414002094072,
            "loss_sequences_upper_95": 3.9372429828447166,
            "loss_tokens_lower_95": 3.913107822916667,
            "loss_tokens_upper_95": 3.9358651875,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.065779912253261,
            "data_time": 0.01012298880345318,
            "batch_time": 0.03889158617452796,
            "samples_per_second": 1074459.1305047292,
            "samples_per_second_per_gpu": 134307.39131309115,
            "loss_sequences_lower_95": 4.0313174307953314,
            "loss_sequences_upper_95": 4.09872460763469,
            "loss_tokens_lower_95": 4.05444790625,
            "loss_tokens_upper_95": 4.0770861145833335,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.858504703626756,
            "data_time": 0.003943295582481052,
            "batch_time": 0.033148910040440766,
            "samples_per_second": 1096279.4136814722,
            "samples_per_second_per_gpu": 137034.92671018402,
            "loss_sequences_lower_95": 3.8173045580440603,
            "loss_sequences_upper_95": 3.899519793149324,
            "loss_tokens_lower_95": 3.8468735,
            "loss_tokens_upper_95": 3.8701802604166664,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5360117565125835,
            "data_time": 0.0016558058499123104,
            "batch_time": 0.030518000413698627,
            "samples_per_second": 1111507.4287047265,
            "samples_per_second_per_gpu": 138938.4285880908,
            "loss_sequences_lower_95": 2.5100401636240433,
            "loss_sequences_upper_95": 2.5617662179129463,
            "loss_tokens_lower_95": 2.5245130989583333,
            "loss_tokens_upper_95": 2.5481330416666665,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.26528542962998,
            "data_time": 0.0018220777443790476,
            "batch_time": 0.030845733827649175,
            "samples_per_second": 1113480.6336293952,
            "samples_per_second_per_gpu": 139185.0792036744,
            "loss_sequences_lower_95": 4.256617985111256,
            "loss_sequences_upper_95": 4.273846245091623,
            "loss_tokens_lower_95": 4.254261114583334,
            "loss_tokens_upper_95": 4.276231697916667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.20915527653888,
            "data_time": 0.011605259918031238,
            "batch_time": 0.04560728584017072,
            "samples_per_second": 1059287.395463819,
            "samples_per_second_per_gpu": 132410.92443297736,
            "loss_sequences_lower_95": 4.167773884098704,
            "loss_sequences_upper_95": 4.2541320676726055,
            "loss_tokens_lower_95": 4.197420864583333,
            "loss_tokens_upper_95": 4.220847395833333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.751351240595339,
            "data_time": 0.01040505338460207,
            "batch_time": 0.0396550502628088,
            "samples_per_second": 1067833.2807312428,
            "samples_per_second_per_gpu": 133479.16009140536,
            "loss_sequences_lower_95": 4.714216462990983,
            "loss_sequences_upper_95": 4.783884197732677,
            "loss_tokens_lower_95": 4.7393631562500005,
            "loss_tokens_upper_95": 4.76347959375,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.048101753310927,
            "data_time": 0.0013353920662425975,
            "batch_time": 0.03004275442443831,
            "samples_per_second": 1117319.8011554251,
            "samples_per_second_per_gpu": 139664.97514442814,
            "loss_sequences_lower_95": 4.040049969700733,
            "loss_sequences_upper_95": 4.056056949238557,
            "loss_tokens_lower_95": 4.036575291666666,
            "loss_tokens_upper_95": 4.059683354166666,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9816940373949388,
            "data_time": 0.0027112631277676725,
            "batch_time": 0.03170056406603963,
            "samples_per_second": 1105446.9665142063,
            "samples_per_second_per_gpu": 138180.87081427578,
            "loss_sequences_lower_95": 3.9709143915458407,
            "loss_sequences_upper_95": 3.992161336991748,
            "loss_tokens_lower_95": 3.97029728125,
            "loss_tokens_upper_95": 3.9931485520833334,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.275974451649262,
            "data_time": 0.010203235234196478,
            "batch_time": 0.03899208264859769,
            "samples_per_second": 1068756.2118809726,
            "samples_per_second_per_gpu": 133594.52648512158,
            "loss_sequences_lower_95": 4.24126822140831,
            "loss_sequences_upper_95": 4.310031864560884,
            "loss_tokens_lower_95": 4.264422885416667,
            "loss_tokens_upper_95": 4.287407083333333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.858475742903358,
            "data_time": 0.010527283071996681,
            "batch_time": 0.039125943088911444,
            "samples_per_second": 1076921.7931793586,
            "samples_per_second_per_gpu": 134615.22414741982,
            "loss_sequences_lower_95": 3.796630337282014,
            "loss_sequences_upper_95": 3.9196492119379296,
            "loss_tokens_lower_95": 3.8463805,
            "loss_tokens_upper_95": 3.8705163333333332,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.717155640775507,
            "data_time": 0.08797394377844674,
            "batch_time": 0.12082939488547188,
            "samples_per_second": 585070.2462443074,
            "samples_per_second_per_gpu": 73133.78078053842,
            "loss_sequences_lower_95": 4.647714571519331,
            "loss_sequences_upper_95": 4.784909985282204,
            "loss_tokens_lower_95": 4.695478803461248,
            "loss_tokens_upper_95": 4.739856381849809,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.507279664364918,
            "data_time": 0.015017764134840532,
            "batch_time": 0.04467050595716997,
            "samples_per_second": 1029545.2004235719,
            "samples_per_second_per_gpu": 128693.15005294648,
            "loss_sequences_lower_95": 3.3992505843715835,
            "loss_sequences_upper_95": 3.614070747753621,
            "loss_tokens_lower_95": 3.49545528125,
            "loss_tokens_upper_95": 3.51880515625,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.09872542081848,
            "data_time": 0.013922836631536484,
            "batch_time": 0.043056278179089226,
            "samples_per_second": 1070496.465021076,
            "samples_per_second_per_gpu": 133812.0581276345,
            "loss_sequences_lower_95": 6.041707464132585,
            "loss_sequences_upper_95": 6.151538746212277,
            "loss_tokens_lower_95": 6.0871833125,
            "loss_tokens_upper_95": 6.110185958333334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.427976190066729,
            "data_time": 0.038955073803663254,
            "batch_time": 0.06951518729329109,
            "samples_per_second": 930397.6611665329,
            "samples_per_second_per_gpu": 116299.70764581661,
            "loss_sequences_lower_95": 4.372986315117507,
            "loss_sequences_upper_95": 4.476512183517706,
            "loss_tokens_lower_95": 4.414940280601626,
            "loss_tokens_upper_95": 4.441072732894146,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.061135889474967,
            "data_time": 0.0016286046942165608,
            "batch_time": 0.030721138475424608,
            "samples_per_second": 1096981.897504548,
            "samples_per_second_per_gpu": 137122.7371880685,
            "loss_sequences_lower_95": 5.045379208357072,
            "loss_sequences_upper_95": 5.077523943237253,
            "loss_tokens_lower_95": 5.045359318250605,
            "loss_tokens_upper_95": 5.07687567876727,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8892517980648877,
            "data_time": 0.0019231058040242285,
            "batch_time": 0.03053169289402142,
            "samples_per_second": 1110671.1730841321,
            "samples_per_second_per_gpu": 138833.89663551652,
            "loss_sequences_lower_95": 3.8767283658633738,
            "loss_sequences_upper_95": 3.902766875217835,
            "loss_tokens_lower_95": 3.875498638126282,
            "loss_tokens_upper_95": 3.8963044685086023,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.767434219031273,
            "data_time": 0.0032633422794036433,
            "batch_time": 0.032517990450270325,
            "samples_per_second": 1096853.2162925524,
            "samples_per_second_per_gpu": 137106.65203656905,
            "loss_sequences_lower_95": 5.967866884428879,
            "loss_sequences_upper_95": 6.260491381155527,
            "loss_tokens_lower_95": 5.284886595894389,
            "loss_tokens_upper_95": 5.494734450388671,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.946016602516174,
            "data_time": 0.003617292547479589,
            "batch_time": 0.032481129816237915,
            "samples_per_second": 1097277.9640240688,
            "samples_per_second_per_gpu": 137159.7455030086,
            "loss_sequences_lower_95": 6.107456184895834,
            "loss_sequences_upper_95": 6.308643636067709,
            "loss_tokens_lower_95": 5.572203997150157,
            "loss_tokens_upper_95": 5.709689268867924,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.03305329094338,
            "data_time": 0.005043513454822754,
            "batch_time": 0.03374918841308778,
            "samples_per_second": 1100666.9486170122,
            "samples_per_second_per_gpu": 137583.36857712653,
            "loss_sequences_lower_95": 4.081058981709629,
            "loss_sequences_upper_95": 4.153594188675726,
            "loss_tokens_lower_95": 3.926286694273707,
            "loss_tokens_upper_95": 3.9618090236605847,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.7767161651091143,
            "data_time": 0.024386301636695862,
            "batch_time": 0.05396176874637604,
            "samples_per_second": 1017740.2634570053,
            "samples_per_second_per_gpu": 127217.53293212566,
            "loss_sequences_lower_95": 2.75284782409668,
            "loss_sequences_upper_95": 2.873798342618075,
            "loss_tokens_lower_95": 2.708539701010279,
            "loss_tokens_upper_95": 2.7577999187493702,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.119955921173096,
            "data_time": 0.02151024341583252,
            "batch_time": 0.0513863880187273,
            "samples_per_second": 995157.9257834061,
            "samples_per_second_per_gpu": 124394.74072292577,
            "loss_sequences_lower_95": 4.106410808952487,
            "loss_sequences_upper_95": 4.318004000916773,
            "loss_tokens_lower_95": 3.9743109779642007,
            "loss_tokens_upper_95": 4.074508915347834,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.411081190903982,
            "data_time": 0.017475704352060955,
            "batch_time": 0.04691151472238394,
            "samples_per_second": 1028946.6859570158,
            "samples_per_second_per_gpu": 128618.33574462698,
            "loss_sequences_lower_95": 4.366888865152995,
            "loss_sequences_upper_95": 4.486554066975911,
            "loss_tokens_lower_95": 4.271147821565494,
            "loss_tokens_upper_95": 4.5028346642870245,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.940565892409741,
            "data_time": 0.0015776811557871517,
            "batch_time": 0.030425896877825303,
            "samples_per_second": 1104280.8437661103,
            "samples_per_second_per_gpu": 138035.10547076378,
            "loss_sequences_lower_95": 7.962312597651445,
            "loss_sequences_upper_95": 8.035786313265833,
            "loss_tokens_lower_95": 7.781373725602036,
            "loss_tokens_upper_95": 7.858698624665536,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.578629863683624,
            "data_time": 0.0033545458076784276,
            "batch_time": 0.03240237580049758,
            "samples_per_second": 1105088.8892537504,
            "samples_per_second_per_gpu": 138136.1111567188,
            "loss_sequences_lower_95": 6.175378202187894,
            "loss_sequences_upper_95": 6.482005407673743,
            "loss_tokens_lower_95": 4.7574678980578735,
            "loss_tokens_upper_95": 4.904410142058356,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.97297790738096,
            "data_time": 0.005432328259622729,
            "batch_time": 0.033999222355919914,
            "samples_per_second": 1097535.2526338338,
            "samples_per_second_per_gpu": 137191.90657922922,
            "loss_sequences_lower_95": 5.459539628272984,
            "loss_sequences_upper_95": 5.805327494233948,
            "loss_tokens_lower_95": 4.510509036748245,
            "loss_tokens_upper_95": 4.67076714199641,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.377168446371,
            "data_time": 0.024368620344570706,
            "batch_time": 0.05493299450193133,
            "samples_per_second": 992438.5092225274,
            "samples_per_second_per_gpu": 124054.81365281592,
            "loss_sequences_lower_95": 5.316312955394728,
            "loss_sequences_upper_95": 5.438977440960331,
            "loss_tokens_lower_95": 5.3154726629387845,
            "loss_tokens_upper_95": 5.436300018171197,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.2456572723388675,
            "data_time": 0.05111419237576998,
            "batch_time": 0.0840654785816486,
            "samples_per_second": 848356.8466812929,
            "samples_per_second_per_gpu": 106044.60583516161,
            "loss_sequences_lower_95": 4.110640274047851,
            "loss_sequences_upper_95": 4.513827575683593,
            "loss_tokens_lower_95": 3.9228666083757266,
            "loss_tokens_upper_95": 4.410787630848893,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.98077583825454,
            "data_time": 0.003385887555549481,
            "batch_time": 0.032408248915018964,
            "samples_per_second": 1095516.9284350928,
            "samples_per_second_per_gpu": 136939.6160543866,
            "loss_sequences_lower_95": 4.940700428707938,
            "loss_sequences_upper_95": 5.020302941782082,
            "loss_tokens_lower_95": 4.940103021346437,
            "loss_tokens_upper_95": 5.021544772941722,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.340714257823747,
            "data_time": 0.005088146125706335,
            "batch_time": 0.03366758656151726,
            "samples_per_second": 1105413.8676540607,
            "samples_per_second_per_gpu": 138176.7334567576,
            "loss_sequences_lower_95": 5.300328400133088,
            "loss_sequences_upper_95": 5.381296938584536,
            "loss_tokens_lower_95": 5.298638941009546,
            "loss_tokens_upper_95": 5.381311635010365,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.500549165286748,
            "data_time": 0.003609799926790769,
            "batch_time": 0.03235814767058931,
            "samples_per_second": 1098638.229187767,
            "samples_per_second_per_gpu": 137329.77864847088,
            "loss_sequences_lower_95": 4.645050228147952,
            "loss_sequences_upper_95": 4.762732650873062,
            "loss_tokens_lower_95": 4.335459693531127,
            "loss_tokens_upper_95": 4.394486349001227,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.352148285865784,
            "data_time": 0.011546564288437366,
            "batch_time": 0.040381609462201595,
            "samples_per_second": 1066028.6910027836,
            "samples_per_second_per_gpu": 133253.58637534795,
            "loss_sequences_lower_95": 6.525832702636718,
            "loss_sequences_upper_95": 7.076834863281249,
            "loss_tokens_lower_95": 5.651248510273619,
            "loss_tokens_upper_95": 6.016940741851423,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.610095292329788,
            "data_time": 0.16387121379375458,
            "batch_time": 0.198082834482193,
            "samples_per_second": 560581.5035002342,
            "samples_per_second_per_gpu": 70072.68793752928,
            "loss_sequences_lower_95": 4.328722560405732,
            "loss_sequences_upper_95": 4.992824065685272,
            "loss_tokens_lower_95": 4.111741883727326,
            "loss_tokens_upper_95": 4.958811670062186,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.910500545611327,
            "data_time": 0.029764350424421594,
            "batch_time": 0.059949808932365255,
            "samples_per_second": 925643.9650654222,
            "samples_per_second_per_gpu": 115705.49563317778,
            "loss_sequences_lower_95": 6.351614292188622,
            "loss_sequences_upper_95": 7.171625141713811,
            "loss_tokens_lower_95": 4.4585796575535515,
            "loss_tokens_upper_95": 4.949379849783329,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8605844203745656,
            "data_time": 0.003040672176414066,
            "batch_time": 0.03161948070757919,
            "samples_per_second": 1106250.259884105,
            "samples_per_second_per_gpu": 138281.28248551313,
            "loss_sequences_lower_95": 3.8303861965514416,
            "loss_sequences_upper_95": 3.890501287151185,
            "loss_tokens_lower_95": 3.829432928202818,
            "loss_tokens_upper_95": 3.890845633157219,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.74831319288225,
            "data_time": 0.002562363849946312,
            "batch_time": 0.031056383389764207,
            "samples_per_second": 1115047.7387842932,
            "samples_per_second_per_gpu": 139380.96734803665,
            "loss_sequences_lower_95": 4.7233948594871915,
            "loss_sequences_upper_95": 4.927864693882811,
            "loss_tokens_lower_95": 4.494813431347977,
            "loss_tokens_upper_95": 4.693182895951057,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8430293383615792,
            "data_time": 0.01916616161664327,
            "batch_time": 0.048775638143221535,
            "samples_per_second": 986631.4855499773,
            "samples_per_second_per_gpu": 123328.93569374716,
            "loss_sequences_lower_95": 3.7095914288754863,
            "loss_sequences_upper_95": 4.110602007855426,
            "loss_tokens_lower_95": 3.57462198855588,
            "loss_tokens_upper_95": 3.888269776971554,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.21869675569301,
            "data_time": 0.004956481233239174,
            "batch_time": 0.03363745249807835,
            "samples_per_second": 1093896.9186280568,
            "samples_per_second_per_gpu": 136737.1148285071,
            "loss_sequences_lower_95": 4.251215760578014,
            "loss_sequences_upper_95": 4.400092723337608,
            "loss_tokens_lower_95": 4.072027451436165,
            "loss_tokens_upper_95": 4.22222286242961,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7587796929405957,
            "data_time": 0.0324360245750064,
            "batch_time": 0.06282659655525571,
            "samples_per_second": 977841.476220537,
            "samples_per_second_per_gpu": 122230.18452756712,
            "loss_sequences_lower_95": 3.5593715016434833,
            "loss_sequences_upper_95": 4.048492934064168,
            "loss_tokens_lower_95": 3.5004087861949427,
            "loss_tokens_upper_95": 3.9013348876554854,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.106453816421892,
            "data_time": 0.0018085445635708341,
            "batch_time": 0.030288309132724203,
            "samples_per_second": 1114382.904814979,
            "samples_per_second_per_gpu": 139297.86310187238,
            "loss_sequences_lower_95": 4.093907326387153,
            "loss_sequences_upper_95": 4.118971597053785,
            "loss_tokens_lower_95": 4.094163569041934,
            "loss_tokens_upper_95": 4.118846552513627,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9331961453539654,
            "data_time": 0.05101427598433061,
            "batch_time": 0.08081456964666194,
            "samples_per_second": 907436.667620115,
            "samples_per_second_per_gpu": 113429.58345251437,
            "loss_sequences_lower_95": 1.824221427695265,
            "loss_sequences_upper_95": 2.1108777685072813,
            "loss_tokens_lower_95": 1.6974450789796172,
            "loss_tokens_upper_95": 2.0077490471768646,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.023480021653685,
            "data_time": 0.0012862621638720913,
            "batch_time": 0.02984860307113123,
            "samples_per_second": 1113227.3738465337,
            "samples_per_second_per_gpu": 139153.4217308167,
            "loss_sequences_lower_95": 6.457974523388365,
            "loss_sequences_upper_95": 6.513193277482966,
            "loss_tokens_lower_95": 5.335417021276596,
            "loss_tokens_upper_95": 5.389499588974855,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.014450112819672,
            "data_time": 0.005877668895418682,
            "batch_time": 0.03520246299486312,
            "samples_per_second": 1075070.4539894222,
            "samples_per_second_per_gpu": 134383.80674867777,
            "loss_sequences_lower_95": 6.988106030273437,
            "loss_sequences_upper_95": 7.3430052001953126,
            "loss_tokens_lower_95": 6.669441939469741,
            "loss_tokens_upper_95": 6.975524118752417,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.533616601902506,
            "data_time": 0.022734801648026805,
            "batch_time": 0.05305121712765451,
            "samples_per_second": 982775.8392083502,
            "samples_per_second_per_gpu": 122846.97990104377,
            "loss_sequences_lower_95": 5.394436207646909,
            "loss_sequences_upper_95": 5.675152800186821,
            "loss_tokens_lower_95": 5.39425733483356,
            "loss_tokens_upper_95": 5.674152420707371,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.231086492177212,
            "data_time": 0.004874735352504684,
            "batch_time": 0.03362499841724534,
            "samples_per_second": 1098363.443618642,
            "samples_per_second_per_gpu": 137295.43045233024,
            "loss_sequences_lower_95": 7.142066650390626,
            "loss_sequences_upper_95": 7.317604481090198,
            "loss_tokens_lower_95": 7.142115349047112,
            "loss_tokens_upper_95": 7.319926535866477,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.2757014090617498,
            "data_time": 0.0043522709227622825,
            "batch_time": 0.0329716072437611,
            "samples_per_second": 1106287.5183749706,
            "samples_per_second_per_gpu": 138285.93979687133,
            "loss_sequences_lower_95": 1.3319200805664062,
            "loss_sequences_upper_95": 1.4135599202473959,
            "loss_tokens_lower_95": 1.1743866491909263,
            "loss_tokens_upper_95": 1.2406667667066826,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.246421655019124,
            "data_time": 0.025866825665746416,
            "batch_time": 0.055144129054886956,
            "samples_per_second": 971761.6946205612,
            "samples_per_second_per_gpu": 121470.21182757015,
            "loss_sequences_lower_95": 5.927228713262649,
            "loss_sequences_upper_95": 6.565634504045758,
            "loss_tokens_lower_95": 5.9266900634765625,
            "loss_tokens_upper_95": 6.576064133417039,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3396396338939667,
            "data_time": 0.15864773094654083,
            "batch_time": 0.19322030246257782,
            "samples_per_second": 542892.8563055804,
            "samples_per_second_per_gpu": 67861.60703819754,
            "loss_sequences_lower_95": 3.0534840404987333,
            "loss_sequences_upper_95": 4.330140697956085,
            "loss_tokens_lower_95": 2.720055837729543,
            "loss_tokens_upper_95": 3.303120733831346,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.786227084159851,
            "data_time": 0.006046100741341,
            "batch_time": 0.03484154693664066,
            "samples_per_second": 1089745.29970572,
            "samples_per_second_per_gpu": 136218.162463215,
            "loss_sequences_lower_95": 7.71256875,
            "loss_sequences_upper_95": 8.125007275390624,
            "loss_tokens_lower_95": 7.419935094120135,
            "loss_tokens_upper_95": 7.782643006094014,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.12065132522583,
            "data_time": 0.006216627737832448,
            "batch_time": 0.03537234758573865,
            "samples_per_second": 1076408.111745439,
            "samples_per_second_per_gpu": 134551.01396817988,
            "loss_sequences_lower_95": 7.167411022949219,
            "loss_sequences_upper_95": 7.400705200195312,
            "loss_tokens_lower_95": 6.896565184691099,
            "loss_tokens_upper_95": 7.1094489879350276,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8587286975852924,
            "data_time": 0.0036364850391911027,
            "batch_time": 0.03236471322866587,
            "samples_per_second": 1100199.1272788267,
            "samples_per_second_per_gpu": 137524.89090985333,
            "loss_sequences_lower_95": 3.828828514577397,
            "loss_sequences_upper_95": 3.8879359665709856,
            "loss_tokens_lower_95": 3.8290624492567256,
            "loss_tokens_upper_95": 3.888434200050285,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.408576801625265,
            "data_time": 0.008822601007190716,
            "batch_time": 0.03759527782659156,
            "samples_per_second": 1079807.5052752055,
            "samples_per_second_per_gpu": 134975.93815940068,
            "loss_sequences_lower_95": 5.322248242637529,
            "loss_sequences_upper_95": 5.49289299515169,
            "loss_tokens_lower_95": 5.319850451678907,
            "loss_tokens_upper_95": 5.493173231836837,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.919185472965241,
            "data_time": 0.006151819512957618,
            "batch_time": 0.03529883992104303,
            "samples_per_second": 1079107.3077100299,
            "samples_per_second_per_gpu": 134888.41346375374,
            "loss_sequences_lower_95": 7.86348330078125,
            "loss_sequences_upper_95": 7.9760617187500005,
            "loss_tokens_lower_95": 7.864006872558594,
            "loss_tokens_upper_95": 7.975813647460938,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.667394079074842,
            "data_time": 0.0019045032490403982,
            "batch_time": 0.030512580438913153,
            "samples_per_second": 1110943.6042584006,
            "samples_per_second_per_gpu": 138867.95053230008,
            "loss_sequences_lower_95": 5.317537334806646,
            "loss_sequences_upper_95": 5.435053918815043,
            "loss_tokens_lower_95": 3.902454007459583,
            "loss_tokens_upper_95": 3.9786228498379046,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.28972202005671,
            "data_time": 0.019649791717529296,
            "batch_time": 0.050608137675694054,
            "samples_per_second": 968956.0985368943,
            "samples_per_second_per_gpu": 121119.51231711179,
            "loss_sequences_lower_95": 5.144615378308652,
            "loss_sequences_upper_95": 5.434900038989622,
            "loss_tokens_lower_95": 5.145016912204116,
            "loss_tokens_upper_95": 5.43530642381355,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.377460196438958,
            "data_time": 0.01096791960299015,
            "batch_time": 0.04038723558187485,
            "samples_per_second": 1061301.6630747074,
            "samples_per_second_per_gpu": 132662.70788433842,
            "loss_sequences_lower_95": 5.273845885033701,
            "loss_sequences_upper_95": 5.479721009497549,
            "loss_tokens_lower_95": 5.275057983398438,
            "loss_tokens_upper_95": 5.477741328220741,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.387162883135664,
            "data_time": 0.0018414081588748934,
            "batch_time": 0.03039112082717716,
            "samples_per_second": 1112116.3503467096,
            "samples_per_second_per_gpu": 139014.5437933387,
            "loss_sequences_lower_95": 5.929966706595265,
            "loss_sequences_upper_95": 6.055314051147752,
            "loss_tokens_lower_95": 4.5673852971367825,
            "loss_tokens_upper_95": 4.662343217270961,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.2294710805176425,
            "data_time": 0.028765579064687092,
            "batch_time": 0.05873243510723114,
            "samples_per_second": 1000318.2498366069,
            "samples_per_second_per_gpu": 125039.78122957586,
            "loss_sequences_lower_95": 4.109725790679771,
            "loss_sequences_upper_95": 4.345954265291729,
            "loss_tokens_lower_95": 4.1103511103877315,
            "loss_tokens_upper_95": 4.344217120781147,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.577338888156669,
            "data_time": 0.003320334886311029,
            "batch_time": 0.03213283415970203,
            "samples_per_second": 1098659.1827195229,
            "samples_per_second_per_gpu": 137332.39783994036,
            "loss_sequences_lower_95": 6.555248799455275,
            "loss_sequences_upper_95": 6.599051629396024,
            "loss_tokens_lower_95": 6.555686535383218,
            "loss_tokens_upper_95": 6.598887151782301,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.5978727873089245,
            "data_time": 0.025245482271367855,
            "batch_time": 0.055975842475891116,
            "samples_per_second": 938685.4671487905,
            "samples_per_second_per_gpu": 117335.68339359881,
            "loss_sequences_lower_95": 5.436520785729862,
            "loss_sequences_upper_95": 5.7556630254949175,
            "loss_tokens_lower_95": 5.439389304744387,
            "loss_tokens_upper_95": 5.757877120230962,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.075638564427694,
            "data_time": 0.08349432051181793,
            "batch_time": 0.11681794375181198,
            "samples_per_second": 741283.0782705011,
            "samples_per_second_per_gpu": 92660.38478381264,
            "loss_sequences_lower_95": 4.699009691874186,
            "loss_sequences_upper_95": 5.654531237284342,
            "loss_tokens_lower_95": 4.190104654100207,
            "loss_tokens_upper_95": 5.547046958075629,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.442348392804464,
            "data_time": 0.08745456486940384,
            "batch_time": 0.12282533198595047,
            "samples_per_second": 703309.8902451482,
            "samples_per_second_per_gpu": 87913.73628064353,
            "loss_sequences_lower_95": 4.149824860890707,
            "loss_sequences_upper_95": 5.182542546590169,
            "loss_tokens_lower_95": 3.4452145608623375,
            "loss_tokens_upper_95": 4.857171785161736,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.934909904828303,
            "data_time": 0.003465440008063538,
            "batch_time": 0.032479193857517435,
            "samples_per_second": 1092330.519205357,
            "samples_per_second_per_gpu": 136541.31490066962,
            "loss_sequences_lower_95": 5.893073870696797,
            "loss_sequences_upper_95": 5.978607988540133,
            "loss_tokens_lower_95": 5.892076209269145,
            "loss_tokens_upper_95": 5.9777161669274665,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.5988359399727603,
            "data_time": 0.0012502029000147688,
            "batch_time": 0.030283692237845657,
            "samples_per_second": 1096626.9730968429,
            "samples_per_second_per_gpu": 137078.37163710536,
            "loss_sequences_lower_95": 1.8626381131714593,
            "loss_sequences_upper_95": 1.8987257004697224,
            "loss_tokens_lower_95": 1.3358703241592107,
            "loss_tokens_upper_95": 1.355476035625243,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.721092507595152,
            "data_time": 0.042525049299001694,
            "batch_time": 0.07363883033394814,
            "samples_per_second": 955887.0905328862,
            "samples_per_second_per_gpu": 119485.88631661078,
            "loss_sequences_lower_95": 2.6026040625384472,
            "loss_sequences_upper_95": 2.937943580206923,
            "loss_tokens_lower_95": 2.4362784932679316,
            "loss_tokens_upper_95": 2.5974011467352924,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5464172943218335,
            "data_time": 0.12980227243332637,
            "batch_time": 0.16314847128731863,
            "samples_per_second": 563673.72684405,
            "samples_per_second_per_gpu": 70459.21585550625,
            "loss_sequences_lower_95": 4.139474054284998,
            "loss_sequences_upper_95": 5.080269829002586,
            "loss_tokens_lower_95": 3.935675632806472,
            "loss_tokens_upper_95": 4.9589763217502165,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5710230607812,
            "data_time": 0.03396049283799671,
            "batch_time": 0.06352967023849487,
            "samples_per_second": 977834.8007171225,
            "samples_per_second_per_gpu": 122229.35008964031,
            "loss_sequences_lower_95": 2.495009934029928,
            "loss_sequences_upper_95": 2.779172781037121,
            "loss_tokens_lower_95": 2.3390923841266296,
            "loss_tokens_upper_95": 2.4748706034154226,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.613512840212845,
            "data_time": 0.03311622994286673,
            "batch_time": 0.06290807610466367,
            "samples_per_second": 982266.2284557575,
            "samples_per_second_per_gpu": 122783.27855696969,
            "loss_sequences_lower_95": 2.5795016591141864,
            "loss_sequences_upper_95": 2.8316745013725466,
            "loss_tokens_lower_95": 2.372398910262988,
            "loss_tokens_upper_95": 2.484051029084086,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6134476836134746,
            "data_time": 0.033934615907214936,
            "batch_time": 0.06375462668282646,
            "samples_per_second": 973189.3717964448,
            "samples_per_second_per_gpu": 121648.6714745556,
            "loss_sequences_lower_95": 2.4203274052317547,
            "loss_sequences_upper_95": 2.7405847968124766,
            "loss_tokens_lower_95": 2.4598547033361484,
            "loss_tokens_upper_95": 2.6390828988687822,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.707159336020307,
            "data_time": 0.03363668067114694,
            "batch_time": 0.06355967124303182,
            "samples_per_second": 976527.8646358935,
            "samples_per_second_per_gpu": 122065.98307948669,
            "loss_sequences_lower_95": 2.67521462789396,
            "loss_sequences_upper_95": 2.9188503916670636,
            "loss_tokens_lower_95": 2.4655529366846767,
            "loss_tokens_upper_95": 2.570191812960901,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.2238510736027117,
            "data_time": 0.034646982028160564,
            "batch_time": 0.06497258610195583,
            "samples_per_second": 987429.668983677,
            "samples_per_second_per_gpu": 123428.70862295963,
            "loss_sequences_lower_95": 2.172051708150354,
            "loss_sequences_upper_95": 2.3147967178629054,
            "loss_tokens_lower_95": 2.1318930802592133,
            "loss_tokens_upper_95": 2.210499089417593,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9730029760337457,
            "data_time": 0.031679204532078335,
            "batch_time": 0.06206912369955154,
            "samples_per_second": 967809.3442033489,
            "samples_per_second_per_gpu": 120976.1680254186,
            "loss_sequences_lower_95": 1.945226287841797,
            "loss_sequences_upper_95": 2.105344549039515,
            "loss_tokens_lower_95": 1.791307454721406,
            "loss_tokens_upper_95": 1.8595035189214135,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-0.25/params.txt",
    "uuid": "1b90fa26-88ee-403e-996c-0f5f09d912e4",
    "creation_date": "2023_12_14-06_41_24"
}