{
    "name": "rpj-d=1024_l=24_h=8-1.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 8232325120,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1646465024",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=1024_l=24_h=8-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.641669084628423,
            "data_time": 0.038881052285432816,
            "batch_time": 0.42418914288282394,
            "samples_per_second": 692171.2208012741,
            "samples_per_second_per_gpu": 86521.40260015926,
            "loss_sequences_lower_95": 2.5756512451171876,
            "loss_sequences_upper_95": 2.7048416900634766,
            "loss_tokens_lower_95": 2.6303314145406085,
            "loss_tokens_upper_95": 2.6530553754170736,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.149769456671634,
            "data_time": 0.0010261925389322816,
            "batch_time": 0.03659363252158569,
            "samples_per_second": 901526.1065795862,
            "samples_per_second_per_gpu": 112690.76332244827,
            "loss_sequences_lower_95": 3.1471539371051422,
            "loss_sequences_upper_95": 3.152362104995353,
            "loss_tokens_lower_95": 3.139273494791667,
            "loss_tokens_upper_95": 3.1601631197916666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6047305204430407,
            "data_time": 0.008422588348388672,
            "batch_time": 0.04392278671264648,
            "samples_per_second": 868986.4556183214,
            "samples_per_second_per_gpu": 108623.30695229018,
            "loss_sequences_lower_95": 2.5798521641322543,
            "loss_sequences_upper_95": 2.6295506442323022,
            "loss_tokens_lower_95": 2.5936197395833336,
            "loss_tokens_upper_95": 2.6158728697916667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9874104877353944,
            "data_time": 0.0014570542660198714,
            "batch_time": 0.03677056533725638,
            "samples_per_second": 906410.8933982486,
            "samples_per_second_per_gpu": 113301.36167478107,
            "loss_sequences_lower_95": 2.9752265524323454,
            "loss_sequences_upper_95": 2.9991368798324745,
            "loss_tokens_lower_95": 2.9767701250000003,
            "loss_tokens_upper_95": 2.997804619791667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.147891891949531,
            "data_time": 0.008628223046838525,
            "batch_time": 0.04422117039501904,
            "samples_per_second": 866632.6621628599,
            "samples_per_second_per_gpu": 108329.08277035749,
            "loss_sequences_lower_95": 3.1137695436807853,
            "loss_sequences_upper_95": 3.181199792057822,
            "loss_tokens_lower_95": 3.1372768541666667,
            "loss_tokens_upper_95": 3.158342848958333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.93514068567534,
            "data_time": 0.0032556866822035417,
            "batch_time": 0.03863142301207003,
            "samples_per_second": 903107.0154291631,
            "samples_per_second_per_gpu": 112888.37692864539,
            "loss_sequences_lower_95": 2.894149230500616,
            "loss_sequences_upper_95": 2.9760035158905507,
            "loss_tokens_lower_95": 2.924437302083333,
            "loss_tokens_upper_95": 2.9458016666666667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6428124857192137,
            "data_time": 0.0015797107488062798,
            "batch_time": 0.03683231005271924,
            "samples_per_second": 910372.1184563539,
            "samples_per_second_per_gpu": 113796.51480704424,
            "loss_sequences_lower_95": 1.6213377710459185,
            "loss_sequences_upper_95": 1.6643721500318878,
            "loss_tokens_lower_95": 1.6333065338541668,
            "loss_tokens_upper_95": 1.6524725598958332,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.546620801406381,
            "data_time": 0.0016042775361919841,
            "batch_time": 0.03702352755940245,
            "samples_per_second": 908873.9432581832,
            "samples_per_second_per_gpu": 113609.2429072729,
            "loss_sequences_lower_95": 3.537811109293194,
            "loss_sequences_upper_95": 3.555415780431937,
            "loss_tokens_lower_95": 3.5360895104166667,
            "loss_tokens_upper_95": 3.5570205625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.315645149568232,
            "data_time": 0.008584465299333845,
            "batch_time": 0.04390114261990502,
            "samples_per_second": 865256.6808232362,
            "samples_per_second_per_gpu": 108157.08510290453,
            "loss_sequences_lower_95": 3.2735474997419653,
            "loss_sequences_upper_95": 3.3618587835048275,
            "loss_tokens_lower_95": 3.30480590625,
            "loss_tokens_upper_95": 3.3265511458333337,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.933468927978998,
            "data_time": 0.008139913901686668,
            "batch_time": 0.04358369950205088,
            "samples_per_second": 876751.0379950241,
            "samples_per_second_per_gpu": 109593.87974937801,
            "loss_sequences_lower_95": 3.903773528690866,
            "loss_sequences_upper_95": 3.9603807170400507,
            "loss_tokens_lower_95": 3.9216240729166665,
            "loss_tokens_upper_95": 3.945567635416667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.072953563039068,
            "data_time": 0.0012009733832022953,
            "batch_time": 0.0365381006281093,
            "samples_per_second": 909815.7288774475,
            "samples_per_second_per_gpu": 113726.96610968094,
            "loss_sequences_lower_95": 3.0649867079364808,
            "loss_sequences_upper_95": 3.080968908723191,
            "loss_tokens_lower_95": 3.0626163697916664,
            "loss_tokens_upper_95": 3.083255213541667,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9830272865017373,
            "data_time": 0.0023384302680835834,
            "batch_time": 0.03781917113050831,
            "samples_per_second": 902128.9196932786,
            "samples_per_second_per_gpu": 112766.11496165983,
            "loss_sequences_lower_95": 2.9729544096616647,
            "loss_sequences_upper_95": 2.9928963009390617,
            "loss_tokens_lower_95": 2.972877635416667,
            "loss_tokens_upper_95": 2.993357578125,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4975418860723715,
            "data_time": 0.008188682111355627,
            "batch_time": 0.04330689728024449,
            "samples_per_second": 868678.9412325459,
            "samples_per_second_per_gpu": 108584.86765406824,
            "loss_sequences_lower_95": 3.4638165023215644,
            "loss_sequences_upper_95": 3.529779993641449,
            "loss_tokens_lower_95": 3.4866261041666666,
            "loss_tokens_upper_95": 3.508491541666667,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8795339898519265,
            "data_time": 0.008668004754055068,
            "batch_time": 0.04406372199495475,
            "samples_per_second": 867351.2904725338,
            "samples_per_second_per_gpu": 108418.91130906672,
            "loss_sequences_lower_95": 2.8176641957580193,
            "loss_sequences_upper_95": 2.9393874158684445,
            "loss_tokens_lower_95": 2.868665588541667,
            "loss_tokens_upper_95": 2.8904280052083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6033941128037195,
            "data_time": 0.07270738908222743,
            "batch_time": 0.10909618650163923,
            "samples_per_second": 517069.11079362984,
            "samples_per_second_per_gpu": 64633.63884920373,
            "loss_sequences_lower_95": 3.5442406567660245,
            "loss_sequences_upper_95": 3.662399326671254,
            "loss_tokens_lower_95": 3.584075658971613,
            "loss_tokens_upper_95": 3.6234666477550164,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5698505469730923,
            "data_time": 0.011730773882432417,
            "batch_time": 0.04737247662110762,
            "samples_per_second": 849751.640600564,
            "samples_per_second_per_gpu": 106218.9550750705,
            "loss_sequences_lower_95": 2.4778084802210505,
            "loss_sequences_upper_95": 2.661555948062819,
            "loss_tokens_lower_95": 2.559325244791667,
            "loss_tokens_upper_95": 2.5801625729166666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.43290242796523,
            "data_time": 0.011107183992862701,
            "batch_time": 0.04648444801568985,
            "samples_per_second": 868338.6925053712,
            "samples_per_second_per_gpu": 108542.3365631714,
            "loss_sequences_lower_95": 5.379690390715184,
            "loss_sequences_upper_95": 5.482403822121331,
            "loss_tokens_lower_95": 5.421114052083333,
            "loss_tokens_upper_95": 5.444649552083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.198718647487828,
            "data_time": 0.0341838076710701,
            "batch_time": 0.07034055143594742,
            "samples_per_second": 777163.6432620528,
            "samples_per_second_per_gpu": 97145.4554077566,
            "loss_sequences_lower_95": 3.158759063970847,
            "loss_sequences_upper_95": 3.2389386661717148,
            "loss_tokens_lower_95": 3.187026852467021,
            "loss_tokens_upper_95": 3.2104461357241774,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.897326402082825,
            "data_time": 0.0015083437356970576,
            "batch_time": 0.03690597163789093,
            "samples_per_second": 903002.1596486672,
            "samples_per_second_per_gpu": 112875.2699560834,
            "loss_sequences_lower_95": 4.874370220810782,
            "loss_sequences_upper_95": 4.920644483958838,
            "loss_tokens_lower_95": 4.87359062600146,
            "loss_tokens_upper_95": 4.92082021235401,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.9067219022592448,
            "data_time": 0.0017353908461370286,
            "batch_time": 0.03712832230101725,
            "samples_per_second": 901788.7499749317,
            "samples_per_second_per_gpu": 112723.59374686646,
            "loss_sequences_lower_95": 2.895969525648526,
            "loss_sequences_upper_95": 2.9206579193558926,
            "loss_tokens_lower_95": 2.890230461276761,
            "loss_tokens_upper_95": 2.9091908658288195,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5232927319917073,
            "data_time": 0.0029379076602658945,
            "batch_time": 0.03925672148220052,
            "samples_per_second": 901599.5862823542,
            "samples_per_second_per_gpu": 112699.94828529427,
            "loss_sequences_lower_95": 3.7811418146810047,
            "loss_sequences_upper_95": 4.064748700069379,
            "loss_tokens_lower_95": 2.981900675193963,
            "loss_tokens_upper_95": 3.184299582244386,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.701595135700889,
            "data_time": 0.00318209763537062,
            "batch_time": 0.038490887017960246,
            "samples_per_second": 897303.0408405194,
            "samples_per_second_per_gpu": 112162.88010506492,
            "loss_sequences_lower_95": 3.7702811442057294,
            "loss_sequences_upper_95": 3.9675873209635415,
            "loss_tokens_lower_95": 3.475340568494497,
            "loss_tokens_upper_95": 3.6150206490762575,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.570925239634568,
            "data_time": 0.003933929929546281,
            "batch_time": 0.03938908562581104,
            "samples_per_second": 892050.4046834905,
            "samples_per_second_per_gpu": 111506.30058543631,
            "loss_sequences_lower_95": 2.6104312737662294,
            "loss_sequences_upper_95": 2.6650743694175274,
            "loss_tokens_lower_95": 2.4891676900832445,
            "loss_tokens_upper_95": 2.5181847324899977,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9755655294114893,
            "data_time": 0.019484019705227444,
            "batch_time": 0.055228935820715766,
            "samples_per_second": 832851.4945965831,
            "samples_per_second_per_gpu": 104106.43682457288,
            "loss_sequences_lower_95": 1.9579614916714754,
            "loss_sequences_upper_95": 2.0552546206387605,
            "loss_tokens_lower_95": 1.913251867504787,
            "loss_tokens_upper_95": 1.9554732258704524,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.042705924170358,
            "data_time": 0.018007535487413406,
            "batch_time": 0.053911617025732994,
            "samples_per_second": 821209.2239563343,
            "samples_per_second_per_gpu": 102651.15299454179,
            "loss_sequences_lower_95": 3.029614780970982,
            "loss_sequences_upper_95": 3.2145508746711577,
            "loss_tokens_lower_95": 2.9269012752835137,
            "loss_tokens_upper_95": 3.0140926035143023,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.083399844964345,
            "data_time": 0.015186591026110526,
            "batch_time": 0.050306109281686634,
            "samples_per_second": 838831.3119655178,
            "samples_per_second_per_gpu": 104853.91399568973,
            "loss_sequences_lower_95": 3.053918212890625,
            "loss_sequences_upper_95": 3.1490315755208336,
            "loss_tokens_lower_95": 2.9638143532355725,
            "loss_tokens_upper_95": 3.1436490996069604,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.418510976034103,
            "data_time": 0.0012416740111884212,
            "batch_time": 0.03655445271854196,
            "samples_per_second": 906122.3060147257,
            "samples_per_second_per_gpu": 113265.28825184071,
            "loss_sequences_lower_95": 5.426188781386865,
            "loss_sequences_upper_95": 5.509378171749668,
            "loss_tokens_lower_95": 5.2750150714170525,
            "loss_tokens_upper_95": 5.3612881927439195,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9720134841723476,
            "data_time": 0.0026839581511964735,
            "batch_time": 0.03797450701662358,
            "samples_per_second": 901338.9007055281,
            "samples_per_second_per_gpu": 112667.36258819101,
            "loss_sequences_lower_95": 4.424550261320891,
            "loss_sequences_upper_95": 4.7102124352246415,
            "loss_tokens_lower_95": 3.3177636641139445,
            "loss_tokens_upper_95": 3.445533019298862,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.691271115335995,
            "data_time": 0.00442958965494826,
            "batch_time": 0.039871092986416175,
            "samples_per_second": 888978.2101873737,
            "samples_per_second_per_gpu": 111122.27627342171,
            "loss_sequences_lower_95": 4.054930119953871,
            "loss_sequences_upper_95": 4.37169081131346,
            "loss_tokens_lower_95": 3.3207535523682035,
            "loss_tokens_upper_95": 3.467917463924855,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.043720406484386,
            "data_time": 0.020129182509013584,
            "batch_time": 0.05541139841079712,
            "samples_per_second": 835353.5936236875,
            "samples_per_second_per_gpu": 104419.19920296094,
            "loss_sequences_lower_95": 5.951225468883775,
            "loss_sequences_upper_95": 6.132908149614726,
            "loss_tokens_lower_95": 5.952736102935931,
            "loss_tokens_upper_95": 6.131780448460687,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2221320629119874,
            "data_time": 0.04206302991280189,
            "batch_time": 0.08029904732337365,
            "samples_per_second": 705998.8805720757,
            "samples_per_second_per_gpu": 88249.86007150947,
            "loss_sequences_lower_95": 3.0894662399291994,
            "loss_sequences_upper_95": 3.45258935546875,
            "loss_tokens_lower_95": 2.917682501668367,
            "loss_tokens_upper_95": 3.365051689898605,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.5712471890473925,
            "data_time": 0.0030746013840283353,
            "batch_time": 0.038410709191928856,
            "samples_per_second": 901886.9695507978,
            "samples_per_second_per_gpu": 112735.87119384973,
            "loss_sequences_lower_95": 4.513143549189963,
            "loss_sequences_upper_95": 4.629840031665814,
            "loss_tokens_lower_95": 4.51129583946392,
            "loss_tokens_upper_95": 4.630569942165909,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.089723693650829,
            "data_time": 0.004494745229624613,
            "batch_time": 0.03987041538730142,
            "samples_per_second": 894584.8924131643,
            "samples_per_second_per_gpu": 111823.11155164553,
            "loss_sequences_lower_95": 5.016729331426597,
            "loss_sequences_upper_95": 5.162104866096565,
            "loss_tokens_lower_95": 5.013474602976555,
            "loss_tokens_upper_95": 5.163249087821842,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3421952674516007,
            "data_time": 0.003064073445215081,
            "batch_time": 0.03835051209056352,
            "samples_per_second": 896175.4361265552,
            "samples_per_second_per_gpu": 112021.9295158194,
            "loss_sequences_lower_95": 3.4931732543015506,
            "loss_sequences_upper_95": 3.623208488655298,
            "loss_tokens_lower_95": 3.158214032461701,
            "loss_tokens_upper_95": 3.213168220144747,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.231370785236359,
            "data_time": 0.009757491759955883,
            "batch_time": 0.04485353920608759,
            "samples_per_second": 868675.8847977726,
            "samples_per_second_per_gpu": 108584.48559972158,
            "loss_sequences_lower_95": 5.42086181640625,
            "loss_sequences_upper_95": 5.988351623535157,
            "loss_tokens_lower_95": 4.628516440956424,
            "loss_tokens_upper_95": 4.988446751814816,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6651246398687363,
            "data_time": 0.13366952538490295,
            "batch_time": 0.172504261136055,
            "samples_per_second": 464715.15701761644,
            "samples_per_second_per_gpu": 58089.394627202055,
            "loss_sequences_lower_95": 3.468993973731995,
            "loss_sequences_upper_95": 3.8928820133209228,
            "loss_tokens_lower_95": 3.223150389221893,
            "loss_tokens_upper_95": 4.035730234781901,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.845352952507721,
            "data_time": 0.023457585497105374,
            "batch_time": 0.058005715938324626,
            "samples_per_second": 792769.2251475672,
            "samples_per_second_per_gpu": 99096.1531434459,
            "loss_sequences_lower_95": 5.354069597967739,
            "loss_sequences_upper_95": 6.248370291172773,
            "loss_tokens_lower_95": 3.3600423200020257,
            "loss_tokens_upper_95": 3.8170704047064365,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.4081958323495782,
            "data_time": 0.002601493149995804,
            "batch_time": 0.03803930597172843,
            "samples_per_second": 895116.5172014602,
            "samples_per_second_per_gpu": 111889.56465018252,
            "loss_sequences_lower_95": 2.376668928887137,
            "loss_sequences_upper_95": 2.4389249589152606,
            "loss_tokens_lower_95": 2.376879312191118,
            "loss_tokens_upper_95": 2.439149965058773,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3702556642255446,
            "data_time": 0.002153258199695473,
            "batch_time": 0.037498861585978996,
            "samples_per_second": 902916.5739429115,
            "samples_per_second_per_gpu": 112864.57174286393,
            "loss_sequences_lower_95": 2.3443568360891107,
            "loss_sequences_upper_95": 2.4782539070080536,
            "loss_tokens_lower_95": 2.23327183683467,
            "loss_tokens_upper_95": 2.363356396893798,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0836335942422077,
            "data_time": 0.016772187418407865,
            "batch_time": 0.05154729220602247,
            "samples_per_second": 830821.5393864292,
            "samples_per_second_per_gpu": 103852.69242330366,
            "loss_sequences_lower_95": 2.9526634970864096,
            "loss_sequences_upper_95": 3.3670397412645947,
            "loss_tokens_lower_95": 2.8264508976967218,
            "loss_tokens_upper_95": 3.113735597735748,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.522938344077793,
            "data_time": 0.004209118708968162,
            "batch_time": 0.03944052644073963,
            "samples_per_second": 891708.633761171,
            "samples_per_second_per_gpu": 111463.57922014638,
            "loss_sequences_lower_95": 3.5709931070257004,
            "loss_sequences_upper_95": 3.728980022090198,
            "loss_tokens_lower_95": 3.3732074849764664,
            "loss_tokens_upper_95": 3.515482504609493,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.641481743353169,
            "data_time": 0.026495785940261113,
            "batch_time": 0.06224435000192551,
            "samples_per_second": 812167.585018274,
            "samples_per_second_per_gpu": 101520.94812728425,
            "loss_sequences_lower_95": 2.5301342057018745,
            "loss_sequences_upper_95": 2.962350189395067,
            "loss_tokens_lower_95": 2.398026144951825,
            "loss_tokens_upper_95": 2.736879611987663,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.253504275083303,
            "data_time": 0.0018215629038123003,
            "batch_time": 0.037140162573115997,
            "samples_per_second": 902538.2066886565,
            "samples_per_second_per_gpu": 112817.27583608206,
            "loss_sequences_lower_95": 4.233110411144729,
            "loss_sequences_upper_95": 4.2733326059743195,
            "loss_tokens_lower_95": 4.2333328384426885,
            "loss_tokens_upper_95": 4.27352462680036,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.7810429328853644,
            "data_time": 0.037902428887107154,
            "batch_time": 0.07359522039240057,
            "samples_per_second": 744437.0948409413,
            "samples_per_second_per_gpu": 93054.63685511766,
            "loss_sequences_lower_95": 0.741513144854203,
            "loss_sequences_upper_95": 0.8624729823140265,
            "loss_tokens_lower_95": 0.6585850632666564,
            "loss_tokens_upper_95": 0.8330788869086648,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.330031414276899,
            "data_time": 0.0011009301144039542,
            "batch_time": 0.0364127028093354,
            "samples_per_second": 905737.5638778051,
            "samples_per_second_per_gpu": 113217.19548472564,
            "loss_sequences_lower_95": 4.677138876605084,
            "loss_sequences_upper_95": 4.7221698010842506,
            "loss_tokens_lower_95": 3.7893080875241782,
            "loss_tokens_upper_95": 3.8328143496131526,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.3433438210487365,
            "data_time": 0.004962165204305497,
            "batch_time": 0.04043591259017823,
            "samples_per_second": 887011.4567453054,
            "samples_per_second_per_gpu": 110876.43209316318,
            "loss_sequences_lower_95": 4.3627164306640625,
            "loss_sequences_upper_95": 4.643213049316406,
            "loss_tokens_lower_95": 4.027891017739753,
            "loss_tokens_upper_95": 4.277866679204128,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.275988343487615,
            "data_time": 0.020073256250155175,
            "batch_time": 0.05545596551086943,
            "samples_per_second": 836805.2556954485,
            "samples_per_second_per_gpu": 104600.65696193106,
            "loss_sequences_lower_95": 5.087328570822011,
            "loss_sequences_upper_95": 5.458650671917459,
            "loss_tokens_lower_95": 5.089110147227412,
            "loss_tokens_upper_95": 5.458138985011888,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.534115114717773,
            "data_time": 0.004027676151459475,
            "batch_time": 0.03946126835891999,
            "samples_per_second": 892028.7996790836,
            "samples_per_second_per_gpu": 111503.59995988545,
            "loss_sequences_lower_95": 7.441758441347065,
            "loss_sequences_upper_95": 7.623596986712831,
            "loss_tokens_lower_95": 7.44308760209517,
            "loss_tokens_upper_95": 7.62306455669981,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0241375151475272,
            "data_time": 0.003674880304235093,
            "batch_time": 0.03909535801157038,
            "samples_per_second": 895610.4161635708,
            "samples_per_second_per_gpu": 111951.30202044635,
            "loss_sequences_lower_95": 1.0480916056315106,
            "loss_sequences_upper_95": 1.0886715372721356,
            "loss_tokens_lower_95": 0.9653598548794519,
            "loss_tokens_upper_95": 1.023170455682273,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.114630339259193,
            "data_time": 0.019846109407288686,
            "batch_time": 0.05506831620420728,
            "samples_per_second": 799031.2399001844,
            "samples_per_second_per_gpu": 99878.90498752306,
            "loss_sequences_lower_95": 5.717901059105283,
            "loss_sequences_upper_95": 6.51251466296968,
            "loss_tokens_lower_95": 5.723465183803013,
            "loss_tokens_upper_95": 6.5123257736932665,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.070982698351145,
            "data_time": 0.12564526498317719,
            "batch_time": 0.1647530347108841,
            "samples_per_second": 485913.959658613,
            "samples_per_second_per_gpu": 60739.24495732663,
            "loss_sequences_lower_95": 1.9014238744974137,
            "loss_sequences_upper_95": 2.7331552147865295,
            "loss_tokens_lower_95": 1.6142158901568542,
            "loss_tokens_upper_95": 2.098595606223824,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.6084132218360905,
            "data_time": 0.00502951183016338,
            "batch_time": 0.04029925948097592,
            "samples_per_second": 890274.3444262532,
            "samples_per_second_per_gpu": 111284.29305328165,
            "loss_sequences_lower_95": 7.539912963867188,
            "loss_sequences_upper_95": 7.891014086914063,
            "loss_tokens_lower_95": 7.3142676442206005,
            "loss_tokens_upper_95": 7.622517077450825,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.59235026216507,
            "data_time": 0.0053873904167659695,
            "batch_time": 0.04071389777319772,
            "samples_per_second": 888427.466387228,
            "samples_per_second_per_gpu": 111053.4332984035,
            "loss_sequences_lower_95": 6.679424621582031,
            "loss_sequences_upper_95": 6.885394653320312,
            "loss_tokens_lower_95": 6.3651172070605915,
            "loss_tokens_upper_95": 6.546002732125532,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.830849224554785,
            "data_time": 0.0030562137680308875,
            "batch_time": 0.03843032269174838,
            "samples_per_second": 896605.0439500007,
            "samples_per_second_per_gpu": 112075.63049375008,
            "loss_sequences_lower_95": 4.785452868991997,
            "loss_sequences_upper_95": 4.875966937646664,
            "loss_tokens_lower_95": 4.785985223231123,
            "loss_tokens_upper_95": 4.876272158442109,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.678753357695361,
            "data_time": 0.007256587826593405,
            "batch_time": 0.04268899496948611,
            "samples_per_second": 871580.2952097629,
            "samples_per_second_per_gpu": 108947.53690122036,
            "loss_sequences_lower_95": 4.577536071683587,
            "loss_sequences_upper_95": 4.781942463067637,
            "loss_tokens_lower_95": 4.57516112906226,
            "loss_tokens_upper_95": 4.780658465266777,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.20008879494667,
            "data_time": 0.004953339459404113,
            "batch_time": 0.040426985611991276,
            "samples_per_second": 885543.1949542646,
            "samples_per_second_per_gpu": 110692.89936928307,
            "loss_sequences_lower_95": 4.084101550292969,
            "loss_sequences_upper_95": 4.315953784179688,
            "loss_tokens_lower_95": 4.086362902832032,
            "loss_tokens_upper_95": 4.318279467773437,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7754321034888445,
            "data_time": 0.0015182579397479167,
            "batch_time": 0.0369238112509138,
            "samples_per_second": 901909.6990429774,
            "samples_per_second_per_gpu": 112738.71238037218,
            "loss_sequences_lower_95": 3.2420558719400425,
            "loss_sequences_upper_95": 3.333434811450449,
            "loss_tokens_lower_95": 2.2017362675558805,
            "loss_tokens_upper_95": 2.263364747072687,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.8922597739233895,
            "data_time": 0.016827975000653948,
            "batch_time": 0.05193274702344622,
            "samples_per_second": 837117.2886140458,
            "samples_per_second_per_gpu": 104639.66107675573,
            "loss_sequences_lower_95": 4.719864961994229,
            "loss_sequences_upper_95": 5.062564052752594,
            "loss_tokens_lower_95": 4.720911043081711,
            "loss_tokens_upper_95": 5.063131611382784,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.0756346356635,
            "data_time": 0.00964645016938448,
            "batch_time": 0.04498742613941431,
            "samples_per_second": 880182.0909863223,
            "samples_per_second_per_gpu": 110022.76137329028,
            "loss_sequences_lower_95": 4.950475679285386,
            "loss_sequences_upper_95": 5.197232007793352,
            "loss_tokens_lower_95": 4.952032889571845,
            "loss_tokens_upper_95": 5.196849089977788,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.718599504386962,
            "data_time": 0.0018894976453502107,
            "batch_time": 0.0372484544123015,
            "samples_per_second": 900771.3997560263,
            "samples_per_second_per_gpu": 112596.42496950329,
            "loss_sequences_lower_95": 2.9655658288511524,
            "loss_sequences_upper_95": 3.0514880926167325,
            "loss_tokens_lower_95": 2.296724833421931,
            "loss_tokens_upper_95": 2.361098049066063,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.071887175242106,
            "data_time": 0.02435576170682907,
            "batch_time": 0.05988647292057673,
            "samples_per_second": 828584.8720415444,
            "samples_per_second_per_gpu": 103573.10900519305,
            "loss_sequences_lower_95": 4.864537500452113,
            "loss_sequences_upper_95": 5.271102130223834,
            "loss_tokens_lower_95": 4.866549949040489,
            "loss_tokens_upper_95": 5.268760447022776,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3243277950753494,
            "data_time": 0.00291416875026456,
            "batch_time": 0.03837110766269931,
            "samples_per_second": 894674.105166151,
            "samples_per_second_per_gpu": 111834.26314576887,
            "loss_sequences_lower_95": 3.2891325915042047,
            "loss_sequences_upper_95": 3.359699826488437,
            "loss_tokens_lower_95": 3.2899653126194575,
            "loss_tokens_upper_95": 3.359884977661506,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.883178441269884,
            "data_time": 0.019483505595814098,
            "batch_time": 0.0539993936365301,
            "samples_per_second": 808683.0514390569,
            "samples_per_second_per_gpu": 101085.38142988211,
            "loss_sequences_lower_95": 4.695523382390586,
            "loss_sequences_upper_95": 5.069241984839579,
            "loss_tokens_lower_95": 4.6918355256608395,
            "loss_tokens_upper_95": 5.070660741120866,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.012692078948021,
            "data_time": 0.0679101049900055,
            "batch_time": 0.10425685346126556,
            "samples_per_second": 649556.6758929875,
            "samples_per_second_per_gpu": 81194.58448662344,
            "loss_sequences_lower_95": 1.8122598965962728,
            "loss_sequences_upper_95": 2.336436144510905,
            "loss_tokens_lower_95": 1.6553942044576009,
            "loss_tokens_upper_95": 2.2705996566348605,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1452106753985087,
            "data_time": 0.0703757032752037,
            "batch_time": 0.10690390318632126,
            "samples_per_second": 637849.7005580865,
            "samples_per_second_per_gpu": 79731.21256976081,
            "loss_sequences_lower_95": 1.9826269213358563,
            "loss_sequences_upper_95": 2.6168715222676595,
            "loss_tokens_lower_95": 1.6431490308783028,
            "loss_tokens_upper_95": 2.3917302206660924,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5159355332117546,
            "data_time": 0.0027493239150740527,
            "batch_time": 0.038255855993637976,
            "samples_per_second": 895742.0851891882,
            "samples_per_second_per_gpu": 111967.76064864853,
            "loss_sequences_lower_95": 3.4881788691435016,
            "loss_sequences_upper_95": 3.54269641994201,
            "loss_tokens_lower_95": 3.4893460626840946,
            "loss_tokens_upper_95": 3.5432785101597015,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.5597551606256786,
            "data_time": 0.0010268569269769824,
            "batch_time": 0.036349563233766306,
            "samples_per_second": 905850.6109715722,
            "samples_per_second_per_gpu": 113231.32637144653,
            "loss_sequences_lower_95": 0.6370835434760309,
            "loss_sequences_upper_95": 0.6517466848431195,
            "loss_tokens_lower_95": 0.46808299921314145,
            "loss_tokens_upper_95": 0.4762544827178449,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5019823753927637,
            "data_time": 0.033151622861623764,
            "batch_time": 0.0703028030693531,
            "samples_per_second": 794189.0392043081,
            "samples_per_second_per_gpu": 99273.62990053851,
            "loss_sequences_lower_95": 1.4253140366922215,
            "loss_sequences_upper_95": 1.6389011232871709,
            "loss_tokens_lower_95": 1.329045420950972,
            "loss_tokens_upper_95": 1.4455604577585743,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.985252581738137,
            "data_time": 0.09919445855276925,
            "batch_time": 0.13436110814412436,
            "samples_per_second": 516079.1805568319,
            "samples_per_second_per_gpu": 64509.897569603985,
            "loss_sequences_lower_95": 3.5031325159846127,
            "loss_sequences_upper_95": 4.5045895653802,
            "loss_tokens_lower_95": 3.3084050119659048,
            "loss_tokens_upper_95": 4.54261070534035,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.3396770826927045,
            "data_time": 0.025880504222143264,
            "batch_time": 0.0615280100277492,
            "samples_per_second": 816868.2072725756,
            "samples_per_second_per_gpu": 102108.52590907195,
            "loss_sequences_lower_95": 1.291486188842029,
            "loss_sequences_upper_95": 1.4673945892147902,
            "loss_tokens_lower_95": 1.200764373702297,
            "loss_tokens_upper_95": 1.2932650020204741,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.380419778387721,
            "data_time": 0.02545732543582008,
            "batch_time": 0.061760084969656806,
            "samples_per_second": 801353.3569349775,
            "samples_per_second_per_gpu": 100169.16961687218,
            "loss_sequences_lower_95": 1.356815045054366,
            "loss_sequences_upper_95": 1.5196214489820525,
            "loss_tokens_lower_95": 1.235300621371819,
            "loss_tokens_upper_95": 1.3135763689741042,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.3480754336569367,
            "data_time": 0.02678468113853818,
            "batch_time": 0.06296916802724202,
            "samples_per_second": 810208.8582306453,
            "samples_per_second_per_gpu": 101276.10727883066,
            "loss_sequences_lower_95": 1.2396064804821478,
            "loss_sequences_upper_95": 1.4350014523762027,
            "loss_tokens_lower_95": 1.257486883232551,
            "loss_tokens_upper_95": 1.382071721327586,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.43961934163803,
            "data_time": 0.02734188238779704,
            "batch_time": 0.06300868874504453,
            "samples_per_second": 811384.996920846,
            "samples_per_second_per_gpu": 101423.12461510576,
            "loss_sequences_lower_95": 1.398312594250935,
            "loss_sequences_upper_95": 1.547432008603724,
            "loss_tokens_lower_95": 1.300994121991214,
            "loss_tokens_upper_95": 1.3768617172478888,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1887764790043327,
            "data_time": 0.027652999501169465,
            "batch_time": 0.06451026009924618,
            "samples_per_second": 809144.5770801979,
            "samples_per_second_per_gpu": 101143.07213502473,
            "loss_sequences_lower_95": 1.1461491792098335,
            "loss_sequences_upper_95": 1.2461777349436505,
            "loss_tokens_lower_95": 1.1314065384843885,
            "loss_tokens_upper_95": 1.1872996404513383,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.1310456962120243,
            "data_time": 0.027210116386413574,
            "batch_time": 0.06334003664198376,
            "samples_per_second": 808369.5486664894,
            "samples_per_second_per_gpu": 101046.19358331118,
            "loss_sequences_lower_95": 1.1171893980444931,
            "loss_sequences_upper_95": 1.2244238318466558,
            "loss_tokens_lower_95": 1.0136167454274034,
            "loss_tokens_upper_95": 1.0634899129746835,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-1.0/params.txt",
    "uuid": "226d1bca-1c90-4168-8481-13ecb6fbdea1",
    "creation_date": "2023_12_13-16_18_40"
}