{
    "name": "rpj-d=576_l=24_h=8-2.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 6147095040,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1229419008",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.918924154837926,
            "data_time": 0.035532914102077484,
            "batch_time": 0.3706706501543522,
            "samples_per_second": 838631.8416692563,
            "samples_per_second_per_gpu": 104828.98020865704,
            "loss_sequences_lower_95": 2.8493795649210614,
            "loss_sequences_upper_95": 2.984846318562825,
            "loss_tokens_lower_95": 2.9070950317382813,
            "loss_tokens_upper_95": 2.930648880004883,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.400956711809327,
            "data_time": 0.0012889987476696124,
            "batch_time": 0.03095798203208246,
            "samples_per_second": 1073197.2586610771,
            "samples_per_second_per_gpu": 134149.65733263464,
            "loss_sequences_lower_95": 3.398372306218308,
            "loss_sequences_upper_95": 3.403501594504053,
            "loss_tokens_lower_95": 3.3902626249999996,
            "loss_tokens_upper_95": 3.4116306250000004,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.782764957389053,
            "data_time": 0.009753456115722656,
            "batch_time": 0.03910172843933105,
            "samples_per_second": 1052765.5843159761,
            "samples_per_second_per_gpu": 131595.69803949702,
            "loss_sequences_lower_95": 2.7568776033362563,
            "loss_sequences_upper_95": 2.808620163275271,
            "loss_tokens_lower_95": 2.7712755885416667,
            "loss_tokens_upper_95": 2.794453854166667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.236707434703394,
            "data_time": 0.0016271030824435385,
            "batch_time": 0.030278145580699568,
            "samples_per_second": 1111894.3096449953,
            "samples_per_second_per_gpu": 138986.7887056244,
            "loss_sequences_lower_95": 3.223905338877255,
            "loss_sequences_upper_95": 3.248952606515786,
            "loss_tokens_lower_95": 3.2259426302083334,
            "loss_tokens_upper_95": 3.2473840677083334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3917120207108695,
            "data_time": 0.009599207881912292,
            "batch_time": 0.038381430257364096,
            "samples_per_second": 1069845.6020888968,
            "samples_per_second_per_gpu": 133730.7002611121,
            "loss_sequences_lower_95": 3.3567849588491283,
            "loss_sequences_upper_95": 3.4256143294138237,
            "loss_tokens_lower_95": 3.380735489583333,
            "loss_tokens_upper_95": 3.402419890625,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.177478621558702,
            "data_time": 0.0038654146635014076,
            "batch_time": 0.033140970636969025,
            "samples_per_second": 1093192.1500499803,
            "samples_per_second_per_gpu": 136649.01875624753,
            "loss_sequences_lower_95": 3.1350882939300435,
            "loss_sequences_upper_95": 3.2197003227768097,
            "loss_tokens_lower_95": 3.1664353645833336,
            "loss_tokens_upper_95": 3.1885981510416666,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.8339461409559055,
            "data_time": 0.0016356014310827457,
            "batch_time": 0.030271995417640026,
            "samples_per_second": 1115260.1547882042,
            "samples_per_second_per_gpu": 139407.51934852553,
            "loss_sequences_lower_95": 1.8113405861367986,
            "loss_sequences_upper_95": 1.8564711565290177,
            "loss_tokens_lower_95": 1.8239671458333333,
            "loss_tokens_upper_95": 1.8442789791666667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.754242275248023,
            "data_time": 0.0018140566693046311,
            "batch_time": 0.030533534919103784,
            "samples_per_second": 1111540.4690111114,
            "samples_per_second_per_gpu": 138942.55862638893,
            "loss_sequences_lower_95": 3.7448497729875654,
            "loss_sequences_upper_95": 3.763759949689136,
            "loss_tokens_lower_95": 3.7433941041666667,
            "loss_tokens_upper_95": 3.76471015625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.569907847458754,
            "data_time": 0.009423818853166368,
            "batch_time": 0.039244066155146036,
            "samples_per_second": 1035121.0350662149,
            "samples_per_second_per_gpu": 129390.12938327686,
            "loss_sequences_lower_95": 3.529297830225006,
            "loss_sequences_upper_95": 3.615362319326013,
            "loss_tokens_lower_95": 3.5588124687500002,
            "loss_tokens_upper_95": 3.581175947916667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.180143085864222,
            "data_time": 0.009402768686413765,
            "batch_time": 0.03870838228613138,
            "samples_per_second": 1062359.311976683,
            "samples_per_second_per_gpu": 132794.9139970854,
            "loss_sequences_lower_95": 4.148817341318244,
            "loss_sequences_upper_95": 4.2083257622398405,
            "loss_tokens_lower_95": 4.168103322916667,
            "loss_tokens_upper_95": 4.1922423645833335,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.331207660676617,
            "data_time": 0.0013449427917513921,
            "batch_time": 0.03002758514851732,
            "samples_per_second": 1114033.6719170474,
            "samples_per_second_per_gpu": 139254.20898963092,
            "loss_sequences_lower_95": 3.323115162106431,
            "loss_sequences_upper_95": 3.339337923547348,
            "loss_tokens_lower_95": 3.3204098281250003,
            "loss_tokens_upper_95": 3.3421613489583333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2456346659400173,
            "data_time": 0.0025990221720750286,
            "batch_time": 0.03125001568282077,
            "samples_per_second": 1111655.751893962,
            "samples_per_second_per_gpu": 138956.96898674525,
            "loss_sequences_lower_95": 3.235247779855724,
            "loss_sequences_upper_95": 3.255878104988807,
            "loss_tokens_lower_95": 3.235152984375,
            "loss_tokens_upper_95": 3.2562361041666663,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7152413354684084,
            "data_time": 0.009564668293526993,
            "batch_time": 0.038800082187878754,
            "samples_per_second": 1048933.6842802118,
            "samples_per_second_per_gpu": 131116.71053502648,
            "loss_sequences_lower_95": 3.678132818194964,
            "loss_sequences_upper_95": 3.752299558318411,
            "loss_tokens_lower_95": 3.7040032708333337,
            "loss_tokens_upper_95": 3.7264706875,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.125494271577497,
            "data_time": 0.009507988553598107,
            "batch_time": 0.03871492750616187,
            "samples_per_second": 1050345.8765136288,
            "samples_per_second_per_gpu": 131293.2345642036,
            "loss_sequences_lower_95": 3.0624733670423083,
            "loss_sequences_upper_95": 3.1863189697265626,
            "loss_tokens_lower_95": 3.1141668072916664,
            "loss_tokens_upper_95": 3.136735697916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.948537978259,
            "data_time": 0.07703808375767299,
            "batch_time": 0.11096277407237462,
            "samples_per_second": 561407.8789597183,
            "samples_per_second_per_gpu": 70175.98486996478,
            "loss_sequences_lower_95": 3.883937731656161,
            "loss_sequences_upper_95": 4.012502930381081,
            "loss_tokens_lower_95": 3.928412385420366,
            "loss_tokens_upper_95": 3.9692845257845795,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8094398916288994,
            "data_time": 0.013102225281975487,
            "batch_time": 0.042770599777048286,
            "samples_per_second": 1033317.2242838565,
            "samples_per_second_per_gpu": 129164.65303548206,
            "loss_sequences_lower_95": 2.7124639260873153,
            "loss_sequences_upper_95": 2.9062010473134565,
            "loss_tokens_lower_95": 2.7985030312499997,
            "loss_tokens_upper_95": 2.8200921354166666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.660887075288315,
            "data_time": 0.012293924887975058,
            "batch_time": 0.04217842345436414,
            "samples_per_second": 1038251.806293346,
            "samples_per_second_per_gpu": 129781.47578666825,
            "loss_sequences_lower_95": 5.613064583247444,
            "loss_sequences_upper_95": 5.705968062670061,
            "loss_tokens_lower_95": 5.649382052083333,
            "loss_tokens_upper_95": 5.6722513333333335,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.552258630267909,
            "data_time": 0.03502342477440834,
            "batch_time": 0.06597453728318214,
            "samples_per_second": 920303.7592602015,
            "samples_per_second_per_gpu": 115037.96990752518,
            "loss_sequences_lower_95": 3.5061985453621287,
            "loss_sequences_upper_95": 3.60251211572866,
            "loss_tokens_lower_95": 3.539999520974081,
            "loss_tokens_upper_95": 3.564615330930616,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.478176907636623,
            "data_time": 0.001733831738828255,
            "batch_time": 0.03080571709298327,
            "samples_per_second": 1092241.4251771185,
            "samples_per_second_per_gpu": 136530.1781471398,
            "loss_sequences_lower_95": 5.455527377131997,
            "loss_sequences_upper_95": 5.501417789043583,
            "loss_tokens_lower_95": 5.455177645078158,
            "loss_tokens_upper_95": 5.501502189858995,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.167868826309064,
            "data_time": 0.002002523773035426,
            "batch_time": 0.031019253478308393,
            "samples_per_second": 1091984.6683300883,
            "samples_per_second_per_gpu": 136498.08354126103,
            "loss_sequences_lower_95": 3.1537448283615066,
            "loss_sequences_upper_95": 3.179068853199375,
            "loss_tokens_lower_95": 3.1530503588442795,
            "loss_tokens_upper_95": 3.1723457952792957,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.059180641827518,
            "data_time": 0.0032974496188869352,
            "batch_time": 0.03251300703007359,
            "samples_per_second": 1095541.199200777,
            "samples_per_second_per_gpu": 136942.64990009714,
            "loss_sequences_lower_95": 4.299408365501003,
            "loss_sequences_upper_95": 4.583512758048683,
            "loss_tokens_lower_95": 3.52543152451714,
            "loss_tokens_upper_95": 3.73555820656868,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.335439226140578,
            "data_time": 0.003752623625258182,
            "batch_time": 0.03284533876687922,
            "samples_per_second": 1083877.1374408666,
            "samples_per_second_per_gpu": 135484.64218010832,
            "loss_sequences_lower_95": 4.434792358398438,
            "loss_sequences_upper_95": 4.637131697591146,
            "loss_tokens_lower_95": 4.058716188826651,
            "loss_tokens_upper_95": 4.201977078419811,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9462066960208246,
            "data_time": 0.004675563464157542,
            "batch_time": 0.03402876458376601,
            "samples_per_second": 1078055.7246301654,
            "samples_per_second_per_gpu": 134756.96557877067,
            "loss_sequences_lower_95": 2.988621145021382,
            "loss_sequences_upper_95": 3.0473093630058283,
            "loss_tokens_lower_95": 2.855623340098974,
            "loss_tokens_upper_95": 2.8861788410387024,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.1923423409461975,
            "data_time": 0.021508974688393728,
            "batch_time": 0.05303415656089783,
            "samples_per_second": 1011965.4385541407,
            "samples_per_second_per_gpu": 126495.67981926759,
            "loss_sequences_lower_95": 2.1712868707830255,
            "loss_sequences_upper_95": 2.2777499146894975,
            "loss_tokens_lower_95": 2.125839074152222,
            "loss_tokens_upper_95": 2.170908562336239,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.360206789873084,
            "data_time": 0.019957533106207848,
            "batch_time": 0.05063258484005928,
            "samples_per_second": 969263.268054726,
            "samples_per_second_per_gpu": 121157.90850684074,
            "loss_sequences_lower_95": 3.349428991200973,
            "loss_sequences_upper_95": 3.542469445053412,
            "loss_tokens_lower_95": 3.229896624545108,
            "loss_tokens_upper_95": 3.3225336540495936,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.622152891953786,
            "data_time": 0.017520614159412872,
            "batch_time": 0.04718484481175741,
            "samples_per_second": 1006626.7775057268,
            "samples_per_second_per_gpu": 125828.34718821585,
            "loss_sequences_lower_95": 3.5937708943684896,
            "loss_sequences_upper_95": 3.7072107442220052,
            "loss_tokens_lower_95": 3.4902124732622255,
            "loss_tokens_upper_95": 3.673350066572833,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.783404406578183,
            "data_time": 0.0015127900993362935,
            "batch_time": 0.030410232324321468,
            "samples_per_second": 1099727.9917195064,
            "samples_per_second_per_gpu": 137465.9989649383,
            "loss_sequences_lower_95": 5.793574256234314,
            "loss_sequences_upper_95": 5.8733328226495995,
            "loss_tokens_lower_95": 5.639941672216183,
            "loss_tokens_upper_95": 5.721708654483611,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.582667286598722,
            "data_time": 0.0037204309197880277,
            "batch_time": 0.03287771044161496,
            "samples_per_second": 1086447.2128665654,
            "samples_per_second_per_gpu": 135805.90160832068,
            "loss_sequences_lower_95": 5.14691750882852,
            "loss_sequences_upper_95": 5.475445248382259,
            "loss_tokens_lower_95": 3.791889564174614,
            "loss_tokens_upper_95": 3.9321161448576576,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.113455339605084,
            "data_time": 0.004720020938563991,
            "batch_time": 0.03379912956340893,
            "samples_per_second": 1077838.1409045819,
            "samples_per_second_per_gpu": 134729.76761307273,
            "loss_sequences_lower_95": 4.578280889621774,
            "loss_sequences_upper_95": 4.945332987398011,
            "loss_tokens_lower_95": 3.681796823004901,
            "loss_tokens_upper_95": 3.8392024062071437,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.026009816557305,
            "data_time": 0.022738011819975718,
            "batch_time": 0.05283823183604649,
            "samples_per_second": 999642.0541195845,
            "samples_per_second_per_gpu": 124955.25676494806,
            "loss_sequences_lower_95": 5.942720205925371,
            "loss_sequences_upper_95": 6.111778949058219,
            "loss_tokens_lower_95": 5.940915195691531,
            "loss_tokens_upper_95": 6.109846350264876,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.50283077955246,
            "data_time": 0.045751915528224066,
            "batch_time": 0.07639504854495709,
            "samples_per_second": 897764.7654633995,
            "samples_per_second_per_gpu": 112220.59568292493,
            "loss_sequences_lower_95": 3.3644950256347657,
            "loss_sequences_upper_95": 3.7443517990112305,
            "loss_tokens_lower_95": 3.1854838253560347,
            "loss_tokens_upper_95": 3.647838487437458,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.452158583589318,
            "data_time": 0.0033376870223593372,
            "batch_time": 0.03229636429277666,
            "samples_per_second": 1096312.9850806403,
            "samples_per_second_per_gpu": 137039.12313508004,
            "loss_sequences_lower_95": 5.396645202949078,
            "loss_sequences_upper_95": 5.5080279784656465,
            "loss_tokens_lower_95": 5.396997657549418,
            "loss_tokens_upper_95": 5.507730724135587,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.772726183929568,
            "data_time": 0.0049078367273725845,
            "batch_time": 0.03422566183628499,
            "samples_per_second": 1077043.4239279425,
            "samples_per_second_per_gpu": 134630.42799099282,
            "loss_sequences_lower_95": 5.703268045211405,
            "loss_sequences_upper_95": 5.841513473923142,
            "loss_tokens_lower_95": 5.701030149467649,
            "loss_tokens_upper_95": 5.842577225218827,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6501312235104764,
            "data_time": 0.0034291141748943287,
            "batch_time": 0.032994916063148055,
            "samples_per_second": 1069652.9236351205,
            "samples_per_second_per_gpu": 133706.61545439006,
            "loss_sequences_lower_95": 3.8046473323585417,
            "loss_sequences_upper_95": 3.9313020359574775,
            "loss_tokens_lower_95": 3.4750006942445366,
            "loss_tokens_upper_95": 3.5310165871642596,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.523032949447632,
            "data_time": 0.010565169155597687,
            "batch_time": 0.04142977483570576,
            "samples_per_second": 1000652.6971353088,
            "samples_per_second_per_gpu": 125081.5871419136,
            "loss_sequences_lower_95": 5.696071032714844,
            "loss_sequences_upper_95": 6.243619287109375,
            "loss_tokens_lower_95": 4.905431708809873,
            "loss_tokens_upper_95": 5.271142957827494,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7459320574998856,
            "data_time": 0.1556103378534317,
            "batch_time": 0.19209425151348114,
            "samples_per_second": 524981.8176167723,
            "samples_per_second_per_gpu": 65622.72720209653,
            "loss_sequences_lower_95": 3.5284528374671935,
            "loss_sequences_upper_95": 4.013094067573547,
            "loss_tokens_lower_95": 3.3087488459444594,
            "loss_tokens_upper_95": 4.083502074493759,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.8172727878066315,
            "data_time": 0.025956526715704736,
            "batch_time": 0.05676246703939235,
            "samples_per_second": 906215.5276898552,
            "samples_per_second_per_gpu": 113276.9409612319,
            "loss_sequences_lower_95": 5.204645327864022,
            "loss_sequences_upper_95": 5.970450057106456,
            "loss_tokens_lower_95": 3.477817147479654,
            "loss_tokens_upper_95": 3.9347545189411286,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.2138441990639457,
            "data_time": 0.0028694522463613087,
            "batch_time": 0.031843935449918113,
            "samples_per_second": 1089908.561077068,
            "samples_per_second_per_gpu": 136238.5701346335,
            "loss_sequences_lower_95": 2.1903073932052752,
            "loss_sequences_upper_95": 2.2378870786147744,
            "loss_tokens_lower_95": 2.1901152217893323,
            "loss_tokens_upper_95": 2.2378453859235066,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9498331923160532,
            "data_time": 0.002409434568340759,
            "batch_time": 0.031450471537585956,
            "samples_per_second": 1094198.8133166805,
            "samples_per_second_per_gpu": 136774.85166458506,
            "loss_sequences_lower_95": 2.923759450085205,
            "loss_sequences_upper_95": 3.076078672314671,
            "loss_tokens_lower_95": 2.782377194506471,
            "loss_tokens_upper_95": 2.932271323097999,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2552854399104696,
            "data_time": 0.017495223217540316,
            "batch_time": 0.047063547703954905,
            "samples_per_second": 993037.8660908923,
            "samples_per_second_per_gpu": 124129.73326136154,
            "loss_sequences_lower_95": 3.099790591285342,
            "loss_sequences_upper_95": 3.494460701855111,
            "loss_tokens_lower_95": 3.0037155447791397,
            "loss_tokens_upper_95": 3.296644895767085,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6993821526288797,
            "data_time": 0.004422225058078766,
            "batch_time": 0.03347070515155792,
            "samples_per_second": 1078813.2492668857,
            "samples_per_second_per_gpu": 134851.6561583607,
            "loss_sequences_lower_95": 3.73217892906472,
            "loss_sequences_upper_95": 3.881509478899467,
            "loss_tokens_lower_95": 3.5540621076677805,
            "loss_tokens_upper_95": 3.698431032263826,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9188521395369276,
            "data_time": 0.030877110503968738,
            "batch_time": 0.06114481460480463,
            "samples_per_second": 977018.390870398,
            "samples_per_second_per_gpu": 122127.29885879975,
            "loss_sequences_lower_95": 2.7937842578422734,
            "loss_sequences_upper_95": 3.2834386127751047,
            "loss_tokens_lower_95": 2.649551436407142,
            "loss_tokens_upper_95": 3.016062586225849,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.217529189612871,
            "data_time": 0.0019591980536747015,
            "batch_time": 0.03159174783868977,
            "samples_per_second": 1072926.4176231474,
            "samples_per_second_per_gpu": 134115.80220289342,
            "loss_sequences_lower_95": 4.197975796331141,
            "loss_sequences_upper_95": 4.236787728639477,
            "loss_tokens_lower_95": 4.198202267406606,
            "loss_tokens_upper_95": 4.2366932077821815,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8936720797159139,
            "data_time": 0.044006547060879794,
            "batch_time": 0.07701014605435458,
            "samples_per_second": 834385.2692319865,
            "samples_per_second_per_gpu": 104298.15865399831,
            "loss_sequences_lower_95": 0.8433164652111461,
            "loss_sequences_upper_95": 0.9721410732824826,
            "loss_tokens_lower_95": 0.7608416882361703,
            "loss_tokens_upper_95": 0.9459656964304991,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.1911168838790624,
            "data_time": 0.0013245007043512348,
            "batch_time": 0.030260306579269286,
            "samples_per_second": 1097348.0430294764,
            "samples_per_second_per_gpu": 137168.50537868455,
            "loss_sequences_lower_95": 4.528958394752358,
            "loss_sequences_upper_95": 4.572860191873034,
            "loss_tokens_lower_95": 3.6652341694874275,
            "loss_tokens_upper_95": 3.707335209139265,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.77020445895195,
            "data_time": 0.0056333716899629625,
            "batch_time": 0.034769035047954984,
            "samples_per_second": 1077046.4536105585,
            "samples_per_second_per_gpu": 134630.80670131982,
            "loss_sequences_lower_95": 5.785076867675781,
            "loss_sequences_upper_95": 6.055095690917969,
            "loss_tokens_lower_95": 5.459310858800996,
            "loss_tokens_upper_95": 5.707077359157482,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.692586232268292,
            "data_time": 0.022056056281267587,
            "batch_time": 0.052174293388754636,
            "samples_per_second": 1002501.295420018,
            "samples_per_second_per_gpu": 125312.66192750225,
            "loss_sequences_lower_95": 5.493631100861922,
            "loss_sequences_upper_95": 5.89435434092646,
            "loss_tokens_lower_95": 5.495086072838824,
            "loss_tokens_upper_95": 5.885977623980978,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.350405935446421,
            "data_time": 0.004539352583597942,
            "batch_time": 0.0335646379424865,
            "samples_per_second": 1086738.3663669773,
            "samples_per_second_per_gpu": 135842.29579587217,
            "loss_sequences_lower_95": 6.282170835552793,
            "loss_sequences_upper_95": 6.415969164299242,
            "loss_tokens_lower_95": 6.284478038441051,
            "loss_tokens_upper_95": 6.416064767548532,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8727753545045853,
            "data_time": 0.0041100566691540654,
            "batch_time": 0.033075711511551066,
            "samples_per_second": 1090811.2261393138,
            "samples_per_second_per_gpu": 136351.40326741422,
            "loss_sequences_lower_95": 0.9097078002929687,
            "loss_sequences_upper_95": 0.9649244649251302,
            "loss_tokens_lower_95": 0.8050852215886355,
            "loss_tokens_upper_95": 0.8555670412696329,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.458200503530956,
            "data_time": 0.022189649088042124,
            "batch_time": 0.05325266080243247,
            "samples_per_second": 919089.1599184104,
            "samples_per_second_per_gpu": 114886.1449898013,
            "loss_sequences_lower_95": 6.0804682704380575,
            "loss_sequences_upper_95": 6.8334314691452755,
            "loss_tokens_lower_95": 6.077211129324777,
            "loss_tokens_upper_95": 6.840492146809895,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3261025808751583,
            "data_time": 0.1463472843170166,
            "batch_time": 0.18253165483474731,
            "samples_per_second": 510363.4681639043,
            "samples_per_second_per_gpu": 63795.43352048804,
            "loss_sequences_lower_95": 2.1021707236766813,
            "loss_sequences_upper_95": 3.1072440862655637,
            "loss_tokens_lower_95": 1.766472882143001,
            "loss_tokens_upper_95": 2.283874757117832,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.621055804252625,
            "data_time": 0.005670518629134647,
            "batch_time": 0.03553358146122524,
            "samples_per_second": 1053882.0110651145,
            "samples_per_second_per_gpu": 131735.25138313932,
            "loss_sequences_lower_95": 7.553023815917968,
            "loss_sequences_upper_95": 7.885696496582031,
            "loss_tokens_lower_95": 7.354318769167724,
            "loss_tokens_upper_95": 7.6448479718564935,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.48493331193924,
            "data_time": 0.005913977585141621,
            "batch_time": 0.035188524495987666,
            "samples_per_second": 1074996.0775779653,
            "samples_per_second_per_gpu": 134374.50969724567,
            "loss_sequences_lower_95": 6.585091430664063,
            "loss_sequences_upper_95": 6.806349987792969,
            "loss_tokens_lower_95": 6.242821519877473,
            "loss_tokens_upper_95": 6.43169840209533,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.835023327957573,
            "data_time": 0.003746178796060109,
            "batch_time": 0.03273129574829918,
            "samples_per_second": 1088649.2008797782,
            "samples_per_second_per_gpu": 136081.15010997228,
            "loss_sequences_lower_95": 4.798319955660199,
            "loss_sequences_upper_95": 4.870944883964864,
            "loss_tokens_lower_95": 4.798812288187752,
            "loss_tokens_upper_95": 4.871254868080581,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.048094224636822,
            "data_time": 0.00812050222990376,
            "batch_time": 0.0373374472211855,
            "samples_per_second": 1061649.9171136057,
            "samples_per_second_per_gpu": 132706.2396392007,
            "loss_sequences_lower_95": 4.923904414257512,
            "loss_sequences_upper_95": 5.1685979371429775,
            "loss_tokens_lower_95": 4.922823163807483,
            "loss_tokens_upper_95": 5.166619244881673,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.700492072582245,
            "data_time": 0.005507815451849075,
            "batch_time": 0.03506013372587779,
            "samples_per_second": 1061477.0766525657,
            "samples_per_second_per_gpu": 132684.63458157072,
            "loss_sequences_lower_95": 5.612756823730469,
            "loss_sequences_upper_95": 5.791309924316407,
            "loss_tokens_lower_95": 5.611248706054687,
            "loss_tokens_upper_95": 5.793465686035156,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.053456363597904,
            "data_time": 0.0018690648736935708,
            "batch_time": 0.03094902205332015,
            "samples_per_second": 1091828.8759175674,
            "samples_per_second_per_gpu": 136478.60948969593,
            "loss_sequences_lower_95": 3.5213570523149245,
            "loss_sequences_upper_95": 3.614503763969371,
            "loss_tokens_lower_95": 2.4610034720185663,
            "loss_tokens_upper_95": 2.5236556571391247,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.7598221942559995,
            "data_time": 0.0183340276990618,
            "batch_time": 0.04776352643966675,
            "samples_per_second": 1004031.8255423639,
            "samples_per_second_per_gpu": 125503.97819279549,
            "loss_sequences_lower_95": 5.560345288177034,
            "loss_sequences_upper_95": 5.957346731157445,
            "loss_tokens_lower_95": 5.564357791729827,
            "loss_tokens_upper_95": 5.951589248429483,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.543432858411004,
            "data_time": 0.0101374052464962,
            "batch_time": 0.039782085455954075,
            "samples_per_second": 1055763.5472137812,
            "samples_per_second_per_gpu": 131970.44340172264,
            "loss_sequences_lower_95": 5.4108013556985295,
            "loss_sequences_upper_95": 5.673507056142769,
            "loss_tokens_lower_95": 5.411874138327207,
            "loss_tokens_upper_95": 5.672458220837163,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4363380221515376,
            "data_time": 0.0020369162229216973,
            "batch_time": 0.031133620373515313,
            "samples_per_second": 1090512.7648624296,
            "samples_per_second_per_gpu": 136314.0956078037,
            "loss_sequences_lower_95": 3.756477252932012,
            "loss_sequences_upper_95": 3.845998601520418,
            "loss_tokens_lower_95": 2.9085846492122482,
            "loss_tokens_upper_95": 2.979815720146552,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.696791833039945,
            "data_time": 0.026098936796188354,
            "batch_time": 0.05735192447900772,
            "samples_per_second": 964966.1856031675,
            "samples_per_second_per_gpu": 120620.77320039594,
            "loss_sequences_lower_95": 4.560495414935723,
            "loss_sequences_upper_95": 4.831050400506882,
            "loss_tokens_lower_95": 4.557826354011657,
            "loss_tokens_upper_95": 4.830187116350446,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.211697196595895,
            "data_time": 0.003888736714373578,
            "batch_time": 0.0328070653081668,
            "samples_per_second": 1091179.279532292,
            "samples_per_second_per_gpu": 136397.4099415365,
            "loss_sequences_lower_95": 4.168670852147841,
            "loss_sequences_upper_95": 4.253731334814602,
            "loss_tokens_lower_95": 4.170385548069572,
            "loss_tokens_upper_95": 4.2524990816728785,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.87448713385943,
            "data_time": 0.022618493166836826,
            "batch_time": 0.05366400588642467,
            "samples_per_second": 928911.0747775305,
            "samples_per_second_per_gpu": 116113.88434719131,
            "loss_sequences_lower_95": 5.660608887903899,
            "loss_sequences_upper_95": 6.085512831604596,
            "loss_tokens_lower_95": 5.6587510784852855,
            "loss_tokens_upper_95": 6.085456714815306,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8433314343293508,
            "data_time": 0.07853294909000397,
            "batch_time": 0.113914355635643,
            "samples_per_second": 688529.8937322705,
            "samples_per_second_per_gpu": 86066.23671653381,
            "loss_sequences_lower_95": 2.6210503260294598,
            "loss_sequences_upper_95": 3.2345586585998536,
            "loss_tokens_lower_95": 2.378148587544759,
            "loss_tokens_upper_95": 3.262743706173367,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.797352437178294,
            "data_time": 0.07803942263126373,
            "batch_time": 0.11290210485458374,
            "samples_per_second": 710953.2533296165,
            "samples_per_second_per_gpu": 88869.15666620206,
            "loss_sequences_lower_95": 2.639698321024577,
            "loss_sequences_upper_95": 3.2400467173258463,
            "loss_tokens_lower_95": 2.200273218047753,
            "loss_tokens_upper_95": 3.17306066791663,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9392499636480138,
            "data_time": 0.0033527433766976323,
            "batch_time": 0.03271861115647036,
            "samples_per_second": 1080131.2799209137,
            "samples_per_second_per_gpu": 135016.40999011422,
            "loss_sequences_lower_95": 1.9230037207175075,
            "loss_sequences_upper_95": 1.9559333434968704,
            "loss_tokens_lower_95": 1.9227751022586064,
            "loss_tokens_upper_95": 1.9558012918239138,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.5568062241909281,
            "data_time": 0.0013182224049609914,
            "batch_time": 0.03035162271079923,
            "samples_per_second": 1093619.5044311536,
            "samples_per_second_per_gpu": 136702.4380538942,
            "loss_sequences_lower_95": 0.6252450371819009,
            "loss_sequences_upper_95": 0.6385468762688701,
            "loss_tokens_lower_95": 0.4799891033348658,
            "loss_tokens_upper_95": 0.4881813496282314,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7489941814753014,
            "data_time": 0.03884366899728775,
            "batch_time": 0.0705338604748249,
            "samples_per_second": 930177.062050123,
            "samples_per_second_per_gpu": 116272.13275626538,
            "loss_sequences_lower_95": 1.6719881192905697,
            "loss_sequences_upper_95": 1.9105057573693942,
            "loss_tokens_lower_95": 1.5590275688799204,
            "loss_tokens_upper_95": 1.6842681802305945,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6111828427057007,
            "data_time": 0.11643267813183013,
            "batch_time": 0.14983229410080684,
            "samples_per_second": 554003.288008781,
            "samples_per_second_per_gpu": 69250.41100109763,
            "loss_sequences_lower_95": 3.201960599744642,
            "loss_sequences_upper_95": 4.093474908777185,
            "loss_tokens_lower_95": 3.0967867721745996,
            "loss_tokens_upper_95": 4.025717464494116,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.613995990738636,
            "data_time": 0.028750155653272356,
            "batch_time": 0.0587172650155567,
            "samples_per_second": 977549.9138750404,
            "samples_per_second_per_gpu": 122193.73923438005,
            "loss_sequences_lower_95": 1.5610610636269173,
            "loss_sequences_upper_95": 1.7616812124484922,
            "loss_tokens_lower_95": 1.4468299582106812,
            "loss_tokens_upper_95": 1.5489013096943336,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6587788273648518,
            "data_time": 0.03073663938613165,
            "batch_time": 0.06135985397157215,
            "samples_per_second": 961067.1255454576,
            "samples_per_second_per_gpu": 120133.3906931822,
            "loss_sequences_lower_95": 1.630680158661633,
            "loss_sequences_upper_95": 1.8150228546886908,
            "loss_tokens_lower_95": 1.490881838686122,
            "loss_tokens_upper_95": 1.576604877452687,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.6117562662537506,
            "data_time": 0.030672272046407063,
            "batch_time": 0.060469922565278555,
            "samples_per_second": 977520.819834776,
            "samples_per_second_per_gpu": 122190.102479347,
            "loss_sequences_lower_95": 1.4879237523893032,
            "loss_sequences_upper_95": 1.7120537222885506,
            "loss_tokens_lower_95": 1.5075127749144175,
            "loss_tokens_upper_95": 1.6438810764200869,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7232296205148465,
            "data_time": 0.03094447794414702,
            "batch_time": 0.06143498988378616,
            "samples_per_second": 980403.8633654484,
            "samples_per_second_per_gpu": 122550.48292068105,
            "loss_sequences_lower_95": 1.6827614760980374,
            "loss_sequences_upper_95": 1.8527186975246523,
            "loss_tokens_lower_95": 1.5621235000752958,
            "loss_tokens_upper_95": 1.6449700138650578,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4299193798385053,
            "data_time": 0.031659073299831815,
            "batch_time": 0.06269433763292101,
            "samples_per_second": 974639.1322984,
            "samples_per_second_per_gpu": 121829.8915373,
            "loss_sequences_lower_95": 1.3859612577450202,
            "loss_sequences_upper_95": 1.493932086636561,
            "loss_tokens_lower_95": 1.3668163376992541,
            "loss_tokens_upper_95": 1.4289254927656114,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.3150386730345285,
            "data_time": 0.03075532118479411,
            "batch_time": 0.062103915782201854,
            "samples_per_second": 958554.2592859417,
            "samples_per_second_per_gpu": 119819.28241074272,
            "loss_sequences_lower_95": 1.2941269548927867,
            "loss_sequences_upper_95": 1.4095580147533884,
            "loss_tokens_lower_95": 1.1836345342874097,
            "loss_tokens_upper_95": 1.2365428510209462,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-2.0/params.txt",
    "uuid": "8a71b7ae-a583-4682-a019-efb96af536f8",
    "creation_date": "2023_12_14-06_48_34"
}