{
    "name": "rpj-d=576_l=24_h=8-1.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 3073547520,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "614709504",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=576_l=24_h=8-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.0783996959527333,
            "data_time": 0.034741856157779694,
            "batch_time": 0.36416464671492577,
            "samples_per_second": 842849.0621267401,
            "samples_per_second_per_gpu": 105356.13276584251,
            "loss_sequences_lower_95": 3.008007106781006,
            "loss_sequences_upper_95": 3.1444191551208496,
            "loss_tokens_lower_95": 3.0661127026875814,
            "loss_tokens_upper_95": 3.0905805079142254,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.531147408490181,
            "data_time": 0.0011390566452838078,
            "batch_time": 0.03026346113278866,
            "samples_per_second": 1094963.9615251662,
            "samples_per_second_per_gpu": 136870.49519064577,
            "loss_sequences_lower_95": 3.528572762620737,
            "loss_sequences_upper_95": 3.5336699089842236,
            "loss_tokens_lower_95": 3.520269572916667,
            "loss_tokens_upper_95": 3.542141333333333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8913333995001658,
            "data_time": 0.009125219345092774,
            "batch_time": 0.03836202716827392,
            "samples_per_second": 1059022.4061464665,
            "samples_per_second_per_gpu": 132377.8007683083,
            "loss_sequences_lower_95": 2.8649582531987408,
            "loss_sequences_upper_95": 2.917805281658562,
            "loss_tokens_lower_95": 2.879573328125,
            "loss_tokens_upper_95": 2.9030971770833336,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.366546301448468,
            "data_time": 0.0017224076743188657,
            "batch_time": 0.029936334314314944,
            "samples_per_second": 1129533.2399541545,
            "samples_per_second_per_gpu": 141191.6549942693,
            "loss_sequences_lower_95": 3.3535177291398197,
            "loss_sequences_upper_95": 3.3790767155283508,
            "loss_tokens_lower_95": 3.3554010260416667,
            "loss_tokens_upper_95": 3.377309875,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5191319469521827,
            "data_time": 0.009009185540248673,
            "batch_time": 0.0384639040882369,
            "samples_per_second": 1053786.0874661778,
            "samples_per_second_per_gpu": 131723.26093327222,
            "loss_sequences_lower_95": 3.4837477674309447,
            "loss_sequences_upper_95": 3.5532871285184093,
            "loss_tokens_lower_95": 3.5080351770833333,
            "loss_tokens_upper_95": 3.52994,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3069008229785353,
            "data_time": 0.003412267112213632,
            "batch_time": 0.03210939240196477,
            "samples_per_second": 1113310.8059027267,
            "samples_per_second_per_gpu": 139163.85073784084,
            "loss_sequences_lower_95": 3.264593368934823,
            "loss_sequences_upper_95": 3.3493745037522307,
            "loss_tokens_lower_95": 3.295675375,
            "loss_tokens_upper_95": 3.318000630208333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9542670275970382,
            "data_time": 0.0014583151453276442,
            "batch_time": 0.02972557134379376,
            "samples_per_second": 1131260.738394391,
            "samples_per_second_per_gpu": 141407.59229929888,
            "loss_sequences_lower_95": 1.9310793506855868,
            "loss_sequences_upper_95": 1.977241116270727,
            "loss_tokens_lower_95": 1.9439214166666667,
            "loss_tokens_upper_95": 1.9649058854166668,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.85934478200543,
            "data_time": 0.0017038531405115067,
            "batch_time": 0.02993707226091526,
            "samples_per_second": 1131537.2217218708,
            "samples_per_second_per_gpu": 141442.15271523385,
            "loss_sequences_lower_95": 3.8504972492637433,
            "loss_sequences_upper_95": 3.868088340559555,
            "loss_tokens_lower_95": 3.8484226458333333,
            "loss_tokens_upper_95": 3.8699975625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6845121044453566,
            "data_time": 0.009187011491684686,
            "batch_time": 0.03784042407595922,
            "samples_per_second": 1072534.2889745221,
            "samples_per_second_per_gpu": 134066.78612181527,
            "loss_sequences_lower_95": 3.6428060081916125,
            "loss_sequences_upper_95": 3.729876479482263,
            "loss_tokens_lower_95": 3.673445822916667,
            "loss_tokens_upper_95": 3.695700291666667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.298143922104666,
            "data_time": 0.008800283074378967,
            "batch_time": 0.037859934382140636,
            "samples_per_second": 1078259.4397192525,
            "samples_per_second_per_gpu": 134782.42996490657,
            "loss_sequences_lower_95": 4.26733997933007,
            "loss_sequences_upper_95": 4.325435762066144,
            "loss_tokens_lower_95": 4.285973479166667,
            "loss_tokens_upper_95": 4.310274916666667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4727438877294436,
            "data_time": 0.0012468335137820695,
            "batch_time": 0.029400852811797152,
            "samples_per_second": 1136347.794155535,
            "samples_per_second_per_gpu": 142043.47426944188,
            "loss_sequences_lower_95": 3.4644830562324413,
            "loss_sequences_upper_95": 3.4809344807797724,
            "loss_tokens_lower_95": 3.461758338541667,
            "loss_tokens_upper_95": 3.4838890052083333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.388818943515414,
            "data_time": 0.0024747221197911244,
            "batch_time": 0.03114983064745189,
            "samples_per_second": 1113313.4447597717,
            "samples_per_second_per_gpu": 139164.18059497146,
            "loss_sequences_lower_95": 3.378249154811797,
            "loss_sequences_upper_95": 3.3991604246928366,
            "loss_tokens_lower_95": 3.3782100729166666,
            "loss_tokens_upper_95": 3.399609697916667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.819515652395407,
            "data_time": 0.00908397308922568,
            "batch_time": 0.03769686664988401,
            "samples_per_second": 1074761.359854683,
            "samples_per_second_per_gpu": 134345.16998183538,
            "loss_sequences_lower_95": 3.7827554636988148,
            "loss_sequences_upper_95": 3.8555490118494866,
            "loss_tokens_lower_95": 3.8083174479166666,
            "loss_tokens_upper_95": 3.83074809375,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.253217383946032,
            "data_time": 0.008906438531153705,
            "batch_time": 0.03780128091454981,
            "samples_per_second": 1067826.1351046762,
            "samples_per_second_per_gpu": 133478.26688808453,
            "loss_sequences_lower_95": 3.1893714298784372,
            "loss_sequences_upper_95": 3.315478552917356,
            "loss_tokens_lower_95": 3.2417783333333334,
            "loss_tokens_upper_95": 3.264544583333333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.112749657847664,
            "data_time": 0.07299673557281494,
            "batch_time": 0.10897019079753331,
            "samples_per_second": 514839.82245771214,
            "samples_per_second_per_gpu": 64354.97780721402,
            "loss_sequences_lower_95": 4.046041505986993,
            "loss_sequences_upper_95": 4.179682003368031,
            "loss_tokens_lower_95": 4.092484006014737,
            "loss_tokens_upper_95": 4.133643106980757,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9426308380271533,
            "data_time": 0.013457067988135597,
            "batch_time": 0.04259889098730954,
            "samples_per_second": 1044196.0217513056,
            "samples_per_second_per_gpu": 130524.5027189132,
            "loss_sequences_lower_95": 2.8431667094327966,
            "loss_sequences_upper_95": 3.042107068831997,
            "loss_tokens_lower_95": 2.93150640625,
            "loss_tokens_upper_95": 2.9536117083333333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.783689093778505,
            "data_time": 0.011488066365321478,
            "batch_time": 0.040961003551880516,
            "samples_per_second": 1057802.1248529237,
            "samples_per_second_per_gpu": 132225.26560661546,
            "loss_sequences_lower_95": 5.732995685990065,
            "loss_sequences_upper_95": 5.832232955892357,
            "loss_tokens_lower_95": 5.77205390625,
            "loss_tokens_upper_95": 5.795076677083333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7326844051236012,
            "data_time": 0.03233809769153595,
            "batch_time": 0.06309155747294426,
            "samples_per_second": 933299.189081546,
            "samples_per_second_per_gpu": 116662.39863519325,
            "loss_sequences_lower_95": 3.688163557208952,
            "loss_sequences_upper_95": 3.7784528763567815,
            "loss_tokens_lower_95": 3.7201416578449185,
            "loss_tokens_upper_95": 3.745163220264873,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.080417307901036,
            "data_time": 0.001718111794467396,
            "batch_time": 0.030542711801572378,
            "samples_per_second": 1102885.5868363034,
            "samples_per_second_per_gpu": 137860.69835453792,
            "loss_sequences_lower_95": 5.057280849972404,
            "loss_sequences_upper_95": 5.104492423955811,
            "loss_tokens_lower_95": 5.0566061998157315,
            "loss_tokens_upper_95": 5.104302702940286,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.297072776639825,
            "data_time": 0.0018590216518967015,
            "batch_time": 0.030603651218353563,
            "samples_per_second": 1105538.5777024385,
            "samples_per_second_per_gpu": 138192.3222128048,
            "loss_sequences_lower_95": 3.2852907728944682,
            "loss_sequences_upper_95": 3.310970214163015,
            "loss_tokens_lower_95": 3.28219014152839,
            "loss_tokens_upper_95": 3.3018153987044165,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.492225271353589,
            "data_time": 0.003223720527167145,
            "batch_time": 0.03231090604978053,
            "samples_per_second": 1094798.9847116924,
            "samples_per_second_per_gpu": 136849.87308896155,
            "loss_sequences_lower_95": 4.732498115608024,
            "loss_sequences_upper_95": 5.021500070578206,
            "loss_tokens_lower_95": 3.980553771667262,
            "loss_tokens_upper_95": 4.1903535426160055,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.80364606376489,
            "data_time": 0.003726592723359453,
            "batch_time": 0.032782230763993364,
            "samples_per_second": 1088516.5283322614,
            "samples_per_second_per_gpu": 136064.56604153267,
            "loss_sequences_lower_95": 4.926788541666667,
            "loss_sequences_upper_95": 5.129996476236979,
            "loss_tokens_lower_95": 4.494537944673742,
            "loss_tokens_upper_95": 4.637840163128931,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2543459350724757,
            "data_time": 0.004069605923705871,
            "batch_time": 0.033017509123858285,
            "samples_per_second": 1090543.5984040764,
            "samples_per_second_per_gpu": 136317.94980050955,
            "loss_sequences_lower_95": 3.298938053064585,
            "loss_sequences_upper_95": 3.3619232524787956,
            "loss_tokens_lower_95": 3.156633106921321,
            "loss_tokens_upper_95": 3.188546355225373,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3149720625443893,
            "data_time": 0.02076963654586247,
            "batch_time": 0.05140053800174168,
            "samples_per_second": 997040.7028924355,
            "samples_per_second_per_gpu": 124630.08786155444,
            "loss_sequences_lower_95": 2.2932682037353516,
            "loss_sequences_upper_95": 2.403365991765803,
            "loss_tokens_lower_95": 2.2473413880630857,
            "loss_tokens_upper_95": 2.2930200985085154,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.535253197806222,
            "data_time": 0.01789960451424122,
            "batch_time": 0.048128681257367134,
            "samples_per_second": 980643.8703922821,
            "samples_per_second_per_gpu": 122580.48379903527,
            "loss_sequences_lower_95": 3.531182201151945,
            "loss_sequences_upper_95": 3.731980547223772,
            "loss_tokens_lower_95": 3.400535476261002,
            "loss_tokens_upper_95": 3.4954554206161137,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.153597602844238,
            "data_time": 0.015658297599890292,
            "batch_time": 0.04812340400157831,
            "samples_per_second": 1004031.4959424624,
            "samples_per_second_per_gpu": 125503.9369928078,
            "loss_sequences_lower_95": 4.120763000488281,
            "loss_sequences_upper_95": 4.230375600179036,
            "loss_tokens_lower_95": 4.010461606849699,
            "loss_tokens_upper_95": 4.232308186647771,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.442548438820417,
            "data_time": 0.0014383708875768944,
            "batch_time": 0.030387810278262647,
            "samples_per_second": 1099377.094999799,
            "samples_per_second_per_gpu": 137422.13687497488,
            "loss_sequences_lower_95": 6.457895676732199,
            "loss_sequences_upper_95": 6.534861757849023,
            "loss_tokens_lower_95": 6.291787916139089,
            "loss_tokens_upper_95": 6.372188821431984,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.684923924279936,
            "data_time": 0.0028567014124569476,
            "batch_time": 0.03223496735496009,
            "samples_per_second": 1083736.6345905366,
            "samples_per_second_per_gpu": 135467.07932381707,
            "loss_sequences_lower_95": 5.205885268702652,
            "loss_sequences_upper_95": 5.503937651252104,
            "loss_tokens_lower_95": 3.951506930844119,
            "loss_tokens_upper_95": 4.088136384359389,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.2610313315643795,
            "data_time": 0.004721182020934853,
            "batch_time": 0.03338233200279442,
            "samples_per_second": 1093762.6774702584,
            "samples_per_second_per_gpu": 136720.3346837823,
            "loss_sequences_lower_95": 4.70591152568726,
            "loss_sequences_upper_95": 5.034671093416702,
            "loss_tokens_lower_95": 3.8494833341526498,
            "loss_tokens_upper_95": 4.005389481107817,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.659820672039572,
            "data_time": 0.02193154607500349,
            "batch_time": 0.05166157015732357,
            "samples_per_second": 1008030.6444026884,
            "samples_per_second_per_gpu": 126003.83055033605,
            "loss_sequences_lower_95": 5.595072484125286,
            "loss_sequences_upper_95": 5.7246475568100745,
            "loss_tokens_lower_95": 5.595482478729666,
            "loss_tokens_upper_95": 5.722981631483662,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.616692943572998,
            "data_time": 0.04438580457980816,
            "batch_time": 0.07768603013111995,
            "samples_per_second": 849363.6189479822,
            "samples_per_second_per_gpu": 106170.45236849778,
            "loss_sequences_lower_95": 3.4831273193359378,
            "loss_sequences_upper_95": 3.8542505950927732,
            "loss_tokens_lower_95": 3.3102628759066834,
            "loss_tokens_upper_95": 3.7735227636019957,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.83104764060818,
            "data_time": 0.003166347437353466,
            "batch_time": 0.03189464068851588,
            "samples_per_second": 1106086.5475319612,
            "samples_per_second_per_gpu": 138260.81844149515,
            "loss_sequences_lower_95": 4.779491825162327,
            "loss_sequences_upper_95": 4.883673164422339,
            "loss_tokens_lower_95": 4.777843963404394,
            "loss_tokens_upper_95": 4.8833301306014905,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.023159496516697,
            "data_time": 0.004667394896315906,
            "batch_time": 0.034084643275274926,
            "samples_per_second": 1077821.5429499166,
            "samples_per_second_per_gpu": 134727.69286873957,
            "loss_sequences_lower_95": 4.964357818178619,
            "loss_sequences_upper_95": 5.080284614781686,
            "loss_tokens_lower_95": 4.963256785949657,
            "loss_tokens_upper_95": 5.0808972857800985,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8031015625067455,
            "data_time": 0.003262326732824997,
            "batch_time": 0.03213587126515596,
            "samples_per_second": 1094027.4741175533,
            "samples_per_second_per_gpu": 136753.43426469417,
            "loss_sequences_lower_95": 3.957740984596028,
            "loss_sequences_upper_95": 4.08509035328397,
            "loss_tokens_lower_95": 3.6246749077730356,
            "loss_tokens_upper_95": 3.682299267503567,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.66664590549469,
            "data_time": 0.010339658707380295,
            "batch_time": 0.03933712653815746,
            "samples_per_second": 1056236.8239865934,
            "samples_per_second_per_gpu": 132029.60299832418,
            "loss_sequences_lower_95": 5.856612451171875,
            "loss_sequences_upper_95": 6.397390454101562,
            "loss_tokens_lower_95": 5.062010684666611,
            "loss_tokens_upper_95": 5.42910880108372,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.013099208474159,
            "data_time": 0.14273706078529358,
            "batch_time": 0.17596440017223358,
            "samples_per_second": 570239.0215478793,
            "samples_per_second_per_gpu": 71279.87769348492,
            "loss_sequences_lower_95": 3.7930974304676055,
            "loss_sequences_upper_95": 4.292025709152222,
            "loss_tokens_lower_95": 3.577282986695739,
            "loss_tokens_upper_95": 4.337040447366649,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.24735525764268,
            "data_time": 0.02569446918812204,
            "batch_time": 0.05530246014290668,
            "samples_per_second": 946344.452793327,
            "samples_per_second_per_gpu": 118293.05659916588,
            "loss_sequences_lower_95": 5.7259078365632865,
            "loss_sequences_upper_95": 6.564847363000628,
            "loss_tokens_lower_95": 3.7992594997401703,
            "loss_tokens_upper_95": 4.274081823441464,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8076052450102558,
            "data_time": 0.0027213655412197113,
            "batch_time": 0.0316885422087378,
            "samples_per_second": 1091600.122017481,
            "samples_per_second_per_gpu": 136450.0152521851,
            "loss_sequences_lower_95": 2.7793604112910932,
            "loss_sequences_upper_95": 2.8363720415147853,
            "loss_tokens_lower_95": 2.7782152541707625,
            "loss_tokens_upper_95": 2.8361986632740828,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.243500712990682,
            "data_time": 0.0023333455501344715,
            "batch_time": 0.030983063154179872,
            "samples_per_second": 1108630.4705775785,
            "samples_per_second_per_gpu": 138578.80882219732,
            "loss_sequences_lower_95": 3.214165007792791,
            "loss_sequences_upper_95": 3.3794086879002525,
            "loss_tokens_lower_95": 3.058533983164532,
            "loss_tokens_upper_95": 3.220257868751139,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3210358176475916,
            "data_time": 0.01620606581370036,
            "batch_time": 0.04502012663417392,
            "samples_per_second": 1009610.712680736,
            "samples_per_second_per_gpu": 126201.339085092,
            "loss_sequences_lower_95": 3.184010929764409,
            "loss_sequences_upper_95": 3.5639464884887246,
            "loss_tokens_lower_95": 3.0814967209271655,
            "loss_tokens_upper_95": 3.374325722439437,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7771358559615233,
            "data_time": 0.0046128291636705395,
            "batch_time": 0.034102143719792366,
            "samples_per_second": 1066070.8006520648,
            "samples_per_second_per_gpu": 133258.8500815081,
            "loss_sequences_lower_95": 3.810981857673145,
            "loss_sequences_upper_95": 3.9566621968571183,
            "loss_tokens_lower_95": 3.634147841286351,
            "loss_tokens_upper_95": 3.780007505200454,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.000201546564335,
            "data_time": 0.030243328639439175,
            "batch_time": 0.05949708961305164,
            "samples_per_second": 988815.134086105,
            "samples_per_second_per_gpu": 123601.89176076313,
            "loss_sequences_lower_95": 2.876636863336331,
            "loss_sequences_upper_95": 3.3659397032202745,
            "loss_tokens_lower_95": 2.7476776471534716,
            "loss_tokens_upper_95": 3.112538154440253,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.331330641671928,
            "data_time": 0.00193713180932149,
            "batch_time": 0.03047811908813367,
            "samples_per_second": 1110667.0332887121,
            "samples_per_second_per_gpu": 138833.37916108902,
            "loss_sequences_lower_95": 4.311812235493973,
            "loss_sequences_upper_95": 4.350484843062362,
            "loss_tokens_lower_95": 4.311732844615798,
            "loss_tokens_upper_95": 4.350446729970995,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.0976876228758432,
            "data_time": 0.042661250721324574,
            "batch_time": 0.07347672202370384,
            "samples_per_second": 899061.8036690647,
            "samples_per_second_per_gpu": 112382.72545863308,
            "loss_sequences_lower_95": 1.039858090298847,
            "loss_sequences_upper_95": 1.1998215888310404,
            "loss_tokens_lower_95": 0.9276964482411879,
            "loss_tokens_upper_95": 1.1534536152803543,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.44580223303546,
            "data_time": 0.001319486719664664,
            "batch_time": 0.03011425916177329,
            "samples_per_second": 1103379.4932809966,
            "samples_per_second_per_gpu": 137922.43666012457,
            "loss_sequences_lower_95": 4.757969118514151,
            "loss_sequences_upper_95": 4.801212073751966,
            "loss_tokens_lower_95": 3.9479635517408123,
            "loss_tokens_upper_95": 3.9891608438104447,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.626673545598984,
            "data_time": 0.005553809423295278,
            "batch_time": 0.035306637249295676,
            "samples_per_second": 1060694.886275446,
            "samples_per_second_per_gpu": 132586.86078443076,
            "loss_sequences_lower_95": 5.625827075195312,
            "loss_sequences_upper_95": 5.897309948730469,
            "loss_tokens_lower_95": 5.345692815581497,
            "loss_tokens_upper_95": 5.589780392392329,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.352775760318922,
            "data_time": 0.02104135489059707,
            "batch_time": 0.050901940313436214,
            "samples_per_second": 1001332.5766870194,
            "samples_per_second_per_gpu": 125166.57208587743,
            "loss_sequences_lower_95": 5.138281077509341,
            "loss_sequences_upper_95": 5.564560095745584,
            "loss_tokens_lower_95": 5.140766256581182,
            "loss_tokens_upper_95": 5.560330040973166,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.850916728828892,
            "data_time": 0.004225658724106938,
            "batch_time": 0.033207256750888134,
            "samples_per_second": 1091811.806383076,
            "samples_per_second_per_gpu": 136476.4757978845,
            "loss_sequences_lower_95": 4.809343243223248,
            "loss_sequences_upper_95": 4.892189007383404,
            "loss_tokens_lower_95": 4.809757459235914,
            "loss_tokens_upper_95": 4.89187314120206,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.0243629668951035,
            "data_time": 0.004141724490104837,
            "batch_time": 0.03330541291135423,
            "samples_per_second": 1087359.8891755084,
            "samples_per_second_per_gpu": 135919.98614693855,
            "loss_sequences_lower_95": 1.0672712341308592,
            "loss_sequences_upper_95": 1.12919658203125,
            "loss_tokens_lower_95": 0.9462193392982193,
            "loss_tokens_upper_95": 1.001666254392382,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.734030404545012,
            "data_time": 0.02014248924595969,
            "batch_time": 0.050504829202379496,
            "samples_per_second": 948230.6379996601,
            "samples_per_second_per_gpu": 118528.82974995751,
            "loss_sequences_lower_95": 6.364806736537389,
            "loss_sequences_upper_95": 7.100925728934152,
            "loss_tokens_lower_95": 6.358469645182291,
            "loss_tokens_upper_95": 7.109090401785714,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.3713480718433857,
            "data_time": 0.13787910342216492,
            "batch_time": 0.17467710375785828,
            "samples_per_second": 533490.7987272351,
            "samples_per_second_per_gpu": 66686.34984090438,
            "loss_sequences_lower_95": 2.147835671901703,
            "loss_sequences_upper_95": 3.1768490076065063,
            "loss_tokens_lower_95": 1.8123072358750805,
            "loss_tokens_upper_95": 2.353680243737919,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.503691425800324,
            "data_time": 0.00552398959795634,
            "batch_time": 0.03471832615988595,
            "samples_per_second": 1075884.1888861628,
            "samples_per_second_per_gpu": 134485.52361077035,
            "loss_sequences_lower_95": 7.417493127441406,
            "loss_sequences_upper_95": 7.778298291015624,
            "loss_tokens_lower_95": 7.222629873724355,
            "loss_tokens_upper_95": 7.536267750799756,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.845673988342285,
            "data_time": 0.005440738939103626,
            "batch_time": 0.03421778811348809,
            "samples_per_second": 1088883.0187592194,
            "samples_per_second_per_gpu": 136110.37734490243,
            "loss_sequences_lower_95": 6.93041123046875,
            "loss_sequences_upper_95": 7.16066455078125,
            "loss_tokens_lower_95": 6.602314568043472,
            "loss_tokens_upper_95": 6.806256589474055,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.864200115763338,
            "data_time": 0.0035368249567854764,
            "batch_time": 0.03267863123711934,
            "samples_per_second": 1085253.322609479,
            "samples_per_second_per_gpu": 135656.66532618488,
            "loss_sequences_lower_95": 4.832233028175285,
            "loss_sequences_upper_95": 4.8952841819791315,
            "loss_tokens_lower_95": 4.833218397088711,
            "loss_tokens_upper_95": 4.895649263469557,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.841375659687728,
            "data_time": 0.007915121551006583,
            "batch_time": 0.03811149438702448,
            "samples_per_second": 1029334.2639077078,
            "samples_per_second_per_gpu": 128666.78298846347,
            "loss_sequences_lower_95": 4.716724448489703,
            "loss_sequences_upper_95": 4.962214916129632,
            "loss_tokens_lower_95": 4.714867639028897,
            "loss_tokens_upper_95": 4.960196368192564,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.976797275066375,
            "data_time": 0.005583481656180488,
            "batch_time": 0.034924663721569,
            "samples_per_second": 1071036.109806173,
            "samples_per_second_per_gpu": 133879.51372577163,
            "loss_sequences_lower_95": 5.906160241699219,
            "loss_sequences_upper_95": 6.049093786621094,
            "loss_tokens_lower_95": 5.906922399902344,
            "loss_tokens_upper_95": 6.049882385253906,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.522192881190788,
            "data_time": 0.0018463010824019608,
            "batch_time": 0.030520958756228702,
            "samples_per_second": 1106796.066295522,
            "samples_per_second_per_gpu": 138349.50828694025,
            "loss_sequences_lower_95": 4.094901530717833,
            "loss_sequences_upper_95": 4.199169201232853,
            "loss_tokens_lower_95": 2.817008017118649,
            "loss_tokens_upper_95": 2.8854997693874505,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.669696439558001,
            "data_time": 0.016843068599700927,
            "batch_time": 0.04625786713191441,
            "samples_per_second": 1008605.8548684857,
            "samples_per_second_per_gpu": 126075.73185856071,
            "loss_sequences_lower_95": 5.437769738951726,
            "loss_sequences_upper_95": 5.901007159788217,
            "loss_tokens_lower_95": 5.440078974481839,
            "loss_tokens_upper_95": 5.896041665148379,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.680725556728887,
            "data_time": 0.009765509516000748,
            "batch_time": 0.038717626594007015,
            "samples_per_second": 1076470.5682205604,
            "samples_per_second_per_gpu": 134558.82102757005,
            "loss_sequences_lower_95": 5.51191115435432,
            "loss_sequences_upper_95": 5.846063639322916,
            "loss_tokens_lower_95": 5.516775668275123,
            "loss_tokens_upper_95": 5.842129265280331,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.963717806866395,
            "data_time": 0.00194954633653149,
            "batch_time": 0.03063843684901175,
            "samples_per_second": 1105875.17846796,
            "samples_per_second_per_gpu": 138234.397308495,
            "loss_sequences_lower_95": 4.379331176406113,
            "loss_sequences_upper_95": 4.477444464506608,
            "loss_tokens_lower_95": 3.3249931260769143,
            "loss_tokens_upper_95": 3.4021729258509916,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.008573516966805,
            "data_time": 0.024854113658269245,
            "batch_time": 0.05570333699385325,
            "samples_per_second": 978802.8618902281,
            "samples_per_second_per_gpu": 122350.35773627851,
            "loss_sequences_lower_95": 4.841203396913236,
            "loss_sequences_upper_95": 5.16967289031498,
            "loss_tokens_lower_95": 4.8426189684994005,
            "loss_tokens_upper_95": 5.167871497421668,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.921521684591194,
            "data_time": 0.003241112526228722,
            "batch_time": 0.03194468551211887,
            "samples_per_second": 1101493.6648666596,
            "samples_per_second_per_gpu": 137686.70810833244,
            "loss_sequences_lower_95": 4.879631504443807,
            "loss_sequences_upper_95": 4.962773444966074,
            "loss_tokens_lower_95": 4.880258938383983,
            "loss_tokens_upper_95": 4.962932629133219,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.714903138216259,
            "data_time": 0.021776797554709695,
            "batch_time": 0.0525563196702437,
            "samples_per_second": 934322.1529704218,
            "samples_per_second_per_gpu": 116790.26912130273,
            "loss_sequences_lower_95": 5.482630727823498,
            "loss_sequences_upper_95": 5.944929859939131,
            "loss_tokens_lower_95": 5.485190308209762,
            "loss_tokens_upper_95": 5.944531546287166,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4153822739919026,
            "data_time": 0.07017813622951508,
            "batch_time": 0.10156026482582092,
            "samples_per_second": 778019.9439317142,
            "samples_per_second_per_gpu": 97252.49299146427,
            "loss_sequences_lower_95": 3.122991886138916,
            "loss_sequences_upper_95": 3.9003549448649086,
            "loss_tokens_lower_95": 2.76311870680915,
            "loss_tokens_upper_95": 3.641484589046902,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1049172818660735,
            "data_time": 0.06990612298250198,
            "batch_time": 0.10377789288759232,
            "samples_per_second": 723711.9403722803,
            "samples_per_second_per_gpu": 90463.99254653504,
            "loss_sequences_lower_95": 2.902206859588623,
            "loss_sequences_upper_95": 3.699855677286784,
            "loss_tokens_lower_95": 2.363685912228702,
            "loss_tokens_upper_95": 3.37085164959511,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.180590499307691,
            "data_time": 0.0030140593305905475,
            "batch_time": 0.031774591599723186,
            "samples_per_second": 1101152.0577622105,
            "samples_per_second_per_gpu": 137644.0072202763,
            "loss_sequences_lower_95": 3.1524182650266934,
            "loss_sequences_upper_95": 3.209103748619293,
            "loss_tokens_lower_95": 3.152324211558818,
            "loss_tokens_upper_95": 3.209787040339654,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.6988275945514613,
            "data_time": 0.001274954951500736,
            "batch_time": 0.03005200480997432,
            "samples_per_second": 1103597.2693695317,
            "samples_per_second_per_gpu": 137949.65867119146,
            "loss_sequences_lower_95": 0.8224561441762335,
            "loss_sequences_upper_95": 0.8448424203576985,
            "loss_tokens_lower_95": 0.5597532474493404,
            "loss_tokens_upper_95": 0.5707549297463487,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.9088890655772892,
            "data_time": 0.036610737442970276,
            "batch_time": 0.06808317452669144,
            "samples_per_second": 952319.7371417735,
            "samples_per_second_per_gpu": 119039.96714272168,
            "loss_sequences_lower_95": 1.8222486661175104,
            "loss_sequences_upper_95": 2.0692818769319787,
            "loss_tokens_lower_95": 1.7052525227264876,
            "loss_tokens_upper_95": 1.8383230825455585,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.621377619537147,
            "data_time": 0.10777178264799572,
            "batch_time": 0.14144855453854516,
            "samples_per_second": 578695.5284024206,
            "samples_per_second_per_gpu": 72336.94105030257,
            "loss_sequences_lower_95": 3.213616737159523,
            "loss_sequences_upper_95": 4.0962180988208665,
            "loss_tokens_lower_95": 3.1123498092462984,
            "loss_tokens_upper_95": 4.054351053120177,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7572339471520446,
            "data_time": 0.02763310216722034,
            "batch_time": 0.05816333918344407,
            "samples_per_second": 970186.3996312723,
            "samples_per_second_per_gpu": 121273.29995390904,
            "loss_sequences_lower_95": 1.7010245532524295,
            "loss_sequences_upper_95": 1.9097056179511838,
            "loss_tokens_lower_95": 1.5846639401216884,
            "loss_tokens_upper_95": 1.693040435765349,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.8090246399728263,
            "data_time": 0.028059661388397217,
            "batch_time": 0.057829802944546656,
            "samples_per_second": 984707.3592702961,
            "samples_per_second_per_gpu": 123088.41990878701,
            "loss_sequences_lower_95": 1.7815178103563263,
            "loss_sequences_upper_95": 1.9716943275637742,
            "loss_tokens_lower_95": 1.6308570243555398,
            "loss_tokens_upper_95": 1.7219542204404026,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.7504900112384703,
            "data_time": 0.02996028604961577,
            "batch_time": 0.05962246656417847,
            "samples_per_second": 990437.5088720819,
            "samples_per_second_per_gpu": 123804.68860901023,
            "loss_sequences_lower_95": 1.5999307167239305,
            "loss_sequences_upper_95": 1.8358831917367329,
            "loss_tokens_lower_95": 1.646609597099177,
            "loss_tokens_upper_95": 1.7902633939000503,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.8832380844325554,
            "data_time": 0.02696990966796875,
            "batch_time": 0.05720564581099011,
            "samples_per_second": 966641.3634149943,
            "samples_per_second_per_gpu": 120830.17042687429,
            "loss_sequences_lower_95": 1.8475342215561286,
            "loss_sequences_upper_95": 2.024034532686559,
            "loss_tokens_lower_95": 1.7111723712671583,
            "loss_tokens_upper_95": 1.7982039311964564,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.5330417941075674,
            "data_time": 0.030408205809416593,
            "batch_time": 0.059896133564136644,
            "samples_per_second": 1006460.6518828659,
            "samples_per_second_per_gpu": 125807.58148535824,
            "loss_sequences_lower_95": 1.4841888001246482,
            "loss_sequences_upper_95": 1.60213008904309,
            "loss_tokens_lower_95": 1.466313300901214,
            "loss_tokens_upper_95": 1.5327987213000878,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.4446307885937575,
            "data_time": 0.02850492795308431,
            "batch_time": 0.059359638463883174,
            "samples_per_second": 966278.3835669046,
            "samples_per_second_per_gpu": 120784.79794586307,
            "loss_sequences_lower_95": 1.4211321225980433,
            "loss_sequences_upper_95": 1.5463149303343238,
            "loss_tokens_lower_95": 1.3021243972272452,
            "loss_tokens_upper_95": 1.3576717048815174,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=576_l=24_h=8-1.0/params.txt",
    "uuid": "0ba085d6-bb1d-4f73-9f69-aa7389392c11",
    "creation_date": "2023_12_13-16_18_38"
}