{
    "name": "rpj-d=1024_l=24_h=8-0.5",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 4116162560,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "823232512",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=1024_l=24_h=8-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.786628646651904,
            "data_time": 0.04576977342367172,
            "batch_time": 0.4397192746400833,
            "samples_per_second": 684695.4515854771,
            "samples_per_second_per_gpu": 85586.93144818464,
            "loss_sequences_lower_95": 2.718042704264323,
            "loss_sequences_upper_95": 2.8520909372965497,
            "loss_tokens_lower_95": 2.7748177019755045,
            "loss_tokens_upper_95": 2.798236624399821,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2737747143040807,
            "data_time": 0.0011196463449622334,
            "batch_time": 0.0368466827136428,
            "samples_per_second": 896308.2146873642,
            "samples_per_second_per_gpu": 112038.52683592052,
            "loss_sequences_lower_95": 3.2711976115160177,
            "loss_sequences_upper_95": 3.2763236001305898,
            "loss_tokens_lower_95": 3.2632078020833335,
            "loss_tokens_upper_95": 3.284327364583333,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.692258158995181,
            "data_time": 0.00991139030456543,
            "batch_time": 0.04530858898162842,
            "samples_per_second": 866340.5577531905,
            "samples_per_second_per_gpu": 108292.56971914881,
            "loss_sequences_lower_95": 2.6688710364516903,
            "loss_sequences_upper_95": 2.7159134783063617,
            "loss_tokens_lower_95": 2.6809313645833335,
            "loss_tokens_upper_95": 2.703574666666667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1332588141726463,
            "data_time": 0.0016678624639385625,
            "batch_time": 0.03697662722123297,
            "samples_per_second": 905620.7804031699,
            "samples_per_second_per_gpu": 113202.59755039624,
            "loss_sequences_lower_95": 3.1229066426385312,
            "loss_sequences_upper_95": 3.1434773175740975,
            "loss_tokens_lower_95": 3.12264446875,
            "loss_tokens_upper_95": 3.1437170260416667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2746284182834042,
            "data_time": 0.009884283362156842,
            "batch_time": 0.045326313649515705,
            "samples_per_second": 869927.8064901665,
            "samples_per_second_per_gpu": 108740.97581127081,
            "loss_sequences_lower_95": 3.2424522943749206,
            "loss_sequences_upper_95": 3.3065837417260693,
            "loss_tokens_lower_95": 3.2638124739583336,
            "loss_tokens_upper_95": 3.2850962447916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0669541289253677,
            "data_time": 0.0038428844317145968,
            "batch_time": 0.0392718480333038,
            "samples_per_second": 899648.4252848547,
            "samples_per_second_per_gpu": 112456.05316060684,
            "loss_sequences_lower_95": 3.025749558190113,
            "loss_sequences_upper_95": 3.1089303852343217,
            "loss_tokens_lower_95": 3.055902234375,
            "loss_tokens_upper_95": 3.07784534375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.7445352035639237,
            "data_time": 0.0016205112463498388,
            "batch_time": 0.036957052656332395,
            "samples_per_second": 907307.6051963605,
            "samples_per_second_per_gpu": 113413.45064954506,
            "loss_sequences_lower_95": 1.7224343660315689,
            "loss_sequences_upper_95": 1.7666448600924747,
            "loss_tokens_lower_95": 1.7346371822916666,
            "loss_tokens_upper_95": 1.7545382916666667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.648027144526936,
            "data_time": 0.001753275387459647,
            "batch_time": 0.03744958983660243,
            "samples_per_second": 902114.8693868808,
            "samples_per_second_per_gpu": 112764.3586733601,
            "loss_sequences_lower_95": 3.639521361665576,
            "loss_sequences_upper_95": 3.6570277221040577,
            "loss_tokens_lower_95": 3.637311229166667,
            "loss_tokens_upper_95": 3.6585545625,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.453845448125669,
            "data_time": 0.011068627947852724,
            "batch_time": 0.047260485944293794,
            "samples_per_second": 860114.8990266715,
            "samples_per_second_per_gpu": 107514.36237833394,
            "loss_sequences_lower_95": 3.413097350577998,
            "loss_sequences_upper_95": 3.500239290066851,
            "loss_tokens_lower_95": 3.4429092968750004,
            "loss_tokens_upper_95": 3.4649652343749997,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.089437219936386,
            "data_time": 0.009871430695056915,
            "batch_time": 0.0454191816970706,
            "samples_per_second": 872378.4398809419,
            "samples_per_second_per_gpu": 109047.30498511773,
            "loss_sequences_lower_95": 4.064521095969461,
            "loss_sequences_upper_95": 4.117386817178236,
            "loss_tokens_lower_95": 4.0775108229166666,
            "loss_tokens_upper_95": 4.101692333333333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2022814206330765,
            "data_time": 0.0013155381821318502,
            "batch_time": 0.036658169243818574,
            "samples_per_second": 908309.5876826736,
            "samples_per_second_per_gpu": 113538.6984603342,
            "loss_sequences_lower_95": 3.194223011168974,
            "loss_sequences_upper_95": 3.210354881500959,
            "loss_tokens_lower_95": 3.1916911979166667,
            "loss_tokens_upper_95": 3.212866317708333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1108086084336453,
            "data_time": 0.00274352824856697,
            "batch_time": 0.03827541416431843,
            "samples_per_second": 900733.9711788514,
            "samples_per_second_per_gpu": 112591.74639735643,
            "loss_sequences_lower_95": 3.1007056691059716,
            "loss_sequences_upper_95": 3.1206839526906105,
            "loss_tokens_lower_95": 3.1005401979166667,
            "loss_tokens_upper_95": 3.121246640625,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6067991885403834,
            "data_time": 0.010404424705053036,
            "batch_time": 0.04603021229679877,
            "samples_per_second": 859182.3728561411,
            "samples_per_second_per_gpu": 107397.79660701763,
            "loss_sequences_lower_95": 3.57156117034974,
            "loss_sequences_upper_95": 3.6429328036356647,
            "loss_tokens_lower_95": 3.5957738333333333,
            "loss_tokens_upper_95": 3.6177439479166664,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0120594477216485,
            "data_time": 0.010127536804077636,
            "batch_time": 0.045441678795681534,
            "samples_per_second": 868805.7199547206,
            "samples_per_second_per_gpu": 108600.71499434007,
            "loss_sequences_lower_95": 2.9505875348559223,
            "loss_sequences_upper_95": 3.0720192592644158,
            "loss_tokens_lower_95": 3.0011545520833334,
            "loss_tokens_upper_95": 3.0228566927083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.770829677581787,
            "data_time": 0.08584347793034144,
            "batch_time": 0.12054744788578578,
            "samples_per_second": 511820.9091325855,
            "samples_per_second_per_gpu": 63977.61364157319,
            "loss_sequences_lower_95": 3.710606323588978,
            "loss_sequences_upper_95": 3.830117433721369,
            "loss_tokens_lower_95": 3.7510012886740944,
            "loss_tokens_upper_95": 3.7914430098100143,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6962899623042293,
            "data_time": 0.014235649596561085,
            "batch_time": 0.050000706856900994,
            "samples_per_second": 850014.0200882506,
            "samples_per_second_per_gpu": 106251.75251103133,
            "loss_sequences_lower_95": 2.601753782044347,
            "loss_sequences_upper_95": 2.7900291909976884,
            "loss_tokens_lower_95": 2.6857879583333335,
            "loss_tokens_upper_95": 2.7067819218749998,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.589430343507147,
            "data_time": 0.01329264665643374,
            "batch_time": 0.04941981161634127,
            "samples_per_second": 856262.197292489,
            "samples_per_second_per_gpu": 107032.77466156112,
            "loss_sequences_lower_95": 5.540071705840822,
            "loss_sequences_upper_95": 5.637207804254618,
            "loss_tokens_lower_95": 5.577537833333333,
            "loss_tokens_upper_95": 5.601246489583334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3851047871542757,
            "data_time": 0.03590521588921547,
            "batch_time": 0.07271859049797058,
            "samples_per_second": 759106.9445947317,
            "samples_per_second_per_gpu": 94888.36807434146,
            "loss_sequences_lower_95": 3.323258315539751,
            "loss_sequences_upper_95": 3.4721702763291655,
            "loss_tokens_lower_95": 3.3732948803510823,
            "loss_tokens_upper_95": 3.3970758844594484,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.961389425337068,
            "data_time": 0.0016318395360455699,
            "batch_time": 0.03709279890076718,
            "samples_per_second": 899954.6946205666,
            "samples_per_second_per_gpu": 112494.33682757082,
            "loss_sequences_lower_95": 4.939403497097992,
            "loss_sequences_upper_95": 4.984138293823886,
            "loss_tokens_lower_95": 4.938873029349452,
            "loss_tokens_upper_95": 4.983626519993591,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0317133359523485,
            "data_time": 0.0017768480120950443,
            "batch_time": 0.03731989333773874,
            "samples_per_second": 896978.9077552282,
            "samples_per_second_per_gpu": 112122.36346940353,
            "loss_sequences_lower_95": 3.0207917663780868,
            "loss_sequences_upper_95": 3.0457850562792523,
            "loss_tokens_lower_95": 3.015298673031809,
            "loss_tokens_upper_95": 3.034447231472163,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.947654130457283,
            "data_time": 0.0031304770657524087,
            "batch_time": 0.038790563498882616,
            "samples_per_second": 892357.109777965,
            "samples_per_second_per_gpu": 111544.63872224563,
            "loss_sequences_lower_95": 4.219047870014318,
            "loss_sequences_upper_95": 4.504981552794343,
            "loss_tokens_lower_95": 3.3861169494672536,
            "loss_tokens_upper_95": 3.5946278999043213,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.036080919951201,
            "data_time": 0.003692200684801061,
            "batch_time": 0.039205489482017276,
            "samples_per_second": 890811.6389981403,
            "samples_per_second_per_gpu": 111351.45487476754,
            "loss_sequences_lower_95": 4.121960009765625,
            "loss_sequences_upper_95": 4.3151470703125,
            "loss_tokens_lower_95": 3.783726169418239,
            "loss_tokens_upper_95": 3.923031747494104,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8304197014598254,
            "data_time": 0.004780104077599527,
            "batch_time": 0.04028570238461501,
            "samples_per_second": 889434.8013218648,
            "samples_per_second_per_gpu": 111179.3501652331,
            "loss_sequences_lower_95": 2.8768510542284638,
            "loss_sequences_upper_95": 2.936384768771619,
            "loss_tokens_lower_95": 2.7370810167820157,
            "loss_tokens_upper_95": 2.767203084416239,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.0829167420213874,
            "data_time": 0.02438182064465114,
            "batch_time": 0.06014972712312426,
            "samples_per_second": 834430.8360921165,
            "samples_per_second_per_gpu": 104303.85451151457,
            "loss_sequences_lower_95": 2.06326490575617,
            "loss_sequences_upper_95": 2.1641936007413,
            "loss_tokens_lower_95": 2.019108002714653,
            "loss_tokens_upper_95": 2.062616435616749,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1886796542576383,
            "data_time": 0.021150020882487297,
            "batch_time": 0.05661090649664402,
            "samples_per_second": 821291.8656587522,
            "samples_per_second_per_gpu": 102661.48320734402,
            "loss_sequences_lower_95": 3.1787564211475607,
            "loss_sequences_upper_95": 3.3664009374501753,
            "loss_tokens_lower_95": 3.0689680703072106,
            "loss_tokens_upper_95": 3.158813479868399,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.294028436342875,
            "data_time": 0.01697745231481699,
            "batch_time": 0.05195589860280355,
            "samples_per_second": 839803.3300987578,
            "samples_per_second_per_gpu": 104975.41626234472,
            "loss_sequences_lower_95": 3.2636807963053385,
            "loss_sequences_upper_95": 3.359476226806641,
            "loss_tokens_lower_95": 3.1629155822678463,
            "loss_tokens_upper_95": 3.372979042232405,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.576499938212535,
            "data_time": 0.001392955632674836,
            "batch_time": 0.036899074843145474,
            "samples_per_second": 899509.4860129901,
            "samples_per_second_per_gpu": 112438.68575162377,
            "loss_sequences_lower_95": 5.585805189982039,
            "loss_sequences_upper_95": 5.665121518380001,
            "loss_tokens_lower_95": 5.436010328073487,
            "loss_tokens_upper_95": 5.516771370242845,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.170657716365374,
            "data_time": 0.00291761375913684,
            "batch_time": 0.038518846635050424,
            "samples_per_second": 892484.0470743871,
            "samples_per_second_per_gpu": 111560.50588429839,
            "loss_sequences_lower_95": 4.634122217865504,
            "loss_sequences_upper_95": 4.914391226720329,
            "loss_tokens_lower_95": 3.511323640218764,
            "loss_tokens_upper_95": 3.6400859614483987,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8613334893163156,
            "data_time": 0.005097487488308469,
            "batch_time": 0.040986608411814715,
            "samples_per_second": 885824.648518694,
            "samples_per_second_per_gpu": 110728.08106483675,
            "loss_sequences_lower_95": 4.2415394232948485,
            "loss_sequences_upper_95": 4.554288740451018,
            "loss_tokens_lower_95": 3.4765046042448167,
            "loss_tokens_upper_95": 3.6251317209180383,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.31051042417413,
            "data_time": 0.022786661982536316,
            "batch_time": 0.05808532450880323,
            "samples_per_second": 834258.3176137509,
            "samples_per_second_per_gpu": 104282.28970171887,
            "loss_sequences_lower_95": 5.231724945486409,
            "loss_sequences_upper_95": 5.389907920523865,
            "loss_tokens_lower_95": 5.230981570727204,
            "loss_tokens_upper_95": 5.387480100866866,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.329074420928955,
            "data_time": 0.04636099705329308,
            "batch_time": 0.08216962447533241,
            "samples_per_second": 748189.6884084672,
            "samples_per_second_per_gpu": 93523.7110510584,
            "loss_sequences_lower_95": 3.1924891357421874,
            "loss_sequences_upper_95": 3.5635330352783203,
            "loss_tokens_lower_95": 3.030052138654405,
            "loss_tokens_upper_95": 3.4824662810787954,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.546434528507986,
            "data_time": 0.003329544710966707,
            "batch_time": 0.03880224832484083,
            "samples_per_second": 897794.2027621166,
            "samples_per_second_per_gpu": 112224.27534526457,
            "loss_sequences_lower_95": 4.495724765305143,
            "loss_sequences_upper_95": 4.597704603337705,
            "loss_tokens_lower_95": 4.495088275452597,
            "loss_tokens_upper_95": 4.597025532561413,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.129778462096768,
            "data_time": 0.004992936213681593,
            "batch_time": 0.04037578032028422,
            "samples_per_second": 892382.2537293928,
            "samples_per_second_per_gpu": 111547.7817161741,
            "loss_sequences_lower_95": 5.0731578580089325,
            "loss_sequences_upper_95": 5.185508664292844,
            "loss_tokens_lower_95": 5.07038929132435,
            "loss_tokens_upper_95": 5.186302471238994,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4692901985170534,
            "data_time": 0.003656198087572794,
            "batch_time": 0.03904342625620041,
            "samples_per_second": 891446.5720407134,
            "samples_per_second_per_gpu": 111430.82150508917,
            "loss_sequences_lower_95": 3.6219405247934238,
            "loss_sequences_upper_95": 3.7506949639553864,
            "loss_tokens_lower_95": 3.294013079567073,
            "loss_tokens_upper_95": 3.349426827799194,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.37798446726799,
            "data_time": 0.010903526097536087,
            "batch_time": 0.04659164138138294,
            "samples_per_second": 856922.7837125843,
            "samples_per_second_per_gpu": 107115.34796407304,
            "loss_sequences_lower_95": 5.573949487304687,
            "loss_sequences_upper_95": 6.129416687011719,
            "loss_tokens_lower_95": 4.771356504125993,
            "loss_tokens_upper_95": 5.127648877308591,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.698574185371399,
            "data_time": 0.16572006046772003,
            "batch_time": 0.20359990000724792,
            "samples_per_second": 488269.5043916917,
            "samples_per_second_per_gpu": 61033.68804896146,
            "loss_sequences_lower_95": 3.4658967018127442,
            "loss_sequences_upper_95": 3.971192365884781,
            "loss_tokens_lower_95": 3.2650339323898843,
            "loss_tokens_upper_95": 4.052488006942573,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.518801822059456,
            "data_time": 0.027406867514265344,
            "batch_time": 0.06249951301737035,
            "samples_per_second": 777103.6444098747,
            "samples_per_second_per_gpu": 97137.95555123434,
            "loss_sequences_lower_95": 4.917792563602842,
            "loss_sequences_upper_95": 5.677363981049636,
            "loss_tokens_lower_95": 3.2794829324509407,
            "loss_tokens_upper_95": 3.711405881174253,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3898580122223168,
            "data_time": 0.002933284060822593,
            "batch_time": 0.038370823280678854,
            "samples_per_second": 893150.0739368687,
            "samples_per_second_per_gpu": 111643.75924210859,
            "loss_sequences_lower_95": 2.363782922187295,
            "loss_sequences_upper_95": 2.4161410657116917,
            "loss_tokens_lower_95": 2.3631371547687308,
            "loss_tokens_upper_95": 2.4159951786226244,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6602646292519214,
            "data_time": 0.002344599887757603,
            "batch_time": 0.03786167508185627,
            "samples_per_second": 897825.952613787,
            "samples_per_second_per_gpu": 112228.24407672338,
            "loss_sequences_lower_95": 2.633081348433255,
            "loss_sequences_upper_95": 2.7771778215511596,
            "loss_tokens_lower_95": 2.5085712584447912,
            "loss_tokens_upper_95": 2.6496238694589067,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1652021484497266,
            "data_time": 0.01891306373808119,
            "batch_time": 0.05397928092214796,
            "samples_per_second": 820639.9238993684,
            "samples_per_second_per_gpu": 102579.99048742105,
            "loss_sequences_lower_95": 3.026908673296918,
            "loss_sequences_upper_95": 3.435490344819568,
            "loss_tokens_lower_95": 2.8944273321414635,
            "loss_tokens_upper_95": 3.193722999989159,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.591691732406616,
            "data_time": 0.00494176484644413,
            "batch_time": 0.04035382345318794,
            "samples_per_second": 886202.736168279,
            "samples_per_second_per_gpu": 110775.34202103487,
            "loss_sequences_lower_95": 3.633126462531447,
            "loss_sequences_upper_95": 3.7878665192445244,
            "loss_tokens_lower_95": 3.4460928717667887,
            "loss_tokens_upper_95": 3.5900870123917885,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7693843638024678,
            "data_time": 0.03186779930478051,
            "batch_time": 0.06748477617899577,
            "samples_per_second": 810081.0621467619,
            "samples_per_second_per_gpu": 101260.13276834524,
            "loss_sequences_lower_95": 2.6099575740535084,
            "loss_sequences_upper_95": 3.038473785214308,
            "loss_tokens_lower_95": 2.5158916728531557,
            "loss_tokens_upper_95": 2.8591831922920066,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.080908082680455,
            "data_time": 0.0019344285255520524,
            "batch_time": 0.037459195017385996,
            "samples_per_second": 896335.9938005389,
            "samples_per_second_per_gpu": 112041.99922506737,
            "loss_sequences_lower_95": 4.062525669196339,
            "loss_sequences_upper_95": 4.098679462454991,
            "loss_tokens_lower_95": 4.0628448736622325,
            "loss_tokens_upper_95": 4.0986028651042705,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8413361348573444,
            "data_time": 0.04520546306263317,
            "batch_time": 0.08048074462197044,
            "samples_per_second": 746071.9074770231,
            "samples_per_second_per_gpu": 93258.98843462788,
            "loss_sequences_lower_95": 0.7927115745914792,
            "loss_sequences_upper_95": 0.9205420651482147,
            "loss_tokens_lower_95": 0.7108294890139217,
            "loss_tokens_upper_95": 0.8920793375192914,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.4434502129034925,
            "data_time": 0.0012105907823294028,
            "batch_time": 0.03669189950896229,
            "samples_per_second": 899822.5140918813,
            "samples_per_second_per_gpu": 112477.81426148517,
            "loss_sequences_lower_95": 4.789689250442216,
            "loss_sequences_upper_95": 4.834448579992138,
            "loss_tokens_lower_95": 3.90395521639265,
            "loss_tokens_upper_95": 3.9468551257253384,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.382092129230499,
            "data_time": 0.0060051438354310534,
            "batch_time": 0.04146701097488403,
            "samples_per_second": 884243.127310612,
            "samples_per_second_per_gpu": 110530.3909138265,
            "loss_sequences_lower_95": 6.400866967773438,
            "loss_sequences_upper_95": 6.684414819335937,
            "loss_tokens_lower_95": 6.066383809077122,
            "loss_tokens_upper_95": 6.331256334816681,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.238495978065159,
            "data_time": 0.023731975232140493,
            "batch_time": 0.05946493148803711,
            "samples_per_second": 824942.5905296468,
            "samples_per_second_per_gpu": 103117.82381620586,
            "loss_sequences_lower_95": 5.06908715289572,
            "loss_sequences_upper_95": 5.405696596891984,
            "loss_tokens_lower_95": 5.073325487219769,
            "loss_tokens_upper_95": 5.40262278681216,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.718417846434044,
            "data_time": 0.0046636559158922675,
            "batch_time": 0.04017577437033136,
            "samples_per_second": 888207.5649958733,
            "samples_per_second_per_gpu": 111025.94562448416,
            "loss_sequences_lower_95": 7.613905547170928,
            "loss_sequences_upper_95": 7.821577962239584,
            "loss_tokens_lower_95": 7.614228238192472,
            "loss_tokens_upper_95": 7.823560088186553,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.8363973379135132,
            "data_time": 0.004362602500205344,
            "batch_time": 0.03998391171719166,
            "samples_per_second": 891375.9632082559,
            "samples_per_second_per_gpu": 111421.99540103199,
            "loss_sequences_lower_95": 0.8506054850260417,
            "loss_sequences_upper_95": 0.8861184143066406,
            "loss_tokens_lower_95": 0.7901787687731342,
            "loss_tokens_upper_95": 0.838818279264831,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.057832855270022,
            "data_time": 0.023416691592761447,
            "batch_time": 0.058753481933048794,
            "samples_per_second": 799914.299210892,
            "samples_per_second_per_gpu": 99989.2874013615,
            "loss_sequences_lower_95": 5.691496146065848,
            "loss_sequences_upper_95": 6.425909365699405,
            "loss_tokens_lower_95": 5.691775512695313,
            "loss_tokens_upper_95": 6.427430100213914,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2096133902668953,
            "data_time": 0.14767856895923615,
            "batch_time": 0.186521515250206,
            "samples_per_second": 492028.41072395165,
            "samples_per_second_per_gpu": 61503.551340493956,
            "loss_sequences_lower_95": 1.9792989492416382,
            "loss_sequences_upper_95": 2.9264571607112884,
            "loss_tokens_lower_95": 1.7022637939453125,
            "loss_tokens_upper_95": 2.1977031338091972,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.553303086280823,
            "data_time": 0.005873270451076447,
            "batch_time": 0.04155240314347403,
            "samples_per_second": 880785.3105077529,
            "samples_per_second_per_gpu": 110098.16381346912,
            "loss_sequences_lower_95": 7.484214636230468,
            "loss_sequences_upper_95": 7.820668090820312,
            "loss_tokens_lower_95": 7.271739111575983,
            "loss_tokens_upper_95": 7.568919101463357,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.750390594482422,
            "data_time": 0.005974174018890139,
            "batch_time": 0.04134610106074621,
            "samples_per_second": 886502.7899755185,
            "samples_per_second_per_gpu": 110812.84874693981,
            "loss_sequences_lower_95": 6.837738891601562,
            "loss_sequences_upper_95": 7.048505920410156,
            "loss_tokens_lower_95": 6.534071468530922,
            "loss_tokens_upper_95": 6.708480433051922,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.055715684798034,
            "data_time": 0.003643701865920255,
            "batch_time": 0.03910721408882269,
            "samples_per_second": 891772.1320039941,
            "samples_per_second_per_gpu": 111471.51650049926,
            "loss_sequences_lower_95": 5.01116735884858,
            "loss_sequences_upper_95": 5.099894511280066,
            "loss_tokens_lower_95": 5.011862574445295,
            "loss_tokens_upper_95": 5.09982051612785,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.826600901725288,
            "data_time": 0.00832272152165992,
            "batch_time": 0.04376865657794872,
            "samples_per_second": 871853.1437980642,
            "samples_per_second_per_gpu": 108981.64297475803,
            "loss_sequences_lower_95": 4.713238862912227,
            "loss_sequences_upper_95": 4.9399438907840105,
            "loss_tokens_lower_95": 4.70914810110347,
            "loss_tokens_upper_95": 4.938657970985143,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.032277168750763,
            "data_time": 0.006073589362795391,
            "batch_time": 0.04144268187265548,
            "samples_per_second": 886569.4947077609,
            "samples_per_second_per_gpu": 110821.18683847011,
            "loss_sequences_lower_95": 2.9553443420410157,
            "loss_sequences_upper_95": 3.110456671142578,
            "loss_tokens_lower_95": 2.9549406005859375,
            "loss_tokens_upper_95": 3.1126785522460936,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8836804104279885,
            "data_time": 0.0016831541331819415,
            "batch_time": 0.037172157101910595,
            "samples_per_second": 898128.0012244685,
            "samples_per_second_per_gpu": 112266.00015305856,
            "loss_sequences_lower_95": 3.3312862168874173,
            "loss_sequences_upper_95": 3.4227351972711686,
            "loss_tokens_lower_95": 2.313927552501013,
            "loss_tokens_upper_95": 2.3752624217516956,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.1659938577395765,
            "data_time": 0.01899402652468,
            "batch_time": 0.054640744413648336,
            "samples_per_second": 818222.7742827075,
            "samples_per_second_per_gpu": 102277.84678533844,
            "loss_sequences_lower_95": 4.972576539907883,
            "loss_sequences_upper_95": 5.358799242617479,
            "loss_tokens_lower_95": 4.972081824914733,
            "loss_tokens_upper_95": 5.359059655488427,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.355536162619497,
            "data_time": 0.01061226986348629,
            "batch_time": 0.04634890705347061,
            "samples_per_second": 873658.492826567,
            "samples_per_second_per_gpu": 109207.31160332088,
            "loss_sequences_lower_95": 5.216232395546109,
            "loss_sequences_upper_95": 5.490886098824296,
            "loss_tokens_lower_95": 5.220551901424633,
            "loss_tokens_upper_95": 5.487564541685815,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.937075137890447,
            "data_time": 0.0019662485148913265,
            "batch_time": 0.037392736464984776,
            "samples_per_second": 898161.3400530722,
            "samples_per_second_per_gpu": 112270.16750663402,
            "loss_sequences_lower_95": 3.2027539172597237,
            "loss_sequences_upper_95": 3.288984869214268,
            "loss_tokens_lower_95": 2.4913784106115044,
            "loss_tokens_upper_95": 2.5572806946442976,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.1383281864186445,
            "data_time": 0.027261696755886078,
            "batch_time": 0.06324847042560577,
            "samples_per_second": 824396.1706426814,
            "samples_per_second_per_gpu": 103049.52133033518,
            "loss_sequences_lower_95": 4.960682331065022,
            "loss_sequences_upper_95": 5.308998568095858,
            "loss_tokens_lower_95": 4.959362389297081,
            "loss_tokens_upper_95": 5.307292700066138,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.511383901969373,
            "data_time": 0.0033797935397342593,
            "batch_time": 0.038942729742681294,
            "samples_per_second": 891435.7895027426,
            "samples_per_second_per_gpu": 111429.47368784282,
            "loss_sequences_lower_95": 4.485631502950593,
            "loss_sequences_upper_95": 4.5369292634269875,
            "loss_tokens_lower_95": 4.485433286147744,
            "loss_tokens_upper_95": 4.53792586487003,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.364027702692643,
            "data_time": 0.024564261869950726,
            "batch_time": 0.05946256247433749,
            "samples_per_second": 804206.274337842,
            "samples_per_second_per_gpu": 100525.78429223025,
            "loss_sequences_lower_95": 5.150302642525978,
            "loss_sequences_upper_95": 5.572029706343864,
            "loss_tokens_lower_95": 5.1507249109953355,
            "loss_tokens_upper_95": 5.575781575915882,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.1004198014736177,
            "data_time": 0.0790865421295166,
            "batch_time": 0.1156008169054985,
            "samples_per_second": 641891.2003687496,
            "samples_per_second_per_gpu": 80236.4000460937,
            "loss_sequences_lower_95": 1.8569999122619627,
            "loss_sequences_upper_95": 2.532890853881836,
            "loss_tokens_lower_95": 1.6833179447386,
            "loss_tokens_upper_95": 2.435174740685357,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.2468236764272054,
            "data_time": 0.08135389536619186,
            "batch_time": 0.11849325150251389,
            "samples_per_second": 649431.4841269028,
            "samples_per_second_per_gpu": 81178.93551586285,
            "loss_sequences_lower_95": 2.0465394051869708,
            "loss_sequences_upper_95": 2.690056997934977,
            "loss_tokens_lower_95": 1.721828319249528,
            "loss_tokens_upper_95": 2.5409049387728233,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.278167811970999,
            "data_time": 0.003250395290426397,
            "batch_time": 0.038763131023225116,
            "samples_per_second": 893556.6408748177,
            "samples_per_second_per_gpu": 111694.58010935222,
            "loss_sequences_lower_95": 4.248704019583027,
            "loss_sequences_upper_95": 4.306578917468244,
            "loss_tokens_lower_95": 4.249056840539857,
            "loss_tokens_upper_95": 4.307454005200663,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.6209235067497718,
            "data_time": 0.0011352524568919802,
            "batch_time": 0.03661094632007883,
            "samples_per_second": 900297.245518674,
            "samples_per_second_per_gpu": 112537.15568983425,
            "loss_sequences_lower_95": 0.7262903490675007,
            "loss_sequences_upper_95": 0.7427248865229433,
            "loss_tokens_lower_95": 0.5077526014959154,
            "loss_tokens_upper_95": 0.5166477600501291,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6033670723907592,
            "data_time": 0.04038914665579796,
            "batch_time": 0.07739121839404106,
            "samples_per_second": 791765.7925003809,
            "samples_per_second_per_gpu": 98970.72406254761,
            "loss_sequences_lower_95": 1.5238515087938684,
            "loss_sequences_upper_95": 1.7507302832415723,
            "loss_tokens_lower_95": 1.4264019554998388,
            "loss_tokens_upper_95": 1.5479280371069761,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6668982336650022,
            "data_time": 0.11774243627275739,
            "batch_time": 0.1548517317998977,
            "samples_per_second": 512586.6707769385,
            "samples_per_second_per_gpu": 64073.33384711731,
            "loss_sequences_lower_95": 3.272856480366475,
            "loss_sequences_upper_95": 4.09248050998997,
            "loss_tokens_lower_95": 3.0918527438316814,
            "loss_tokens_upper_95": 4.177630586977358,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4708573596506584,
            "data_time": 0.03149802911849249,
            "batch_time": 0.06712167603628975,
            "samples_per_second": 810237.5367156349,
            "samples_per_second_per_gpu": 101279.69208945436,
            "loss_sequences_lower_95": 1.4173320328317038,
            "loss_sequences_upper_95": 1.605942726135254,
            "loss_tokens_lower_95": 1.3206388928130257,
            "loss_tokens_upper_95": 1.4173147446974874,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.5295530463137277,
            "data_time": 0.03221470401400611,
            "batch_time": 0.06795031967617217,
            "samples_per_second": 812275.5043362823,
            "samples_per_second_per_gpu": 101534.43804203528,
            "loss_sequences_lower_95": 1.5016481725181021,
            "loss_sequences_upper_95": 1.6749905795585815,
            "loss_tokens_lower_95": 1.374508071099522,
            "loss_tokens_upper_95": 1.4566788136672482,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.456468295760271,
            "data_time": 0.031314997445969356,
            "batch_time": 0.06681039787474133,
            "samples_per_second": 811371.658845442,
            "samples_per_second_per_gpu": 101421.45735568025,
            "loss_sequences_lower_95": 1.3344246445632564,
            "loss_sequences_upper_95": 1.5413358292928556,
            "loss_tokens_lower_95": 1.366139888927949,
            "loss_tokens_upper_95": 1.4952808803494733,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.598510947169327,
            "data_time": 0.032273828983306885,
            "batch_time": 0.06794632616497222,
            "samples_per_second": 807555.0237572491,
            "samples_per_second_per_gpu": 100944.37796965614,
            "loss_sequences_lower_95": 1.5570515632629396,
            "loss_sequences_upper_95": 1.7174826877873117,
            "loss_tokens_lower_95": 1.4454371086904936,
            "loss_tokens_upper_95": 1.5243777670221537,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.3404138762017954,
            "data_time": 0.03084784378240138,
            "batch_time": 0.06709016693962945,
            "samples_per_second": 813351.8892804526,
            "samples_per_second_per_gpu": 101668.98616005658,
            "loss_sequences_lower_95": 1.2916189608366593,
            "loss_sequences_upper_95": 1.400902443642942,
            "loss_tokens_lower_95": 1.2769795256059886,
            "loss_tokens_upper_95": 1.3376193040042776,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.263330513747727,
            "data_time": 0.03163018964585804,
            "batch_time": 0.06789678619021461,
            "samples_per_second": 801522.3204424958,
            "samples_per_second_per_gpu": 100190.29005531197,
            "loss_sequences_lower_95": 1.2510244276465439,
            "loss_sequences_upper_95": 1.373045656157703,
            "loss_tokens_lower_95": 1.131842865854807,
            "loss_tokens_upper_95": 1.1848567119688065,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-0.5/params.txt",
    "uuid": "36ad2294-092f-43fb-b840-7e9eefde7dc0",
    "creation_date": "2023_12_14-07_45_51"
}