{
    "name": "rw_original-d=1024_l=24_h=8-2.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 16464650240,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "3292930048",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=1024_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.0336712698141732,
            "data_time": 0.04575300216674805,
            "batch_time": 0.42537275701761246,
            "samples_per_second": 691049.1941417519,
            "samples_per_second_per_gpu": 86381.14926771898,
            "loss_sequences_lower_95": 2.972948112487793,
            "loss_sequences_upper_95": 3.0948647435506182,
            "loss_tokens_lower_95": 3.0207778422037763,
            "loss_tokens_upper_95": 3.0465834617614744,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.047896150256647,
            "data_time": 0.0010497637800033538,
            "batch_time": 0.03675580058936901,
            "samples_per_second": 897549.4205943698,
            "samples_per_second_per_gpu": 112193.67757429622,
            "loss_sequences_lower_95": 3.045476049718232,
            "loss_sequences_upper_95": 3.050351185197767,
            "loss_tokens_lower_95": 3.0375991250000003,
            "loss_tokens_upper_95": 3.058148307291667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7237196679017983,
            "data_time": 0.009442041397094727,
            "batch_time": 0.044947259902954104,
            "samples_per_second": 868044.5373519193,
            "samples_per_second_per_gpu": 108505.56716898992,
            "loss_sequences_lower_95": 2.674774755361129,
            "loss_sequences_upper_95": 2.7849409640565215,
            "loss_tokens_lower_95": 2.7118300104166666,
            "loss_tokens_upper_95": 2.7358063229166665,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1172040552453897,
            "data_time": 0.0015683431962603017,
            "batch_time": 0.03683884383032197,
            "samples_per_second": 906926.9029445056,
            "samples_per_second_per_gpu": 113365.8628680632,
            "loss_sequences_lower_95": 3.0822855388208765,
            "loss_sequences_upper_95": 3.1535065943137885,
            "loss_tokens_lower_95": 3.1055834322916667,
            "loss_tokens_upper_95": 3.1289063802083334,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1126984680742935,
            "data_time": 0.009828816372084902,
            "batch_time": 0.04508755406535479,
            "samples_per_second": 868939.5663880341,
            "samples_per_second_per_gpu": 108617.44579850427,
            "loss_sequences_lower_95": 3.059505510815778,
            "loss_sequences_upper_95": 3.179828508614037,
            "loss_tokens_lower_95": 3.10203,
            "loss_tokens_upper_95": 3.1231396614583335,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1880079354662865,
            "data_time": 0.003831635674704676,
            "batch_time": 0.039424501683401024,
            "samples_per_second": 896386.7446852872,
            "samples_per_second_per_gpu": 112048.3430856609,
            "loss_sequences_lower_95": 3.144333962467072,
            "loss_sequences_upper_95": 3.23609927013405,
            "loss_tokens_lower_95": 3.1762612291666668,
            "loss_tokens_upper_95": 3.1997133489583334,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7612978046159355,
            "data_time": 0.0016027377830244004,
            "batch_time": 0.036944142465295836,
            "samples_per_second": 907832.0895918345,
            "samples_per_second_per_gpu": 113479.01119897931,
            "loss_sequences_lower_95": 2.7332968401227675,
            "loss_sequences_upper_95": 2.788782395966199,
            "loss_tokens_lower_95": 2.7472486354166668,
            "loss_tokens_upper_95": 2.7758093541666664,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.594647717301134,
            "data_time": 0.0018131688068921173,
            "batch_time": 0.037180223708270434,
            "samples_per_second": 906785.4621646613,
            "samples_per_second_per_gpu": 113348.18277058266,
            "loss_sequences_lower_95": 3.5730017895124346,
            "loss_sequences_upper_95": 3.618123496809555,
            "loss_tokens_lower_95": 3.5834422916666666,
            "loss_tokens_upper_95": 3.6058551145833335,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1994429831582356,
            "data_time": 0.01089193896641807,
            "batch_time": 0.050145681888338116,
            "samples_per_second": 854051.1724389995,
            "samples_per_second_per_gpu": 106756.39655487494,
            "loss_sequences_lower_95": 3.120314050689945,
            "loss_sequences_upper_95": 3.2949536269273216,
            "loss_tokens_lower_95": 3.1882099739583336,
            "loss_tokens_upper_95": 3.210814822916667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.247157636838468,
            "data_time": 0.009977688081562519,
            "batch_time": 0.04563077539205551,
            "samples_per_second": 872615.0808694838,
            "samples_per_second_per_gpu": 109076.88510868547,
            "loss_sequences_lower_95": 4.146139405744349,
            "loss_sequences_upper_95": 4.371761469784461,
            "loss_tokens_lower_95": 4.234012885416667,
            "loss_tokens_upper_95": 4.260387927083333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2494639703733097,
            "data_time": 0.001347374412973602,
            "batch_time": 0.036707771350403755,
            "samples_per_second": 907796.6144279279,
            "samples_per_second_per_gpu": 113474.57680349099,
            "loss_sequences_lower_95": 3.237671932279533,
            "loss_sequences_upper_95": 3.2616986620143553,
            "loss_tokens_lower_95": 3.2387172604166667,
            "loss_tokens_upper_95": 3.2606011770833336,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0852636719981317,
            "data_time": 0.002629314632241077,
            "batch_time": 0.03814425436682149,
            "samples_per_second": 900514.1324553164,
            "samples_per_second_per_gpu": 112564.26655691455,
            "loss_sequences_lower_95": 3.0618318773021005,
            "loss_sequences_upper_95": 3.1100706787617787,
            "loss_tokens_lower_95": 3.07427340625,
            "loss_tokens_upper_95": 3.0961153125000003,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6272593511770994,
            "data_time": 0.009947816373802456,
            "batch_time": 0.04522682838289163,
            "samples_per_second": 864207.5149790238,
            "samples_per_second_per_gpu": 108025.93937237798,
            "loss_sequences_lower_95": 3.5487219510649086,
            "loss_sequences_upper_95": 3.7233175281578825,
            "loss_tokens_lower_95": 3.6147612291666666,
            "loss_tokens_upper_95": 3.6396779583333334,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8462968625264837,
            "data_time": 0.00970101926431238,
            "batch_time": 0.04496504871018854,
            "samples_per_second": 869703.8653636402,
            "samples_per_second_per_gpu": 108712.98317045503,
            "loss_sequences_lower_95": 2.7657234766818357,
            "loss_sequences_upper_95": 2.9385767484148264,
            "loss_tokens_lower_95": 2.835071161458333,
            "loss_tokens_upper_95": 2.85736453125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.679426605051214,
            "data_time": 0.0819058929170881,
            "batch_time": 0.11627886976514544,
            "samples_per_second": 524721.6952912367,
            "samples_per_second_per_gpu": 65590.21191140459,
            "loss_sequences_lower_95": 3.603927638313987,
            "loss_sequences_upper_95": 3.7693971113725144,
            "loss_tokens_lower_95": 3.6585550915111194,
            "loss_tokens_upper_95": 3.7008785681291063,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1045867117778903,
            "data_time": 0.013768727129155939,
            "batch_time": 0.049344971776008606,
            "samples_per_second": 848148.1657175526,
            "samples_per_second_per_gpu": 106018.52071469408,
            "loss_sequences_lower_95": 3.0445639029188687,
            "loss_sequences_upper_95": 3.163486687574025,
            "loss_tokens_lower_95": 3.092270510416667,
            "loss_tokens_upper_95": 3.11664965625,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.159437222342378,
            "data_time": 0.012961900482575098,
            "batch_time": 0.048598130544026695,
            "samples_per_second": 861537.2998767028,
            "samples_per_second_per_gpu": 107692.16248458785,
            "loss_sequences_lower_95": 5.081367536962504,
            "loss_sequences_upper_95": 5.259140280368775,
            "loss_tokens_lower_95": 5.147659625,
            "loss_tokens_upper_95": 5.170933458333334,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3400734272159514,
            "data_time": 0.03799763694405556,
            "batch_time": 0.08213307335972786,
            "samples_per_second": 770128.5429694238,
            "samples_per_second_per_gpu": 96266.06787117798,
            "loss_sequences_lower_95": 3.1991236764876567,
            "loss_sequences_upper_95": 3.586816662647685,
            "loss_tokens_lower_95": 3.3269495604468178,
            "loss_tokens_upper_95": 3.353477659381804,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9557638379977718,
            "data_time": 0.0015565170853447535,
            "batch_time": 0.03704494631643447,
            "samples_per_second": 899373.3574624577,
            "samples_per_second_per_gpu": 112421.66968280722,
            "loss_sequences_lower_95": 1.9469333663351462,
            "loss_sequences_upper_95": 1.9647740163994625,
            "loss_tokens_lower_95": 1.9467439304575114,
            "loss_tokens_upper_95": 1.9646515461984582,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7093580240772153,
            "data_time": 0.0017964837087947093,
            "batch_time": 0.037264291410613215,
            "samples_per_second": 899044.8973992981,
            "samples_per_second_per_gpu": 112380.61217491227,
            "loss_sequences_lower_95": 2.707317732321126,
            "loss_sequences_upper_95": 2.7317369440926234,
            "loss_tokens_lower_95": 2.687782657491336,
            "loss_tokens_upper_95": 2.7058022121002394,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6615526658792117,
            "data_time": 0.0032631692517826176,
            "batch_time": 0.03882257990068585,
            "samples_per_second": 894540.2261308427,
            "samples_per_second_per_gpu": 111817.52826635534,
            "loss_sequences_lower_95": 3.9306094912464573,
            "loss_sequences_upper_95": 4.224670404390057,
            "loss_tokens_lower_95": 3.0859702149600907,
            "loss_tokens_upper_95": 3.295885901806722,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.486611946936697,
            "data_time": 0.00412190594571702,
            "batch_time": 0.039585483993621585,
            "samples_per_second": 891840.2580189618,
            "samples_per_second_per_gpu": 111480.03225237022,
            "loss_sequences_lower_95": 3.5489694986979163,
            "loss_sequences_upper_95": 3.7452302408854163,
            "loss_tokens_lower_95": 3.2876788092079403,
            "loss_tokens_upper_95": 3.4269834721403303,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.556290614415155,
            "data_time": 0.004753169251063651,
            "batch_time": 0.04033371873570783,
            "samples_per_second": 888633.7532145627,
            "samples_per_second_per_gpu": 111079.21915182033,
            "loss_sequences_lower_95": 2.597515952433484,
            "loss_sequences_upper_95": 2.6512899981564515,
            "loss_tokens_lower_95": 2.469495260270464,
            "loss_tokens_upper_95": 2.498608481148969,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.128776123306968,
            "data_time": 0.02459296371255602,
            "batch_time": 0.060710012912750244,
            "samples_per_second": 831460.410324177,
            "samples_per_second_per_gpu": 103932.55129052213,
            "loss_sequences_lower_95": 2.1087850085171786,
            "loss_sequences_upper_95": 2.2052257537841795,
            "loss_tokens_lower_95": 2.066144350561826,
            "loss_tokens_upper_95": 2.1107957089904765,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.963799242097504,
            "data_time": 0.021491577848792076,
            "batch_time": 0.056792570278048515,
            "samples_per_second": 819784.6856402862,
            "samples_per_second_per_gpu": 102473.08570503577,
            "loss_sequences_lower_95": 2.9504300175880895,
            "loss_sequences_upper_95": 3.1222562206034756,
            "loss_tokens_lower_95": 2.851660506675271,
            "loss_tokens_upper_95": 2.9379987940081245,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.075262421766917,
            "data_time": 0.0165632168451945,
            "batch_time": 0.05192448237003424,
            "samples_per_second": 832481.2423869343,
            "samples_per_second_per_gpu": 104060.1552983668,
            "loss_sequences_lower_95": 3.0563463745117185,
            "loss_sequences_upper_95": 3.149438293457031,
            "loss_tokens_lower_95": 2.9384701043919965,
            "loss_tokens_upper_95": 3.134496481840042,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.740363858841985,
            "data_time": 0.0014546437240493964,
            "batch_time": 0.03692435110756854,
            "samples_per_second": 900181.9653161787,
            "samples_per_second_per_gpu": 112522.74566452234,
            "loss_sequences_lower_95": 4.7442469843388615,
            "loss_sequences_upper_95": 4.821872635604793,
            "loss_tokens_lower_95": 4.613691397851068,
            "loss_tokens_upper_95": 4.692501433417745,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.729375371351989,
            "data_time": 0.0031377471133366528,
            "batch_time": 0.03865868753234812,
            "samples_per_second": 895906.884890385,
            "samples_per_second_per_gpu": 111988.36061129812,
            "loss_sequences_lower_95": 4.167441248010706,
            "loss_sequences_upper_95": 4.450204005385889,
            "loss_tokens_lower_95": 3.1242871470715543,
            "loss_tokens_upper_95": 3.249961743541028,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.493007759897375,
            "data_time": 0.0052777275040343,
            "batch_time": 0.04066387422986933,
            "samples_per_second": 886080.5015327968,
            "samples_per_second_per_gpu": 110760.0626915996,
            "loss_sequences_lower_95": 3.821998955528077,
            "loss_sequences_upper_95": 4.134639961320793,
            "loss_tokens_lower_95": 3.1416851721951637,
            "loss_tokens_upper_95": 3.286378103319619,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.224951345626622,
            "data_time": 0.024232657892363414,
            "batch_time": 0.06016041338443756,
            "samples_per_second": 822056.8406872888,
            "samples_per_second_per_gpu": 102757.1050859111,
            "loss_sequences_lower_95": 6.147340511513627,
            "loss_sequences_upper_95": 6.301896994952197,
            "loss_tokens_lower_95": 6.1466221221505775,
            "loss_tokens_upper_95": 6.3028446842002,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.046404769420624,
            "data_time": 0.050927354739262506,
            "batch_time": 0.08691022946284367,
            "samples_per_second": 742752.2400733054,
            "samples_per_second_per_gpu": 92844.03000916318,
            "loss_sequences_lower_95": 2.920039337158203,
            "loss_sequences_upper_95": 3.2763591690063474,
            "loss_tokens_lower_95": 2.755831652591821,
            "loss_tokens_upper_95": 3.1822956228512127,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.119532427865871,
            "data_time": 0.0034678577157861126,
            "batch_time": 0.03894514101414593,
            "samples_per_second": 897096.063395222,
            "samples_per_second_per_gpu": 112137.00792440274,
            "loss_sequences_lower_95": 4.083656646822224,
            "loss_sequences_upper_95": 4.1550050952173425,
            "loss_tokens_lower_95": 4.084540925665301,
            "loss_tokens_upper_95": 4.154825731822143,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7776993853844747,
            "data_time": 0.004773191682277263,
            "batch_time": 0.040309305097695085,
            "samples_per_second": 888737.4625426433,
            "samples_per_second_per_gpu": 111092.18281783041,
            "loss_sequences_lower_95": 2.737411796451103,
            "loss_sequences_upper_95": 2.818864717100801,
            "loss_tokens_lower_95": 2.736258951822917,
            "loss_tokens_upper_95": 2.8173427522426544,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0827787027525044,
            "data_time": 0.0033714740374175827,
            "batch_time": 0.038774696333619484,
            "samples_per_second": 891736.1534555894,
            "samples_per_second_per_gpu": 111467.01918194868,
            "loss_sequences_lower_95": 3.21054075155995,
            "loss_sequences_upper_95": 3.3393097687597764,
            "loss_tokens_lower_95": 2.9386344249073817,
            "loss_tokens_upper_95": 2.9955518481376253,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.969910739421844,
            "data_time": 0.01150465290993452,
            "batch_time": 0.046989950351417065,
            "samples_per_second": 859514.8657470581,
            "samples_per_second_per_gpu": 107439.35821838226,
            "loss_sequences_lower_95": 5.135218981933594,
            "loss_sequences_upper_95": 5.6773607055664055,
            "loss_tokens_lower_95": 4.429351172456672,
            "loss_tokens_upper_95": 4.785455754480167,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4513992071151733,
            "data_time": 0.1532658189535141,
            "batch_time": 0.1938709169626236,
            "samples_per_second": 456074.5065078542,
            "samples_per_second_per_gpu": 57009.313313481776,
            "loss_sequences_lower_95": 3.260980725288391,
            "loss_sequences_upper_95": 3.6619844913482664,
            "loss_tokens_lower_95": 3.005671648047436,
            "loss_tokens_upper_95": 3.810701813094917,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9824543985827217,
            "data_time": 0.02752273640734084,
            "batch_time": 0.06212856414470267,
            "samples_per_second": 785793.7700871315,
            "samples_per_second_per_gpu": 98224.22126089144,
            "loss_sequences_lower_95": 4.274101625639816,
            "loss_sequences_upper_95": 4.831454327462733,
            "loss_tokens_lower_95": 3.0653620946770186,
            "loss_tokens_upper_95": 3.41996615473286,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.3792530673724066,
            "data_time": 0.0029708780348300934,
            "batch_time": 0.03834357547263304,
            "samples_per_second": 894970.7420291494,
            "samples_per_second_per_gpu": 111871.34275364368,
            "loss_sequences_lower_95": 2.3569704334930646,
            "loss_sequences_upper_95": 2.4010865911014636,
            "loss_tokens_lower_95": 2.3566699560056246,
            "loss_tokens_upper_95": 2.4023432167089886,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.253896490779904,
            "data_time": 0.0028458680577365105,
            "batch_time": 0.0383272722427078,
            "samples_per_second": 898382.8871485122,
            "samples_per_second_per_gpu": 112297.86089356402,
            "loss_sequences_lower_95": 2.228584934784652,
            "loss_sequences_upper_95": 2.3564601893011354,
            "loss_tokens_lower_95": 2.12388454969895,
            "loss_tokens_upper_95": 2.248891371929685,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.989476646040822,
            "data_time": 0.01708183354801602,
            "batch_time": 0.05203996764289008,
            "samples_per_second": 826866.6649790796,
            "samples_per_second_per_gpu": 103358.33312238495,
            "loss_sequences_lower_95": 2.8578800634586767,
            "loss_sequences_upper_95": 3.2744700840541294,
            "loss_tokens_lower_95": 2.729285656049448,
            "loss_tokens_upper_95": 3.0140750401555993,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4191484403309147,
            "data_time": 0.004672738909721375,
            "batch_time": 0.04005663581192494,
            "samples_per_second": 887126.9167120276,
            "samples_per_second_per_gpu": 110890.86458900345,
            "loss_sequences_lower_95": 3.4645019916633784,
            "loss_sequences_upper_95": 3.6184640066964286,
            "loss_tokens_lower_95": 3.273496013313687,
            "loss_tokens_upper_95": 3.4138987136099552,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.541853857476537,
            "data_time": 0.02972050224031721,
            "batch_time": 0.06555438893181938,
            "samples_per_second": 805636.8272774934,
            "samples_per_second_per_gpu": 100704.60340968668,
            "loss_sequences_lower_95": 2.402298299277701,
            "loss_sequences_upper_95": 2.8148790917745448,
            "loss_tokens_lower_95": 2.304712153025591,
            "loss_tokens_upper_95": 2.6471380386228076,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.082704832325413,
            "data_time": 0.0017404686294933628,
            "batch_time": 0.037196403639439435,
            "samples_per_second": 898395.9495241384,
            "samples_per_second_per_gpu": 112299.4936905173,
            "loss_sequences_lower_95": 5.070339360840919,
            "loss_sequences_upper_95": 5.0950820632876574,
            "loss_tokens_lower_95": 5.070568664514153,
            "loss_tokens_upper_95": 5.094795404393379,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2680535709973677,
            "data_time": 0.046244998411698775,
            "batch_time": 0.08219043124805797,
            "samples_per_second": 733312.9500810265,
            "samples_per_second_per_gpu": 91664.11876012832,
            "loss_sequences_lower_95": 1.2127411240512884,
            "loss_sequences_upper_95": 1.3975591492884367,
            "loss_tokens_lower_95": 1.0757055307972028,
            "loss_tokens_upper_95": 1.3340698679491587,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.1534953037647835,
            "data_time": 0.0012299555095423198,
            "batch_time": 0.036731214555291096,
            "samples_per_second": 899255.6944565272,
            "samples_per_second_per_gpu": 112406.9618070659,
            "loss_sequences_lower_95": 4.475070846845519,
            "loss_sequences_upper_95": 4.514278848516117,
            "loss_tokens_lower_95": 3.654803662959381,
            "loss_tokens_upper_95": 3.6947213914410058,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.651291000366211,
            "data_time": 0.0055890073851933555,
            "batch_time": 0.04112717935017177,
            "samples_per_second": 884606.2989968244,
            "samples_per_second_per_gpu": 110575.78737460305,
            "loss_sequences_lower_95": 4.624014318847657,
            "loss_sequences_upper_95": 4.758361303710937,
            "loss_tokens_lower_95": 4.546198062055902,
            "loss_tokens_upper_95": 4.686497645072023,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.944238917723946,
            "data_time": 0.023611434435440323,
            "batch_time": 0.05908892720432605,
            "samples_per_second": 828567.253753328,
            "samples_per_second_per_gpu": 103570.906719166,
            "loss_sequences_lower_95": 3.815399740467901,
            "loss_sequences_upper_95": 4.074277164625085,
            "loss_tokens_lower_95": 3.8151905292013417,
            "loss_tokens_upper_95": 4.069904479980469,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.683675492532325,
            "data_time": 0.004777627177985318,
            "batch_time": 0.04036230387457882,
            "samples_per_second": 887373.1757451651,
            "samples_per_second_per_gpu": 110921.64696814564,
            "loss_sequences_lower_95": 6.603984005089962,
            "loss_sequences_upper_95": 6.761907681551847,
            "loss_tokens_lower_95": 6.603766738429214,
            "loss_tokens_upper_95": 6.76408251213305,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.4566192847092947,
            "data_time": 0.004198492524471689,
            "batch_time": 0.039661405568427226,
            "samples_per_second": 893488.5221996229,
            "samples_per_second_per_gpu": 111686.06527495287,
            "loss_sequences_lower_95": 1.491543436686198,
            "loss_sequences_upper_95": 1.5365419108072915,
            "loss_tokens_lower_95": 1.3767969609718886,
            "loss_tokens_upper_95": 1.454536924144658,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.757201183409918,
            "data_time": 0.023189038038253784,
            "batch_time": 0.058117374777793884,
            "samples_per_second": 804815.4887765022,
            "samples_per_second_per_gpu": 100601.93609706277,
            "loss_sequences_lower_95": 5.4293900553385415,
            "loss_sequences_upper_95": 6.08889660063244,
            "loss_tokens_lower_95": 5.436775367373512,
            "loss_tokens_upper_95": 6.089516863141742,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.9553234465420246,
            "data_time": 0.14789463579654694,
            "batch_time": 0.1854598969221115,
            "samples_per_second": 491640.20661741553,
            "samples_per_second_per_gpu": 61455.02582717694,
            "loss_sequences_lower_95": 1.7850428164005279,
            "loss_sequences_upper_95": 2.564848208427429,
            "loss_tokens_lower_95": 1.5315415750090609,
            "loss_tokens_upper_95": 1.9644688745872263,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.619393096923828,
            "data_time": 0.00555721398383852,
            "batch_time": 0.0409512406303769,
            "samples_per_second": 885878.6923361613,
            "samples_per_second_per_gpu": 110734.83654202016,
            "loss_sequences_lower_95": 7.562984252929687,
            "loss_sequences_upper_95": 7.869786303710938,
            "loss_tokens_lower_95": 7.340183873830108,
            "loss_tokens_upper_95": 7.617030853426396,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.042688012599945,
            "data_time": 0.005973154590243385,
            "batch_time": 0.04141006346732851,
            "samples_per_second": 885924.3350191535,
            "samples_per_second_per_gpu": 110740.5418773942,
            "loss_sequences_lower_95": 7.167147790527344,
            "loss_sequences_upper_95": 7.4025088378906245,
            "loss_tokens_lower_95": 6.764617116103843,
            "loss_tokens_upper_95": 6.96981806938924,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.961845440309878,
            "data_time": 0.003531909866077844,
            "batch_time": 0.039048911416809697,
            "samples_per_second": 891564.3890137642,
            "samples_per_second_per_gpu": 111445.54862672053,
            "loss_sequences_lower_95": 5.941462525928176,
            "loss_sequences_upper_95": 5.982643181151525,
            "loss_tokens_lower_95": 5.941567188023801,
            "loss_tokens_upper_95": 5.982026846793287,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.020561002732788,
            "data_time": 0.008306061033035694,
            "batch_time": 0.04363574938471584,
            "samples_per_second": 874068.3847157357,
            "samples_per_second_per_gpu": 109258.54808946696,
            "loss_sequences_lower_95": 2.940281149868592,
            "loss_sequences_upper_95": 3.100931447112615,
            "loss_tokens_lower_95": 2.9395084410402266,
            "loss_tokens_upper_95": 3.1014071228683635,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.71284103345871,
            "data_time": 0.0055704254006582596,
            "batch_time": 0.040998243623309664,
            "samples_per_second": 885664.7157312618,
            "samples_per_second_per_gpu": 110708.08946640772,
            "loss_sequences_lower_95": 6.6262700073242184,
            "loss_sequences_upper_95": 6.79863671875,
            "loss_tokens_lower_95": 6.627370849609376,
            "loss_tokens_upper_95": 6.798594836425781,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.585934130561273,
            "data_time": 0.0017129287377197035,
            "batch_time": 0.0371928897832427,
            "samples_per_second": 898209.9423551253,
            "samples_per_second_per_gpu": 112276.24279439067,
            "loss_sequences_lower_95": 3.0235034479955063,
            "loss_sequences_upper_95": 3.096424141327164,
            "loss_tokens_lower_95": 2.054322506485145,
            "loss_tokens_upper_95": 2.106038382992791,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.283343212818032,
            "data_time": 0.018533815656389507,
            "batch_time": 0.053787413665226526,
            "samples_per_second": 827516.8545382565,
            "samples_per_second_per_gpu": 103439.60681728207,
            "loss_sequences_lower_95": 3.1761910851322,
            "loss_sequences_upper_95": 3.3928188096231486,
            "loss_tokens_lower_95": 3.173255191632171,
            "loss_tokens_upper_95": 3.3922279073231256,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0465719938278197,
            "data_time": 0.010731472633779049,
            "batch_time": 0.04645534045994282,
            "samples_per_second": 871013.7358275054,
            "samples_per_second_per_gpu": 108876.71697843817,
            "loss_sequences_lower_95": 2.976195062375536,
            "loss_sequences_upper_95": 3.117626564175475,
            "loss_tokens_lower_95": 2.976271493949142,
            "loss_tokens_upper_95": 3.1174312636431525,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5896206021712085,
            "data_time": 0.001980285580142136,
            "batch_time": 0.03746351387298653,
            "samples_per_second": 896913.2814519211,
            "samples_per_second_per_gpu": 112114.16018149014,
            "loss_sequences_lower_95": 4.133055093147156,
            "loss_sequences_upper_95": 4.229637613718214,
            "loss_tokens_lower_95": 2.8674775437525204,
            "loss_tokens_upper_95": 2.941513339993401,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.170979449357936,
            "data_time": 0.026178302864233654,
            "batch_time": 0.062257026632626854,
            "samples_per_second": 816381.7105549931,
            "samples_per_second_per_gpu": 102047.71381937414,
            "loss_sequences_lower_95": 6.100590539357019,
            "loss_sequences_upper_95": 6.237714285068411,
            "loss_tokens_lower_95": 6.099511347372065,
            "loss_tokens_upper_95": 6.238600199058573,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.8768027226130166,
            "data_time": 0.003124017709631914,
            "batch_time": 0.03867234458853474,
            "samples_per_second": 892158.3586394306,
            "samples_per_second_per_gpu": 111519.79482992883,
            "loss_sequences_lower_95": 2.8509013418028477,
            "loss_sequences_upper_95": 2.9028819568879016,
            "loss_tokens_lower_95": 2.8518155775157683,
            "loss_tokens_upper_95": 2.9025545695360284,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.234012328305291,
            "data_time": 0.024204137108542703,
            "batch_time": 0.05972526723688299,
            "samples_per_second": 793283.6345072228,
            "samples_per_second_per_gpu": 99160.45431340285,
            "loss_sequences_lower_95": 3.1169611736408718,
            "loss_sequences_upper_95": 3.3535703788683255,
            "loss_tokens_lower_95": 3.1149615389629477,
            "loss_tokens_upper_95": 3.354606161765682,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6030899365743,
            "data_time": 0.08051427453756332,
            "batch_time": 0.11657564342021942,
            "samples_per_second": 645132.8906147692,
            "samples_per_second_per_gpu": 80641.61132684615,
            "loss_sequences_lower_95": 1.4644438044230144,
            "loss_sequences_upper_95": 1.8888169447580974,
            "loss_tokens_lower_95": 1.314641727341546,
            "loss_tokens_upper_95": 1.7825995339287652,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.6711729526519776,
            "data_time": 0.07442927360534668,
            "batch_time": 0.10996627062559128,
            "samples_per_second": 654746.2245038371,
            "samples_per_second_per_gpu": 81843.27806297963,
            "loss_sequences_lower_95": 1.5415645122528074,
            "loss_sequences_upper_95": 2.021084928512573,
            "loss_tokens_lower_95": 1.2757937099156755,
            "loss_tokens_upper_95": 1.8822941726513123,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.594668138220314,
            "data_time": 0.003003746990318366,
            "batch_time": 0.03863308779136653,
            "samples_per_second": 891031.961658818,
            "samples_per_second_per_gpu": 111378.99520735224,
            "loss_sequences_lower_95": 4.567290743798325,
            "loss_sequences_upper_95": 4.622320242026418,
            "loss_tokens_lower_95": 4.567184587571337,
            "loss_tokens_upper_95": 4.622306657883836,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.5730935824198231,
            "data_time": 0.0011373271745847664,
            "batch_time": 0.03665790794095273,
            "samples_per_second": 899288.7621664817,
            "samples_per_second_per_gpu": 112411.09527081021,
            "loss_sequences_lower_95": 0.6507990375553494,
            "loss_sequences_upper_95": 0.6658273079812197,
            "loss_tokens_lower_95": 0.48951670085228277,
            "loss_tokens_upper_95": 0.49810039132952577,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8916643153964063,
            "data_time": 0.03857050836086273,
            "batch_time": 0.07586570084095001,
            "samples_per_second": 785846.5244378784,
            "samples_per_second_per_gpu": 98230.8155547348,
            "loss_sequences_lower_95": 3.9459800239623064,
            "loss_sequences_upper_95": 4.291373089167077,
            "loss_tokens_lower_95": 3.6019463663047313,
            "loss_tokens_upper_95": 3.872587475627917,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.54158455616719,
            "data_time": 0.11628939991905576,
            "batch_time": 0.15248541604904903,
            "samples_per_second": 498883.5398053559,
            "samples_per_second_per_gpu": 62360.442475669486,
            "loss_sequences_lower_95": 6.13359698733768,
            "loss_sequences_upper_95": 7.183259170119826,
            "loss_tokens_lower_95": 5.507942218545043,
            "loss_tokens_upper_95": 7.449746685263551,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8128447489040656,
            "data_time": 0.031129456701732817,
            "batch_time": 0.06670060895738147,
            "samples_per_second": 809470.1289732981,
            "samples_per_second_per_gpu": 101183.76612166227,
            "loss_sequences_lower_95": 3.813127322313262,
            "loss_sequences_upper_95": 4.112599470557236,
            "loss_tokens_lower_95": 3.481090423609651,
            "loss_tokens_upper_95": 3.7051669601555925,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.942364965997091,
            "data_time": 0.031947141601925806,
            "batch_time": 0.06761873903728667,
            "samples_per_second": 807093.9811782853,
            "samples_per_second_per_gpu": 100886.74764728567,
            "loss_sequences_lower_95": 3.9259150993533254,
            "loss_sequences_upper_95": 4.189935907503454,
            "loss_tokens_lower_95": 3.644436743212736,
            "loss_tokens_upper_95": 3.8371410471425325,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.894097348538841,
            "data_time": 0.03080088467825027,
            "batch_time": 0.06677536169687907,
            "samples_per_second": 801226.2390287955,
            "samples_per_second_per_gpu": 100153.27987859944,
            "loss_sequences_lower_95": 3.909917347605636,
            "loss_sequences_upper_95": 4.25635644866199,
            "loss_tokens_lower_95": 3.5114472330541977,
            "loss_tokens_upper_95": 3.803342073165253,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.109431255154494,
            "data_time": 0.03145842892783029,
            "batch_time": 0.06806526013783046,
            "samples_per_second": 796396.4116955278,
            "samples_per_second_per_gpu": 99549.55146194098,
            "loss_sequences_lower_95": 4.071225654788133,
            "loss_sequences_upper_95": 4.334979769078696,
            "loss_tokens_lower_95": 3.825577191029011,
            "loss_tokens_upper_95": 4.006883691330193,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6354764425976676,
            "data_time": 0.032175817607361594,
            "batch_time": 0.06934171547124415,
            "samples_per_second": 795203.1884419525,
            "samples_per_second_per_gpu": 99400.39855524406,
            "loss_sequences_lower_95": 3.576000123586714,
            "loss_sequences_upper_95": 3.795594863417726,
            "loss_tokens_lower_95": 3.401461147722287,
            "loss_tokens_upper_95": 3.5553350988958763,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0205204472309206,
            "data_time": 0.030953373227800642,
            "batch_time": 0.06752404996326991,
            "samples_per_second": 796556.3208486206,
            "samples_per_second_per_gpu": 99569.54010607758,
            "loss_sequences_lower_95": 3.0359140442638863,
            "loss_sequences_upper_95": 3.265111792959818,
            "loss_tokens_lower_95": 2.788001784772453,
            "loss_tokens_upper_95": 2.898739551546489,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=1024_l=24_h=8-2.0/params.txt",
    "uuid": "c71591ef-91f1-46f8-a1a5-75a5e8017276",
    "creation_date": "2023_12_14-05_16_42"
}