{
    "name": "c4_original-d=1024_l=24_h=8-0.25",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 2058081280,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "411616256",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=1024_l=24_h=8-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.257023584842682,
            "data_time": 0.047936711460351944,
            "batch_time": 0.44700104370713234,
            "samples_per_second": 692492.2461386034,
            "samples_per_second_per_gpu": 86561.53076732543,
            "loss_sequences_lower_95": 4.123423239390056,
            "loss_sequences_upper_95": 4.39102222442627,
            "loss_tokens_lower_95": 4.241916694641113,
            "loss_tokens_upper_95": 4.272441368103028,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5279492850157737,
            "data_time": 0.0010804531925008361,
            "batch_time": 0.03680761222869871,
            "samples_per_second": 896250.3886677318,
            "samples_per_second_per_gpu": 112031.29858346647,
            "loss_sequences_lower_95": 3.525283367304321,
            "loss_sequences_upper_95": 3.53053282244497,
            "loss_tokens_lower_95": 3.517132166666667,
            "loss_tokens_upper_95": 3.5387815,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7754573058108893,
            "data_time": 0.010119199752807617,
            "batch_time": 0.045585487365722654,
            "samples_per_second": 870201.0100468767,
            "samples_per_second_per_gpu": 108775.12625585959,
            "loss_sequences_lower_95": 3.753834795270647,
            "loss_sequences_upper_95": 3.798000824597417,
            "loss_tokens_lower_95": 3.761095958333333,
            "loss_tokens_upper_95": 3.7901823541666664,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.546348397952994,
            "data_time": 0.001632013407192732,
            "batch_time": 0.037040336057543755,
            "samples_per_second": 903345.6746535522,
            "samples_per_second_per_gpu": 112918.20933169403,
            "loss_sequences_lower_95": 3.533664032297036,
            "loss_sequences_upper_95": 3.559621063547036,
            "loss_tokens_lower_95": 3.5351819791666665,
            "loss_tokens_upper_95": 3.55725465625,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5510682991711526,
            "data_time": 0.009896410413947239,
            "batch_time": 0.04517003169572686,
            "samples_per_second": 867677.0885834118,
            "samples_per_second_per_gpu": 108459.63607292647,
            "loss_sequences_lower_95": 3.5172360795092925,
            "loss_sequences_upper_95": 3.5859067710985233,
            "loss_tokens_lower_95": 3.54001328125,
            "loss_tokens_upper_95": 3.5618597083333334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.038709125415068,
            "data_time": 0.003986197321311287,
            "batch_time": 0.03944798202618309,
            "samples_per_second": 899219.9455380803,
            "samples_per_second_per_gpu": 112402.49319226004,
            "loss_sequences_lower_95": 3.9995757164556425,
            "loss_sequences_upper_95": 4.08005245787039,
            "loss_tokens_lower_95": 4.026157697916666,
            "loss_tokens_upper_95": 4.051190989583334,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.059366468020848,
            "data_time": 0.0017110277544615904,
            "batch_time": 0.03702482148445839,
            "samples_per_second": 908374.339911552,
            "samples_per_second_per_gpu": 113546.792488944,
            "loss_sequences_lower_95": 4.025307338169643,
            "loss_sequences_upper_95": 4.09218727080676,
            "loss_tokens_lower_95": 4.0438529375000005,
            "loss_tokens_upper_95": 4.074719635416667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.13615149937375,
            "data_time": 0.00167596385051196,
            "batch_time": 0.03702491349826942,
            "samples_per_second": 907432.3259934548,
            "samples_per_second_per_gpu": 113429.04074918185,
            "loss_sequences_lower_95": 4.123258303337696,
            "loss_sequences_upper_95": 4.150391351030759,
            "loss_tokens_lower_95": 4.124354416666667,
            "loss_tokens_upper_95": 4.147776958333333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.898109313918323,
            "data_time": 0.009986077982281882,
            "batch_time": 0.04544120459329514,
            "samples_per_second": 863821.4843536122,
            "samples_per_second_per_gpu": 107977.68554420152,
            "loss_sequences_lower_95": 3.8489131958504035,
            "loss_sequences_upper_95": 3.953708828561674,
            "loss_tokens_lower_95": 3.8866575625,
            "loss_tokens_upper_95": 3.9096029479166665,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.948811221970871,
            "data_time": 0.009765351191163063,
            "batch_time": 0.04543852899223566,
            "samples_per_second": 871867.7646473459,
            "samples_per_second_per_gpu": 108983.47058091823,
            "loss_sequences_lower_95": 4.904565489998919,
            "loss_sequences_upper_95": 5.0056764866523595,
            "loss_tokens_lower_95": 4.935912083333333,
            "loss_tokens_upper_95": 4.962198604166667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9600663248832966,
            "data_time": 0.0012994982112300138,
            "batch_time": 0.036667056265319006,
            "samples_per_second": 907992.09091594,
            "samples_per_second_per_gpu": 113499.0113644925,
            "loss_sequences_lower_95": 3.9524426709499623,
            "loss_sequences_upper_95": 3.967681582810145,
            "loss_tokens_lower_95": 3.9482094895833333,
            "loss_tokens_upper_95": 3.9717807708333335,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7267401644410016,
            "data_time": 0.002647197018257287,
            "batch_time": 0.038105734778284335,
            "samples_per_second": 903089.2785863294,
            "samples_per_second_per_gpu": 112886.15982329118,
            "loss_sequences_lower_95": 3.7171002154050394,
            "loss_sequences_upper_95": 3.7363152824506716,
            "loss_tokens_lower_95": 3.7152909895833335,
            "loss_tokens_upper_95": 3.738237510416667,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.273225416276325,
            "data_time": 0.010047544132579456,
            "batch_time": 0.045314449566626266,
            "samples_per_second": 868569.8708233084,
            "samples_per_second_per_gpu": 108571.23385291354,
            "loss_sequences_lower_95": 4.226771554695423,
            "loss_sequences_upper_95": 4.3283383135379685,
            "loss_tokens_lower_95": 4.2600739999999995,
            "loss_tokens_upper_95": 4.2861818750000005,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5588763367134297,
            "data_time": 0.010270155758496775,
            "batch_time": 0.04582400625920391,
            "samples_per_second": 868195.6611796875,
            "samples_per_second_per_gpu": 108524.45764746093,
            "loss_sequences_lower_95": 3.4983474700368347,
            "loss_sequences_upper_95": 3.6200271202444787,
            "loss_tokens_lower_95": 3.5467411145833334,
            "loss_tokens_upper_95": 3.5707693541666665,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.9792116988788955,
            "data_time": 0.08725240400859288,
            "batch_time": 0.12253422396523612,
            "samples_per_second": 509985.7881131996,
            "samples_per_second_per_gpu": 63748.22351414995,
            "loss_sequences_lower_95": 4.910740817676891,
            "loss_sequences_upper_95": 5.048819134452126,
            "loss_tokens_lower_95": 4.949672118100253,
            "loss_tokens_upper_95": 5.009783718802712,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.247638743403354,
            "data_time": 0.013840247284282337,
            "batch_time": 0.049618047746745025,
            "samples_per_second": 848072.6089737464,
            "samples_per_second_per_gpu": 106009.0761217183,
            "loss_sequences_lower_95": 4.154344946858487,
            "loss_sequences_upper_95": 4.341408189601175,
            "loss_tokens_lower_95": 4.233462583333333,
            "loss_tokens_upper_95": 4.261276791666666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.957983749208475,
            "data_time": 0.012893613427877426,
            "batch_time": 0.04846286401152611,
            "samples_per_second": 866308.9678183245,
            "samples_per_second_per_gpu": 108288.62097729056,
            "loss_sequences_lower_95": 5.89305605120898,
            "loss_sequences_upper_95": 6.026991114956093,
            "loss_tokens_lower_95": 5.945942729166666,
            "loss_tokens_upper_95": 5.970229375000001,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.234313316032535,
            "data_time": 0.03674755245447159,
            "batch_time": 0.07238789275288582,
            "samples_per_second": 769714.7612047084,
            "samples_per_second_per_gpu": 96214.34515058855,
            "loss_sequences_lower_95": 4.128578029695104,
            "loss_sequences_upper_95": 4.409061044161437,
            "loss_tokens_lower_95": 4.219839777711962,
            "loss_tokens_upper_95": 4.248824110187468,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.261399821633373,
            "data_time": 0.0015749705272274843,
            "batch_time": 0.03703302257017557,
            "samples_per_second": 900497.9426399345,
            "samples_per_second_per_gpu": 112562.24282999181,
            "loss_sequences_lower_95": 5.238413915619214,
            "loss_sequences_upper_95": 5.285109612568545,
            "loss_tokens_lower_95": 5.238265518733977,
            "loss_tokens_upper_95": 5.284658274061743,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1993190623260404,
            "data_time": 0.0017773572616516405,
            "batch_time": 0.03724328846119012,
            "samples_per_second": 899313.3007858071,
            "samples_per_second_per_gpu": 112414.16259822588,
            "loss_sequences_lower_95": 3.2027131798399844,
            "loss_sequences_upper_95": 3.228603439771709,
            "loss_tokens_lower_95": 3.176918972163164,
            "loss_tokens_upper_95": 3.196295135680748,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.1605435233100385,
            "data_time": 0.003303005378500231,
            "batch_time": 0.04031730080190186,
            "samples_per_second": 899349.4893142373,
            "samples_per_second_per_gpu": 112418.68616427966,
            "loss_sequences_lower_95": 5.390652101108586,
            "loss_sequences_upper_95": 5.68792938535147,
            "loss_tokens_lower_95": 4.657629473061115,
            "loss_tokens_upper_95": 4.872436587424797,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.306119144350291,
            "data_time": 0.0037619076827739148,
            "batch_time": 0.03921359190915493,
            "samples_per_second": 892563.1059133657,
            "samples_per_second_per_gpu": 111570.38823917072,
            "loss_sequences_lower_95": 5.425455875651042,
            "loss_sequences_upper_95": 5.634169921874999,
            "loss_tokens_lower_95": 4.985642958922956,
            "loss_tokens_upper_95": 5.129903535279088,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.721751626645913,
            "data_time": 0.004675115395455339,
            "batch_time": 0.04013378407980342,
            "samples_per_second": 891848.1931351925,
            "samples_per_second_per_gpu": 111481.02414189906,
            "loss_sequences_lower_95": 3.768042140263694,
            "loss_sequences_upper_95": 3.839125034427715,
            "loss_tokens_lower_95": 3.6165127634667504,
            "loss_tokens_upper_95": 3.6517617448395603,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.091900946877219,
            "data_time": 0.023545735648700168,
            "batch_time": 0.05938692390918732,
            "samples_per_second": 830353.3096466291,
            "samples_per_second_per_gpu": 103794.16370582864,
            "loss_sequences_lower_95": 3.9933134807239883,
            "loss_sequences_upper_95": 4.257614468661221,
            "loss_tokens_lower_95": 3.981117369369142,
            "loss_tokens_upper_95": 4.063129180628338,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.9876783283389345,
            "data_time": 0.020067384466528893,
            "batch_time": 0.05527993477880955,
            "samples_per_second": 824106.4811282562,
            "samples_per_second_per_gpu": 103013.31014103202,
            "loss_sequences_lower_95": 3.970378050512197,
            "loss_sequences_upper_95": 4.189204176299426,
            "loss_tokens_lower_95": 3.851802772723426,
            "loss_tokens_upper_95": 3.953451133526574,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.670402990976969,
            "data_time": 0.017050850085723095,
            "batch_time": 0.052706557970780596,
            "samples_per_second": 830618.4995167151,
            "samples_per_second_per_gpu": 103827.31243958938,
            "loss_sequences_lower_95": 4.615616841634115,
            "loss_sequences_upper_95": 4.715830841064453,
            "loss_tokens_lower_95": 4.557358938625039,
            "loss_tokens_upper_95": 4.781342752312847,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.387297905868247,
            "data_time": 0.001362420198380296,
            "batch_time": 0.03677458083133343,
            "samples_per_second": 902591.8085761226,
            "samples_per_second_per_gpu": 112823.97607201533,
            "loss_sequences_lower_95": 6.400663952930466,
            "loss_sequences_upper_95": 6.479427800981743,
            "loss_tokens_lower_95": 6.244042534805175,
            "loss_tokens_upper_95": 6.326214653560848,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.784666905148262,
            "data_time": 0.002950955957374317,
            "batch_time": 0.038322056299888045,
            "samples_per_second": 898844.4198728952,
            "samples_per_second_per_gpu": 112355.5524841119,
            "loss_sequences_lower_95": 5.327220405552925,
            "loss_sequences_upper_95": 5.62682033757168,
            "loss_tokens_lower_95": 4.050338603755109,
            "loss_tokens_upper_95": 4.190118058955637,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.38840780831848,
            "data_time": 0.005029633641242981,
            "batch_time": 0.04059422499424702,
            "samples_per_second": 882564.1282781415,
            "samples_per_second_per_gpu": 110320.51603476769,
            "loss_sequences_lower_95": 4.79976134837284,
            "loss_sequences_upper_95": 5.138416027290422,
            "loss_tokens_lower_95": 3.9793875119116047,
            "loss_tokens_upper_95": 4.145818002657107,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.54024484059582,
            "data_time": 0.024461948445865085,
            "batch_time": 0.061107022421700616,
            "samples_per_second": 808684.4362740916,
            "samples_per_second_per_gpu": 101085.55453426145,
            "loss_sequences_lower_95": 6.470272367294521,
            "loss_sequences_upper_95": 6.6081853161119435,
            "loss_tokens_lower_95": 6.472813993828482,
            "loss_tokens_upper_95": 6.606738587819278,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6555797028541566,
            "data_time": 0.04990178346633911,
            "batch_time": 0.08634672715113713,
            "samples_per_second": 746246.0656900338,
            "samples_per_second_per_gpu": 93280.75821125423,
            "loss_sequences_lower_95": 3.5160825271606444,
            "loss_sequences_upper_95": 3.906146186828613,
            "loss_tokens_lower_95": 3.3323252396421483,
            "loss_tokens_upper_95": 3.802745806808335,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.006185178014858,
            "data_time": 0.003424715898275863,
            "batch_time": 0.038946013265348405,
            "samples_per_second": 897647.6453586527,
            "samples_per_second_per_gpu": 112205.95566983159,
            "loss_sequences_lower_95": 4.951585639633445,
            "loss_sequences_upper_95": 5.061679017800025,
            "loss_tokens_lower_95": 4.948731493291006,
            "loss_tokens_upper_95": 5.062426595385268,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.029946688059214,
            "data_time": 0.0050832630176139695,
            "batch_time": 0.04071500134118035,
            "samples_per_second": 888702.1853806908,
            "samples_per_second_per_gpu": 111087.77317258636,
            "loss_sequences_lower_95": 4.968542850378788,
            "loss_sequences_upper_95": 5.091506935513283,
            "loss_tokens_lower_95": 4.966185003807074,
            "loss_tokens_upper_95": 5.0933736115376735,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.603335699267175,
            "data_time": 0.0034068346538502505,
            "batch_time": 0.038732100511472634,
            "samples_per_second": 894397.5530531673,
            "samples_per_second_per_gpu": 111799.69413164591,
            "loss_sequences_lower_95": 3.7388702292955998,
            "loss_sequences_upper_95": 3.86168637945032,
            "loss_tokens_lower_95": 3.4432776301503165,
            "loss_tokens_upper_95": 3.4985099458841273,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.765544803857804,
            "data_time": 0.010447979904711246,
            "batch_time": 0.04618121776729822,
            "samples_per_second": 858311.7062759204,
            "samples_per_second_per_gpu": 107288.96328449005,
            "loss_sequences_lower_95": 5.956177172851563,
            "loss_sequences_upper_95": 6.499923901367188,
            "loss_tokens_lower_95": 5.131310574665991,
            "loss_tokens_upper_95": 5.493446108744209,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.184752896428108,
            "data_time": 0.1561758816242218,
            "batch_time": 0.1957198530435562,
            "samples_per_second": 484246.89268840477,
            "samples_per_second_per_gpu": 60530.861586050596,
            "loss_sequences_lower_95": 3.926426297426224,
            "loss_sequences_upper_95": 4.510252511501312,
            "loss_tokens_lower_95": 3.6790865141769937,
            "loss_tokens_upper_95": 4.552518235129871,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.171708152211946,
            "data_time": 0.028029190733077677,
            "batch_time": 0.06314950800956563,
            "samples_per_second": 779124.5564613068,
            "samples_per_second_per_gpu": 97390.56955766334,
            "loss_sequences_lower_95": 5.582886803287199,
            "loss_sequences_upper_95": 6.344727860374012,
            "loss_tokens_lower_95": 3.8758262935324477,
            "loss_tokens_upper_95": 4.333999774851148,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.127903774296055,
            "data_time": 0.0031732266975773703,
            "batch_time": 0.038469030211369194,
            "samples_per_second": 896950.6558777023,
            "samples_per_second_per_gpu": 112118.83198471279,
            "loss_sequences_lower_95": 2.1068321084705386,
            "loss_sequences_upper_95": 2.148559154345511,
            "loss_tokens_lower_95": 2.106048256010403,
            "loss_tokens_upper_95": 2.14889056271161,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8239796984880536,
            "data_time": 0.0025807516862590228,
            "batch_time": 0.03803305307654017,
            "samples_per_second": 899762.487059715,
            "samples_per_second_per_gpu": 112470.31088246437,
            "loss_sequences_lower_95": 3.7935110046635456,
            "loss_sequences_upper_95": 3.9774278276155877,
            "loss_tokens_lower_95": 3.618670687673738,
            "loss_tokens_upper_95": 3.7972113408705113,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.416563629230737,
            "data_time": 0.018659656246503193,
            "batch_time": 0.05378125939104292,
            "samples_per_second": 824182.8402539614,
            "samples_per_second_per_gpu": 103022.85503174517,
            "loss_sequences_lower_95": 3.289272741520361,
            "loss_sequences_upper_95": 3.6780261141039947,
            "loss_tokens_lower_95": 3.1500831117021275,
            "loss_tokens_upper_95": 3.4557847156224706,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.7495242118270924,
            "data_time": 0.004832957684993744,
            "batch_time": 0.04031011797487736,
            "samples_per_second": 885986.0746077839,
            "samples_per_second_per_gpu": 110748.25932597299,
            "loss_sequences_lower_95": 3.7767733627355464,
            "loss_sequences_upper_95": 3.9208016483635064,
            "loss_tokens_lower_95": 3.612298474205224,
            "loss_tokens_upper_95": 3.7596308285137208,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.232577490370448,
            "data_time": 0.031406493414015996,
            "batch_time": 0.06746640659513928,
            "samples_per_second": 795891.8054850929,
            "samples_per_second_per_gpu": 99486.47568563661,
            "loss_sequences_lower_95": 3.0613042040569027,
            "loss_sequences_upper_95": 3.547316779159918,
            "loss_tokens_lower_95": 2.9570044363497914,
            "loss_tokens_upper_95": 3.3356252300019755,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.6926723008013695,
            "data_time": 0.001863408317291589,
            "batch_time": 0.03735557566439682,
            "samples_per_second": 897271.8829441142,
            "samples_per_second_per_gpu": 112158.98536801427,
            "loss_sequences_lower_95": 4.680075575662007,
            "loss_sequences_upper_95": 4.704983252509877,
            "loss_tokens_lower_95": 4.680279835263928,
            "loss_tokens_upper_95": 4.704872771429286,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.2341895167110035,
            "data_time": 0.047217880595814096,
            "batch_time": 0.08242503946477717,
            "samples_per_second": 741694.9212474243,
            "samples_per_second_per_gpu": 92711.86515592804,
            "loss_sequences_lower_95": 1.1790333238620203,
            "loss_sequences_upper_95": 1.3375754532304784,
            "loss_tokens_lower_95": 1.0597567695922228,
            "loss_tokens_upper_95": 1.2970194509109918,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.474691574316105,
            "data_time": 0.0012304804725798858,
            "batch_time": 0.036688218060303215,
            "samples_per_second": 900459.3671228889,
            "samples_per_second_per_gpu": 112557.42089036111,
            "loss_sequences_lower_95": 5.92075671309945,
            "loss_sequences_upper_95": 5.974695996298481,
            "loss_tokens_lower_95": 4.796225120889749,
            "loss_tokens_upper_95": 4.8494987185686655,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.994897452354431,
            "data_time": 0.005948957942780994,
            "batch_time": 0.04166405115808759,
            "samples_per_second": 878444.1335481617,
            "samples_per_second_per_gpu": 109805.51669352021,
            "loss_sequences_lower_95": 6.9211989135742185,
            "loss_sequences_upper_95": 7.1583407104492185,
            "loss_tokens_lower_95": 6.8331660035407,
            "loss_tokens_upper_95": 7.061714010688563,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.171193697141565,
            "data_time": 0.02398051245737884,
            "batch_time": 0.05961599188335871,
            "samples_per_second": 827839.8696385473,
            "samples_per_second_per_gpu": 103479.98370481841,
            "loss_sequences_lower_95": 4.971912496815557,
            "loss_sequences_upper_95": 5.365026112432065,
            "loss_tokens_lower_95": 4.975086630116338,
            "loss_tokens_upper_95": 5.366787215523098,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.694927223884698,
            "data_time": 0.004595536424453,
            "batch_time": 0.04002071468226881,
            "samples_per_second": 889885.1644190527,
            "samples_per_second_per_gpu": 111235.64555238158,
            "loss_sequences_lower_95": 6.616002012310606,
            "loss_sequences_upper_95": 6.773269375887784,
            "loss_tokens_lower_95": 6.6140937573982015,
            "loss_tokens_upper_95": 6.774723806670218,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 1.0117280772924424,
            "data_time": 0.004302019768572868,
            "batch_time": 0.039736411990003384,
            "samples_per_second": 893796.9480162624,
            "samples_per_second_per_gpu": 111724.6185020328,
            "loss_sequences_lower_95": 1.0717284749348959,
            "loss_sequences_upper_95": 1.1585462483723958,
            "loss_tokens_lower_95": 0.9164186905230842,
            "loss_tokens_upper_95": 0.9730304739083133,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.532125621750241,
            "data_time": 0.024756680641855513,
            "batch_time": 0.059833843793187826,
            "samples_per_second": 803710.8908480091,
            "samples_per_second_per_gpu": 100463.86135600114,
            "loss_sequences_lower_95": 6.18991199311756,
            "loss_sequences_upper_95": 6.872346467517671,
            "loss_tokens_lower_95": 6.189230375744048,
            "loss_tokens_upper_95": 6.872881469726562,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.745794452726841,
            "data_time": 0.1535397320985794,
            "batch_time": 0.19357427954673767,
            "samples_per_second": 471291.5046753386,
            "samples_per_second_per_gpu": 58911.43808441733,
            "loss_sequences_lower_95": 2.4919939994812013,
            "loss_sequences_upper_95": 3.8278069913387296,
            "loss_tokens_lower_95": 2.1274747994019814,
            "loss_tokens_upper_95": 2.703284597495167,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.524245326519012,
            "data_time": 0.005702059420328292,
            "batch_time": 0.041056048775476126,
            "samples_per_second": 886851.9097568705,
            "samples_per_second_per_gpu": 110856.48871960881,
            "loss_sequences_lower_95": 7.4571251342773435,
            "loss_sequences_upper_95": 7.796858898925781,
            "loss_tokens_lower_95": 7.225268430758248,
            "loss_tokens_upper_95": 7.524780769154505,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.36123841381073,
            "data_time": 0.005771262305123466,
            "batch_time": 0.041153824518597316,
            "samples_per_second": 885853.868883384,
            "samples_per_second_per_gpu": 110731.733610423,
            "loss_sequences_lower_95": 7.46444521484375,
            "loss_sequences_upper_95": 7.688500561523438,
            "loss_tokens_lower_95": 7.10329432136486,
            "loss_tokens_upper_95": 7.29276923197296,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.828794105717131,
            "data_time": 0.0035881748964953983,
            "batch_time": 0.03904569428102627,
            "samples_per_second": 891790.6530708796,
            "samples_per_second_per_gpu": 111473.83163385995,
            "loss_sequences_lower_95": 4.7979175696787,
            "loss_sequences_upper_95": 4.859762040233196,
            "loss_tokens_lower_95": 4.798213836741535,
            "loss_tokens_upper_95": 4.859177633215827,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.414126634597778,
            "data_time": 0.008626516491985033,
            "batch_time": 0.04393839548001477,
            "samples_per_second": 873422.2957495068,
            "samples_per_second_per_gpu": 109177.78696868835,
            "loss_sequences_lower_95": 5.2922719554051465,
            "loss_sequences_upper_95": 5.534071821221558,
            "loss_tokens_lower_95": 5.288231746831797,
            "loss_tokens_upper_95": 5.533486155613959,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.35901778459549,
            "data_time": 0.005933394034703572,
            "batch_time": 0.041434417168299355,
            "samples_per_second": 884304.3970255284,
            "samples_per_second_per_gpu": 110538.04962819105,
            "loss_sequences_lower_95": 7.308433544921876,
            "loss_sequences_upper_95": 7.409909582519531,
            "loss_tokens_lower_95": 7.3093833740234375,
            "loss_tokens_upper_95": 7.409346166992187,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.0952487298194935,
            "data_time": 0.00182550295990221,
            "batch_time": 0.037360145329076985,
            "samples_per_second": 896010.1838260187,
            "samples_per_second_per_gpu": 112001.27297825234,
            "loss_sequences_lower_95": 4.781128433213103,
            "loss_sequences_upper_95": 4.899185933065279,
            "loss_tokens_lower_95": 3.2658869302003795,
            "loss_tokens_upper_95": 3.3401563183173013,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.9443831070145565,
            "data_time": 0.018594765663146974,
            "batch_time": 0.05383737257548741,
            "samples_per_second": 827260.6091423837,
            "samples_per_second_per_gpu": 103407.57614279796,
            "loss_sequences_lower_95": 5.740198960944787,
            "loss_sequences_upper_95": 6.144931907084451,
            "loss_tokens_lower_95": 5.740081456881851,
            "loss_tokens_upper_95": 6.146301463112902,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 6.026083580652872,
            "data_time": 0.010908188298344612,
            "batch_time": 0.046389385126531124,
            "samples_per_second": 876318.589634811,
            "samples_per_second_per_gpu": 109539.82370435138,
            "loss_sequences_lower_95": 5.868962079216452,
            "loss_sequences_upper_95": 6.179108922621783,
            "loss_tokens_lower_95": 5.8742388437308515,
            "loss_tokens_upper_95": 6.174765505323222,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.5141039608296305,
            "data_time": 0.0019746891526348383,
            "batch_time": 0.037374057242738574,
            "samples_per_second": 899116.0958877524,
            "samples_per_second_per_gpu": 112389.51198596905,
            "loss_sequences_lower_95": 4.978986046031411,
            "loss_sequences_upper_95": 5.090454532776682,
            "loss_tokens_lower_95": 3.783449870197419,
            "loss_tokens_upper_95": 3.8677080922686695,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.598432361764252,
            "data_time": 0.02772608647743861,
            "batch_time": 0.06434474140405655,
            "samples_per_second": 810846.3359080445,
            "samples_per_second_per_gpu": 101355.79198850556,
            "loss_sequences_lower_95": 5.455321748925265,
            "loss_sequences_upper_95": 5.736827596028646,
            "loss_tokens_lower_95": 5.454209520450975,
            "loss_tokens_upper_95": 5.73689948187934,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.430593711739286,
            "data_time": 0.003578217911632943,
            "batch_time": 0.03908069448156671,
            "samples_per_second": 892943.600277693,
            "samples_per_second_per_gpu": 111617.95003471163,
            "loss_sequences_lower_95": 4.377210868812117,
            "loss_sequences_upper_95": 4.484175797675364,
            "loss_tokens_lower_95": 4.379102383768157,
            "loss_tokens_upper_95": 4.481858387984996,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.876382745585396,
            "data_time": 0.023275693980130283,
            "batch_time": 0.05855868296189742,
            "samples_per_second": 792727.5347783605,
            "samples_per_second_per_gpu": 99090.94184729506,
            "loss_sequences_lower_95": 5.666937907691141,
            "loss_sequences_upper_95": 6.08547135140132,
            "loss_tokens_lower_95": 5.66587388121966,
            "loss_tokens_upper_95": 6.0865934057143125,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.776269495487213,
            "data_time": 0.0807972326874733,
            "batch_time": 0.11728626489639282,
            "samples_per_second": 640819.6072246205,
            "samples_per_second_per_gpu": 80102.45090307757,
            "loss_sequences_lower_95": 4.473513488769531,
            "loss_sequences_upper_95": 5.26208391825358,
            "loss_tokens_lower_95": 3.9639285087585447,
            "loss_tokens_upper_95": 5.076168409983317,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 4.221232843399048,
            "data_time": 0.08627066016197205,
            "batch_time": 0.12228818982839584,
            "samples_per_second": 646781.7399881572,
            "samples_per_second_per_gpu": 80847.71749851965,
            "loss_sequences_lower_95": 4.023354708353679,
            "loss_sequences_upper_95": 4.8742278289794925,
            "loss_tokens_lower_95": 3.271969613064541,
            "loss_tokens_upper_95": 4.532775527439759,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6378006042542266,
            "data_time": 0.0032336052338329045,
            "batch_time": 0.03876102328510969,
            "samples_per_second": 893453.4731366956,
            "samples_per_second_per_gpu": 111681.68414208695,
            "loss_sequences_lower_95": 3.624575001150589,
            "loss_sequences_upper_95": 3.650660574776786,
            "loss_tokens_lower_95": 3.6248808421161636,
            "loss_tokens_upper_95": 3.650693754890004,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 0.9655687096616281,
            "data_time": 0.0011235184853395175,
            "batch_time": 0.03656066976960431,
            "samples_per_second": 901184.3605898585,
            "samples_per_second_per_gpu": 112648.04507373231,
            "loss_sequences_lower_95": 1.1345686896819223,
            "loss_sequences_upper_95": 1.1571651892566506,
            "loss_tokens_lower_95": 0.7819264441949287,
            "loss_tokens_upper_95": 0.794307574121636,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.135280470209797,
            "data_time": 0.04113325849175453,
            "batch_time": 0.0775754339993,
            "samples_per_second": 789416.6204433425,
            "samples_per_second_per_gpu": 98677.07755541781,
            "loss_sequences_lower_95": 5.1552655557947835,
            "loss_sequences_upper_95": 5.556471204382228,
            "loss_tokens_lower_95": 4.793589962252057,
            "loss_tokens_upper_95": 5.0085972522875215,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 7.967787845714672,
            "data_time": 0.11638219015938896,
            "batch_time": 0.15403308187212264,
            "samples_per_second": 486355.99290695495,
            "samples_per_second_per_gpu": 60794.49911336937,
            "loss_sequences_lower_95": 7.532235470333615,
            "loss_sequences_upper_95": 8.638565269676414,
            "loss_tokens_lower_95": 7.261026867525078,
            "loss_tokens_upper_95": 8.408470662434896,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.022130527147433,
            "data_time": 0.029961111999693372,
            "batch_time": 0.06560114451817103,
            "samples_per_second": 809858.9239238217,
            "samples_per_second_per_gpu": 101232.3654904777,
            "loss_sequences_lower_95": 4.980390223061166,
            "loss_sequences_upper_95": 5.3298514947658635,
            "loss_tokens_lower_95": 4.654962744428091,
            "loss_tokens_upper_95": 4.839650244428091,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.113277118380477,
            "data_time": 0.031168037936801,
            "batch_time": 0.06705606267565772,
            "samples_per_second": 805232.1881215901,
            "samples_per_second_per_gpu": 100654.02351519876,
            "loss_sequences_lower_95": 5.081193607609446,
            "loss_sequences_upper_95": 5.401561234637005,
            "loss_tokens_lower_95": 4.777277141726281,
            "loss_tokens_upper_95": 4.930877573168318,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.234203049322454,
            "data_time": 0.03128517241705032,
            "batch_time": 0.0670289709454491,
            "samples_per_second": 808671.5347243113,
            "samples_per_second_per_gpu": 101083.94184053892,
            "loss_sequences_lower_95": 5.184944162136171,
            "loss_sequences_upper_95": 5.599496329702982,
            "loss_tokens_lower_95": 4.833684232250036,
            "loss_tokens_upper_95": 5.073416567643941,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.19596831100743,
            "data_time": 0.03277328184672764,
            "batch_time": 0.06849912518546694,
            "samples_per_second": 805861.2292135174,
            "samples_per_second_per_gpu": 100732.65365168967,
            "loss_sequences_lower_95": 5.15960012296351,
            "loss_sequences_upper_95": 5.455567950737185,
            "loss_tokens_lower_95": 4.885591453480943,
            "loss_tokens_upper_95": 5.026246410144082,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.224278559595902,
            "data_time": 0.033255915582915886,
            "batch_time": 0.06931292274851858,
            "samples_per_second": 817333.3710298714,
            "samples_per_second_per_gpu": 102166.67137873392,
            "loss_sequences_lower_95": 5.17926669861219,
            "loss_sequences_upper_95": 5.445711958038141,
            "loss_tokens_lower_95": 5.000753761150921,
            "loss_tokens_upper_95": 5.1198037392146825,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 5.322478910771812,
            "data_time": 0.0314620193980989,
            "batch_time": 0.06747325545265562,
            "samples_per_second": 804851.5937044288,
            "samples_per_second_per_gpu": 100606.4492130536,
            "loss_sequences_lower_95": 5.330874382577291,
            "loss_sequences_upper_95": 5.622877939735971,
            "loss_tokens_lower_95": 4.992364799954792,
            "loss_tokens_upper_95": 5.127122148602321,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/params.txt",
    "uuid": "df104fdb-2b70-4495-ad02-be1d8e58c351",
    "creation_date": "2023_12_14-04_59_48"
}