{
    "name": "c4_original-d=576_l=24_h=8-1.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 3073547520,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "614709504",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=576_l=24_h=8-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.149683564901352,
            "data_time": 0.03071112185716629,
            "batch_time": 0.3535274937748909,
            "samples_per_second": 852688.344466524,
            "samples_per_second_per_gpu": 106586.0430583155,
            "loss_sequences_lower_95": 4.026166458129882,
            "loss_sequences_upper_95": 4.272037353515625,
            "loss_tokens_lower_95": 4.1343465995788575,
            "loss_tokens_upper_95": 4.1649011039733885,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4326457499105776,
            "data_time": 0.0011786955360962054,
            "batch_time": 0.030285556386549988,
            "samples_per_second": 1095613.6600072142,
            "samples_per_second_per_gpu": 136951.70750090177,
            "loss_sequences_lower_95": 3.4299237435881595,
            "loss_sequences_upper_95": 3.4353092481235077,
            "loss_tokens_lower_95": 3.421942578125,
            "loss_tokens_upper_95": 3.4434462604166667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.717084680284773,
            "data_time": 0.008323954582214356,
            "batch_time": 0.03760473346710205,
            "samples_per_second": 1063010.1240025053,
            "samples_per_second_per_gpu": 132876.26550031317,
            "loss_sequences_lower_95": 3.6914187559789537,
            "loss_sequences_upper_95": 3.7454685569296076,
            "loss_tokens_lower_95": 3.7024118958333334,
            "loss_tokens_upper_95": 3.73201121875,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4547927121526185,
            "data_time": 0.0014178779368337832,
            "batch_time": 0.029573090766605577,
            "samples_per_second": 1132564.6940085283,
            "samples_per_second_per_gpu": 141570.58675106603,
            "loss_sequences_lower_95": 3.4409873751610824,
            "loss_sequences_upper_95": 3.468974267074742,
            "loss_tokens_lower_95": 3.44367409375,
            "loss_tokens_upper_95": 3.4657383854166666,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.456260763700528,
            "data_time": 0.00784852020294068,
            "batch_time": 0.036453138784583346,
            "samples_per_second": 1079579.5215521827,
            "samples_per_second_per_gpu": 134947.44019402284,
            "loss_sequences_lower_95": 3.421471689383513,
            "loss_sequences_upper_95": 3.4923842669018903,
            "loss_tokens_lower_95": 3.4452541927083336,
            "loss_tokens_upper_95": 3.4669431510416664,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.91166024917976,
            "data_time": 0.00312051922082901,
            "batch_time": 0.03169422402330067,
            "samples_per_second": 1115607.8639967614,
            "samples_per_second_per_gpu": 139450.98299959517,
            "loss_sequences_lower_95": 3.8726142100133836,
            "loss_sequences_upper_95": 3.9520045837360853,
            "loss_tokens_lower_95": 3.899207666666667,
            "loss_tokens_upper_95": 3.92387815625,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.708271761767718,
            "data_time": 0.0014673847747665829,
            "batch_time": 0.03068769882398173,
            "samples_per_second": 1097659.7452971868,
            "samples_per_second_per_gpu": 137207.46816214835,
            "loss_sequences_lower_95": 3.6747994957748724,
            "loss_sequences_upper_95": 3.741268126195791,
            "loss_tokens_lower_95": 3.6937990833333334,
            "loss_tokens_upper_95": 3.7230975520833334,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.072242903434794,
            "data_time": 0.0015021051536888718,
            "batch_time": 0.0298646524430119,
            "samples_per_second": 1127644.839599092,
            "samples_per_second_per_gpu": 140955.6049498865,
            "loss_sequences_lower_95": 4.060338187172775,
            "loss_sequences_upper_95": 4.085332501636126,
            "loss_tokens_lower_95": 4.0603454687500005,
            "loss_tokens_upper_95": 4.084106770833333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.82947242066143,
            "data_time": 0.009264762439425029,
            "batch_time": 0.04351606444706992,
            "samples_per_second": 1077270.3851526366,
            "samples_per_second_per_gpu": 134658.79814407957,
            "loss_sequences_lower_95": 3.781271660037157,
            "loss_sequences_upper_95": 3.8831273148699506,
            "loss_tokens_lower_95": 3.8179003958333335,
            "loss_tokens_upper_95": 3.8409116041666667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.895137766133184,
            "data_time": 0.007849004119634628,
            "batch_time": 0.03741058800369501,
            "samples_per_second": 1056828.3208092307,
            "samples_per_second_per_gpu": 132103.54010115383,
            "loss_sequences_lower_95": 4.851770972451674,
            "loss_sequences_upper_95": 4.949295176630434,
            "loss_tokens_lower_95": 4.8820774375000004,
            "loss_tokens_upper_95": 4.908570041666667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8170489782958352,
            "data_time": 0.0011436067629788274,
            "batch_time": 0.029292580786715733,
            "samples_per_second": 1136612.1802453876,
            "samples_per_second_per_gpu": 142076.52253067345,
            "loss_sequences_lower_95": 3.8097536939945353,
            "loss_sequences_upper_95": 3.824408876561001,
            "loss_tokens_lower_95": 3.8054079375,
            "loss_tokens_upper_95": 3.828466302083333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.619576950164598,
            "data_time": 0.0023115046514658806,
            "batch_time": 0.030601478833937823,
            "samples_per_second": 1128041.3067423212,
            "samples_per_second_per_gpu": 141005.16334279015,
            "loss_sequences_lower_95": 3.6108198833981935,
            "loss_sequences_upper_95": 3.6281798375839496,
            "loss_tokens_lower_95": 3.6082435208333337,
            "loss_tokens_upper_95": 3.6309896354166664,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.216370867909089,
            "data_time": 0.0088204329192874,
            "batch_time": 0.03764786739123197,
            "samples_per_second": 1070262.0450448333,
            "samples_per_second_per_gpu": 133782.75563060417,
            "loss_sequences_lower_95": 4.170185180416455,
            "loss_sequences_upper_95": 4.269444240856364,
            "loss_tokens_lower_95": 4.2032295,
            "loss_tokens_upper_95": 4.2290800625000005,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4500562455163712,
            "data_time": 0.007907897827634774,
            "batch_time": 0.03690644564381634,
            "samples_per_second": 1076706.2227696516,
            "samples_per_second_per_gpu": 134588.27784620645,
            "loss_sequences_lower_95": 3.390009458584601,
            "loss_sequences_upper_95": 3.5117944907753627,
            "loss_tokens_lower_95": 3.4380249010416666,
            "loss_tokens_upper_95": 3.46202834375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.860542611642317,
            "data_time": 0.06686683212007795,
            "batch_time": 0.09978896379470825,
            "samples_per_second": 552327.6417589572,
            "samples_per_second_per_gpu": 69040.95521986965,
            "loss_sequences_lower_95": 4.790262629769066,
            "loss_sequences_upper_95": 4.9331200079484425,
            "loss_tokens_lower_95": 4.832482754100453,
            "loss_tokens_upper_95": 4.889662890000777,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.028380831893609,
            "data_time": 0.011582339351827448,
            "batch_time": 0.040357157588005066,
            "samples_per_second": 1064369.5162700608,
            "samples_per_second_per_gpu": 133046.1895337576,
            "loss_sequences_lower_95": 3.9447128607302298,
            "loss_sequences_upper_95": 4.11098118551271,
            "loss_tokens_lower_95": 4.01513746875,
            "loss_tokens_upper_95": 4.041747635416666,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.931061491802689,
            "data_time": 0.010337927689154943,
            "batch_time": 0.03963161259889603,
            "samples_per_second": 1062621.7211049073,
            "samples_per_second_per_gpu": 132827.71513811342,
            "loss_sequences_lower_95": 5.868002907141532,
            "loss_sequences_upper_95": 5.997341476078083,
            "loss_tokens_lower_95": 5.9188262499999995,
            "loss_tokens_upper_95": 5.94327015625,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.116726105330421,
            "data_time": 0.030202358961105347,
            "batch_time": 0.06140398606657982,
            "samples_per_second": 941245.8399841182,
            "samples_per_second_per_gpu": 117655.72999801478,
            "loss_sequences_lower_95": 4.019752433651784,
            "loss_sequences_upper_95": 4.2745055777127625,
            "loss_tokens_lower_95": 4.102723137277072,
            "loss_tokens_upper_95": 4.130904038226018,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.154675372830994,
            "data_time": 0.0016300982046779033,
            "batch_time": 0.030734163176497457,
            "samples_per_second": 1094054.53368453,
            "samples_per_second_per_gpu": 136756.81671056626,
            "loss_sequences_lower_95": 5.134528195547286,
            "loss_sequences_upper_95": 5.175496000836776,
            "loss_tokens_lower_95": 5.133996781975502,
            "loss_tokens_upper_95": 5.175396119120318,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.129341172094104,
            "data_time": 0.00182180035455971,
            "batch_time": 0.0307206916296558,
            "samples_per_second": 1101302.826280866,
            "samples_per_second_per_gpu": 137662.85328510826,
            "loss_sequences_lower_95": 3.1369283044416574,
            "loss_sequences_upper_95": 3.1628188658415906,
            "loss_tokens_lower_95": 3.104675596485232,
            "loss_tokens_upper_95": 3.1238992093232536,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.883587806487793,
            "data_time": 0.002686231120592238,
            "batch_time": 0.031865548458333066,
            "samples_per_second": 1090354.67415184,
            "samples_per_second_per_gpu": 136294.33426898,
            "loss_sequences_lower_95": 5.108422678576701,
            "loss_sequences_upper_95": 5.409739930842584,
            "loss_tokens_lower_95": 4.386750627308197,
            "loss_tokens_upper_95": 4.606782886213367,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.102477855205536,
            "data_time": 0.00357410803120187,
            "batch_time": 0.03224308202241329,
            "samples_per_second": 1100623.1460018263,
            "samples_per_second_per_gpu": 137577.89325022828,
            "loss_sequences_lower_95": 5.2197753499348964,
            "loss_sequences_upper_95": 5.4227724609375,
            "loss_tokens_lower_95": 4.789226365959119,
            "loss_tokens_upper_95": 4.9340476243121065,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.511181473822373,
            "data_time": 0.003917288456567272,
            "batch_time": 0.03314992493274165,
            "samples_per_second": 1083551.532626323,
            "samples_per_second_per_gpu": 135443.94157829037,
            "loss_sequences_lower_95": 3.5557435794162244,
            "loss_sequences_upper_95": 3.626024094958302,
            "loss_tokens_lower_95": 3.4080166728676637,
            "loss_tokens_upper_95": 3.4416239465867293,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7470070633021266,
            "data_time": 0.019786879420280457,
            "batch_time": 0.04909460885184152,
            "samples_per_second": 1026305.3916438819,
            "samples_per_second_per_gpu": 128288.17395548524,
            "loss_sequences_lower_95": 3.67262319391424,
            "loss_sequences_upper_95": 3.8931142009388315,
            "loss_tokens_lower_95": 3.6388665300942256,
            "loss_tokens_upper_95": 3.711362585659579,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9224327749135544,
            "data_time": 0.01708371192216873,
            "batch_time": 0.04641897976398468,
            "samples_per_second": 1001605.2319734221,
            "samples_per_second_per_gpu": 125200.65399667776,
            "loss_sequences_lower_95": 3.918470527493224,
            "loss_sequences_upper_95": 4.1384496260662464,
            "loss_tokens_lower_95": 3.7866794362516925,
            "loss_tokens_upper_95": 3.8888704272808057,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.690178793271382,
            "data_time": 0.013711012326754056,
            "batch_time": 0.04368128073521149,
            "samples_per_second": 999703.6444369195,
            "samples_per_second_per_gpu": 124962.95555461493,
            "loss_sequences_lower_95": 4.635913869222005,
            "loss_sequences_upper_95": 4.746653411865235,
            "loss_tokens_lower_95": 4.566772227057421,
            "loss_tokens_upper_95": 4.80154520041863,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.006831127818228,
            "data_time": 0.0014981285416578968,
            "batch_time": 0.03019836601011045,
            "samples_per_second": 1109458.2937541155,
            "samples_per_second_per_gpu": 138682.28671926443,
            "loss_sequences_lower_95": 6.0173578517912505,
            "loss_sequences_upper_95": 6.0958864521308005,
            "loss_tokens_lower_95": 5.867222579539753,
            "loss_tokens_upper_95": 5.948334277124444,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.764741860606052,
            "data_time": 0.0026919127710713637,
            "batch_time": 0.031868999436397684,
            "samples_per_second": 1090290.0118216851,
            "samples_per_second_per_gpu": 136286.25147771064,
            "loss_sequences_lower_95": 5.341938016230009,
            "loss_sequences_upper_95": 5.658617300939078,
            "loss_tokens_lower_95": 3.9885016085341447,
            "loss_tokens_upper_95": 4.130656412316928,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.339329273118907,
            "data_time": 0.004322521589897774,
            "batch_time": 0.033132786283621915,
            "samples_per_second": 1089585.6310127594,
            "samples_per_second_per_gpu": 136198.20387659493,
            "loss_sequences_lower_95": 4.800214362551327,
            "loss_sequences_upper_95": 5.157981898597483,
            "loss_tokens_lower_95": 3.9115930006403277,
            "loss_tokens_upper_95": 4.078376010280534,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.645338049762326,
            "data_time": 0.0196528754064015,
            "batch_time": 0.049301734992436,
            "samples_per_second": 1006827.1873341406,
            "samples_per_second_per_gpu": 125853.39841676758,
            "loss_sequences_lower_95": 5.549705484468642,
            "loss_sequences_upper_95": 5.741182524101919,
            "loss_tokens_lower_95": 5.550087177154681,
            "loss_tokens_upper_95": 5.7392957713506,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.555635151863098,
            "data_time": 0.041092868034656234,
            "batch_time": 0.07274097662705642,
            "samples_per_second": 871408.2412488057,
            "samples_per_second_per_gpu": 108926.0301561007,
            "loss_sequences_lower_95": 3.425025421142578,
            "loss_sequences_upper_95": 3.7948803558349606,
            "loss_tokens_lower_95": 3.237718137901456,
            "loss_tokens_upper_95": 3.6936475941448177,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.077213654864652,
            "data_time": 0.0028266948181183546,
            "batch_time": 0.03137856953470741,
            "samples_per_second": 1112156.2633661889,
            "samples_per_second_per_gpu": 139019.5329207736,
            "loss_sequences_lower_95": 5.026360765377111,
            "loss_sequences_upper_95": 5.128647465935261,
            "loss_tokens_lower_95": 5.024803725431007,
            "loss_tokens_upper_95": 5.12901506375144,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.464336037147641,
            "data_time": 0.004112839504322842,
            "batch_time": 0.03397916386216927,
            "samples_per_second": 1065265.3540397864,
            "samples_per_second_per_gpu": 133158.1692549733,
            "loss_sequences_lower_95": 5.407886631971105,
            "loss_sequences_upper_95": 5.520276738697149,
            "loss_tokens_lower_95": 5.407120968180538,
            "loss_tokens_upper_95": 5.521929616717214,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.512277449954451,
            "data_time": 0.0031383773935536327,
            "batch_time": 0.03305857794331166,
            "samples_per_second": 1059091.1408297233,
            "samples_per_second_per_gpu": 132386.39260371542,
            "loss_sequences_lower_95": 3.657627687672147,
            "loss_sequences_upper_95": 3.783737119523174,
            "loss_tokens_lower_95": 3.3371087296459234,
            "loss_tokens_upper_95": 3.392691032629102,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.6267815346717835,
            "data_time": 0.00880720280110836,
            "batch_time": 0.03848901577293873,
            "samples_per_second": 1032497.6733905331,
            "samples_per_second_per_gpu": 129062.20917381663,
            "loss_sequences_lower_95": 5.833159130859375,
            "loss_sequences_upper_95": 6.395897741699218,
            "loss_tokens_lower_95": 4.9911847380279,
            "loss_tokens_upper_95": 5.356564619870946,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.082466959953308,
            "data_time": 0.12362989783287048,
            "batch_time": 0.15690931677818298,
            "samples_per_second": 578782.1223989553,
            "samples_per_second_per_gpu": 72347.76529986941,
            "loss_sequences_lower_95": 3.845712113380432,
            "loss_sequences_upper_95": 4.3788384914398195,
            "loss_tokens_lower_95": 3.6193930439565376,
            "loss_tokens_upper_95": 4.401283465856793,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.5565200482291734,
            "data_time": 0.022836094206952033,
            "batch_time": 0.0515638234767508,
            "samples_per_second": 959663.5636414681,
            "samples_per_second_per_gpu": 119957.94545518351,
            "loss_sequences_lower_95": 6.099144701025952,
            "loss_sequences_upper_95": 7.041106283253637,
            "loss_tokens_lower_95": 3.8978770706538013,
            "loss_tokens_upper_95": 4.383817068484886,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.6118588180925295,
            "data_time": 0.002692735857433743,
            "batch_time": 0.03191994151307477,
            "samples_per_second": 1085279.635451286,
            "samples_per_second_per_gpu": 135659.95443141076,
            "loss_sequences_lower_95": 2.5829939202691543,
            "loss_sequences_upper_95": 2.6399490429772827,
            "loss_tokens_lower_95": 2.5821854937592152,
            "loss_tokens_upper_95": 2.6402811446216963,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5019954504260755,
            "data_time": 0.002256962232549014,
            "batch_time": 0.03130797614503564,
            "samples_per_second": 1096235.7007054293,
            "samples_per_second_per_gpu": 137029.46258817866,
            "loss_sequences_lower_95": 3.4713378053439743,
            "loss_sequences_upper_95": 3.6448447525258345,
            "loss_tokens_lower_95": 3.310378753517476,
            "loss_tokens_upper_95": 3.4802111340229676,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.314730481787042,
            "data_time": 0.015097434322039286,
            "batch_time": 0.04379674461152819,
            "samples_per_second": 1012408.3915705153,
            "samples_per_second_per_gpu": 126551.04894631442,
            "loss_sequences_lower_95": 3.161548483153403,
            "loss_sequences_upper_95": 3.551755845066392,
            "loss_tokens_lower_95": 3.0580660189223665,
            "loss_tokens_upper_95": 3.3549396073785123,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.69346554173949,
            "data_time": 0.003878292813897133,
            "batch_time": 0.032945746928453444,
            "samples_per_second": 1080875.954417554,
            "samples_per_second_per_gpu": 135109.49430219425,
            "loss_sequences_lower_95": 3.7304318302338197,
            "loss_sequences_upper_95": 3.881493909394732,
            "loss_tokens_lower_95": 3.547674602611784,
            "loss_tokens_upper_95": 3.6937378524752056,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.138703615200229,
            "data_time": 0.025276978810628254,
            "batch_time": 0.05620776755469186,
            "samples_per_second": 974578.6226460894,
            "samples_per_second_per_gpu": 121822.32783076118,
            "loss_sequences_lower_95": 2.977058024522735,
            "loss_sequences_upper_95": 3.4577873043897673,
            "loss_tokens_lower_95": 2.8601853935403496,
            "loss_tokens_upper_95": 3.245390835088384,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.701225444707281,
            "data_time": 0.0019823735683287234,
            "batch_time": 0.030920921017825485,
            "samples_per_second": 1096922.4715170097,
            "samples_per_second_per_gpu": 137115.3089396262,
            "loss_sequences_lower_95": 4.686897467384101,
            "loss_sequences_upper_95": 4.715641917445989,
            "loss_tokens_lower_95": 4.686752536054086,
            "loss_tokens_upper_95": 4.715666268019229,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.0594510473672627,
            "data_time": 0.038842118870128284,
            "batch_time": 0.06803930455988104,
            "samples_per_second": 913474.6991252991,
            "samples_per_second_per_gpu": 114184.33739066239,
            "loss_sequences_lower_95": 1.0078088519642654,
            "loss_sequences_upper_95": 1.1580323358183926,
            "loss_tokens_lower_95": 0.8978501678123201,
            "loss_tokens_upper_95": 1.1152286931704596,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.194628895772328,
            "data_time": 0.001376082116780117,
            "batch_time": 0.03025020788871033,
            "samples_per_second": 1099653.1170554613,
            "samples_per_second_per_gpu": 137456.63963193266,
            "loss_sequences_lower_95": 5.567223624623297,
            "loss_sequences_upper_95": 5.61635976603446,
            "loss_tokens_lower_95": 4.618660976789168,
            "loss_tokens_upper_95": 4.6682360130560925,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.846238782882691,
            "data_time": 0.0050068748375726125,
            "batch_time": 0.034369267168499175,
            "samples_per_second": 1070024.911102305,
            "samples_per_second_per_gpu": 133753.11388778812,
            "loss_sequences_lower_95": 6.788004138183594,
            "loss_sequences_upper_95": 7.045553271484375,
            "loss_tokens_lower_95": 6.657298067569364,
            "loss_tokens_upper_95": 6.898104218780211,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.536968552547952,
            "data_time": 0.01873249118610964,
            "batch_time": 0.04784898434655141,
            "samples_per_second": 1017348.654126182,
            "samples_per_second_per_gpu": 127168.58176577275,
            "loss_sequences_lower_95": 5.366696910028872,
            "loss_sequences_upper_95": 5.709785939092222,
            "loss_tokens_lower_95": 5.367906520677649,
            "loss_tokens_upper_95": 5.707375329059103,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.423712155313202,
            "data_time": 0.004301415150424084,
            "batch_time": 0.03314765210611274,
            "samples_per_second": 1088518.5074570116,
            "samples_per_second_per_gpu": 136064.81343212645,
            "loss_sequences_lower_95": 7.340991451379025,
            "loss_sequences_upper_95": 7.503075746478457,
            "loss_tokens_lower_95": 7.343988444010417,
            "loss_tokens_upper_95": 7.5028849468809184,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.9804631320635477,
            "data_time": 0.003729826275338518,
            "batch_time": 0.032409949188536784,
            "samples_per_second": 1100538.2844171776,
            "samples_per_second_per_gpu": 137567.2855521472,
            "loss_sequences_lower_95": 1.0290120605468749,
            "loss_sequences_upper_95": 1.1021686279296874,
            "loss_tokens_lower_95": 0.8966199526685674,
            "loss_tokens_upper_95": 0.9509171793717487,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.4731958945592245,
            "data_time": 0.019531807729176114,
            "batch_time": 0.049473968999726434,
            "samples_per_second": 959387.7805107456,
            "samples_per_second_per_gpu": 119923.4725638432,
            "loss_sequences_lower_95": 6.123049984886532,
            "loss_sequences_upper_95": 6.816760733468192,
            "loss_tokens_lower_95": 6.125734674362909,
            "loss_tokens_upper_95": 6.822121029808408,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.524437829852104,
            "data_time": 0.13201910257339478,
            "batch_time": 0.1648981273174286,
            "samples_per_second": 575518.9684164643,
            "samples_per_second_per_gpu": 71939.87105205804,
            "loss_sequences_lower_95": 2.2747721672058105,
            "loss_sequences_upper_95": 3.4096353650093074,
            "loss_tokens_lower_95": 1.9259261298425419,
            "loss_tokens_upper_95": 2.481982598058956,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.527966742038727,
            "data_time": 0.004934621235680958,
            "batch_time": 0.03392040824133252,
            "samples_per_second": 1083828.2312233744,
            "samples_per_second_per_gpu": 135478.5289029218,
            "loss_sequences_lower_95": 7.449228698730469,
            "loss_sequences_upper_95": 7.7959852783203125,
            "loss_tokens_lower_95": 7.260999629864636,
            "loss_tokens_upper_95": 7.56523834073604,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.910069445133209,
            "data_time": 0.0052041226909274145,
            "batch_time": 0.03412200203017583,
            "samples_per_second": 1084407.0973629656,
            "samples_per_second_per_gpu": 135550.8871703707,
            "loss_sequences_lower_95": 6.966675573730469,
            "loss_sequences_upper_95": 7.175926904296875,
            "loss_tokens_lower_95": 6.712306154544317,
            "loss_tokens_upper_95": 6.893788277631291,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.025593313401951,
            "data_time": 0.003712321763054583,
            "batch_time": 0.03248003015550084,
            "samples_per_second": 1094951.415354796,
            "samples_per_second_per_gpu": 136868.9269193495,
            "loss_sequences_lower_95": 4.986632224532769,
            "loss_sequences_upper_95": 5.063185206076621,
            "loss_tokens_lower_95": 4.987409267751635,
            "loss_tokens_upper_95": 5.063553774121061,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.046050169500887,
            "data_time": 0.007748447516173394,
            "batch_time": 0.03684179804476366,
            "samples_per_second": 1060703.897506084,
            "samples_per_second_per_gpu": 132587.9871882605,
            "loss_sequences_lower_95": 4.951606715329781,
            "loss_sequences_upper_95": 5.139282685966902,
            "loss_tokens_lower_95": 4.950189790271577,
            "loss_tokens_upper_95": 5.139716354346678,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.425358522415161,
            "data_time": 0.0050939591158004035,
            "batch_time": 0.03438067861965725,
            "samples_per_second": 1078155.1709871695,
            "samples_per_second_per_gpu": 134769.3963733962,
            "loss_sequences_lower_95": 7.36680361328125,
            "loss_sequences_upper_95": 7.486070947265625,
            "loss_tokens_lower_95": 7.367516455078125,
            "loss_tokens_upper_95": 7.485188598632813,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7316441338921678,
            "data_time": 0.0018031829246275817,
            "batch_time": 0.030620793178536717,
            "samples_per_second": 1101535.1237765176,
            "samples_per_second_per_gpu": 137691.8904720647,
            "loss_sequences_lower_95": 4.3298418098096025,
            "loss_sequences_upper_95": 4.436699643744086,
            "loss_tokens_lower_95": 3.003474692938724,
            "loss_tokens_upper_95": 3.072565954522672,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.760375986348337,
            "data_time": 0.015574020998818534,
            "batch_time": 0.044848590237753734,
            "samples_per_second": 1006938.8937399568,
            "samples_per_second_per_gpu": 125867.3617174946,
            "loss_sequences_lower_95": 5.561421989327046,
            "loss_sequences_upper_95": 5.9588887342766155,
            "loss_tokens_lower_95": 5.56408299688083,
            "loss_tokens_upper_95": 5.955254329852203,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.729654138228472,
            "data_time": 0.009209780022501945,
            "batch_time": 0.03858575038611889,
            "samples_per_second": 1059761.7393465224,
            "samples_per_second_per_gpu": 132470.2174183153,
            "loss_sequences_lower_95": 5.594659280215993,
            "loss_sequences_upper_95": 5.8622657087737435,
            "loss_tokens_lower_95": 5.595967634612439,
            "loss_tokens_upper_95": 5.861348781211703,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.327641762708133,
            "data_time": 0.002214498119254087,
            "batch_time": 0.0309567763883491,
            "samples_per_second": 1102486.3532685842,
            "samples_per_second_per_gpu": 137810.79415857303,
            "loss_sequences_lower_95": 4.867187316504603,
            "loss_sequences_upper_95": 4.983020693876832,
            "loss_tokens_lower_95": 3.5567136819137914,
            "loss_tokens_upper_95": 3.639746254857572,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.160627534149816,
            "data_time": 0.024560605486234028,
            "batch_time": 0.054030473033587136,
            "samples_per_second": 1006868.4389491164,
            "samples_per_second_per_gpu": 125858.55486863955,
            "loss_sequences_lower_95": 5.0262848465530965,
            "loss_sequences_upper_95": 5.2890250877097795,
            "loss_tokens_lower_95": 5.0276336185515875,
            "loss_tokens_upper_95": 5.287225785835711,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.0860175530844876,
            "data_time": 0.0031333302985763084,
            "batch_time": 0.03183447717892527,
            "samples_per_second": 1100272.0912059986,
            "samples_per_second_per_gpu": 137534.01140074982,
            "loss_sequences_lower_95": 5.032061651853975,
            "loss_sequences_upper_95": 5.138813685612576,
            "loss_tokens_lower_95": 5.034671462872707,
            "loss_tokens_upper_95": 5.137940707425459,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.821283458506019,
            "data_time": 0.020973346450112083,
            "batch_time": 0.0506265943700617,
            "samples_per_second": 955601.0883985145,
            "samples_per_second_per_gpu": 119450.13604981432,
            "loss_sequences_lower_95": 5.614524145033752,
            "loss_sequences_upper_95": 6.024518844456348,
            "loss_tokens_lower_95": 5.613096129778519,
            "loss_tokens_upper_95": 6.026330625663683,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.569906351963679,
            "data_time": 0.06938350200653076,
            "batch_time": 0.10141412913799286,
            "samples_per_second": 752615.3175094624,
            "samples_per_second_per_gpu": 94076.9146886828,
            "loss_sequences_lower_95": 3.3170901425679524,
            "loss_sequences_upper_95": 3.9573936080932612,
            "loss_tokens_lower_95": 2.941765276590983,
            "loss_tokens_upper_95": 3.8191569116380477,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.137287799517314,
            "data_time": 0.06879031658172607,
            "batch_time": 0.10460557788610458,
            "samples_per_second": 676159.7792563471,
            "samples_per_second_per_gpu": 84519.97240704339,
            "loss_sequences_lower_95": 2.945353253682454,
            "loss_sequences_upper_95": 3.767053629557292,
            "loss_tokens_lower_95": 2.379034436686655,
            "loss_tokens_upper_95": 3.390734571821234,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.560932418488965,
            "data_time": 0.003152355297373771,
            "batch_time": 0.03230512050967697,
            "samples_per_second": 1087567.937085276,
            "samples_per_second_per_gpu": 135945.9921356595,
            "loss_sequences_lower_95": 4.535911230612573,
            "loss_sequences_upper_95": 4.5857621717944586,
            "loss_tokens_lower_95": 4.53544170396493,
            "loss_tokens_upper_95": 4.586628737976344,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.9104566964437899,
            "data_time": 0.0012380732162254383,
            "batch_time": 0.03011826228521585,
            "samples_per_second": 1100715.508151449,
            "samples_per_second_per_gpu": 137589.43851893113,
            "loss_sequences_lower_95": 1.1054576306909494,
            "loss_sequences_upper_95": 1.1348635984675255,
            "loss_tokens_lower_95": 0.7062024265524985,
            "loss_tokens_upper_95": 0.7203024280444353,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.729631529079647,
            "data_time": 0.03520277515053749,
            "batch_time": 0.07114500179886818,
            "samples_per_second": 959757.0124797301,
            "samples_per_second_per_gpu": 119969.62655996626,
            "loss_sequences_lower_95": 4.745814201775498,
            "loss_sequences_upper_95": 5.115754399337168,
            "loss_tokens_lower_95": 4.373790739280121,
            "loss_tokens_upper_95": 4.580169220457965,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.118748188018799,
            "data_time": 0.10091243471418108,
            "batch_time": 0.13483273415338426,
            "samples_per_second": 562404.5273928574,
            "samples_per_second_per_gpu": 70300.56592410717,
            "loss_sequences_lower_95": 6.67373413910737,
            "loss_sequences_upper_95": 7.826098942112279,
            "loss_tokens_lower_95": 6.396095727991175,
            "loss_tokens_upper_95": 7.528661939832899,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.502861299165866,
            "data_time": 0.02848117124466669,
            "batch_time": 0.05815116280601138,
            "samples_per_second": 984140.2413344163,
            "samples_per_second_per_gpu": 123017.53016680204,
            "loss_sequences_lower_95": 4.455311900813405,
            "loss_sequences_upper_95": 4.794328345322027,
            "loss_tokens_lower_95": 4.159280366247897,
            "loss_tokens_upper_95": 4.331940492929983,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.617257099326064,
            "data_time": 0.025708405744461788,
            "batch_time": 0.05475857995805286,
            "samples_per_second": 1000212.3077177899,
            "samples_per_second_per_gpu": 125026.53846472374,
            "loss_sequences_lower_95": 4.586588045445884,
            "loss_sequences_upper_95": 4.896982537246332,
            "loss_tokens_lower_95": 4.287644543160695,
            "loss_tokens_upper_95": 4.432433556643603,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.72746575169447,
            "data_time": 0.0268832870892116,
            "batch_time": 0.05645565475736346,
            "samples_per_second": 992662.6781928482,
            "samples_per_second_per_gpu": 124082.83477410603,
            "loss_sequences_lower_95": 4.658281530985018,
            "loss_sequences_upper_95": 5.030897242848466,
            "loss_tokens_lower_95": 4.363621207747808,
            "loss_tokens_upper_95": 4.594149553892323,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.697514036806618,
            "data_time": 0.027559518814086914,
            "batch_time": 0.05873953444617135,
            "samples_per_second": 955468.5613783601,
            "samples_per_second_per_gpu": 119433.57017229502,
            "loss_sequences_lower_95": 4.654823656779964,
            "loss_sequences_upper_95": 4.93844134633134,
            "loss_tokens_lower_95": 4.393837208019981,
            "loss_tokens_upper_95": 4.529248056382033,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.889903415063894,
            "data_time": 0.02630652321709527,
            "batch_time": 0.05574025342493882,
            "samples_per_second": 1015573.2407248276,
            "samples_per_second_per_gpu": 126946.65509060345,
            "loss_sequences_lower_95": 4.853556909028048,
            "loss_sequences_upper_95": 5.132475195464139,
            "loss_tokens_lower_95": 4.641999728082041,
            "loss_tokens_upper_95": 4.752995644742615,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.855810063641246,
            "data_time": 0.026636186100187757,
            "batch_time": 0.056209215096064975,
            "samples_per_second": 990296.9880538074,
            "samples_per_second_per_gpu": 123787.12350672593,
            "loss_sequences_lower_95": 4.880504217380431,
            "loss_sequences_upper_95": 5.180960836643125,
            "loss_tokens_lower_95": 4.5152557781419524,
            "loss_tokens_upper_95": 4.63837743463683,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-1.0/params.txt",
    "uuid": "5e7f3760-235c-47b2-875b-23a022203593",
    "creation_date": "2023_12_13-16_18_23"
}