{
    "name": "c4_original-d=512_l=8_h=4-2.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 3156561920,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "631312384",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.394328902165095,
            "data_time": 0.03557486832141876,
            "batch_time": 0.34828800335526466,
            "samples_per_second": 1753469.614549602,
            "samples_per_second_per_gpu": 219183.70181870024,
            "loss_sequences_lower_95": 4.2706707127889,
            "loss_sequences_upper_95": 4.51789919535319,
            "loss_tokens_lower_95": 4.378504676818848,
            "loss_tokens_upper_95": 4.410124727884929,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6429770265470234,
            "data_time": 0.0015339291163327551,
            "batch_time": 0.015245993305699302,
            "samples_per_second": 2266058.2473468245,
            "samples_per_second_per_gpu": 283257.28091835306,
            "loss_sequences_lower_95": 3.6403655576095013,
            "loss_sequences_upper_95": 3.64552857790974,
            "loss_tokens_lower_95": 3.6320683125,
            "loss_tokens_upper_95": 3.6538486979166667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9302368986363314,
            "data_time": 0.010491562843322753,
            "batch_time": 0.02440642738342285,
            "samples_per_second": 2185695.2799911653,
            "samples_per_second_per_gpu": 273211.90999889566,
            "loss_sequences_lower_95": 3.8941127107581313,
            "loss_sequences_upper_95": 3.974293792101802,
            "loss_tokens_lower_95": 3.915358729166667,
            "loss_tokens_upper_95": 3.9451768854166667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.753833370159582,
            "data_time": 0.0016164950242168025,
            "batch_time": 0.014842583160651358,
            "samples_per_second": 2357422.026240859,
            "samples_per_second_per_gpu": 294677.7532801074,
            "loss_sequences_lower_95": 3.7259985905283504,
            "loss_sequences_upper_95": 3.78219168814433,
            "loss_tokens_lower_95": 3.7417918854166667,
            "loss_tokens_upper_95": 3.765552083333333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.6841476827920574,
            "data_time": 0.010451688234549594,
            "batch_time": 0.02393133611793062,
            "samples_per_second": 2212926.2364502205,
            "samples_per_second_per_gpu": 276615.77955627756,
            "loss_sequences_lower_95": 3.637870505773851,
            "loss_sequences_upper_95": 3.740179499297909,
            "loss_tokens_lower_95": 3.6730148854166664,
            "loss_tokens_upper_95": 3.6951767708333336,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.142165384999268,
            "data_time": 0.00421435191579487,
            "batch_time": 0.01760693574729173,
            "samples_per_second": 2326479.5732164844,
            "samples_per_second_per_gpu": 290809.94665206055,
            "loss_sequences_lower_95": 4.097789581319574,
            "loss_sequences_upper_95": 4.188210115620751,
            "loss_tokens_lower_95": 4.129429614583334,
            "loss_tokens_upper_95": 4.15467296875,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.981905683254709,
            "data_time": 0.0015752285378597688,
            "batch_time": 0.014921926265828566,
            "samples_per_second": 2344823.1068214127,
            "samples_per_second_per_gpu": 293102.8883526766,
            "loss_sequences_lower_95": 3.947224768813775,
            "loss_sequences_upper_95": 4.015953663105868,
            "loss_tokens_lower_95": 3.966587479166667,
            "loss_tokens_upper_95": 3.997076395833333,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.282230415843544,
            "data_time": 0.0015943675597608464,
            "batch_time": 0.014866307907551215,
            "samples_per_second": 2353389.313916202,
            "samples_per_second_per_gpu": 294173.6642395253,
            "loss_sequences_lower_95": 4.264972400605367,
            "loss_sequences_upper_95": 4.301078656740837,
            "loss_tokens_lower_95": 4.270008291666667,
            "loss_tokens_upper_95": 4.294209333333333,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.0669206652214855,
            "data_time": 0.012440932175469778,
            "batch_time": 0.02624676435712784,
            "samples_per_second": 2217820.0882698987,
            "samples_per_second_per_gpu": 277227.51103373733,
            "loss_sequences_lower_95": 3.999818804981263,
            "loss_sequences_upper_95": 4.146542544481231,
            "loss_tokens_lower_95": 4.0550499375,
            "loss_tokens_upper_95": 4.078799875,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.183287985240047,
            "data_time": 0.01018347404897213,
            "batch_time": 0.025236709974706173,
            "samples_per_second": 2236456.097486905,
            "samples_per_second_per_gpu": 279557.0121858631,
            "loss_sequences_lower_95": 5.105898384425951,
            "loss_sequences_upper_95": 5.2791540454970045,
            "loss_tokens_lower_95": 5.169762354166667,
            "loss_tokens_upper_95": 5.196972989583333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.043958395723612,
            "data_time": 0.0013332016077979882,
            "batch_time": 0.014540848061626139,
            "samples_per_second": 2371555.15131233,
            "samples_per_second_per_gpu": 296444.3939140412,
            "loss_sequences_lower_95": 4.035605211795087,
            "loss_sequences_upper_95": 4.052656679596495,
            "loss_tokens_lower_95": 4.0321913333333335,
            "loss_tokens_upper_95": 4.055757249999999,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.881729276018409,
            "data_time": 0.002919142291905183,
            "batch_time": 0.016868570464338293,
            "samples_per_second": 2314667.4370377306,
            "samples_per_second_per_gpu": 289333.4296297163,
            "loss_sequences_lower_95": 3.866715284647022,
            "loss_sequences_upper_95": 3.897364704810496,
            "loss_tokens_lower_95": 3.870139072916667,
            "loss_tokens_upper_95": 3.8935075937500003,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.450882703247225,
            "data_time": 0.010504432346509851,
            "batch_time": 0.024204853495119118,
            "samples_per_second": 2193592.5570068895,
            "samples_per_second_per_gpu": 274199.0696258612,
            "loss_sequences_lower_95": 4.385280784003392,
            "loss_sequences_upper_95": 4.529561616537778,
            "loss_tokens_lower_95": 4.437409697916666,
            "loss_tokens_upper_95": 4.463859239583333,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.696152525861006,
            "data_time": 0.010280892193554882,
            "batch_time": 0.02385371139799931,
            "samples_per_second": 2214284.5091529815,
            "samples_per_second_per_gpu": 276785.5636441227,
            "loss_sequences_lower_95": 3.6237112383250065,
            "loss_sequences_upper_95": 3.776254552153609,
            "loss_tokens_lower_95": 3.683664385416667,
            "loss_tokens_upper_95": 3.708406760416667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.165130257606506,
            "data_time": 0.09036966732570104,
            "batch_time": 0.10739082098007202,
            "samples_per_second": 962453.5025746424,
            "samples_per_second_per_gpu": 120306.6878218303,
            "loss_sequences_lower_95": 5.094282965226607,
            "loss_sequences_upper_95": 5.236352643099698,
            "loss_tokens_lower_95": 5.13643217086792,
            "loss_tokens_upper_95": 5.194521687247536,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.283744524241189,
            "data_time": 0.014664633707566694,
            "batch_time": 0.028458748351443897,
            "samples_per_second": 2150344.3467651126,
            "samples_per_second_per_gpu": 268793.0433456391,
            "loss_sequences_lower_95": 4.1959340087178845,
            "loss_sequences_upper_95": 4.370628023286603,
            "loss_tokens_lower_95": 4.270009927083334,
            "loss_tokens_upper_95": 4.297430510416667,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.0794316332069736,
            "data_time": 0.014778499801953634,
            "batch_time": 0.028288714587688446,
            "samples_per_second": 2223835.7501890804,
            "samples_per_second_per_gpu": 277979.46877363505,
            "loss_sequences_lower_95": 6.007306584582166,
            "loss_sequences_upper_95": 6.1650749266933955,
            "loss_tokens_lower_95": 6.0672977604166665,
            "loss_tokens_upper_95": 6.091593010416666,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.433256493240107,
            "data_time": 0.038996465504169464,
            "batch_time": 0.05329509451985359,
            "samples_per_second": 1870579.9430015963,
            "samples_per_second_per_gpu": 233822.49287519953,
            "loss_sequences_lower_95": 4.311833916335809,
            "loss_sequences_upper_95": 4.637602371465964,
            "loss_tokens_lower_95": 4.418309383705014,
            "loss_tokens_upper_95": 4.448041071657275,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.651550286372424,
            "data_time": 0.0019400464646637033,
            "batch_time": 0.015467469709880803,
            "samples_per_second": 2291275.0144343064,
            "samples_per_second_per_gpu": 286409.3768042883,
            "loss_sequences_lower_95": 4.634065013386181,
            "loss_sequences_upper_95": 4.669442592979989,
            "loss_tokens_lower_95": 4.633998291119943,
            "loss_tokens_upper_95": 4.669013745593576,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.337666615241316,
            "data_time": 0.0020675055540291367,
            "batch_time": 0.015626369625519797,
            "samples_per_second": 2281827.1396603202,
            "samples_per_second_per_gpu": 285228.39245754003,
            "loss_sequences_lower_95": 3.346921377480208,
            "loss_sequences_upper_95": 3.3731998944277786,
            "loss_tokens_lower_95": 3.311461029245079,
            "loss_tokens_upper_95": 3.3308658352596487,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.316797219079567,
            "data_time": 0.003214512311771835,
            "batch_time": 0.016726897866179872,
            "samples_per_second": 2286258.7461697594,
            "samples_per_second_per_gpu": 285782.3432712199,
            "loss_sequences_lower_95": 5.5392005657327585,
            "loss_sequences_upper_95": 5.830178606684725,
            "loss_tokens_lower_95": 4.833876620391053,
            "loss_tokens_upper_95": 5.045555698266619,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.564964528044065,
            "data_time": 0.004012700408063037,
            "batch_time": 0.01755128357004612,
            "samples_per_second": 2266917.890705154,
            "samples_per_second_per_gpu": 283364.7363381442,
            "loss_sequences_lower_95": 5.703402734375,
            "loss_sequences_upper_95": 5.90827236328125,
            "loss_tokens_lower_95": 5.221503193789308,
            "loss_tokens_upper_95": 5.362993317610063,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7165189787146717,
            "data_time": 0.004571268821014358,
            "batch_time": 0.01808313258992421,
            "samples_per_second": 2271380.9067961196,
            "samples_per_second_per_gpu": 283922.61334951496,
            "loss_sequences_lower_95": 3.7611267506308046,
            "loss_sequences_upper_95": 3.8325958737828136,
            "loss_tokens_lower_95": 3.6136441847755947,
            "loss_tokens_upper_95": 3.6477360689308878,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.077647257934917,
            "data_time": 0.023538265909467424,
            "batch_time": 0.03782657640320914,
            "samples_per_second": 2039242.5590402242,
            "samples_per_second_per_gpu": 254905.31988002802,
            "loss_sequences_lower_95": 3.9872249187122693,
            "loss_sequences_upper_95": 4.229672019264915,
            "loss_tokens_lower_95": 3.9679852599390304,
            "loss_tokens_upper_95": 4.046667889625113,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.130867749817518,
            "data_time": 0.020534837618470192,
            "batch_time": 0.03474735841155052,
            "samples_per_second": 2009295.4498029968,
            "samples_per_second_per_gpu": 251161.9312253746,
            "loss_sequences_lower_95": 4.118408209353078,
            "loss_sequences_upper_95": 4.346655422911351,
            "loss_tokens_lower_95": 3.999070328368314,
            "loss_tokens_upper_95": 4.102874346754401,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.753066126505534,
            "data_time": 0.018105841599977933,
            "batch_time": 0.03226039654169327,
            "samples_per_second": 2022388.6506726725,
            "samples_per_second_per_gpu": 252798.58133408407,
            "loss_sequences_lower_95": 4.709861551920573,
            "loss_sequences_upper_95": 4.817673400878906,
            "loss_tokens_lower_95": 4.6041955898365075,
            "loss_tokens_upper_95": 4.857855496212047,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.860658849475363,
            "data_time": 0.001654838039795018,
            "batch_time": 0.015085573893763773,
            "samples_per_second": 2307606.2738313884,
            "samples_per_second_per_gpu": 288450.78422892356,
            "loss_sequences_lower_95": 6.875464497933173,
            "loss_sequences_upper_95": 6.954774848063579,
            "loss_tokens_lower_95": 6.709983661277414,
            "loss_tokens_upper_95": 6.792504998297816,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.073422782850587,
            "data_time": 0.003269092148582407,
            "batch_time": 0.017114602479358646,
            "samples_per_second": 2239262.6845927574,
            "samples_per_second_per_gpu": 279907.8355740947,
            "loss_sequences_lower_95": 5.668195134943182,
            "loss_sequences_upper_95": 5.978256420816235,
            "loss_tokens_lower_95": 4.272594771354877,
            "loss_tokens_upper_95": 4.414957394023898,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.587920560466552,
            "data_time": 0.0050389424369141865,
            "batch_time": 0.01881312478233028,
            "samples_per_second": 2216317.9424168975,
            "samples_per_second_per_gpu": 277039.7428021122,
            "loss_sequences_lower_95": 5.04707849912676,
            "loss_sequences_upper_95": 5.399876106965257,
            "loss_tokens_lower_95": 4.1423776838924855,
            "loss_tokens_upper_95": 4.30941193227654,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.704424085138051,
            "data_time": 0.025357278329985484,
            "batch_time": 0.03965328846658979,
            "samples_per_second": 2006288.5472647143,
            "samples_per_second_per_gpu": 250786.06840808928,
            "loss_sequences_lower_95": 5.626236059458832,
            "loss_sequences_upper_95": 5.781178513618364,
            "loss_tokens_lower_95": 5.627041584171661,
            "loss_tokens_upper_95": 5.783731455345676,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.785153408050537,
            "data_time": 0.052050732649289645,
            "batch_time": 0.06704564736439632,
            "samples_per_second": 1715126.6455260445,
            "samples_per_second_per_gpu": 214390.83069075557,
            "loss_sequences_lower_95": 3.645774826049805,
            "loss_sequences_upper_95": 4.0342228317260735,
            "loss_tokens_lower_95": 3.4685004164366475,
            "loss_tokens_upper_95": 3.9506213138696333,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.137261380192689,
            "data_time": 0.0034763949536594633,
            "batch_time": 0.017045807497145453,
            "samples_per_second": 2277832.7285566595,
            "samples_per_second_per_gpu": 284729.09106958244,
            "loss_sequences_lower_95": 5.09306710593654,
            "loss_sequences_upper_95": 5.182049504322063,
            "loss_tokens_lower_95": 5.0916325985958295,
            "loss_tokens_upper_95": 5.182094021878198,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.17699181652772,
            "data_time": 0.005002125246878744,
            "batch_time": 0.018857917816759516,
            "samples_per_second": 2219822.659678066,
            "samples_per_second_per_gpu": 277477.83245975827,
            "loss_sequences_lower_95": 5.132006585998285,
            "loss_sequences_upper_95": 5.222478683184249,
            "loss_tokens_lower_95": 5.13050452130042,
            "loss_tokens_upper_95": 5.224105957431153,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.750397331047888,
            "data_time": 0.003627685954709805,
            "batch_time": 0.01728607382681663,
            "samples_per_second": 2253287.7339493595,
            "samples_per_second_per_gpu": 281660.96674366994,
            "loss_sequences_lower_95": 3.9064536608342117,
            "loss_sequences_upper_95": 4.031009970448093,
            "loss_tokens_lower_95": 3.5701689771646348,
            "loss_tokens_upper_95": 3.6263334384152297,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.8734401707649235,
            "data_time": 0.010840519331395626,
            "batch_time": 0.024955937638878822,
            "samples_per_second": 2107833.6473844615,
            "samples_per_second_per_gpu": 263479.2059230577,
            "loss_sequences_lower_95": 6.075873559570312,
            "loss_sequences_upper_95": 6.64843388671875,
            "loss_tokens_lower_95": 5.191071853284249,
            "loss_tokens_upper_95": 5.55794554129095,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.284077972173691,
            "data_time": 0.16045644879341125,
            "batch_time": 0.17740404605865479,
            "samples_per_second": 883886.5292154616,
            "samples_per_second_per_gpu": 110485.8161519327,
            "loss_sequences_lower_95": 4.017041397094727,
            "loss_sequences_upper_95": 4.647746968269348,
            "loss_tokens_lower_95": 3.807200894410583,
            "loss_tokens_upper_95": 4.612509418356007,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.786298536706245,
            "data_time": 0.029910176358324415,
            "batch_time": 0.04347534128960143,
            "samples_per_second": 1949415.856562645,
            "samples_per_second_per_gpu": 243676.98207033062,
            "loss_sequences_lower_95": 6.309172864892017,
            "loss_sequences_upper_95": 7.219692449460084,
            "loss_tokens_lower_95": 4.125744429355006,
            "loss_tokens_upper_95": 4.621720285856469,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9848554534337155,
            "data_time": 0.003008276845018069,
            "batch_time": 0.01662154806156953,
            "samples_per_second": 2262051.004169347,
            "samples_per_second_per_gpu": 282756.3755211684,
            "loss_sequences_lower_95": 2.9575194512563483,
            "loss_sequences_upper_95": 3.0117378151536562,
            "loss_tokens_lower_95": 2.956844160847395,
            "loss_tokens_upper_95": 3.01107131338057,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.033084311193878,
            "data_time": 0.002706694149628692,
            "batch_time": 0.016323914303999796,
            "samples_per_second": 2274571.0099136475,
            "samples_per_second_per_gpu": 284321.37623920594,
            "loss_sequences_lower_95": 4.0012883973080005,
            "loss_sequences_upper_95": 4.1928794702418495,
            "loss_tokens_lower_95": 3.8177335098717187,
            "loss_tokens_upper_95": 4.003884087763169,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5217645338603427,
            "data_time": 0.020432912641101413,
            "batch_time": 0.034427995483080544,
            "samples_per_second": 2032235.4477128983,
            "samples_per_second_per_gpu": 254029.43096411228,
            "loss_sequences_lower_95": 3.3810901068942454,
            "loss_sequences_upper_95": 3.7696654602721495,
            "loss_tokens_lower_95": 3.251220855571736,
            "loss_tokens_upper_95": 3.5540072657243726,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.858313148192938,
            "data_time": 0.005009740218520164,
            "batch_time": 0.018522191047668456,
            "samples_per_second": 2256856.2621042226,
            "samples_per_second_per_gpu": 282107.0327630278,
            "loss_sequences_lower_95": 3.877531325920605,
            "loss_sequences_upper_95": 4.022421898123027,
            "loss_tokens_lower_95": 3.721804101135695,
            "loss_tokens_upper_95": 3.870333864869621,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.311816267850922,
            "data_time": 0.03354609864098685,
            "batch_time": 0.0481450671241397,
            "samples_per_second": 1944021.035119166,
            "samples_per_second_per_gpu": 243002.62938989574,
            "loss_sequences_lower_95": 3.1396000280612855,
            "loss_sequences_upper_95": 3.6125048474567687,
            "loss_tokens_lower_95": 3.036265765589952,
            "loss_tokens_upper_95": 3.4258640606679496,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.26167933710529,
            "data_time": 0.0022377285345811346,
            "batch_time": 0.01579317768430691,
            "samples_per_second": 2281640.342521262,
            "samples_per_second_per_gpu": 285205.04281515774,
            "loss_sequences_lower_95": 4.250228532034532,
            "loss_sequences_upper_95": 4.272830698952291,
            "loss_tokens_lower_95": 4.250194872958967,
            "loss_tokens_upper_95": 4.273071166967768,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.2769370339449169,
            "data_time": 0.05125084790316495,
            "batch_time": 0.0659280473535711,
            "samples_per_second": 1764174.213038826,
            "samples_per_second_per_gpu": 220521.77662985324,
            "loss_sequences_lower_95": 1.2145679196107735,
            "loss_sequences_upper_95": 1.4009243567013046,
            "loss_tokens_lower_95": 1.0865975105652668,
            "loss_tokens_upper_95": 1.3459564958648524,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.237420147413728,
            "data_time": 0.0015924362854505924,
            "batch_time": 0.015225039306350446,
            "samples_per_second": 2271461.431782735,
            "samples_per_second_per_gpu": 283932.6789728419,
            "loss_sequences_lower_95": 6.754689465408805,
            "loss_sequences_upper_95": 6.8122109006485845,
            "loss_tokens_lower_95": 5.450353953094777,
            "loss_tokens_upper_95": 5.5097336315280465,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.091905939102173,
            "data_time": 0.0063861651079995295,
            "batch_time": 0.02003612972441174,
            "samples_per_second": 2247044.996021085,
            "samples_per_second_per_gpu": 280880.6245026356,
            "loss_sequences_lower_95": 7.019354663085938,
            "loss_sequences_upper_95": 7.3029834716796875,
            "loss_tokens_lower_95": 6.890663480943059,
            "loss_tokens_upper_95": 7.15103507694678,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.919743416620338,
            "data_time": 0.026400052894980222,
            "batch_time": 0.040328143006664215,
            "samples_per_second": 2091167.8597506108,
            "samples_per_second_per_gpu": 261395.98246882635,
            "loss_sequences_lower_95": 4.754504288383152,
            "loss_sequences_upper_95": 5.086089265242867,
            "loss_tokens_lower_95": 4.756203984799592,
            "loss_tokens_upper_95": 5.083231015412704,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.854152907385971,
            "data_time": 0.004936714488339712,
            "batch_time": 0.01857308127793921,
            "samples_per_second": 2252721.1727676312,
            "samples_per_second_per_gpu": 281590.1465959539,
            "loss_sequences_lower_95": 5.807749393347538,
            "loss_sequences_upper_95": 5.900739348440459,
            "loss_tokens_lower_95": 5.808455329663826,
            "loss_tokens_upper_95": 5.900631510416667,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.0415206532875696,
            "data_time": 0.004211934640052471,
            "batch_time": 0.018034722893796068,
            "samples_per_second": 2237064.993774977,
            "samples_per_second_per_gpu": 279633.12422187213,
            "loss_sequences_lower_95": 1.0874549092610677,
            "loss_sequences_upper_95": 1.1556963134765625,
            "loss_tokens_lower_95": 0.9594135896546119,
            "loss_tokens_upper_95": 1.0161678773071727,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.132255807377043,
            "data_time": 0.02460832042353494,
            "batch_time": 0.038205904620034356,
            "samples_per_second": 2003602.0361306178,
            "samples_per_second_per_gpu": 250450.25451632723,
            "loss_sequences_lower_95": 5.813999997093564,
            "loss_sequences_upper_95": 6.446844017392113,
            "loss_tokens_lower_95": 5.814336605980283,
            "loss_tokens_upper_95": 6.446824922107515,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.82082686573267,
            "data_time": 0.15783247351646423,
            "batch_time": 0.17413339018821716,
            "samples_per_second": 912923.2616624157,
            "samples_per_second_per_gpu": 114115.40770780196,
            "loss_sequences_lower_95": 2.5669212460517885,
            "loss_sequences_upper_95": 3.8204516172409058,
            "loss_tokens_lower_95": 2.2098336571762243,
            "loss_tokens_upper_95": 2.7711469827239044,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.29997118473053,
            "data_time": 0.006266655429961189,
            "batch_time": 0.01994087913679698,
            "samples_per_second": 2233500.4659258593,
            "samples_per_second_per_gpu": 279187.5582407324,
            "loss_sequences_lower_95": 7.2407195800781246,
            "loss_sequences_upper_95": 7.60423759765625,
            "loss_tokens_lower_95": 6.9751645780456855,
            "loss_tokens_upper_95": 7.291273418497514,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.930707661628723,
            "data_time": 0.006056426536469232,
            "batch_time": 0.019975055304784623,
            "samples_per_second": 2200537.416551511,
            "samples_per_second_per_gpu": 275067.17706893885,
            "loss_sequences_lower_95": 7.0219841552734374,
            "loss_sequences_upper_95": 7.242325610351563,
            "loss_tokens_lower_95": 6.689870853643747,
            "loss_tokens_upper_95": 6.868751772678549,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.491776879026392,
            "data_time": 0.0040598175597430074,
            "batch_time": 0.01766169063223644,
            "samples_per_second": 2262972.0245261234,
            "samples_per_second_per_gpu": 282871.5030657654,
            "loss_sequences_lower_95": 4.459213276273885,
            "loss_sequences_upper_95": 4.524290936400017,
            "loss_tokens_lower_95": 4.459817956047813,
            "loss_tokens_upper_95": 4.5241274202905,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.572354611896333,
            "data_time": 0.008889345964276178,
            "batch_time": 0.022722431540129047,
            "samples_per_second": 2176401.472551934,
            "samples_per_second_per_gpu": 272050.18406899174,
            "loss_sequences_lower_95": 4.476009489607335,
            "loss_sequences_upper_95": 4.666682755196332,
            "loss_tokens_lower_95": 4.477066044726862,
            "loss_tokens_upper_95": 4.665402770152289,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 8.968581511497497,
            "data_time": 0.006284775715025645,
            "batch_time": 0.020085288418663874,
            "samples_per_second": 2211766.531467382,
            "samples_per_second_per_gpu": 276470.81643342273,
            "loss_sequences_lower_95": 8.932962548828124,
            "loss_sequences_upper_95": 9.004195849609374,
            "loss_tokens_lower_95": 8.933218212890624,
            "loss_tokens_upper_95": 9.004229516601562,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.252142413641443,
            "data_time": 0.0019542863553773703,
            "batch_time": 0.015369038410583381,
            "samples_per_second": 2303565.550625174,
            "samples_per_second_per_gpu": 287945.69382814673,
            "loss_sequences_lower_95": 4.9246784180611405,
            "loss_sequences_upper_95": 5.041057477678571,
            "loss_tokens_lower_95": 3.4382037098627523,
            "loss_tokens_upper_95": 3.5136177293728403,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.439542255294857,
            "data_time": 0.01961047989981515,
            "batch_time": 0.03368616955620902,
            "samples_per_second": 2043334.4540943503,
            "samples_per_second_per_gpu": 255416.8067617938,
            "loss_sequences_lower_95": 5.264907734429658,
            "loss_sequences_upper_95": 5.61350600968546,
            "loss_tokens_lower_95": 5.266844837701143,
            "loss_tokens_upper_95": 5.6126174243528455,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.366754758124258,
            "data_time": 0.011590948328375816,
            "batch_time": 0.025537610054016113,
            "samples_per_second": 2164284.6283805827,
            "samples_per_second_per_gpu": 270535.57854757283,
            "loss_sequences_lower_95": 5.233948280484069,
            "loss_sequences_upper_95": 5.498231943167892,
            "loss_tokens_lower_95": 5.2354473517922795,
            "loss_tokens_upper_95": 5.4963422707950365,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.725099541707011,
            "data_time": 0.0023999925433590757,
            "batch_time": 0.015890677293022445,
            "samples_per_second": 2290990.8707594634,
            "samples_per_second_per_gpu": 286373.8588449329,
            "loss_sequences_lower_95": 5.23214604472003,
            "loss_sequences_upper_95": 5.347194049562352,
            "loss_tokens_lower_95": 3.959296014327547,
            "loss_tokens_upper_95": 4.046022526132364,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.279633397147769,
            "data_time": 0.028880099455515545,
            "batch_time": 0.04329614341259003,
            "samples_per_second": 1999464.3790806846,
            "samples_per_second_per_gpu": 249933.04738508558,
            "loss_sequences_lower_95": 5.160569028501158,
            "loss_sequences_upper_95": 5.396850682818701,
            "loss_tokens_lower_95": 5.159614474180514,
            "loss_tokens_upper_95": 5.397781993724681,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.84282953469396,
            "data_time": 0.003727455424447345,
            "batch_time": 0.01717089369474604,
            "samples_per_second": 2283131.5041090427,
            "samples_per_second_per_gpu": 285391.43801363034,
            "loss_sequences_lower_95": 5.809101159331995,
            "loss_sequences_upper_95": 5.87635227028383,
            "loss_tokens_lower_95": 5.809836224197247,
            "loss_tokens_upper_95": 5.87624804388857,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.469700891994735,
            "data_time": 0.02597463781183416,
            "batch_time": 0.040773966095664284,
            "samples_per_second": 1899273.3471006248,
            "samples_per_second_per_gpu": 237409.1683875781,
            "loss_sequences_lower_95": 5.283186414403823,
            "loss_sequences_upper_95": 5.656446142104065,
            "loss_tokens_lower_95": 5.281998999141953,
            "loss_tokens_upper_95": 5.654710047453352,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5176068007946015,
            "data_time": 0.08564118295907974,
            "batch_time": 0.10096968710422516,
            "samples_per_second": 1455340.0305000907,
            "samples_per_second_per_gpu": 181917.50381251134,
            "loss_sequences_lower_95": 3.198499647776286,
            "loss_sequences_upper_95": 3.9771032142639156,
            "loss_tokens_lower_95": 2.9105866856045193,
            "loss_tokens_upper_95": 3.9681184874640567,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.012459139029185,
            "data_time": 0.08346561342477798,
            "batch_time": 0.10271747410297394,
            "samples_per_second": 1258526.4743682074,
            "samples_per_second_per_gpu": 157315.80929602592,
            "loss_sequences_lower_95": 2.8124135017395018,
            "loss_sequences_upper_95": 3.4922417958577476,
            "loss_tokens_lower_95": 2.3155423453684603,
            "loss_tokens_upper_95": 3.287451677643851,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.043070130917039,
            "data_time": 0.00359709575781056,
            "batch_time": 0.01734979227053691,
            "samples_per_second": 2242562.3749599652,
            "samples_per_second_per_gpu": 280320.29686999565,
            "loss_sequences_lower_95": 5.009933639773564,
            "loss_sequences_upper_95": 5.077489328286083,
            "loss_tokens_lower_95": 5.009140179146724,
            "loss_tokens_upper_95": 5.077067364115427,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 0.9563242739989269,
            "data_time": 0.0016020706873101792,
            "batch_time": 0.015075551407081554,
            "samples_per_second": 2295424.761988039,
            "samples_per_second_per_gpu": 286928.0952485049,
            "loss_sequences_lower_95": 1.1318820075020943,
            "loss_sequences_upper_95": 1.1608589435841654,
            "loss_tokens_lower_95": 0.7641219467787423,
            "loss_tokens_upper_95": 0.7783983021205397,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.839343251205805,
            "data_time": 0.04165846109390259,
            "batch_time": 0.05679536238312721,
            "samples_per_second": 1862667.7950561023,
            "samples_per_second_per_gpu": 232833.4743820128,
            "loss_sequences_lower_95": 4.853643342265932,
            "loss_sequences_upper_95": 5.2264965147484,
            "loss_tokens_lower_95": 4.499384940739375,
            "loss_tokens_upper_95": 4.701308164359954,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.498640111974768,
            "data_time": 0.13080464090619767,
            "batch_time": 0.15058350563049316,
            "samples_per_second": 969353.6667079038,
            "samples_per_second_per_gpu": 121169.20833848798,
            "loss_sequences_lower_95": 7.089190776928051,
            "loss_sequences_upper_95": 8.13330030699034,
            "loss_tokens_lower_95": 6.8113116794162325,
            "loss_tokens_upper_95": 7.879021105354215,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.7367593675124935,
            "data_time": 0.033942733492170064,
            "batch_time": 0.048294592471349804,
            "samples_per_second": 1906094.0878898506,
            "samples_per_second_per_gpu": 238261.76098623133,
            "loss_sequences_lower_95": 4.700712864573409,
            "loss_sequences_upper_95": 5.054325671312286,
            "loss_tokens_lower_95": 4.372123797571489,
            "loss_tokens_upper_95": 4.545567572539949,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.861714601516724,
            "data_time": 0.034248837402888706,
            "batch_time": 0.04957371098654611,
            "samples_per_second": 1832859.4582265732,
            "samples_per_second_per_gpu": 229107.43227832165,
            "loss_sequences_lower_95": 4.842245650872951,
            "loss_sequences_upper_95": 5.166126130266887,
            "loss_tokens_lower_95": 4.5158008201888125,
            "loss_tokens_upper_95": 4.659066856565908,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.9023913523045985,
            "data_time": 0.033830943561735605,
            "batch_time": 0.04933404070990426,
            "samples_per_second": 1813818.1857683067,
            "samples_per_second_per_gpu": 226727.27322103834,
            "loss_sequences_lower_95": 4.837313675298923,
            "loss_sequences_upper_95": 5.225599996055045,
            "loss_tokens_lower_95": 4.525482188263729,
            "loss_tokens_upper_95": 4.751332117731096,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.915167362224765,
            "data_time": 0.03370057968866257,
            "batch_time": 0.04823868331455049,
            "samples_per_second": 1915689.267756343,
            "samples_per_second_per_gpu": 239461.15846954286,
            "loss_sequences_lower_95": 4.876742907268245,
            "loss_sequences_upper_95": 5.164000413475967,
            "loss_tokens_lower_95": 4.6105103679906545,
            "loss_tokens_upper_95": 4.743778283425209,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.1341739115507705,
            "data_time": 0.0358052342026322,
            "batch_time": 0.049934060485274705,
            "samples_per_second": 1984527.0287495812,
            "samples_per_second_per_gpu": 248065.87859369765,
            "loss_sequences_lower_95": 5.104179515009341,
            "loss_sequences_upper_95": 5.375934643478867,
            "loss_tokens_lower_95": 4.881984612642586,
            "loss_tokens_upper_95": 4.991740789064785,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.297140229039076,
            "data_time": 0.03346537976037888,
            "batch_time": 0.04806880156199137,
            "samples_per_second": 1927776.0209192056,
            "samples_per_second_per_gpu": 240972.0026149007,
            "loss_sequences_lower_95": 5.318793413115711,
            "loss_sequences_upper_95": 5.626750452925519,
            "loss_tokens_lower_95": 4.959136822351567,
            "loss_tokens_upper_95": 5.0842798866975585,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-2.0/params.txt",
    "uuid": "3beb1f69-eb13-43e5-a648-5b3dd3de9b95",
    "creation_date": "2023_12_14-04_59_27"
}