{
    "name": "c4_original-d=576_l=24_h=8-0.5",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 1536773760,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "307354752",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=576_l=24_h=8-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.390328129132588,
            "data_time": 0.03717101737856865,
            "batch_time": 0.3760465234518051,
            "samples_per_second": 839747.550025914,
            "samples_per_second_per_gpu": 104968.44375323925,
            "loss_sequences_lower_95": 4.263230857849122,
            "loss_sequences_upper_95": 4.515890795389811,
            "loss_tokens_lower_95": 4.374853719075521,
            "loss_tokens_upper_95": 4.4057217280069985,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6423338113848587,
            "data_time": 0.0012163551931361255,
            "batch_time": 0.030506931069155113,
            "samples_per_second": 1088929.4457558168,
            "samples_per_second_per_gpu": 136116.1807194771,
            "loss_sequences_lower_95": 3.639751221337898,
            "loss_sequences_upper_95": 3.6448429264608997,
            "loss_tokens_lower_95": 3.6314095729166667,
            "loss_tokens_upper_95": 3.653304947916667,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.04209007195064,
            "data_time": 0.009572147369384765,
            "batch_time": 0.039478008270263674,
            "samples_per_second": 1044494.0386301089,
            "samples_per_second_per_gpu": 130561.75482876362,
            "loss_sequences_lower_95": 4.0045003711934,
            "loss_sequences_upper_95": 4.089222380968989,
            "loss_tokens_lower_95": 4.027114093750001,
            "loss_tokens_upper_95": 4.057512177083334,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7657238329562945,
            "data_time": 0.0015746212907527622,
            "batch_time": 0.02998199725621625,
            "samples_per_second": 1123937.4337213691,
            "samples_per_second_per_gpu": 140492.17921517114,
            "loss_sequences_lower_95": 3.735482955460696,
            "loss_sequences_upper_95": 3.796560436130799,
            "loss_tokens_lower_95": 3.7535406145833337,
            "loss_tokens_upper_95": 3.77773421875,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6868489648077007,
            "data_time": 0.01017127758953201,
            "batch_time": 0.039277620999461624,
            "samples_per_second": 1062435.842050648,
            "samples_per_second_per_gpu": 132804.480256331,
            "loss_sequences_lower_95": 3.638382660347187,
            "loss_sequences_upper_95": 3.7468257616590823,
            "loss_tokens_lower_95": 3.675659135416667,
            "loss_tokens_upper_95": 3.6980573020833334,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.171629687217853,
            "data_time": 0.003821940525718357,
            "batch_time": 0.03261528293723646,
            "samples_per_second": 1108619.042284364,
            "samples_per_second_per_gpu": 138577.3802855455,
            "loss_sequences_lower_95": 4.126369527665066,
            "loss_sequences_upper_95": 4.219073341105116,
            "loss_tokens_lower_95": 4.158891322916666,
            "loss_tokens_upper_95": 4.1842825,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.14364842954947,
            "data_time": 0.001596278311961238,
            "batch_time": 0.030551287906204896,
            "samples_per_second": 1108132.9933688934,
            "samples_per_second_per_gpu": 138516.62417111167,
            "loss_sequences_lower_95": 4.1091187021683675,
            "loss_sequences_upper_95": 4.178062171157525,
            "loss_tokens_lower_95": 4.1285398125,
            "loss_tokens_upper_95": 4.1588891979166664,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.26668510456984,
            "data_time": 0.0016496419806901206,
            "batch_time": 0.03026948878576645,
            "samples_per_second": 1121109.6398638082,
            "samples_per_second_per_gpu": 140138.70498297602,
            "loss_sequences_lower_95": 4.247984538612566,
            "loss_sequences_upper_95": 4.287371727748691,
            "loss_tokens_lower_95": 4.254685145833333,
            "loss_tokens_upper_95": 4.278863135416667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0657595919399725,
            "data_time": 0.009658954446277922,
            "batch_time": 0.039147891695537264,
            "samples_per_second": 1045267.5695135632,
            "samples_per_second_per_gpu": 130658.4461891954,
            "loss_sequences_lower_95": 3.9940312579395325,
            "loss_sequences_upper_95": 4.151360885496062,
            "loss_tokens_lower_95": 4.053765072916667,
            "loss_tokens_upper_95": 4.077807177083333,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.170642704831753,
            "data_time": 0.009789863601326942,
            "batch_time": 0.039346231147646904,
            "samples_per_second": 1057013.1635352937,
            "samples_per_second_per_gpu": 132126.6454419117,
            "loss_sequences_lower_95": 5.085179855889482,
            "loss_sequences_upper_95": 5.27594166631284,
            "loss_tokens_lower_95": 5.1570895625,
            "loss_tokens_upper_95": 5.184483791666667,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0688671454086816,
            "data_time": 0.0012562525831009794,
            "batch_time": 0.02975763161236634,
            "samples_per_second": 1123375.7822011888,
            "samples_per_second_per_gpu": 140421.9727751486,
            "loss_sequences_lower_95": 4.0605167363299985,
            "loss_sequences_upper_95": 4.077339387119706,
            "loss_tokens_lower_95": 4.05696078125,
            "loss_tokens_upper_95": 4.081038364583333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8917971515298038,
            "data_time": 0.0025643782254361194,
            "batch_time": 0.03170505312459852,
            "samples_per_second": 1100118.9682046324,
            "samples_per_second_per_gpu": 137514.87102557905,
            "loss_sequences_lower_95": 3.875791164082023,
            "loss_sequences_upper_95": 3.908530218528738,
            "loss_tokens_lower_95": 3.8799671250000003,
            "loss_tokens_upper_95": 3.9036150104166665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.4482446761450465,
            "data_time": 0.009775767684454033,
            "batch_time": 0.03881281637862737,
            "samples_per_second": 1059866.5249091492,
            "samples_per_second_per_gpu": 132483.31561364364,
            "loss_sequences_lower_95": 4.376844648318649,
            "loss_sequences_upper_95": 4.534313085838457,
            "loss_tokens_lower_95": 4.43473415625,
            "loss_tokens_upper_95": 4.461586875,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7206067378554954,
            "data_time": 0.010044861599743604,
            "batch_time": 0.03862208579166002,
            "samples_per_second": 1078099.7063821347,
            "samples_per_second_per_gpu": 134762.46329776684,
            "loss_sequences_lower_95": 3.6455747274177384,
            "loss_sequences_upper_95": 3.804587755378055,
            "loss_tokens_lower_95": 3.707938947916667,
            "loss_tokens_upper_95": 3.73310028125,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.161750945177945,
            "data_time": 0.07849613257816859,
            "batch_time": 0.11090442112513951,
            "samples_per_second": 550033.6597044402,
            "samples_per_second_per_gpu": 68754.20746305502,
            "loss_sequences_lower_95": 5.093347341364081,
            "loss_sequences_upper_95": 5.2313047062266955,
            "loss_tokens_lower_95": 5.132502937316894,
            "loss_tokens_upper_95": 5.191256696527654,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3530039043537725,
            "data_time": 0.014087834141471169,
            "batch_time": 0.04333322292024439,
            "samples_per_second": 1041782.1323011146,
            "samples_per_second_per_gpu": 130222.76653763933,
            "loss_sequences_lower_95": 4.263855736735263,
            "loss_sequences_upper_95": 4.4418683911203996,
            "loss_tokens_lower_95": 4.3392695833333335,
            "loss_tokens_upper_95": 4.366466677083333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.05997873925282,
            "data_time": 0.012398649007081985,
            "batch_time": 0.04242316633462906,
            "samples_per_second": 1033239.2068860686,
            "samples_per_second_per_gpu": 129154.90086075857,
            "loss_sequences_lower_95": 5.982358697976789,
            "loss_sequences_upper_95": 6.153349445111519,
            "loss_tokens_lower_95": 6.047703145833333,
            "loss_tokens_upper_95": 6.0721982187500005,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.429670232241271,
            "data_time": 0.035969752818346024,
            "batch_time": 0.06672095134854317,
            "samples_per_second": 908111.1951135751,
            "samples_per_second_per_gpu": 113513.89938919689,
            "loss_sequences_lower_95": 4.300645146604444,
            "loss_sequences_upper_95": 4.651671312676101,
            "loss_tokens_lower_95": 4.415083413045914,
            "loss_tokens_upper_95": 4.4443566494300715,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.9626125352468575,
            "data_time": 0.00163427378838046,
            "batch_time": 0.030677381809588716,
            "samples_per_second": 1097192.9578342948,
            "samples_per_second_per_gpu": 137149.11972928685,
            "loss_sequences_lower_95": 4.945064110124982,
            "loss_sequences_upper_95": 4.980450695903362,
            "loss_tokens_lower_95": 4.944773951026385,
            "loss_tokens_upper_95": 4.980308349505056,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.32872884479136,
            "data_time": 0.001819996080201143,
            "batch_time": 0.030996908522715236,
            "samples_per_second": 1092093.64254046,
            "samples_per_second_per_gpu": 136511.7053175575,
            "loss_sequences_lower_95": 3.3363566769567816,
            "loss_sequences_upper_95": 3.362430594238573,
            "loss_tokens_lower_95": 3.3041941320502297,
            "loss_tokens_upper_95": 3.3236388990976944,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.380537952237046,
            "data_time": 0.0030560482233678472,
            "batch_time": 0.03168476987401907,
            "samples_per_second": 1109521.1475738445,
            "samples_per_second_per_gpu": 138690.14344673057,
            "loss_sequences_lower_95": 5.5956536741261225,
            "loss_sequences_upper_95": 5.88827566832487,
            "loss_tokens_lower_95": 4.903485911625864,
            "loss_tokens_upper_95": 5.119527562004855,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.502604344209035,
            "data_time": 0.004397891461849213,
            "batch_time": 0.033525449322893267,
            "samples_per_second": 1088434.831388302,
            "samples_per_second_per_gpu": 136054.35392353774,
            "loss_sequences_lower_95": 5.6190870605468755,
            "loss_sequences_upper_95": 5.818564664713542,
            "loss_tokens_lower_95": 5.201776361045598,
            "loss_tokens_upper_95": 5.341405672661163,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.816958912942697,
            "data_time": 0.004503411585029614,
            "batch_time": 0.034073882512916806,
            "samples_per_second": 1071892.1077615533,
            "samples_per_second_per_gpu": 133986.51347019416,
            "loss_sequences_lower_95": 3.8628242683555247,
            "loss_sequences_upper_95": 3.9350607778738627,
            "loss_tokens_lower_95": 3.713241624116736,
            "loss_tokens_upper_95": 3.748189365592634,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.076904904842377,
            "data_time": 0.023885145783424377,
            "batch_time": 0.054295016186577935,
            "samples_per_second": 997807.4403707244,
            "samples_per_second_per_gpu": 124725.93004634055,
            "loss_sequences_lower_95": 3.9988825988769534,
            "loss_sequences_upper_95": 4.2299822096391155,
            "loss_tokens_lower_95": 3.9657120373752894,
            "loss_tokens_upper_95": 4.040921698642044,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.145977569112972,
            "data_time": 0.02196609601378441,
            "batch_time": 0.05265853367745876,
            "samples_per_second": 959349.3321806343,
            "samples_per_second_per_gpu": 119918.66652257928,
            "loss_sequences_lower_95": 4.139763283242985,
            "loss_sequences_upper_95": 4.359843351403061,
            "loss_tokens_lower_95": 4.013078214920447,
            "loss_tokens_upper_95": 4.115883276648189,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.875662147204081,
            "data_time": 0.016500670176285964,
            "batch_time": 0.04607298740973839,
            "samples_per_second": 1008487.4854209945,
            "samples_per_second_per_gpu": 126060.93567762431,
            "loss_sequences_lower_95": 4.826634134928385,
            "loss_sequences_upper_95": 4.93365712483724,
            "loss_tokens_lower_95": 4.741918990579612,
            "loss_tokens_upper_95": 4.983176990787237,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.049031466079195,
            "data_time": 0.0015523667387600526,
            "batch_time": 0.030833867529857912,
            "samples_per_second": 1090167.8318914669,
            "samples_per_second_per_gpu": 136270.97898643336,
            "loss_sequences_lower_95": 7.065256154347719,
            "loss_sequences_upper_95": 7.145465474447616,
            "loss_tokens_lower_95": 6.894577929398204,
            "loss_tokens_upper_95": 6.978407260111568,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.0106995435876875,
            "data_time": 0.002978422497742928,
            "batch_time": 0.03242999695291455,
            "samples_per_second": 1085184.0339919003,
            "samples_per_second_per_gpu": 135648.00424898753,
            "loss_sequences_lower_95": 5.584973010952625,
            "loss_sequences_upper_95": 5.890733804927531,
            "loss_tokens_lower_95": 4.238310453742336,
            "loss_tokens_upper_95": 4.381119093171691,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.540314823897625,
            "data_time": 0.005077926693735896,
            "batch_time": 0.03408309817314148,
            "samples_per_second": 1082604.539695704,
            "samples_per_second_per_gpu": 135325.567461963,
            "loss_sequences_lower_95": 4.985713065606335,
            "loss_sequences_upper_95": 5.3351606661956055,
            "loss_tokens_lower_95": 4.110101623633631,
            "loss_tokens_upper_95": 4.276704194523435,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.414055682752775,
            "data_time": 0.022372066974639893,
            "batch_time": 0.0520364556993757,
            "samples_per_second": 1004215.5250029574,
            "samples_per_second_per_gpu": 125526.94062536968,
            "loss_sequences_lower_95": 5.349288062526755,
            "loss_sequences_upper_95": 5.477540706826127,
            "loss_tokens_lower_95": 5.34969067159853,
            "loss_tokens_upper_95": 5.475953416519514,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7359618234634397,
            "data_time": 0.04683011770248413,
            "batch_time": 0.07765024900436401,
            "samples_per_second": 883853.0813853815,
            "samples_per_second_per_gpu": 110481.63517317269,
            "loss_sequences_lower_95": 3.5951727828979494,
            "loss_sequences_upper_95": 3.978802703857422,
            "loss_tokens_lower_95": 3.4166144792423694,
            "loss_tokens_upper_95": 3.8896353569781414,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.968792514156783,
            "data_time": 0.00322092383917124,
            "batch_time": 0.032107904888613335,
            "samples_per_second": 1101099.502628414,
            "samples_per_second_per_gpu": 137637.43782855175,
            "loss_sequences_lower_95": 4.928470245330092,
            "loss_sequences_upper_95": 5.009289938107727,
            "loss_tokens_lower_95": 4.927722180463152,
            "loss_tokens_upper_95": 5.009361893371769,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.941261968682967,
            "data_time": 0.004709838848518508,
            "batch_time": 0.034311713830098435,
            "samples_per_second": 1070867.9113098448,
            "samples_per_second_per_gpu": 133858.4889137306,
            "loss_sequences_lower_95": 4.898025120290746,
            "loss_sequences_upper_95": 4.983918750959767,
            "loss_tokens_lower_95": 4.8972921585470415,
            "loss_tokens_upper_95": 4.985920794062884,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7008554783038656,
            "data_time": 0.003426970724925624,
            "batch_time": 0.032329519933037315,
            "samples_per_second": 1093506.1945842244,
            "samples_per_second_per_gpu": 136688.27432302805,
            "loss_sequences_lower_95": 3.8438235742612554,
            "loss_sequences_upper_95": 3.965829902790057,
            "loss_tokens_lower_95": 3.5392538963741274,
            "loss_tokens_upper_95": 3.59577709626649,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.765856772899627,
            "data_time": 0.01058980543166399,
            "batch_time": 0.040411463007330894,
            "samples_per_second": 1031056.4166452688,
            "samples_per_second_per_gpu": 128882.0520806586,
            "loss_sequences_lower_95": 5.935533166503907,
            "loss_sequences_upper_95": 6.472109729003907,
            "loss_tokens_lower_95": 5.152022812525852,
            "loss_tokens_upper_95": 5.5148947755522,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3393232226371765,
            "data_time": 0.14563968777656555,
            "batch_time": 0.1820572465658188,
            "samples_per_second": 529214.4148899425,
            "samples_per_second_per_gpu": 66151.80186124281,
            "loss_sequences_lower_95": 4.048432731628418,
            "loss_sequences_upper_95": 4.7032711625099175,
            "loss_tokens_lower_95": 3.8427413238876165,
            "loss_tokens_upper_95": 4.654074228220972,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.453585170466324,
            "data_time": 0.02719790884789,
            "batch_time": 0.05797591361593693,
            "samples_per_second": 906950.8818315599,
            "samples_per_second_per_gpu": 113368.86022894499,
            "loss_sequences_lower_95": 5.901217019969019,
            "loss_sequences_upper_95": 6.70198992148213,
            "loss_tokens_lower_95": 4.003963032650437,
            "loss_tokens_upper_95": 4.4753499303084485,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9182319169275726,
            "data_time": 0.0029333411819405025,
            "batch_time": 0.031540547808011375,
            "samples_per_second": 1105412.8496079992,
            "samples_per_second_per_gpu": 138176.6062009999,
            "loss_sequences_lower_95": 2.894255138578869,
            "loss_sequences_upper_95": 2.941222536579497,
            "loss_tokens_lower_95": 2.894596466157779,
            "loss_tokens_upper_95": 2.9420490576534513,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0999657304388375,
            "data_time": 0.002496677172197868,
            "batch_time": 0.03136423090237909,
            "samples_per_second": 1102369.8012439078,
            "samples_per_second_per_gpu": 137796.22515548847,
            "loss_sequences_lower_95": 4.067186893557151,
            "loss_sequences_upper_95": 4.262534605145061,
            "loss_tokens_lower_95": 3.87758449953575,
            "loss_tokens_upper_95": 4.065661287325123,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5203120014169715,
            "data_time": 0.018603588143984478,
            "batch_time": 0.048824545409944325,
            "samples_per_second": 969164.5837650017,
            "samples_per_second_per_gpu": 121145.57297062522,
            "loss_sequences_lower_95": 3.3463174896799166,
            "loss_sequences_upper_95": 3.752317860362294,
            "loss_tokens_lower_95": 3.2500017954837825,
            "loss_tokens_upper_95": 3.5594312810765496,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.837038586246939,
            "data_time": 0.004599591344594955,
            "batch_time": 0.03334731236100197,
            "samples_per_second": 1090702.8423996188,
            "samples_per_second_per_gpu": 136337.85529995235,
            "loss_sequences_lower_95": 3.8696656718626676,
            "loss_sequences_upper_95": 4.018202446724546,
            "loss_tokens_lower_95": 3.69850336749086,
            "loss_tokens_upper_95": 3.8446119735853714,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4459808614195846,
            "data_time": 0.029124532427106584,
            "batch_time": 0.06118191991533552,
            "samples_per_second": 943450.6706529222,
            "samples_per_second_per_gpu": 117931.33383161528,
            "loss_sequences_lower_95": 3.246084501685166,
            "loss_sequences_upper_95": 3.742811100657393,
            "loss_tokens_lower_95": 3.143780442902159,
            "loss_tokens_upper_95": 3.5540520871832944,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.254007760442049,
            "data_time": 0.001980803130009248,
            "batch_time": 0.031120733456567816,
            "samples_per_second": 1091060.2434975565,
            "samples_per_second_per_gpu": 136382.53043719457,
            "loss_sequences_lower_95": 4.240138408541083,
            "loss_sequences_upper_95": 4.2677129468862525,
            "loss_tokens_lower_95": 4.240301566172609,
            "loss_tokens_upper_95": 4.267612047800185,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.316839382197093,
            "data_time": 0.045275419408624826,
            "batch_time": 0.07663481452248314,
            "samples_per_second": 842189.9473563245,
            "samples_per_second_per_gpu": 105273.74341954057,
            "loss_sequences_lower_95": 1.2437108641689265,
            "loss_sequences_upper_95": 1.4448025823796837,
            "loss_tokens_lower_95": 1.1311266864944505,
            "loss_tokens_upper_95": 1.388197181412727,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.704150811400053,
            "data_time": 0.0012792271420621912,
            "batch_time": 0.03047068990386042,
            "samples_per_second": 1091955.8964035504,
            "samples_per_second_per_gpu": 136494.4870504438,
            "loss_sequences_lower_95": 6.13067294778564,
            "loss_sequences_upper_95": 6.185263958497117,
            "loss_tokens_lower_95": 5.045586871373308,
            "loss_tokens_upper_95": 5.09922699468085,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.241027806758881,
            "data_time": 0.005491800724513947,
            "batch_time": 0.03512867670210581,
            "samples_per_second": 1061381.0622592478,
            "samples_per_second_per_gpu": 132672.63278240597,
            "loss_sequences_lower_95": 7.19425927734375,
            "loss_sequences_upper_95": 7.490310314941406,
            "loss_tokens_lower_95": 7.010381867658787,
            "loss_tokens_upper_95": 7.276428987365865,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.251301756112472,
            "data_time": 0.021539033469507248,
            "batch_time": 0.05110476178638006,
            "samples_per_second": 1012600.5260305187,
            "samples_per_second_per_gpu": 126575.06575381484,
            "loss_sequences_lower_95": 5.083906342879585,
            "loss_sequences_upper_95": 5.418461967136548,
            "loss_tokens_lower_95": 5.084387273373811,
            "loss_tokens_upper_95": 5.419990260912025,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.597252913316091,
            "data_time": 0.0045803910996540485,
            "batch_time": 0.033850343112485956,
            "samples_per_second": 1078277.9240506187,
            "samples_per_second_per_gpu": 134784.74050632733,
            "loss_sequences_lower_95": 7.499998020981297,
            "loss_sequences_upper_95": 7.693732336795692,
            "loss_tokens_lower_95": 7.502439038825758,
            "loss_tokens_upper_95": 7.6916180234966856,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.0743510819673538,
            "data_time": 0.004119012584077551,
            "batch_time": 0.033298355467776035,
            "samples_per_second": 1085834.0851937055,
            "samples_per_second_per_gpu": 135729.2606492132,
            "loss_sequences_lower_95": 1.1303843872070312,
            "loss_sequences_upper_95": 1.211190106201172,
            "loss_tokens_lower_95": 0.9823802196659914,
            "loss_tokens_upper_95": 1.0368246321966286,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.96875946692058,
            "data_time": 0.023247823119163513,
            "batch_time": 0.0538424551486969,
            "samples_per_second": 944720.7158893141,
            "samples_per_second_per_gpu": 118090.08948616426,
            "loss_sequences_lower_95": 5.668680855887277,
            "loss_sequences_upper_95": 6.26328617640904,
            "loss_tokens_lower_95": 5.669683067685082,
            "loss_tokens_upper_95": 6.274383312406994,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.674790073186159,
            "data_time": 0.15361975133419037,
            "batch_time": 0.18975096940994263,
            "samples_per_second": 544869.0687881007,
            "samples_per_second_per_gpu": 68108.63359851259,
            "loss_sequences_lower_95": 2.4467554569244383,
            "loss_sequences_upper_95": 3.707599425315857,
            "loss_tokens_lower_95": 2.097237498489852,
            "loss_tokens_upper_95": 2.6745818210877093,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.2920911531448365,
            "data_time": 0.006055183353878203,
            "batch_time": 0.036322816969856384,
            "samples_per_second": 1044195.0654218394,
            "samples_per_second_per_gpu": 130524.38317772992,
            "loss_sequences_lower_95": 7.218004052734375,
            "loss_sequences_upper_95": 7.556781652832031,
            "loss_tokens_lower_95": 7.010103559413335,
            "loss_tokens_upper_95": 7.311334941108819,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.014060652256012,
            "data_time": 0.005855928810815963,
            "batch_time": 0.03535840955991593,
            "samples_per_second": 1065958.001846371,
            "samples_per_second_per_gpu": 133244.75023079637,
            "loss_sequences_lower_95": 7.07323671875,
            "loss_sequences_upper_95": 7.292611181640624,
            "loss_tokens_lower_95": 6.802120074643215,
            "loss_tokens_upper_95": 7.0014910427164185,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.328608967735989,
            "data_time": 0.003682111497706793,
            "batch_time": 0.03344588758156052,
            "samples_per_second": 1066753.2543503675,
            "samples_per_second_per_gpu": 133344.15679379593,
            "loss_sequences_lower_95": 4.3001736893175915,
            "loss_sequences_upper_95": 4.3568543442135645,
            "loss_tokens_lower_95": 4.300406707344221,
            "loss_tokens_upper_95": 4.356882261198877,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.985420164974054,
            "data_time": 0.007950008455720192,
            "batch_time": 0.037783522620301956,
            "samples_per_second": 1042162.4821834919,
            "samples_per_second_per_gpu": 130270.31027293649,
            "loss_sequences_lower_95": 4.880145638695876,
            "loss_sequences_upper_95": 5.0879303587929625,
            "loss_tokens_lower_95": 4.877782143682196,
            "loss_tokens_upper_95": 5.087848819199428,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 8.013762755870818,
            "data_time": 0.005630725432956029,
            "batch_time": 0.03463191172433278,
            "samples_per_second": 1084203.4760018406,
            "samples_per_second_per_gpu": 135525.43450023007,
            "loss_sequences_lower_95": 7.978907263183594,
            "loss_sequences_upper_95": 8.049423950195312,
            "loss_tokens_lower_95": 7.979024365234375,
            "loss_tokens_upper_95": 8.048579357910157,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.0539970549977715,
            "data_time": 0.0019410020687629685,
            "batch_time": 0.031164381634760895,
            "samples_per_second": 1088370.6164406743,
            "samples_per_second_per_gpu": 136046.3270550843,
            "loss_sequences_lower_95": 4.6478669454381505,
            "loss_sequences_upper_95": 4.754073697670294,
            "loss_tokens_lower_95": 3.3157214440344664,
            "loss_tokens_upper_95": 3.3852854696764494,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.580619896525767,
            "data_time": 0.018651981013161796,
            "batch_time": 0.04814261879239764,
            "samples_per_second": 1000250.7613310771,
            "samples_per_second_per_gpu": 125031.34516638464,
            "loss_sequences_lower_95": 5.403927828660652,
            "loss_sequences_upper_95": 5.755683853377157,
            "loss_tokens_lower_95": 5.404927393215806,
            "loss_tokens_upper_95": 5.75512063325341,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.65097586407381,
            "data_time": 0.010099961422383785,
            "batch_time": 0.040288301184773445,
            "samples_per_second": 1042958.4897120392,
            "samples_per_second_per_gpu": 130369.8112140049,
            "loss_sequences_lower_95": 5.521173658183977,
            "loss_sequences_upper_95": 5.774549883674173,
            "loss_tokens_lower_95": 5.5215936877680765,
            "loss_tokens_upper_95": 5.776505629595588,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.521622615665119,
            "data_time": 0.001956805851376632,
            "batch_time": 0.03173676077262495,
            "samples_per_second": 1071812.9456561902,
            "samples_per_second_per_gpu": 133976.61820702377,
            "loss_sequences_lower_95": 4.9858533123121,
            "loss_sequences_upper_95": 5.090764774560003,
            "loss_tokens_lower_95": 3.8249026767056495,
            "loss_tokens_upper_95": 3.906340435050592,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.8041275947813,
            "data_time": 0.02609655261039734,
            "batch_time": 0.05648469179868698,
            "samples_per_second": 987891.2380109412,
            "samples_per_second_per_gpu": 123486.40475136765,
            "loss_sequences_lower_95": 4.694226542477885,
            "loss_sequences_upper_95": 4.910555610454903,
            "loss_tokens_lower_95": 4.693816508944073,
            "loss_tokens_upper_95": 4.9120353819831974,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.492897249507612,
            "data_time": 0.003217983304071485,
            "batch_time": 0.032775466258709245,
            "samples_per_second": 1074374.3507293623,
            "samples_per_second_per_gpu": 134296.7938411703,
            "loss_sequences_lower_95": 4.468985942875573,
            "loss_sequences_upper_95": 4.516635353951644,
            "loss_tokens_lower_95": 4.469528136647076,
            "loss_tokens_upper_95": 4.516184806240443,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.76501468200128,
            "data_time": 0.02364434762434526,
            "batch_time": 0.05404602397571911,
            "samples_per_second": 936383.1098004547,
            "samples_per_second_per_gpu": 117047.88872505684,
            "loss_sequences_lower_95": 5.587152218124242,
            "loss_sequences_upper_95": 5.943292680758875,
            "loss_tokens_lower_95": 5.5857783641630006,
            "loss_tokens_upper_95": 5.944009977174036,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.368278896808624,
            "data_time": 0.07502010464668274,
            "batch_time": 0.1058306097984314,
            "samples_per_second": 763754.7448606708,
            "samples_per_second_per_gpu": 95469.34310758385,
            "loss_sequences_lower_95": 3.9617058436075845,
            "loss_sequences_upper_95": 4.996642087300619,
            "loss_tokens_lower_95": 3.5263926400078667,
            "loss_tokens_upper_95": 4.740724404652913,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6025883833567303,
            "data_time": 0.07551609724760056,
            "batch_time": 0.1070488691329956,
            "samples_per_second": 748901.8555885605,
            "samples_per_second_per_gpu": 93612.73194857006,
            "loss_sequences_lower_95": 3.3133229128519694,
            "loss_sequences_upper_95": 4.2731622950236,
            "loss_tokens_lower_95": 2.72947207675891,
            "loss_tokens_upper_95": 3.913914352588439,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.350650858703523,
            "data_time": 0.003399590171737626,
            "batch_time": 0.03262332834307764,
            "samples_per_second": 1085401.3017446008,
            "samples_per_second_per_gpu": 135675.1627180751,
            "loss_sequences_lower_95": 4.33904338583855,
            "loss_sequences_upper_95": 4.362141059347386,
            "loss_tokens_lower_95": 4.3390764580840395,
            "loss_tokens_upper_95": 4.362077014681517,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.0626251552634376,
            "data_time": 0.0012126329153189252,
            "batch_time": 0.030152051124713614,
            "samples_per_second": 1099465.720994139,
            "samples_per_second_per_gpu": 137433.2151242674,
            "loss_sequences_lower_95": 1.2640240365402533,
            "loss_sequences_upper_95": 1.2945314937566248,
            "loss_tokens_lower_95": 0.8376992937612724,
            "loss_tokens_upper_95": 0.8528368807356685,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.912084710879589,
            "data_time": 0.0386793278157711,
            "batch_time": 0.06966101750731468,
            "samples_per_second": 941243.685308611,
            "samples_per_second_per_gpu": 117655.46066357638,
            "loss_sequences_lower_95": 4.903951065183625,
            "loss_sequences_upper_95": 5.273469158983606,
            "loss_tokens_lower_95": 4.584917098935305,
            "loss_tokens_upper_95": 4.799533548076333,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.505179894937052,
            "data_time": 0.11015403838384719,
            "batch_time": 0.14439707710629418,
            "samples_per_second": 531916.2194753009,
            "samples_per_second_per_gpu": 66489.52743441261,
            "loss_sequences_lower_95": 7.043108924659522,
            "loss_sequences_upper_95": 8.20006664379223,
            "loss_tokens_lower_95": 6.754172242717979,
            "loss_tokens_upper_95": 7.944007308394821,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.828251870667062,
            "data_time": 0.028724849224090576,
            "batch_time": 0.05832390558151972,
            "samples_per_second": 979835.1795841699,
            "samples_per_second_per_gpu": 122479.39744802123,
            "loss_sequences_lower_95": 4.777420788276486,
            "loss_sequences_upper_95": 5.125817275628811,
            "loss_tokens_lower_95": 4.4747479088914,
            "loss_tokens_upper_95": 4.656932222127312,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.930955027661672,
            "data_time": 0.029474227201370967,
            "batch_time": 0.06010066611426217,
            "samples_per_second": 957910.7335917361,
            "samples_per_second_per_gpu": 119738.84169896701,
            "loss_sequences_lower_95": 4.889524031848442,
            "loss_sequences_upper_95": 5.209073694740853,
            "loss_tokens_lower_95": 4.599114084763839,
            "loss_tokens_upper_95": 4.753836183043568,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.00307592676907,
            "data_time": 0.029119244643620083,
            "batch_time": 0.0599357769602821,
            "samples_per_second": 950214.7189518688,
            "samples_per_second_per_gpu": 118776.8398689836,
            "loss_sequences_lower_95": 4.9231842878388195,
            "loss_sequences_upper_95": 5.311249895793636,
            "loss_tokens_lower_95": 4.6237057879438614,
            "loss_tokens_upper_95": 4.863989903612888,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.976124023518911,
            "data_time": 0.03195404154913766,
            "batch_time": 0.062082912240709574,
            "samples_per_second": 976692.7882941517,
            "samples_per_second_per_gpu": 122086.59853676896,
            "loss_sequences_lower_95": 4.919477248773342,
            "loss_sequences_upper_95": 5.210304213733208,
            "loss_tokens_lower_95": 4.676150835934458,
            "loss_tokens_upper_95": 4.8178283216054565,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.354359064042939,
            "data_time": 0.030752243819060148,
            "batch_time": 0.06143488707365813,
            "samples_per_second": 974450.6069689156,
            "samples_per_second_per_gpu": 121806.32587111444,
            "loss_sequences_lower_95": 5.3202449348402325,
            "loss_sequences_upper_95": 5.597093124863524,
            "loss_tokens_lower_95": 5.094054788041094,
            "loss_tokens_upper_95": 5.210353527621381,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.371333287983406,
            "data_time": 0.029731795901343935,
            "batch_time": 0.060124754905700684,
            "samples_per_second": 965094.6186286729,
            "samples_per_second_per_gpu": 120636.82732858411,
            "loss_sequences_lower_95": 5.3946803395341085,
            "loss_sequences_upper_95": 5.69973058002751,
            "loss_tokens_lower_95": 5.026868171714889,
            "loss_tokens_upper_95": 5.157397269627788,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-0.5/params.txt",
    "uuid": "32f41c14-d809-4a03-849c-13b4eb31371f",
    "creation_date": "2023_12_14-04_59_37"
}