{
    "name": "c4_original-d=96_l=8_h=4-32.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 6764359680,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 32.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1352871936",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=96_l=8_h=4-32.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.351786935329438,
            "data_time": 0.12793052196502686,
            "batch_time": 1.2812219709157944,
            "samples_per_second": 375730.6830333232,
            "samples_per_second_per_gpu": 46966.3353791654,
            "loss_sequences_lower_95": 5.203305473327637,
            "loss_sequences_upper_95": 5.507621243794759,
            "loss_tokens_lower_95": 5.335351244608561,
            "loss_tokens_upper_95": 5.368166325887044,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4177652325641015,
            "data_time": 0.018522912206719706,
            "batch_time": 0.06370097834670828,
            "samples_per_second": 4672653.490660825,
            "samples_per_second_per_gpu": 584081.6863326031,
            "loss_sequences_lower_95": 4.415408159213638,
            "loss_sequences_upper_95": 4.4201520753185335,
            "loss_tokens_lower_95": 4.406130739583333,
            "loss_tokens_upper_95": 4.429313385416666,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.824019961454431,
            "data_time": 0.0822785273194313,
            "batch_time": 0.1272999495267868,
            "samples_per_second": 4136141.4288190724,
            "samples_per_second_per_gpu": 517017.67860238405,
            "loss_sequences_lower_95": 4.785340513891104,
            "loss_sequences_upper_95": 4.871242339465082,
            "loss_tokens_lower_95": 4.808642208333334,
            "loss_tokens_upper_95": 4.839543885416666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.564181003373923,
            "data_time": 0.012085026032046267,
            "batch_time": 0.055913411472973074,
            "samples_per_second": 5382824.038848573,
            "samples_per_second_per_gpu": 672853.0048560717,
            "loss_sequences_lower_95": 4.5349427754510305,
            "loss_sequences_upper_95": 4.593615123630799,
            "loss_tokens_lower_95": 4.551830395833333,
            "loss_tokens_upper_95": 4.5762672604166665,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.456387112922435,
            "data_time": 0.08692947030067444,
            "batch_time": 0.1317460536956787,
            "samples_per_second": 3927904.556301407,
            "samples_per_second_per_gpu": 490988.0695376759,
            "loss_sequences_lower_95": 4.4081429227063715,
            "loss_sequences_upper_95": 4.511731827186227,
            "loss_tokens_lower_95": 4.44442721875,
            "loss_tokens_upper_95": 4.468156916666667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.083907529336915,
            "data_time": 0.029812609155972798,
            "batch_time": 0.07262979944547017,
            "samples_per_second": 5021788.010146723,
            "samples_per_second_per_gpu": 627723.5012683404,
            "loss_sequences_lower_95": 5.038674032598998,
            "loss_sequences_upper_95": 5.130916518179597,
            "loss_tokens_lower_95": 5.0704232395833335,
            "loss_tokens_upper_95": 5.0973114375,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.451512906113449,
            "data_time": 0.011843965947628021,
            "batch_time": 0.05422553494572639,
            "samples_per_second": 5203172.435656721,
            "samples_per_second_per_gpu": 650396.5544570901,
            "loss_sequences_lower_95": 5.418166194993622,
            "loss_sequences_upper_95": 5.485364118303572,
            "loss_tokens_lower_95": 5.4345771874999995,
            "loss_tokens_upper_95": 5.468417697916667,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.920692401806098,
            "data_time": 0.011541250504945455,
            "batch_time": 0.05481971565045809,
            "samples_per_second": 5368882.654181064,
            "samples_per_second_per_gpu": 671110.331772633,
            "loss_sequences_lower_95": 4.902673695189791,
            "loss_sequences_upper_95": 4.940395829924738,
            "loss_tokens_lower_95": 4.908194052083334,
            "loss_tokens_upper_95": 4.9332238125,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.821269483101077,
            "data_time": 0.0779968872666359,
            "batch_time": 0.12354231625795364,
            "samples_per_second": 4110504.08367288,
            "samples_per_second_per_gpu": 513813.01045911,
            "loss_sequences_lower_95": 4.753185650585143,
            "loss_sequences_upper_95": 4.902173087267371,
            "loss_tokens_lower_95": 4.80869284375,
            "loss_tokens_upper_95": 4.833602249999999,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.887320998157908,
            "data_time": 0.08369544893503189,
            "batch_time": 0.12870624661445618,
            "samples_per_second": 4276064.541521263,
            "samples_per_second_per_gpu": 534508.0676901579,
            "loss_sequences_lower_95": 5.808774298264575,
            "loss_sequences_upper_95": 5.981736085914341,
            "loss_tokens_lower_95": 5.873446239583333,
            "loss_tokens_upper_95": 5.90088771875,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.9608385083576385,
            "data_time": 0.00937585121598737,
            "batch_time": 0.05294975535622958,
            "samples_per_second": 5352973.69724821,
            "samples_per_second_per_gpu": 669121.7121560263,
            "loss_sequences_lower_95": 4.951238522680554,
            "loss_sequences_upper_95": 4.970752147179491,
            "loss_tokens_lower_95": 4.947989072916667,
            "loss_tokens_upper_95": 4.9737279999999995,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.729094983537412,
            "data_time": 0.020979869365692138,
            "batch_time": 0.06417526751756668,
            "samples_per_second": 4928435.840668626,
            "samples_per_second_per_gpu": 616054.4800835783,
            "loss_sequences_lower_95": 4.7104146465665355,
            "loss_sequences_upper_95": 4.7481658236477,
            "loss_tokens_lower_95": 4.716293416666667,
            "loss_tokens_upper_95": 4.741790895833333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1767856721703955,
            "data_time": 0.08011870086193085,
            "batch_time": 0.12487740069627762,
            "samples_per_second": 4032004.8323971806,
            "samples_per_second_per_gpu": 504000.6040496476,
            "loss_sequences_lower_95": 5.105283997943871,
            "loss_sequences_upper_95": 5.261058666546495,
            "loss_tokens_lower_95": 5.163351979166667,
            "loss_tokens_upper_95": 5.19054265625,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.646609373345152,
            "data_time": 0.08411628007888794,
            "batch_time": 0.12905946373939514,
            "samples_per_second": 4062666.9645542265,
            "samples_per_second_per_gpu": 507833.3705692783,
            "loss_sequences_lower_95": 4.575132549464581,
            "loss_sequences_upper_95": 4.726387288079971,
            "loss_tokens_lower_95": 4.63365415625,
            "loss_tokens_upper_95": 4.659924177083333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.141356533223933,
            "data_time": 0.11593224108219147,
            "batch_time": 0.1391405463218689,
            "samples_per_second": 1063391.1375134836,
            "samples_per_second_per_gpu": 132923.89218918546,
            "loss_sequences_lower_95": 6.071816080266779,
            "loss_sequences_upper_95": 6.210992535677823,
            "loss_tokens_lower_95": 6.109897145357999,
            "loss_tokens_upper_95": 6.172295466336337,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.4245808673669575,
            "data_time": 0.08096612989902496,
            "batch_time": 0.11627895385026932,
            "samples_per_second": 3282522.1599138286,
            "samples_per_second_per_gpu": 410315.2699892286,
            "loss_sequences_lower_95": 5.3143053149342885,
            "loss_sequences_upper_95": 5.534324516985924,
            "loss_tokens_lower_95": 5.4098747708333335,
            "loss_tokens_upper_95": 5.4394025416666665,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.733242179598846,
            "data_time": 0.07973627746105194,
            "batch_time": 0.11588025093078613,
            "samples_per_second": 3807144.861452408,
            "samples_per_second_per_gpu": 475893.107681551,
            "loss_sequences_lower_95": 6.6607448285991095,
            "loss_sequences_upper_95": 6.819562121871908,
            "loss_tokens_lower_95": 6.7206406875,
            "loss_tokens_upper_95": 6.74592409375,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.532762937858457,
            "data_time": 0.12492434680461884,
            "batch_time": 0.15503869950771332,
            "samples_per_second": 2262435.0895356573,
            "samples_per_second_per_gpu": 282804.38619195716,
            "loss_sequences_lower_95": 5.415562889224193,
            "loss_sequences_upper_95": 5.732335487740938,
            "loss_tokens_lower_95": 5.517599499811891,
            "loss_tokens_upper_95": 5.547667043717182,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.287388857435491,
            "data_time": 0.026462676579302007,
            "batch_time": 0.07086069773543965,
            "samples_per_second": 4503307.431987767,
            "samples_per_second_per_gpu": 562913.4289984709,
            "loss_sequences_lower_95": 5.268311916927788,
            "loss_sequences_upper_95": 5.306676149008332,
            "loss_tokens_lower_95": 5.267500470129789,
            "loss_tokens_upper_95": 5.3064695005385625,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.112525345366971,
            "data_time": 0.02704544849693775,
            "batch_time": 0.07086212225258351,
            "samples_per_second": 4496783.475275912,
            "samples_per_second_per_gpu": 562097.934409489,
            "loss_sequences_lower_95": 4.130682714629804,
            "loss_sequences_upper_95": 4.156512812204366,
            "loss_tokens_lower_95": 4.100761857145802,
            "loss_tokens_upper_95": 4.121746894996617,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.457914599732628,
            "data_time": 0.04908646974298689,
            "batch_time": 0.09094711310333675,
            "samples_per_second": 4395826.821451335,
            "samples_per_second_per_gpu": 549478.3526814169,
            "loss_sequences_lower_95": 6.912847712989342,
            "loss_sequences_upper_95": 7.184458263831483,
            "loss_tokens_lower_95": 6.320639731039284,
            "loss_tokens_upper_95": 6.525753735462086,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.3519649998346965,
            "data_time": 0.04047293836871783,
            "batch_time": 0.08415023485819499,
            "samples_per_second": 4546779.987432393,
            "samples_per_second_per_gpu": 568347.4984290492,
            "loss_sequences_lower_95": 6.736893733723958,
            "loss_sequences_upper_95": 6.927457063802083,
            "loss_tokens_lower_95": 6.254856783117138,
            "loss_tokens_upper_95": 6.389141153203616,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.551302936783878,
            "data_time": 0.058841737608114876,
            "batch_time": 0.09904553244511287,
            "samples_per_second": 4032613.2552797417,
            "samples_per_second_per_gpu": 504076.6569099677,
            "loss_sequences_lower_95": 4.658803685024166,
            "loss_sequences_upper_95": 4.733329341445934,
            "loss_tokens_lower_95": 4.523811311727648,
            "loss_tokens_upper_95": 4.561219418497726,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.166127712076361,
            "data_time": 0.3227042257785797,
            "batch_time": 0.3645321726799011,
            "samples_per_second": 2037090.5186474337,
            "samples_per_second_per_gpu": 254636.31483092922,
            "loss_sequences_lower_95": 5.118823880282315,
            "loss_sequences_upper_95": 5.363361705433238,
            "loss_tokens_lower_95": 5.115005739506701,
            "loss_tokens_upper_95": 5.199280711478384,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.802196703151781,
            "data_time": 0.306198388338089,
            "batch_time": 0.3524400144815445,
            "samples_per_second": 2739898.7716795825,
            "samples_per_second_per_gpu": 342487.3464599478,
            "loss_sequences_lower_95": 4.8409312470105235,
            "loss_sequences_upper_95": 5.07084173708546,
            "loss_tokens_lower_95": 4.753321410058395,
            "loss_tokens_upper_95": 4.865289014895057,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.438020841280619,
            "data_time": 0.16971419006586075,
            "batch_time": 0.2011427953839302,
            "samples_per_second": 2413404.455989155,
            "samples_per_second_per_gpu": 301675.5569986444,
            "loss_sequences_lower_95": 5.417222473144531,
            "loss_sequences_upper_95": 5.516296040852865,
            "loss_tokens_lower_95": 5.325596967586334,
            "loss_tokens_upper_95": 5.549653404811109,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.723850624139287,
            "data_time": 0.023527375236153603,
            "batch_time": 0.06820296458899974,
            "samples_per_second": 4501727.342926765,
            "samples_per_second_per_gpu": 562715.9178658456,
            "loss_sequences_lower_95": 8.83249567104227,
            "loss_sequences_upper_95": 8.908878438561095,
            "loss_tokens_lower_95": 8.661836185735105,
            "loss_tokens_upper_95": 8.743398626009364,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.228993184036678,
            "data_time": 0.04774082750082016,
            "batch_time": 0.08991912007331848,
            "samples_per_second": 4353614.909412712,
            "samples_per_second_per_gpu": 544201.863676589,
            "loss_sequences_lower_95": 6.381327999641598,
            "loss_sequences_upper_95": 6.66926831589002,
            "loss_tokens_lower_95": 5.086652936251135,
            "loss_tokens_upper_95": 5.231659284794364,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.03123016365559,
            "data_time": 0.07535015046596527,
            "batch_time": 0.11714142858982086,
            "samples_per_second": 4344669.952996967,
            "samples_per_second_per_gpu": 543083.7441246209,
            "loss_sequences_lower_95": 5.738086406200005,
            "loss_sequences_upper_95": 6.06702919396931,
            "loss_tokens_lower_95": 4.9307505319256215,
            "loss_tokens_upper_95": 5.104563294737214,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.855405870637937,
            "data_time": 0.35033954679965973,
            "batch_time": 0.3927341401576996,
            "samples_per_second": 1440124.3679543552,
            "samples_per_second_per_gpu": 180015.5459942944,
            "loss_sequences_lower_95": 5.756654573902147,
            "loss_sequences_upper_95": 5.954281198161922,
            "loss_tokens_lower_95": 5.7553075642346245,
            "loss_tokens_upper_95": 5.958186653868792,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.652305207252502,
            "data_time": 0.2803222984075546,
            "batch_time": 0.3062235116958618,
            "samples_per_second": 1455094.005570629,
            "samples_per_second_per_gpu": 181886.7506963286,
            "loss_sequences_lower_95": 4.590050003051759,
            "loss_sequences_upper_95": 5.025936264038086,
            "loss_tokens_lower_95": 4.393003034506372,
            "loss_tokens_upper_95": 4.885822275670142,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.915253819560322,
            "data_time": 0.05090057849884033,
            "batch_time": 0.09418265335261822,
            "samples_per_second": 4517745.919447006,
            "samples_per_second_per_gpu": 564718.2399308757,
            "loss_sequences_lower_95": 4.866746822423554,
            "loss_sequences_upper_95": 4.9643380765417096,
            "loss_tokens_lower_95": 4.8660344915477864,
            "loss_tokens_upper_95": 4.9638654507780515,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.711968319617169,
            "data_time": 0.07544924914836884,
            "batch_time": 0.11873867809772491,
            "samples_per_second": 4447569.871571427,
            "samples_per_second_per_gpu": 555946.2339464284,
            "loss_sequences_lower_95": 5.660586705313268,
            "loss_sequences_upper_95": 5.76188184033643,
            "loss_tokens_lower_95": 5.660703186984925,
            "loss_tokens_upper_95": 5.763600172438063,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.474680111120263,
            "data_time": 0.05167175643146038,
            "batch_time": 0.09286536462605,
            "samples_per_second": 4174755.5169847496,
            "samples_per_second_per_gpu": 521844.4396230937,
            "loss_sequences_lower_95": 4.70619442413629,
            "loss_sequences_upper_95": 4.821935304593988,
            "loss_tokens_lower_95": 4.43578245857218,
            "loss_tokens_upper_95": 4.495008010213022,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.212608186721802,
            "data_time": 0.17535632103681564,
            "batch_time": 0.22056350111961365,
            "samples_per_second": 3581696.0812642886,
            "samples_per_second_per_gpu": 447712.0101580361,
            "loss_sequences_lower_95": 6.790781896972656,
            "loss_sequences_upper_95": 7.311509753417969,
            "loss_tokens_lower_95": 5.977564058234406,
            "loss_tokens_upper_95": 6.329107492321931,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.829734846949577,
            "data_time": 0.12094169855117798,
            "batch_time": 0.13968051970005035,
            "samples_per_second": 912135.2628991223,
            "samples_per_second_per_gpu": 114016.90786239029,
            "loss_sequences_lower_95": 4.52557624578476,
            "loss_sequences_upper_95": 5.270858299732208,
            "loss_tokens_lower_95": 4.28599580786694,
            "loss_tokens_upper_95": 5.185615609706134,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.160243672886114,
            "data_time": 0.30422142148017883,
            "batch_time": 0.3394334316253662,
            "samples_per_second": 2576819.038006589,
            "samples_per_second_per_gpu": 322102.3797508236,
            "loss_sequences_lower_95": 6.6192347910212375,
            "loss_sequences_upper_95": 7.3919718599867545,
            "loss_tokens_lower_95": 4.833908985916361,
            "loss_tokens_upper_95": 5.29646278217922,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.88700861866269,
            "data_time": 0.049763171209229365,
            "batch_time": 0.09471655057536231,
            "samples_per_second": 4512064.292723215,
            "samples_per_second_per_gpu": 564008.0365904019,
            "loss_sequences_lower_95": 4.865279203442415,
            "loss_sequences_upper_95": 4.908727968681739,
            "loss_tokens_lower_95": 4.865840620221713,
            "loss_tokens_upper_95": 4.90833551982648,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.621392044460868,
            "data_time": 0.03010545458112444,
            "batch_time": 0.07337019698960441,
            "samples_per_second": 4417837.0310736345,
            "samples_per_second_per_gpu": 552229.6288842043,
            "loss_sequences_lower_95": 5.739042563191345,
            "loss_sequences_upper_95": 5.968437757738211,
            "loss_tokens_lower_95": 5.4790893510185015,
            "loss_tokens_upper_95": 5.704281192324782,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.104624501951448,
            "data_time": 0.16865261644124985,
            "batch_time": 0.1984042227268219,
            "samples_per_second": 1848972.6771527808,
            "samples_per_second_per_gpu": 231121.5846440976,
            "loss_sequences_lower_95": 4.01472426194411,
            "loss_sequences_upper_95": 4.396302678034856,
            "loss_tokens_lower_95": 3.907602959137951,
            "loss_tokens_upper_95": 4.2364605503982125,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.451062147318141,
            "data_time": 0.07442604005336761,
            "batch_time": 0.11975980997085571,
            "samples_per_second": 4400122.966847801,
            "samples_per_second_per_gpu": 550015.3708559751,
            "loss_sequences_lower_95": 4.530604347283198,
            "loss_sequences_upper_95": 4.672437467627763,
            "loss_tokens_lower_95": 4.3655107804357876,
            "loss_tokens_upper_95": 4.5179796581489535,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.25818339789786,
            "data_time": 0.29231560230255127,
            "batch_time": 0.32645896077156067,
            "samples_per_second": 2400119.0380126806,
            "samples_per_second_per_gpu": 300014.8797515851,
            "loss_sequences_lower_95": 4.115804607112233,
            "loss_sequences_upper_95": 4.6141663621111615,
            "loss_tokens_lower_95": 4.070022070232845,
            "loss_tokens_upper_95": 4.4664855658327385,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.700992173398822,
            "data_time": 0.028637957423458277,
            "batch_time": 0.07256714116816984,
            "samples_per_second": 4397998.9999532085,
            "samples_per_second_per_gpu": 549749.8749941511,
            "loss_sequences_lower_95": 4.692183602736172,
            "loss_sequences_upper_95": 4.709775333973044,
            "loss_tokens_lower_95": 4.692159506119975,
            "loss_tokens_upper_95": 4.709740834885727,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.4583620446399577,
            "data_time": 0.2832271009683609,
            "batch_time": 0.31027789413928986,
            "samples_per_second": 1677508.6873875714,
            "samples_per_second_per_gpu": 209688.58592344643,
            "loss_sequences_lower_95": 2.363383991278491,
            "loss_sequences_upper_95": 2.6683002583031517,
            "loss_tokens_lower_95": 2.2557942979045387,
            "loss_tokens_upper_95": 2.5981197985802362,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.257565630088313,
            "data_time": 0.022542394399642944,
            "batch_time": 0.06658732374509176,
            "samples_per_second": 4530404.746434862,
            "samples_per_second_per_gpu": 566300.5933043577,
            "loss_sequences_lower_95": 6.23460754266575,
            "loss_sequences_upper_95": 6.288369765051756,
            "loss_tokens_lower_95": 5.148775628626693,
            "loss_tokens_upper_95": 5.203620599613153,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.153313633441925,
            "data_time": 0.09342505037784576,
            "batch_time": 0.1375957727432251,
            "samples_per_second": 4160667.7469581505,
            "samples_per_second_per_gpu": 520083.4683697688,
            "loss_sequences_lower_95": 7.881629577636718,
            "loss_sequences_upper_95": 8.424368359375,
            "loss_tokens_lower_95": 7.867004970423675,
            "loss_tokens_upper_95": 8.406239841635248,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.248992725040601,
            "data_time": 0.3060591369867325,
            "batch_time": 0.34867730736732483,
            "samples_per_second": 2530215.3086357806,
            "samples_per_second_per_gpu": 316276.9135794726,
            "loss_sequences_lower_95": 5.096485967221468,
            "loss_sequences_upper_95": 5.40100693412449,
            "loss_tokens_lower_95": 5.097386633831522,
            "loss_tokens_upper_95": 5.399275990361753,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.508563067335071,
            "data_time": 0.06245120118061701,
            "batch_time": 0.10283045967419942,
            "samples_per_second": 4014482.381338982,
            "samples_per_second_per_gpu": 501810.29766737274,
            "loss_sequences_lower_95": 7.4019022808652934,
            "loss_sequences_upper_95": 7.61403897372159,
            "loss_tokens_lower_95": 7.404042654326467,
            "loss_tokens_upper_95": 7.615770892518939,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 1.5055597026348113,
            "data_time": 0.08039831121762593,
            "batch_time": 0.1241065189242363,
            "samples_per_second": 4090593.7103787474,
            "samples_per_second_per_gpu": 511324.21379734343,
            "loss_sequences_lower_95": 1.6321141357421876,
            "loss_sequences_upper_95": 1.7165381022135415,
            "loss_tokens_lower_95": 1.4646130092662066,
            "loss_tokens_upper_95": 1.529995982768107,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.96208146186102,
            "data_time": 0.3183056712150574,
            "batch_time": 0.35946857929229736,
            "samples_per_second": 1931168.5048372531,
            "samples_per_second_per_gpu": 241396.06310465664,
            "loss_sequences_lower_95": 5.680419456845238,
            "loss_sequences_upper_95": 6.2522015962146575,
            "loss_tokens_lower_95": 5.6803887067522325,
            "loss_tokens_upper_95": 6.242714524042039,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.496961012482643,
            "data_time": 0.1282053142786026,
            "batch_time": 0.14661866426467896,
            "samples_per_second": 842358.1720225766,
            "samples_per_second_per_gpu": 105294.77150282207,
            "loss_sequences_lower_95": 3.140906661748886,
            "loss_sequences_upper_95": 4.432961821556091,
            "loss_tokens_lower_95": 2.899113643685567,
            "loss_tokens_upper_95": 3.4909376573071027,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.838316488742828,
            "data_time": 0.09813280403614044,
            "batch_time": 0.14268142357468605,
            "samples_per_second": 4235012.117369782,
            "samples_per_second_per_gpu": 529376.5146712228,
            "loss_sequences_lower_95": 7.908462719726562,
            "loss_sequences_upper_95": 8.272016040039063,
            "loss_tokens_lower_95": 7.669698967586718,
            "loss_tokens_upper_95": 7.988988204413864,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.401373952388764,
            "data_time": 0.0862312912940979,
            "batch_time": 0.13087574392557144,
            "samples_per_second": 4252378.3531429665,
            "samples_per_second_per_gpu": 531547.2941428708,
            "loss_sequences_lower_95": 7.704181311035156,
            "loss_sequences_upper_95": 7.959148120117187,
            "loss_tokens_lower_95": 7.273962506259389,
            "loss_tokens_upper_95": 7.4974171462271535,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.762630440267268,
            "data_time": 0.036426848421494164,
            "batch_time": 0.0799258624513944,
            "samples_per_second": 4580045.824923537,
            "samples_per_second_per_gpu": 572505.7281154421,
            "loss_sequences_lower_95": 4.740969808733867,
            "loss_sequences_upper_95": 4.784095846515672,
            "loss_tokens_lower_95": 4.741562056405568,
            "loss_tokens_upper_95": 4.784234187413573,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.235586948658464,
            "data_time": 0.11903452376524608,
            "batch_time": 0.15925395488739014,
            "samples_per_second": 3991362.2994069126,
            "samples_per_second_per_gpu": 498920.2874258641,
            "loss_sequences_lower_95": 5.129754085511473,
            "loss_sequences_upper_95": 5.338092781688028,
            "loss_tokens_lower_95": 5.13301988827285,
            "loss_tokens_upper_95": 5.336969622305828,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.092699654579162,
            "data_time": 0.08521256223320961,
            "batch_time": 0.12952648848295212,
            "samples_per_second": 4392643.773921107,
            "samples_per_second_per_gpu": 549080.4717401384,
            "loss_sequences_lower_95": 9.049043896484374,
            "loss_sequences_upper_95": 9.1370341796875,
            "loss_tokens_lower_95": 9.047667041015625,
            "loss_tokens_upper_95": 9.1365318359375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.4577878720600195,
            "data_time": 0.026422227422396343,
            "batch_time": 0.07078696219694047,
            "samples_per_second": 4483231.184897347,
            "samples_per_second_per_gpu": 560403.8981121684,
            "loss_sequences_lower_95": 5.5124256999467836,
            "loss_sequences_upper_95": 5.618528384061613,
            "loss_tokens_lower_95": 4.3438573498085775,
            "loss_tokens_upper_95": 4.4152424980937806,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.5099381386344115,
            "data_time": 0.1928877830505371,
            "batch_time": 0.224695691040584,
            "samples_per_second": 1625337.5205837723,
            "samples_per_second_per_gpu": 203167.19007297154,
            "loss_sequences_lower_95": 5.34965042569744,
            "loss_sequences_upper_95": 5.6677287998484145,
            "loss_tokens_lower_95": 5.345362341581885,
            "loss_tokens_upper_95": 5.669562644389138,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.444211731705011,
            "data_time": 0.1528085172176361,
            "batch_time": 0.19824955612421036,
            "samples_per_second": 3942760.1917602536,
            "samples_per_second_per_gpu": 492845.0239700317,
            "loss_sequences_lower_95": 5.32770528157552,
            "loss_sequences_upper_95": 5.560475571576287,
            "loss_tokens_lower_95": 5.329041580499387,
            "loss_tokens_upper_95": 5.559136149088541,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.488725459502482,
            "data_time": 0.03904296411201358,
            "batch_time": 0.08259924128651619,
            "samples_per_second": 4256857.574767423,
            "samples_per_second_per_gpu": 532107.1968459279,
            "loss_sequences_lower_95": 6.3923115795870915,
            "loss_sequences_upper_95": 6.498303866419266,
            "loss_tokens_lower_95": 5.369920700704807,
            "loss_tokens_upper_95": 5.455849107077391,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.832354018297145,
            "data_time": 0.2936439961194992,
            "batch_time": 0.3321036249399185,
            "samples_per_second": 2233476.031100311,
            "samples_per_second_per_gpu": 279184.50388753886,
            "loss_sequences_lower_95": 4.764207942902096,
            "loss_sequences_upper_95": 4.903285354533523,
            "loss_tokens_lower_95": 4.762446634605448,
            "loss_tokens_upper_95": 4.9019419998088205,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.1943722560135965,
            "data_time": 0.043879497509736284,
            "batch_time": 0.08803400855797988,
            "samples_per_second": 4447077.875713961,
            "samples_per_second_per_gpu": 555884.7344642451,
            "loss_sequences_lower_95": 7.1680667347572635,
            "loss_sequences_upper_95": 7.220207721115252,
            "loss_tokens_lower_95": 7.168050324326261,
            "loss_tokens_upper_95": 7.220885267345183,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.123594189153134,
            "data_time": 0.2951606512069702,
            "batch_time": 0.33490951359272003,
            "samples_per_second": 2126425.613666301,
            "samples_per_second_per_gpu": 265803.20170828764,
            "loss_sequences_lower_95": 5.8785802748596785,
            "loss_sequences_upper_95": 6.364418977904088,
            "loss_tokens_lower_95": 5.881195957220874,
            "loss_tokens_upper_95": 6.374069421268204,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.823288802305857,
            "data_time": 0.26401691138744354,
            "batch_time": 0.2834853380918503,
            "samples_per_second": 1191313.0117675606,
            "samples_per_second_per_gpu": 148914.12647094508,
            "loss_sequences_lower_95": 4.578341941833496,
            "loss_sequences_upper_95": 5.437316767374675,
            "loss_tokens_lower_95": 4.117970784505209,
            "loss_tokens_upper_95": 5.3729398091634115,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 3.8321712454160055,
            "data_time": 0.28536413609981537,
            "batch_time": 0.3047465980052948,
            "samples_per_second": 1363036.1936419662,
            "samples_per_second_per_gpu": 170379.52420524578,
            "loss_sequences_lower_95": 3.7264374351501464,
            "loss_sequences_upper_95": 4.760139096577962,
            "loss_tokens_lower_95": 3.1288268528627547,
            "loss_tokens_upper_95": 4.413968006948407,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.974640051008851,
            "data_time": 0.04270450451544353,
            "batch_time": 0.08535002171993256,
            "samples_per_second": 4286410.222251044,
            "samples_per_second_per_gpu": 535801.2777813805,
            "loss_sequences_lower_95": 8.942384135677466,
            "loss_sequences_upper_95": 9.006620130131628,
            "loss_tokens_lower_95": 8.942396705863402,
            "loss_tokens_upper_95": 9.006996559738585,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 2.261351460937044,
            "data_time": 0.02375792227197422,
            "batch_time": 0.06817723472305781,
            "samples_per_second": 4475939.042564106,
            "samples_per_second_per_gpu": 559492.3803205133,
            "loss_sequences_lower_95": 2.977165139837499,
            "loss_sequences_upper_95": 3.0163618462246973,
            "loss_tokens_lower_95": 2.1986260145170986,
            "loss_tokens_upper_95": 2.222940193222407,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.851349770553469,
            "data_time": 0.2853408604860306,
            "batch_time": 0.39838874340057373,
            "samples_per_second": 2343859.8463780778,
            "samples_per_second_per_gpu": 292982.4807972597,
            "loss_sequences_lower_95": 6.002821458230807,
            "loss_sequences_upper_95": 6.449057102954294,
            "loss_tokens_lower_95": 5.682690615596599,
            "loss_tokens_upper_95": 5.930175325472857,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.162897109985352,
            "data_time": 0.19391538202762604,
            "batch_time": 0.2130812555551529,
            "samples_per_second": 1149618.6071579487,
            "samples_per_second_per_gpu": 143702.32589474358,
            "loss_sequences_lower_95": 7.698036709347287,
            "loss_sequences_upper_95": 8.882582752124682,
            "loss_tokens_lower_95": 7.429415253062307,
            "loss_tokens_upper_95": 8.592934427143614,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.884151019701144,
            "data_time": 0.28329986333847046,
            "batch_time": 0.3175386041402817,
            "samples_per_second": 2208662.208444175,
            "samples_per_second_per_gpu": 276082.77605552186,
            "loss_sequences_lower_95": 5.951481098082008,
            "loss_sequences_upper_95": 6.311100434093941,
            "loss_tokens_lower_95": 5.713715996898654,
            "loss_tokens_upper_95": 5.93225420849979,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.964750537058202,
            "data_time": 0.3180414289236069,
            "batch_time": 0.352032408118248,
            "samples_per_second": 1560807.099037091,
            "samples_per_second_per_gpu": 195100.88737963638,
            "loss_sequences_lower_95": 6.048082891324672,
            "loss_sequences_upper_95": 6.388522562166539,
            "loss_tokens_lower_95": 5.817487042537675,
            "loss_tokens_upper_95": 5.998888583231939,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.987880456738356,
            "data_time": 0.30968981981277466,
            "batch_time": 0.3445441275835037,
            "samples_per_second": 1933398.5789587556,
            "samples_per_second_per_gpu": 241674.82236984445,
            "loss_sequences_lower_95": 6.04085296072611,
            "loss_sequences_upper_95": 6.464184402837986,
            "loss_tokens_lower_95": 5.793664796466181,
            "loss_tokens_upper_95": 6.07432537725974,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.964818576487099,
            "data_time": 0.2843712717294693,
            "batch_time": 0.3184913694858551,
            "samples_per_second": 2236444.5369301517,
            "samples_per_second_per_gpu": 279555.56711626897,
            "loss_sequences_lower_95": 5.994309178794302,
            "loss_sequences_upper_95": 6.312749834758479,
            "loss_tokens_lower_95": 5.82556802624854,
            "loss_tokens_upper_95": 5.996314712476879,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.386865852782445,
            "data_time": 0.32386305928230286,
            "batch_time": 0.36037902534008026,
            "samples_per_second": 2123032.321383048,
            "samples_per_second_per_gpu": 265379.040172881,
            "loss_sequences_lower_95": 6.464481178425854,
            "loss_sequences_upper_95": 6.744994207346661,
            "loss_tokens_lower_95": 6.271785507001316,
            "loss_tokens_upper_95": 6.407191373117139,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.158739936061021,
            "data_time": 0.31176960468292236,
            "batch_time": 0.34708067774772644,
            "samples_per_second": 2081528.9036705114,
            "samples_per_second_per_gpu": 260191.11295881393,
            "loss_sequences_lower_95": 6.393670319347847,
            "loss_sequences_upper_95": 6.693980370498285,
            "loss_tokens_lower_95": 6.022659899600663,
            "loss_tokens_upper_95": 6.175695755161241,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-32.0/params.txt",
    "uuid": "33d0a2c8-7db2-41d9-ad67-3a5369e24126",
    "creation_date": "2023_12_14-04_59_20"
}