{
    "name": "c4_original-d=96_l=8_h=4-1.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 211386240,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 1.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "42277248",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=96_l=8_h=4-1.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 6.570902502536773,
            "data_time": 0.14334045350551605,
            "batch_time": 1.3271092623472214,
            "samples_per_second": 361167.71438471926,
            "samples_per_second_per_gpu": 45145.96429808991,
            "loss_sequences_lower_95": 6.373697280883789,
            "loss_sequences_upper_95": 6.770363527933756,
            "loss_tokens_lower_95": 6.555113080342611,
            "loss_tokens_upper_95": 6.586601816813151,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.220676313056785,
            "data_time": 0.01954542409434626,
            "batch_time": 0.06456476504875767,
            "samples_per_second": 4652740.70667297,
            "samples_per_second_per_gpu": 581592.5883341213,
            "loss_sequences_lower_95": 5.218366098424688,
            "loss_sequences_upper_95": 5.22294346845257,
            "loss_tokens_lower_95": 5.2091661875,
            "loss_tokens_upper_95": 5.23215378125,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.6072997579769215,
            "data_time": 0.0949527844786644,
            "batch_time": 0.13904529809951782,
            "samples_per_second": 4180913.9924895233,
            "samples_per_second_per_gpu": 522614.2490611904,
            "loss_sequences_lower_95": 6.559032854352679,
            "loss_sequences_upper_95": 6.668899087711257,
            "loss_tokens_lower_95": 6.593374958333333,
            "loss_tokens_upper_95": 6.621491333333333,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.450534525114237,
            "data_time": 0.015042721440917567,
            "batch_time": 0.05863170325756073,
            "samples_per_second": 5313727.410191156,
            "samples_per_second_per_gpu": 664215.9262738945,
            "loss_sequences_lower_95": 5.4123257893041234,
            "loss_sequences_upper_95": 5.489844454735825,
            "loss_tokens_lower_95": 5.437595375,
            "loss_tokens_upper_95": 5.463435760416667,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.25401688884572,
            "data_time": 0.09498504549264908,
            "batch_time": 0.14071307331323624,
            "samples_per_second": 4066685.177527363,
            "samples_per_second_per_gpu": 508335.64719092037,
            "loss_sequences_lower_95": 5.195674807670889,
            "loss_sequences_upper_95": 5.325135433406791,
            "loss_tokens_lower_95": 5.24210415625,
            "loss_tokens_upper_95": 5.266121354166666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.225994518547136,
            "data_time": 0.03493134925762812,
            "batch_time": 0.07790984213352203,
            "samples_per_second": 4936230.052612,
            "samples_per_second_per_gpu": 617028.7565765,
            "loss_sequences_lower_95": 6.159458996988656,
            "loss_sequences_upper_95": 6.295477058415831,
            "loss_tokens_lower_95": 6.212488635416667,
            "loss_tokens_upper_95": 6.239556333333333,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.26658504563935,
            "data_time": 0.01335657611489296,
            "batch_time": 0.05585228651762009,
            "samples_per_second": 5198349.056242385,
            "samples_per_second_per_gpu": 649793.6320302981,
            "loss_sequences_lower_95": 8.234755779655611,
            "loss_sequences_upper_95": 8.298034658003827,
            "loss_tokens_lower_95": 8.251550541666667,
            "loss_tokens_upper_95": 8.282081979166666,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.498430369312226,
            "data_time": 0.013298893445416501,
            "batch_time": 0.05674328929499576,
            "samples_per_second": 5337421.714166352,
            "samples_per_second_per_gpu": 667177.714270794,
            "loss_sequences_lower_95": 5.473977237401832,
            "loss_sequences_upper_95": 5.5246591132198954,
            "loss_tokens_lower_95": 5.485677552083334,
            "loss_tokens_upper_95": 5.511190854166666,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.627867550384708,
            "data_time": 0.10196593403816223,
            "batch_time": 0.14681607484817505,
            "samples_per_second": 3958506.86580416,
            "samples_per_second_per_gpu": 494813.35822552,
            "loss_sequences_lower_95": 5.541738817168445,
            "loss_sequences_upper_95": 5.729558557029661,
            "loss_tokens_lower_95": 5.615322458333334,
            "loss_tokens_upper_95": 5.64058421875,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.456159978044834,
            "data_time": 0.10438289493322372,
            "batch_time": 0.14994069933891296,
            "samples_per_second": 4120519.5495429267,
            "samples_per_second_per_gpu": 515064.94369286584,
            "loss_sequences_lower_95": 6.347782626547833,
            "loss_sequences_upper_95": 6.586068327530571,
            "loss_tokens_lower_95": 6.442173322916667,
            "loss_tokens_upper_95": 6.46976184375,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.074690526358736,
            "data_time": 0.009903551175676543,
            "batch_time": 0.05307896281110829,
            "samples_per_second": 5414766.36888833,
            "samples_per_second_per_gpu": 676845.7961110412,
            "loss_sequences_lower_95": 6.061076188737666,
            "loss_sequences_upper_95": 6.0886313510689325,
            "loss_tokens_lower_95": 6.061856614583333,
            "loss_tokens_upper_95": 6.087873833333333,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.7449150587905695,
            "data_time": 0.022231820225715637,
            "batch_time": 0.0670392394065857,
            "samples_per_second": 5051903.957354848,
            "samples_per_second_per_gpu": 631487.994669356,
            "loss_sequences_lower_95": 5.718206515107638,
            "loss_sequences_upper_95": 5.772166301150562,
            "loss_tokens_lower_95": 5.73197365625,
            "loss_tokens_upper_95": 5.757398239583333,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.788394740338742,
            "data_time": 0.09467014670372009,
            "batch_time": 0.1387776955962181,
            "samples_per_second": 4155731.6792112626,
            "samples_per_second_per_gpu": 519466.4599014078,
            "loss_sequences_lower_95": 5.692090017074988,
            "loss_sequences_upper_95": 5.900942726986404,
            "loss_tokens_lower_95": 5.774741072916667,
            "loss_tokens_upper_95": 5.801955125,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.043584723579422,
            "data_time": 0.09428706765174866,
            "batch_time": 0.13843415677547455,
            "samples_per_second": 4207138.119591324,
            "samples_per_second_per_gpu": 525892.2649489155,
            "loss_sequences_lower_95": 5.957571815133338,
            "loss_sequences_upper_95": 6.142611949167038,
            "loss_tokens_lower_95": 6.03116690625,
            "loss_tokens_upper_95": 6.056660489583333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.419178485870361,
            "data_time": 0.15380842983722687,
            "batch_time": 0.17496663331985474,
            "samples_per_second": 1155354.9316740022,
            "samples_per_second_per_gpu": 144419.36645925028,
            "loss_sequences_lower_95": 7.346101674166593,
            "loss_sequences_upper_95": 7.511305080760609,
            "loss_tokens_lower_95": 7.391172773187811,
            "loss_tokens_upper_95": 7.447875387018377,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.932516139728335,
            "data_time": 0.09309398382902145,
            "batch_time": 0.1286555677652359,
            "samples_per_second": 3239186.790769178,
            "samples_per_second_per_gpu": 404898.34884614724,
            "loss_sequences_lower_95": 6.762180695380831,
            "loss_sequences_upper_95": 7.105136686719889,
            "loss_tokens_lower_95": 6.91723425,
            "loss_tokens_upper_95": 6.947600958333334,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.103448401025858,
            "data_time": 0.09234149008989334,
            "batch_time": 0.12878502905368805,
            "samples_per_second": 3730325.5062012845,
            "samples_per_second_per_gpu": 466290.68827516056,
            "loss_sequences_lower_95": 7.012193052749835,
            "loss_sequences_upper_95": 7.217240128731035,
            "loss_tokens_lower_95": 7.0912504375,
            "loss_tokens_upper_95": 7.1158650833333335,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.787644472278532,
            "data_time": 0.15592698752880096,
            "batch_time": 0.18523390591144562,
            "samples_per_second": 2231924.12035213,
            "samples_per_second_per_gpu": 278990.51504401624,
            "loss_sequences_lower_95": 6.6518748799308405,
            "loss_sequences_upper_95": 7.0254722970430965,
            "loss_tokens_lower_95": 6.773073890560963,
            "loss_tokens_upper_95": 6.802208359515081,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.371811585554692,
            "data_time": 0.02802430499683727,
            "batch_time": 0.07250162254680287,
            "samples_per_second": 4489316.763969551,
            "samples_per_second_per_gpu": 561164.5954961939,
            "loss_sequences_lower_95": 5.353533484368324,
            "loss_sequences_upper_95": 5.38967752156477,
            "loss_tokens_lower_95": 5.353352637400299,
            "loss_tokens_upper_95": 5.389808420719983,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.847207935700211,
            "data_time": 0.028138500452041627,
            "batch_time": 0.0722134079784155,
            "samples_per_second": 4486210.057420184,
            "samples_per_second_per_gpu": 560776.257177523,
            "loss_sequences_lower_95": 4.86537504395601,
            "loss_sequences_upper_95": 4.891372884276663,
            "loss_tokens_lower_95": 4.8348642140631926,
            "loss_tokens_upper_95": 4.856606858571218,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.4245568121386185,
            "data_time": 0.04810392525460985,
            "batch_time": 0.09002518985006544,
            "samples_per_second": 4400574.677336814,
            "samples_per_second_per_gpu": 550071.8346671017,
            "loss_sequences_lower_95": 7.903057789250709,
            "loss_sequences_upper_95": 8.170934603064477,
            "loss_tokens_lower_95": 7.293583004787646,
            "loss_tokens_upper_95": 7.48992867859483,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.051316994825999,
            "data_time": 0.0404164952536424,
            "batch_time": 0.08420909071962039,
            "samples_per_second": 4589330.387751488,
            "samples_per_second_per_gpu": 573666.298468936,
            "loss_sequences_lower_95": 7.438157438151042,
            "loss_sequences_upper_95": 7.613811197916666,
            "loss_tokens_lower_95": 6.954556308962264,
            "loss_tokens_upper_95": 7.077548570165094,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.113396712556945,
            "data_time": 0.06591776758432388,
            "batch_time": 0.10598357766866684,
            "samples_per_second": 3994865.230996328,
            "samples_per_second_per_gpu": 499358.153874541,
            "loss_sequences_lower_95": 6.165792307589793,
            "loss_sequences_upper_95": 6.235700300446598,
            "loss_tokens_lower_95": 6.091844277336818,
            "loss_tokens_upper_95": 6.128531089681541,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.433322501182556,
            "data_time": 0.32673271000385284,
            "batch_time": 0.36799220740795135,
            "samples_per_second": 2564338.7015608796,
            "samples_per_second_per_gpu": 320542.33769510995,
            "loss_sequences_lower_95": 7.277062086625533,
            "loss_sequences_upper_95": 7.624883020574396,
            "loss_tokens_lower_95": 7.385280850045349,
            "loss_tokens_upper_95": 7.47627572936612,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.9292602286046865,
            "data_time": 0.3827188163995743,
            "batch_time": 0.4278854727745056,
            "samples_per_second": 2245271.029883866,
            "samples_per_second_per_gpu": 280658.8787354833,
            "loss_sequences_lower_95": 5.878569996113679,
            "loss_sequences_upper_95": 6.081272097217794,
            "loss_tokens_lower_95": 5.893040117742891,
            "loss_tokens_upper_95": 5.99535359893365,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.766766538619995,
            "data_time": 0.1856141835451126,
            "batch_time": 0.21653318405151367,
            "samples_per_second": 2548536.880356766,
            "samples_per_second_per_gpu": 318567.1100445957,
            "loss_sequences_lower_95": 5.680044118245442,
            "loss_sequences_upper_95": 5.790167510986328,
            "loss_tokens_lower_95": 5.683198809535306,
            "loss_tokens_upper_95": 5.860098172295079,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.13919925365877,
            "data_time": 0.02478579245507717,
            "batch_time": 0.06891806442290545,
            "samples_per_second": 4516335.991369499,
            "samples_per_second_per_gpu": 564541.9989211874,
            "loss_sequences_lower_95": 9.213587006667979,
            "loss_sequences_upper_95": 9.28655143539934,
            "loss_tokens_lower_95": 9.084528625800326,
            "loss_tokens_upper_95": 9.162093081071719,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.1185550352539675,
            "data_time": 0.04561722427606583,
            "batch_time": 0.08805650174617767,
            "samples_per_second": 4380903.736995531,
            "samples_per_second_per_gpu": 547612.9671244414,
            "loss_sequences_lower_95": 7.288897643426452,
            "loss_sequences_upper_95": 7.572075820049453,
            "loss_tokens_lower_95": 5.973261301870459,
            "loss_tokens_upper_95": 6.121773130605699,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.86096028419078,
            "data_time": 0.0795128345489502,
            "batch_time": 0.12192018330097198,
            "samples_per_second": 4339535.87339277,
            "samples_per_second_per_gpu": 542441.9841740963,
            "loss_sequences_lower_95": 6.532731487807967,
            "loss_sequences_upper_95": 6.847836334957605,
            "loss_tokens_lower_95": 5.761738397845071,
            "loss_tokens_upper_95": 5.933216738262343,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.532914187810192,
            "data_time": 0.3569287955760956,
            "batch_time": 0.3981366455554962,
            "samples_per_second": 2455640.609071293,
            "samples_per_second_per_gpu": 306955.0761339116,
            "loss_sequences_lower_95": 5.498248597584903,
            "loss_sequences_upper_95": 5.566508337569563,
            "loss_tokens_lower_95": 5.498536215307506,
            "loss_tokens_upper_95": 5.566054378143728,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.24308168888092,
            "data_time": 0.30159156024456024,
            "batch_time": 0.32728415727615356,
            "samples_per_second": 1882662.6760278216,
            "samples_per_second_per_gpu": 235332.8345034777,
            "loss_sequences_lower_95": 5.163019302368164,
            "loss_sequences_upper_95": 5.598568267822266,
            "loss_tokens_lower_95": 4.985000484787287,
            "loss_tokens_upper_95": 5.498255202510063,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.1971668360055165,
            "data_time": 0.055730439722537994,
            "batch_time": 0.09965976141393185,
            "samples_per_second": 4410485.773340192,
            "samples_per_second_per_gpu": 551310.721667524,
            "loss_sequences_lower_95": 5.141126025540558,
            "loss_sequences_upper_95": 5.254287429231704,
            "loss_tokens_lower_95": 5.140674152979465,
            "loss_tokens_upper_95": 5.253674153979017,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.744392815229836,
            "data_time": 0.08194625675678253,
            "batch_time": 0.12600089907646178,
            "samples_per_second": 4220246.63724873,
            "samples_per_second_per_gpu": 527530.8296560913,
            "loss_sequences_lower_95": 5.688827047262106,
            "loss_sequences_upper_95": 5.79889220941467,
            "loss_tokens_lower_95": 5.688010655808507,
            "loss_tokens_upper_95": 5.8005617033937344,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.225961967443356,
            "data_time": 0.0515447985380888,
            "batch_time": 0.09258734248578548,
            "samples_per_second": 4277370.486778826,
            "samples_per_second_per_gpu": 534671.3108473533,
            "loss_sequences_lower_95": 5.412043504849871,
            "loss_sequences_upper_95": 5.525355783821579,
            "loss_tokens_lower_95": 5.200076024665824,
            "loss_tokens_upper_95": 5.259599759224261,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.846031008720398,
            "data_time": 0.18364626169204712,
            "batch_time": 0.22876544296741486,
            "samples_per_second": 3717184.240785768,
            "samples_per_second_per_gpu": 464648.030098221,
            "loss_sequences_lower_95": 7.521087646484375,
            "loss_sequences_upper_95": 8.051597363281251,
            "loss_tokens_lower_95": 6.611930397786028,
            "loss_tokens_upper_95": 6.963103969329087,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.205128878355026,
            "data_time": 0.1573571264743805,
            "batch_time": 0.17444442212581635,
            "samples_per_second": 848362.1533364464,
            "samples_per_second_per_gpu": 106045.2691670558,
            "loss_sequences_lower_95": 4.91269314289093,
            "loss_sequences_upper_95": 5.637881314754486,
            "loss_tokens_lower_95": 4.63974372600687,
            "loss_tokens_upper_95": 5.609585378361843,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.846506724412414,
            "data_time": 0.3569062650203705,
            "batch_time": 0.39232222735881805,
            "samples_per_second": 2107571.5540585527,
            "samples_per_second_per_gpu": 263446.4442573191,
            "loss_sequences_lower_95": 6.623423749551006,
            "loss_sequences_upper_95": 7.195709474059357,
            "loss_tokens_lower_95": 5.568002893905898,
            "loss_tokens_upper_95": 5.984889388057532,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.009256508111849,
            "data_time": 0.047262544433275856,
            "batch_time": 0.0920351892709732,
            "samples_per_second": 4543035.608074192,
            "samples_per_second_per_gpu": 567879.451009274,
            "loss_sequences_lower_95": 4.967838531000846,
            "loss_sequences_upper_95": 5.050889593401868,
            "loss_tokens_lower_95": 4.967464096715951,
            "loss_tokens_upper_95": 5.050552692138352,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.182307282952506,
            "data_time": 0.033473714476539976,
            "batch_time": 0.07695854277837844,
            "samples_per_second": 4381942.186542536,
            "samples_per_second_per_gpu": 547742.773317817,
            "loss_sequences_lower_95": 8.20325204977683,
            "loss_sequences_upper_95": 8.365204924619153,
            "loss_tokens_lower_95": 8.09246416304001,
            "loss_tokens_upper_95": 8.254351007821045,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.58548902067946,
            "data_time": 0.1887606978416443,
            "batch_time": 0.21870262175798416,
            "samples_per_second": 1795762.2035624275,
            "samples_per_second_per_gpu": 224470.27544530344,
            "loss_sequences_lower_95": 4.444565881302942,
            "loss_sequences_upper_95": 4.8123788910471035,
            "loss_tokens_lower_95": 4.381312988732944,
            "loss_tokens_upper_95": 4.726125881255059,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.885242645817661,
            "data_time": 0.08148763179779053,
            "batch_time": 0.12628336250782013,
            "samples_per_second": 4404886.318800114,
            "samples_per_second_per_gpu": 550610.7898500143,
            "loss_sequences_lower_95": 4.922570251609363,
            "loss_sequences_upper_95": 5.053889620689867,
            "loss_tokens_lower_95": 4.804236588804316,
            "loss_tokens_upper_95": 4.959604916530089,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.933935278799476,
            "data_time": 0.3109317570924759,
            "batch_time": 0.3446085900068283,
            "samples_per_second": 2301270.885305386,
            "samples_per_second_per_gpu": 287658.86066317325,
            "loss_sequences_lower_95": 6.746637595572123,
            "loss_sequences_upper_95": 7.316601748582793,
            "loss_tokens_lower_95": 6.766007500254894,
            "loss_tokens_upper_95": 7.148652068942445,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.163801260770953,
            "data_time": 0.032934359248529034,
            "batch_time": 0.0772192493127805,
            "samples_per_second": 4310529.412371761,
            "samples_per_second_per_gpu": 538816.1765464701,
            "loss_sequences_lower_95": 5.1556850139559165,
            "loss_sequences_upper_95": 5.171928321211118,
            "loss_tokens_lower_95": 5.155473067269704,
            "loss_tokens_upper_95": 5.1721316528930785,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.103094073175226,
            "data_time": 0.3215636610984802,
            "batch_time": 0.3480236679315567,
            "samples_per_second": 1455152.1452739344,
            "samples_per_second_per_gpu": 181894.0181592418,
            "loss_sequences_lower_95": 5.966056542257661,
            "loss_sequences_upper_95": 6.326198911203921,
            "loss_tokens_lower_95": 5.845995456535472,
            "loss_tokens_upper_95": 6.2729777809541325,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.610740389239114,
            "data_time": 0.022340795695781707,
            "batch_time": 0.06651952842871348,
            "samples_per_second": 4522291.020916308,
            "samples_per_second_per_gpu": 565286.3776145385,
            "loss_sequences_lower_95": 7.074041433274371,
            "loss_sequences_upper_95": 7.11677329009434,
            "loss_tokens_lower_95": 6.546444934719536,
            "loss_tokens_upper_95": 6.586991344294004,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.450593379497528,
            "data_time": 0.10432282090187073,
            "batch_time": 0.14920280128717422,
            "samples_per_second": 4315627.053592003,
            "samples_per_second_per_gpu": 539453.3816990004,
            "loss_sequences_lower_95": 9.2375765625,
            "loss_sequences_upper_95": 9.711332739257813,
            "loss_tokens_lower_95": 9.198758017177832,
            "loss_tokens_upper_95": 9.672066894908884,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.048798627438752,
            "data_time": 0.33877702057361603,
            "batch_time": 0.38146618008613586,
            "samples_per_second": 2654415.2508221203,
            "samples_per_second_per_gpu": 331801.90635276504,
            "loss_sequences_lower_95": 4.914726257324219,
            "loss_sequences_upper_95": 5.182288420304008,
            "loss_tokens_lower_95": 4.915852382494056,
            "loss_tokens_upper_95": 5.178984003481657,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 12.261035136743025,
            "data_time": 0.07253403713305791,
            "batch_time": 0.11217244217793147,
            "samples_per_second": 3844871.7370455638,
            "samples_per_second_per_gpu": 480608.96713069547,
            "loss_sequences_lower_95": 12.069526737097537,
            "loss_sequences_upper_95": 12.453187958688448,
            "loss_tokens_lower_95": 12.068983320756393,
            "loss_tokens_upper_95": 12.455833222360322,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.5519423510233565,
            "data_time": 0.0653637523452441,
            "batch_time": 0.10944865892330806,
            "samples_per_second": 4463747.341747509,
            "samples_per_second_per_gpu": 557968.4177184387,
            "loss_sequences_lower_95": 4.661680558268229,
            "loss_sequences_upper_95": 4.753468953450521,
            "loss_tokens_lower_95": 4.498418382978191,
            "loss_tokens_upper_95": 4.590117890906362,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.360343072527931,
            "data_time": 0.3764711916446686,
            "batch_time": 0.4179193526506424,
            "samples_per_second": 1597049.3168048314,
            "samples_per_second_per_gpu": 199631.16460060392,
            "loss_sequences_lower_95": 6.043649640764508,
            "loss_sequences_upper_95": 6.680675978887648,
            "loss_tokens_lower_95": 6.0487846156529015,
            "loss_tokens_upper_95": 6.682875220889136,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.5338733196258545,
            "data_time": 0.1548876017332077,
            "batch_time": 0.17243359982967377,
            "samples_per_second": 907671.6920706525,
            "samples_per_second_per_gpu": 113458.96150883156,
            "loss_sequences_lower_95": 6.20062484741211,
            "loss_sequences_upper_95": 7.693376660346985,
            "loss_tokens_lower_95": 6.120879673908666,
            "loss_tokens_upper_95": 6.723726907317171,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.574715667247772,
            "data_time": 0.09395470470190048,
            "batch_time": 0.13853060826659203,
            "samples_per_second": 4319768.074091513,
            "samples_per_second_per_gpu": 539971.0092614391,
            "loss_sequences_lower_95": 7.636034155273437,
            "loss_sequences_upper_95": 7.958809680175781,
            "loss_tokens_lower_95": 7.4257328349725045,
            "loss_tokens_upper_95": 7.706170995102315,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.1593892984390255,
            "data_time": 0.10022879019379616,
            "batch_time": 0.14526429399847984,
            "samples_per_second": 4179306.1142946323,
            "samples_per_second_per_gpu": 522413.26428682904,
            "loss_sequences_lower_95": 7.421682971191406,
            "loss_sequences_upper_95": 7.6521970703125,
            "loss_tokens_lower_95": 7.061935016782987,
            "loss_tokens_upper_95": 7.232187673600244,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.732850976330779,
            "data_time": 0.041364350666602455,
            "batch_time": 0.08506632223725319,
            "samples_per_second": 4553875.407715897,
            "samples_per_second_per_gpu": 569234.4259644871,
            "loss_sequences_lower_95": 5.71428539392495,
            "loss_sequences_upper_95": 5.751626567476115,
            "loss_tokens_lower_95": 5.714361787106101,
            "loss_tokens_upper_95": 5.751954074390294,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.315296025137015,
            "data_time": 0.12070101996262868,
            "batch_time": 0.1606858422358831,
            "samples_per_second": 4044662.7079267255,
            "samples_per_second_per_gpu": 505582.8384908407,
            "loss_sequences_lower_95": 5.227311376053067,
            "loss_sequences_upper_95": 5.40089833249328,
            "loss_tokens_lower_95": 5.22855204920615,
            "loss_tokens_upper_95": 5.400186855708765,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.255346482276916,
            "data_time": 0.09968984872102737,
            "batch_time": 0.14412083476781845,
            "samples_per_second": 4170532.776749731,
            "samples_per_second_per_gpu": 521316.5970937164,
            "loss_sequences_lower_95": 8.20133525390625,
            "loss_sequences_upper_95": 8.308609960937499,
            "loss_tokens_lower_95": 8.20088369140625,
            "loss_tokens_upper_95": 8.30759208984375,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.0223459175203695,
            "data_time": 0.027905898434775218,
            "batch_time": 0.07169682213238307,
            "samples_per_second": 4493180.534781856,
            "samples_per_second_per_gpu": 561647.566847732,
            "loss_sequences_lower_95": 7.646342426974929,
            "loss_sequences_upper_95": 7.724030680877483,
            "loss_tokens_lower_95": 6.930653323353454,
            "loss_tokens_upper_95": 6.990227443293307,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.156043298208892,
            "data_time": 0.20093147243772233,
            "batch_time": 0.23343076876231603,
            "samples_per_second": 1986094.0603314992,
            "samples_per_second_per_gpu": 248261.7575414374,
            "loss_sequences_lower_95": 5.007203913446682,
            "loss_sequences_upper_95": 5.299906670869286,
            "loss_tokens_lower_95": 5.006142049760961,
            "loss_tokens_upper_95": 5.301436193665461,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.2230322155297975,
            "data_time": 0.19744179397821426,
            "batch_time": 0.24344703555107117,
            "samples_per_second": 3645703.8932839055,
            "samples_per_second_per_gpu": 455712.9866604882,
            "loss_sequences_lower_95": 5.116784656001073,
            "loss_sequences_upper_95": 5.325717964920343,
            "loss_tokens_lower_95": 5.118072581571691,
            "loss_tokens_upper_95": 5.327419529335171,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.872819043543798,
            "data_time": 0.034182590432465076,
            "batch_time": 0.07791963219642639,
            "samples_per_second": 4365457.599976755,
            "samples_per_second_per_gpu": 545682.1999970943,
            "loss_sequences_lower_95": 7.2790506975761,
            "loss_sequences_upper_95": 7.365430580510929,
            "loss_tokens_lower_95": 6.788773394251567,
            "loss_tokens_upper_95": 6.8614059292169225,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.852000178483428,
            "data_time": 0.32634247839450836,
            "batch_time": 0.364572674036026,
            "samples_per_second": 2264353.7630307833,
            "samples_per_second_per_gpu": 283044.2203788479,
            "loss_sequences_lower_95": 5.773739898520172,
            "loss_sequences_upper_95": 5.933261172985905,
            "loss_tokens_lower_95": 5.7724632949425425,
            "loss_tokens_upper_95": 5.930999901181175,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.251589640346142,
            "data_time": 0.04508562042162968,
            "batch_time": 0.08915635141042563,
            "samples_per_second": 4468403.011118619,
            "samples_per_second_per_gpu": 558550.3763898273,
            "loss_sequences_lower_95": 9.233179218630543,
            "loss_sequences_upper_95": 9.269639717125383,
            "loss_tokens_lower_95": 9.232760282277333,
            "loss_tokens_upper_95": 9.269406238054282,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.987552936794688,
            "data_time": 0.3660612851381302,
            "batch_time": 0.4060111939907074,
            "samples_per_second": 1756401.4326102012,
            "samples_per_second_per_gpu": 219550.17907627515,
            "loss_sequences_lower_95": 4.8283460598547485,
            "loss_sequences_upper_95": 5.145532108047634,
            "loss_tokens_lower_95": 4.828189568380708,
            "loss_tokens_upper_95": 5.1433090061817355,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.317288668950399,
            "data_time": 0.3009014278650284,
            "batch_time": 0.32068127393722534,
            "samples_per_second": 1312198.0684281709,
            "samples_per_second_per_gpu": 164024.75855352136,
            "loss_sequences_lower_95": 7.011642405192057,
            "loss_sequences_upper_95": 7.839444249471029,
            "loss_tokens_lower_95": 6.634622160593669,
            "loss_tokens_upper_95": 7.888281355963812,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.894447271029154,
            "data_time": 0.2994207739830017,
            "batch_time": 0.3197387754917145,
            "samples_per_second": 1254468.645918681,
            "samples_per_second_per_gpu": 156808.58073983513,
            "loss_sequences_lower_95": 6.637410748799642,
            "loss_sequences_upper_95": 7.646418291727702,
            "loss_tokens_lower_95": 6.090697967872192,
            "loss_tokens_upper_95": 7.426557305153836,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.545616533057503,
            "data_time": 0.04278971041951861,
            "batch_time": 0.08596687551055636,
            "samples_per_second": 4296145.854743961,
            "samples_per_second_per_gpu": 537018.2318429952,
            "loss_sequences_lower_95": 9.534235749953977,
            "loss_sequences_upper_95": 9.557159583831922,
            "loss_tokens_lower_95": 9.534438828930412,
            "loss_tokens_upper_95": 9.557188664971465,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 6.496716225986106,
            "data_time": 0.023995388738453128,
            "batch_time": 0.0682105617882383,
            "samples_per_second": 4485610.603657463,
            "samples_per_second_per_gpu": 560701.3254571828,
            "loss_sequences_lower_95": 7.109861070744717,
            "loss_sequences_upper_95": 7.143477046006291,
            "loss_tokens_lower_95": 6.428784425062772,
            "loss_tokens_upper_95": 6.459978715033419,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.939734789330189,
            "data_time": 0.35688522458076477,
            "batch_time": 0.4474702775478363,
            "samples_per_second": 1661277.4312267439,
            "samples_per_second_per_gpu": 207659.67890334298,
            "loss_sequences_lower_95": 7.9625627893162525,
            "loss_sequences_upper_95": 8.335786017470474,
            "loss_tokens_lower_95": 7.818030327890875,
            "loss_tokens_upper_95": 8.042364965226604,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 9.951234173130345,
            "data_time": 0.22127825021743774,
            "batch_time": 0.23863330483436584,
            "samples_per_second": 1044878.6983944596,
            "samples_per_second_per_gpu": 130609.83729930744,
            "loss_sequences_lower_95": 9.563006385597022,
            "loss_sequences_upper_95": 10.551281676421294,
            "loss_tokens_lower_95": 9.434622418438947,
            "loss_tokens_upper_95": 10.289777421362606,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.05360164584183,
            "data_time": 0.34086012840270996,
            "batch_time": 0.3750476539134979,
            "samples_per_second": 2462061.785556428,
            "samples_per_second_per_gpu": 307757.7231945535,
            "loss_sequences_lower_95": 8.00718882490949,
            "loss_sequences_upper_95": 8.287772313559927,
            "loss_tokens_lower_95": 7.928428005414214,
            "loss_tokens_upper_95": 8.11648784101661,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.991283957551166,
            "data_time": 0.3358445465564728,
            "batch_time": 0.37000417709350586,
            "samples_per_second": 2084857.9871022434,
            "samples_per_second_per_gpu": 260607.24838778042,
            "loss_sequences_lower_95": 7.929739584573886,
            "loss_sequences_upper_95": 8.202810948069503,
            "loss_tokens_lower_95": 7.897440574791838,
            "loss_tokens_upper_95": 8.057997367736307,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.17589935151542,
            "data_time": 0.32609823346138,
            "batch_time": 0.36082175374031067,
            "samples_per_second": 2389291.226699315,
            "samples_per_second_per_gpu": 298661.4033374144,
            "loss_sequences_lower_95": 8.219458919618189,
            "loss_sequences_upper_95": 8.574193126399342,
            "loss_tokens_lower_95": 8.018694405010063,
            "loss_tokens_upper_95": 8.26378011676071,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 7.93794531647752,
            "data_time": 0.32148630917072296,
            "batch_time": 0.3551800698041916,
            "samples_per_second": 2241929.1557058687,
            "samples_per_second_per_gpu": 280241.1444632336,
            "loss_sequences_lower_95": 7.8588559778725235,
            "loss_sequences_upper_95": 8.121582366199027,
            "loss_tokens_lower_95": 7.848916578441394,
            "loss_tokens_upper_95": 7.997119045554663,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.150322677185816,
            "data_time": 0.32388362288475037,
            "batch_time": 0.35817648470401764,
            "samples_per_second": 2004773.0760446326,
            "samples_per_second_per_gpu": 250596.63450557907,
            "loss_sequences_lower_95": 8.018780290117917,
            "loss_sequences_upper_95": 8.194022630016255,
            "loss_tokens_lower_95": 8.101524385785318,
            "loss_tokens_upper_95": 8.215948239616846,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 8.056899460350595,
            "data_time": 0.33455537259578705,
            "batch_time": 0.36912694573402405,
            "samples_per_second": 2154425.5259492532,
            "samples_per_second_per_gpu": 269303.19074365665,
            "loss_sequences_lower_95": 8.100825965695265,
            "loss_sequences_upper_95": 8.338615250005954,
            "loss_tokens_lower_95": 7.9570070684523815,
            "loss_tokens_upper_95": 8.084019599532851,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/checkpoints/epoch_1.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=96_l=8_h=4-1.0/params.txt",
    "uuid": "6125019e-4169-462b-91a4-8e819c5fe94d",
    "creation_date": "2023_12_13-16_17_45"
}