{
    "name": "c4_original-d=576_l=24_h=8-16.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 49176760320,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 16.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/186",
        "--train-num-samples",
        "9835352064",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/original_c4/manifest.jsonl",
        "--data-key",
        "txt",
        "--name",
        "c4_original-d=576_l=24_h=8-16.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-max-pop-ci",
        "300000",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/154m_16x_c4_original/"
    ],
    "results": [
        {
            "loss": 3.779083851973216,
            "data_time": 0.009232227007548015,
            "batch_time": 0.15153383413950602,
            "samples_per_second": 213890.81668012135,
            "samples_per_second_per_gpu": 106945.40834006068,
            "loss_sequences_lower_95": 3.649570630391439,
            "loss_sequences_upper_95": 3.9165514755249022,
            "loss_tokens_lower_95": 3.7629030418395994,
            "loss_tokens_upper_95": 3.795049171447754,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.990391744989669,
            "data_time": 0.0009676473449258243,
            "batch_time": 0.029829101747625014,
            "samples_per_second": 276615.5633124517,
            "samples_per_second_per_gpu": 138307.78165622585,
            "loss_sequences_lower_95": 2.9874386251970613,
            "loss_sequences_upper_95": 2.99327625361769,
            "loss_tokens_lower_95": 2.9800361406249998,
            "loss_tokens_upper_95": 3.000667328125,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6253610353080594,
            "data_time": 0.0032511794470190033,
            "batch_time": 0.03651642993213684,
            "samples_per_second": 249436.90478943655,
            "samples_per_second_per_gpu": 124718.45239471828,
            "loss_sequences_lower_95": 3.6045662331094546,
            "loss_sequences_upper_95": 3.6456178813077966,
            "loss_tokens_lower_95": 3.6084961770833335,
            "loss_tokens_upper_95": 3.6421633541666667,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9406101865375165,
            "data_time": 0.0012579897859326494,
            "batch_time": 0.036429532873463455,
            "samples_per_second": 226362.67377161438,
            "samples_per_second_per_gpu": 113181.33688580719,
            "loss_sequences_lower_95": 2.9303813224871136,
            "loss_sequences_upper_95": 2.9509304023034795,
            "loss_tokens_lower_95": 2.930597765625,
            "loss_tokens_upper_95": 2.950837083333333,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9983120756091024,
            "data_time": 0.003564124165276284,
            "batch_time": 0.03638155643756573,
            "samples_per_second": 243414.95557592314,
            "samples_per_second_per_gpu": 121707.47778796157,
            "loss_sequences_lower_95": 2.964492300620147,
            "loss_sequences_upper_95": 3.0327791109104503,
            "loss_tokens_lower_95": 2.9877966510416667,
            "loss_tokens_upper_95": 3.008911442708333,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5082531153628973,
            "data_time": 0.0014949709824893785,
            "batch_time": 0.029571685454119808,
            "samples_per_second": 285153.90803851554,
            "samples_per_second_per_gpu": 142576.95401925777,
            "loss_sequences_lower_95": 3.471623623119901,
            "loss_sequences_upper_95": 3.5459153248613826,
            "loss_tokens_lower_95": 3.4953383437500003,
            "loss_tokens_upper_95": 3.521222197916667,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3459087754755603,
            "data_time": 0.0010916942678890259,
            "batch_time": 0.03320217677080612,
            "samples_per_second": 253704.80007340974,
            "samples_per_second_per_gpu": 126852.40003670487,
            "loss_sequences_lower_95": 3.3134673449457908,
            "loss_sequences_upper_95": 3.378188028140944,
            "loss_tokens_lower_95": 3.32964471875,
            "loss_tokens_upper_95": 3.3622773645833335,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.74631672699414,
            "data_time": 0.0012624309689954697,
            "batch_time": 0.03391438123568818,
            "samples_per_second": 251954.68903803497,
            "samples_per_second_per_gpu": 125977.34451901748,
            "loss_sequences_lower_95": 3.7380790657722516,
            "loss_sequences_upper_95": 3.7543587000981673,
            "loss_tokens_lower_95": 3.73373653125,
            "loss_tokens_upper_95": 3.758682947916667,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.406948372115934,
            "data_time": 0.00339889814776759,
            "batch_time": 0.0363379341940726,
            "samples_per_second": 253046.69029474887,
            "samples_per_second_per_gpu": 126523.34514737444,
            "loss_sequences_lower_95": 3.3647007143594383,
            "loss_sequences_upper_95": 3.45033555689866,
            "loss_tokens_lower_95": 3.395653895833333,
            "loss_tokens_upper_95": 3.4186380677083332,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.458827390029968,
            "data_time": 0.003492589071979673,
            "batch_time": 0.038823532307241844,
            "samples_per_second": 223420.9103833131,
            "samples_per_second_per_gpu": 111710.45519165655,
            "loss_sequences_lower_95": 4.432206032492897,
            "loss_sequences_upper_95": 4.483818747780539,
            "loss_tokens_lower_95": 4.444603739583333,
            "loss_tokens_upper_95": 4.47271340625,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3665981197644705,
            "data_time": 0.0009584348474211417,
            "batch_time": 0.031761828629276005,
            "samples_per_second": 263539.9272288027,
            "samples_per_second_per_gpu": 131769.96361440135,
            "loss_sequences_lower_95": 3.3590676846006917,
            "loss_sequences_upper_95": 3.374311595036916,
            "loss_tokens_lower_95": 3.3554604270833335,
            "loss_tokens_upper_95": 3.3777313593750002,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.164786269097763,
            "data_time": 0.001150366269380822,
            "batch_time": 0.029776396699789463,
            "samples_per_second": 280659.7449251143,
            "samples_per_second_per_gpu": 140329.87246255716,
            "loss_sequences_lower_95": 3.155683911000625,
            "loss_sequences_upper_95": 3.1741279675135363,
            "loss_tokens_lower_95": 3.153282229166667,
            "loss_tokens_upper_95": 3.1762760885416665,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9320306453936964,
            "data_time": 0.00305496012010882,
            "batch_time": 0.03124262248316119,
            "samples_per_second": 280968.46164214524,
            "samples_per_second_per_gpu": 140484.23082107262,
            "loss_sequences_lower_95": 3.8960089074187056,
            "loss_sequences_upper_95": 3.969199314078743,
            "loss_tokens_lower_95": 3.917825802083333,
            "loss_tokens_upper_95": 3.9461918854166664,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9998419867513624,
            "data_time": 0.0033250356975354648,
            "batch_time": 0.03582140694745639,
            "samples_per_second": 245839.5960083105,
            "samples_per_second_per_gpu": 122919.79800415525,
            "loss_sequences_lower_95": 2.942874008769173,
            "loss_sequences_upper_95": 3.0565777563029055,
            "loss_tokens_lower_95": 2.987909807291667,
            "loss_tokens_upper_95": 3.01137659375,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.65679966319691,
            "data_time": 0.025317182143529255,
            "batch_time": 0.06191322207450867,
            "samples_per_second": 186912.81827515658,
            "samples_per_second_per_gpu": 93456.40913757829,
            "loss_sequences_lower_95": 4.580347503315319,
            "loss_sequences_upper_95": 4.733015173131769,
            "loss_tokens_lower_95": 4.62261266708374,
            "loss_tokens_upper_95": 4.6923961119218305,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.647947792409113,
            "data_time": 0.004758886126584785,
            "batch_time": 0.04016308729038682,
            "samples_per_second": 222780.79872764024,
            "samples_per_second_per_gpu": 111390.39936382012,
            "loss_sequences_lower_95": 3.5542792228498543,
            "loss_sequences_upper_95": 3.7425621544306895,
            "loss_tokens_lower_95": 3.6339272291666664,
            "loss_tokens_upper_95": 3.6625470208333333,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.791282205908783,
            "data_time": 0.004059480747003206,
            "batch_time": 0.03232786418255711,
            "samples_per_second": 280952.9276410756,
            "samples_per_second_per_gpu": 140476.4638205378,
            "loss_sequences_lower_95": 5.728748106138688,
            "loss_sequences_upper_95": 5.851027451970646,
            "loss_tokens_lower_95": 5.778129000000001,
            "loss_tokens_upper_95": 5.804072989583333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5873838330878587,
            "data_time": 0.011951538824266004,
            "batch_time": 0.044579436702113,
            "samples_per_second": 242908.9266964119,
            "samples_per_second_per_gpu": 121454.46334820594,
            "loss_sequences_lower_95": 3.5451338783639375,
            "loss_sequences_upper_95": 3.6309777932088885,
            "loss_tokens_lower_95": 3.571767144125016,
            "loss_tokens_upper_95": 3.6031133307785286,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.999490766303863,
            "data_time": 0.0011667741462057792,
            "batch_time": 0.02946741385393352,
            "samples_per_second": 281564.31963893503,
            "samples_per_second_per_gpu": 140782.15981946752,
            "loss_sequences_lower_95": 4.976733819189753,
            "loss_sequences_upper_95": 5.022781655034895,
            "loss_tokens_lower_95": 4.975813199352834,
            "loss_tokens_upper_95": 5.022850867041732,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.7210340858146838,
            "data_time": 0.0013112386637406253,
            "batch_time": 0.02956179022931426,
            "samples_per_second": 281619.4111155964,
            "samples_per_second_per_gpu": 140809.7055577982,
            "loss_sequences_lower_95": 2.7266275198852323,
            "loss_sequences_upper_95": 2.7521525510823293,
            "loss_tokens_lower_95": 2.6985344454872693,
            "loss_tokens_upper_95": 2.7167515243797724,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9222502812609448,
            "data_time": 0.0019018955950467092,
            "batch_time": 0.030069504593903163,
            "samples_per_second": 281874.70829098637,
            "samples_per_second_per_gpu": 140937.35414549318,
            "loss_sequences_lower_95": 4.17229127302787,
            "loss_sequences_upper_95": 4.472513525183042,
            "loss_tokens_lower_95": 3.3630119041968225,
            "loss_tokens_upper_95": 3.5776303200202526,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.938011462117235,
            "data_time": 0.0018170175552368164,
            "batch_time": 0.03418368180592855,
            "samples_per_second": 249798.95917501012,
            "samples_per_second_per_gpu": 124899.47958750506,
            "loss_sequences_lower_95": 4.061658536783854,
            "loss_sequences_upper_95": 4.268973852539062,
            "loss_tokens_lower_95": 3.6902902785966982,
            "loss_tokens_upper_95": 3.835551407724057,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.8047256299814913,
            "data_time": 0.0025187318975275212,
            "batch_time": 0.03475405772527059,
            "samples_per_second": 250556.57424159668,
            "samples_per_second_per_gpu": 125278.28712079834,
            "loss_sequences_lower_95": 2.845055925602078,
            "loss_sequences_upper_95": 2.908303465131018,
            "loss_tokens_lower_95": 2.7202326558618073,
            "loss_tokens_upper_95": 2.752149792392637,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5125998941334813,
            "data_time": 0.009683540889195033,
            "batch_time": 0.03792901337146759,
            "samples_per_second": 270051.7217414049,
            "samples_per_second_per_gpu": 135025.86087070245,
            "loss_sequences_lower_95": 3.405697285045277,
            "loss_sequences_upper_95": 3.7063190876353866,
            "loss_tokens_lower_95": 3.424425246271289,
            "loss_tokens_upper_95": 3.5113947237478587,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.460432868101159,
            "data_time": 0.00736484412224062,
            "batch_time": 0.03913621364101287,
            "samples_per_second": 244217.42272027937,
            "samples_per_second_per_gpu": 122108.71136013968,
            "loss_sequences_lower_95": 3.4373106851383133,
            "loss_sequences_upper_95": 3.6433714325573976,
            "loss_tokens_lower_95": 3.350464730027082,
            "loss_tokens_upper_95": 3.4543045579087677,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.687084935506185,
            "data_time": 0.007102847099304199,
            "batch_time": 0.038784453743382505,
            "samples_per_second": 245345.0195514052,
            "samples_per_second_per_gpu": 122672.5097757026,
            "loss_sequences_lower_95": 3.6633544108072913,
            "loss_sequences_upper_95": 3.767169392903646,
            "loss_tokens_lower_95": 3.541803152039555,
            "loss_tokens_upper_95": 3.776390017213574,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.210491529031593,
            "data_time": 0.0010968273449291361,
            "batch_time": 0.03207080088055001,
            "samples_per_second": 259916.81296059053,
            "samples_per_second_per_gpu": 129958.40648029526,
            "loss_sequences_lower_95": 5.212067008113528,
            "loss_sequences_upper_95": 5.299297351723586,
            "loss_tokens_lower_95": 5.069222262620049,
            "loss_tokens_upper_95": 5.1559830446091555,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.094254793875195,
            "data_time": 0.0018922762838678327,
            "batch_time": 0.033843399499000525,
            "samples_per_second": 251032.4796629578,
            "samples_per_second_per_gpu": 125516.2398314789,
            "loss_sequences_lower_95": 4.594908589064473,
            "loss_sequences_upper_95": 4.906724872011127,
            "loss_tokens_lower_95": 3.3180225729184407,
            "loss_tokens_upper_95": 3.457430805757195,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.826223971290393,
            "data_time": 0.0025985735614283555,
            "batch_time": 0.034194856273884675,
            "samples_per_second": 252619.58374175287,
            "samples_per_second_per_gpu": 126309.79187087643,
            "loss_sequences_lower_95": 4.158053203491628,
            "loss_sequences_upper_95": 4.517230203778264,
            "loss_tokens_lower_95": 3.3648024138961157,
            "loss_tokens_upper_95": 3.5298180155771015,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.998522196730522,
            "data_time": 0.008941319611695435,
            "batch_time": 0.042427357252653654,
            "samples_per_second": 231049.27677219396,
            "samples_per_second_per_gpu": 115524.63838609698,
            "loss_sequences_lower_95": 5.911631433495647,
            "loss_sequences_upper_95": 6.084767910334618,
            "loss_tokens_lower_95": 5.91207393837846,
            "loss_tokens_upper_95": 6.085117691945812,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.0458194851875304,
            "data_time": 0.017993596883920524,
            "batch_time": 0.05043633626057552,
            "samples_per_second": 227073.52209457368,
            "samples_per_second_per_gpu": 113536.76104728684,
            "loss_sequences_lower_95": 2.9499831466674804,
            "loss_sequences_upper_95": 3.2991029815673825,
            "loss_tokens_lower_95": 2.7884558662319012,
            "loss_tokens_upper_95": 3.2158619070309005,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.600646996083254,
            "data_time": 0.0022294067897679617,
            "batch_time": 0.03383491546342222,
            "samples_per_second": 253199.55488984456,
            "samples_per_second_per_gpu": 126599.77744492228,
            "loss_sequences_lower_95": 4.55596136680767,
            "loss_sequences_upper_95": 4.644857641276707,
            "loss_tokens_lower_95": 4.557485384047946,
            "loss_tokens_upper_95": 4.645240282353505,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.088633343589589,
            "data_time": 0.0025529880928837397,
            "batch_time": 0.030705189393236746,
            "samples_per_second": 281272.34449323267,
            "samples_per_second_per_gpu": 140636.17224661633,
            "loss_sequences_lower_95": 5.030713150561783,
            "loss_sequences_upper_95": 5.147693380969748,
            "loss_tokens_lower_95": 5.029857368687935,
            "loss_tokens_upper_95": 5.146348348081747,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.056561842706699,
            "data_time": 0.001995260300843612,
            "batch_time": 0.0376485306283702,
            "samples_per_second": 222007.02215864224,
            "samples_per_second_per_gpu": 111003.51107932112,
            "loss_sequences_lower_95": 3.2119961776982455,
            "loss_sequences_upper_95": 3.3397961837552708,
            "loss_tokens_lower_95": 2.8762465111767503,
            "loss_tokens_upper_95": 2.9288234882287414,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.140991049766541,
            "data_time": 0.004926587854112897,
            "batch_time": 0.03317872967038836,
            "samples_per_second": 276812.06883407076,
            "samples_per_second_per_gpu": 138406.03441703538,
            "loss_sequences_lower_95": 5.249015808105469,
            "loss_sequences_upper_95": 5.829962158203125,
            "loss_tokens_lower_95": 4.471573540455927,
            "loss_tokens_upper_95": 4.8405968535427695,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.654801517724991,
            "data_time": 0.03946022689342499,
            "batch_time": 0.07236069440841675,
            "samples_per_second": 217704.91604703479,
            "samples_per_second_per_gpu": 108852.45802351739,
            "loss_sequences_lower_95": 3.4431023418903353,
            "loss_sequences_upper_95": 3.9613146901130674,
            "loss_tokens_lower_95": 3.158751941549367,
            "loss_tokens_upper_95": 4.039218008107152,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.421604912171419,
            "data_time": 0.012686225500973787,
            "batch_time": 0.04095824740149758,
            "samples_per_second": 267971.7344283965,
            "samples_per_second_per_gpu": 133985.86721419825,
            "loss_sequences_lower_95": 4.894678769166442,
            "loss_sequences_upper_95": 5.687700468918373,
            "loss_tokens_lower_95": 3.133522888665269,
            "loss_tokens_upper_95": 3.5648388369107593,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.038362449253944,
            "data_time": 0.0019028120165829054,
            "batch_time": 0.030361252997119354,
            "samples_per_second": 281468.4082503011,
            "samples_per_second_per_gpu": 140734.20412515054,
            "loss_sequences_lower_95": 2.0182523488269304,
            "loss_sequences_upper_95": 2.0586149323189713,
            "loss_tokens_lower_95": 2.018846472343955,
            "loss_tokens_upper_95": 2.0586423968060563,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.497383940316509,
            "data_time": 0.0014597202022731605,
            "batch_time": 0.029618021032816688,
            "samples_per_second": 282414.5972762663,
            "samples_per_second_per_gpu": 141207.29863813316,
            "loss_sequences_lower_95": 2.465538491875182,
            "loss_sequences_upper_95": 2.60536311523627,
            "loss_tokens_lower_95": 2.3583748422831983,
            "loss_tokens_upper_95": 2.497193984901909,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.964319706836463,
            "data_time": 0.007902295049959726,
            "batch_time": 0.03631972222432603,
            "samples_per_second": 274922.5522129813,
            "samples_per_second_per_gpu": 137461.27610649064,
            "loss_sequences_lower_95": 2.893274356157352,
            "loss_sequences_upper_95": 3.291041799692007,
            "loss_tokens_lower_95": 2.772550440672699,
            "loss_tokens_upper_95": 3.0608355244258063,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4057302743125923,
            "data_time": 0.0023804123946062224,
            "batch_time": 0.030855624506792686,
            "samples_per_second": 278217.1654594223,
            "samples_per_second_per_gpu": 139108.58272971114,
            "loss_sequences_lower_95": 3.4752029045604775,
            "loss_sequences_upper_95": 3.631158818197514,
            "loss_tokens_lower_95": 3.2876583856661834,
            "loss_tokens_upper_95": 3.4309572282368883,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.5775285210551284,
            "data_time": 0.011901211170923142,
            "batch_time": 0.04429543302172706,
            "samples_per_second": 234206.25761230924,
            "samples_per_second_per_gpu": 117103.12880615462,
            "loss_sequences_lower_95": 2.536224202411931,
            "loss_sequences_upper_95": 2.9786505722418064,
            "loss_tokens_lower_95": 2.4194077907145313,
            "loss_tokens_upper_95": 2.7611563453861003,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.052190800170036,
            "data_time": 0.0012858451843261718,
            "batch_time": 0.033070500040054324,
            "samples_per_second": 254032.6517183221,
            "samples_per_second_per_gpu": 127016.32585916105,
            "loss_sequences_lower_95": 4.03211147112235,
            "loss_sequences_upper_95": 4.0722262225882675,
            "loss_tokens_lower_95": 4.032408376206492,
            "loss_tokens_upper_95": 4.07225581835117,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.7238753790994292,
            "data_time": 0.022294392952552207,
            "batch_time": 0.054487210053663984,
            "samples_per_second": 229252.05192220717,
            "samples_per_second_per_gpu": 114626.02596110359,
            "loss_sequences_lower_95": 0.6959647280498616,
            "loss_sequences_upper_95": 0.8024117719779894,
            "loss_tokens_lower_95": 0.6204546831769367,
            "loss_tokens_upper_95": 0.7814458122761003,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.145624432973642,
            "data_time": 0.0011461829989211365,
            "batch_time": 0.031970447314360356,
            "samples_per_second": 260921.83642793092,
            "samples_per_second_per_gpu": 130460.91821396546,
            "loss_sequences_lower_95": 4.483731246724319,
            "loss_sequences_upper_95": 4.526020733015592,
            "loss_tokens_lower_95": 3.644786212524178,
            "loss_tokens_upper_95": 3.6872116175048357,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.672805711746216,
            "data_time": 0.0025329556465148928,
            "batch_time": 0.03074836254119873,
            "samples_per_second": 282132.1413653933,
            "samples_per_second_per_gpu": 141066.07068269665,
            "loss_sequences_lower_95": 6.661955944824219,
            "loss_sequences_upper_95": 6.881992431640625,
            "loss_tokens_lower_95": 6.453684389803267,
            "loss_tokens_upper_95": 6.667402052971529,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.322021090466043,
            "data_time": 0.006371060322070944,
            "batch_time": 0.034601760321650014,
            "samples_per_second": 277448.69583941053,
            "samples_per_second_per_gpu": 138724.34791970527,
            "loss_sequences_lower_95": 5.145397378672724,
            "loss_sequences_upper_95": 5.497930709175441,
            "loss_tokens_lower_95": 5.142758842136549,
            "loss_tokens_upper_95": 5.4981785251783295,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.508215571172309,
            "data_time": 0.0020079493522644045,
            "batch_time": 0.03021016735019106,
            "samples_per_second": 282145.4208140183,
            "samples_per_second_per_gpu": 141072.71040700914,
            "loss_sequences_lower_95": 5.465840990471118,
            "loss_sequences_upper_95": 5.5508409627278645,
            "loss_tokens_lower_95": 5.466170561819365,
            "loss_tokens_upper_95": 5.549781466397373,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.8029523259798685,
            "data_time": 0.002179100475412734,
            "batch_time": 0.03029768073812444,
            "samples_per_second": 281515.8214430833,
            "samples_per_second_per_gpu": 140757.91072154164,
            "loss_sequences_lower_95": 0.819846512858073,
            "loss_sequences_upper_95": 0.8555975565592449,
            "loss_tokens_lower_95": 0.754037884685124,
            "loss_tokens_upper_95": 0.7992856478528911,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.608112941469465,
            "data_time": 0.00712610865538975,
            "batch_time": 0.03543131981255873,
            "samples_per_second": 276047.4260562136,
            "samples_per_second_per_gpu": 138023.7130281068,
            "loss_sequences_lower_95": 5.272675345284599,
            "loss_sequences_upper_95": 5.942916303362165,
            "loss_tokens_lower_95": 5.270621047247024,
            "loss_tokens_upper_95": 5.958167593819755,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.0639961510896683,
            "data_time": 0.04095733165740967,
            "batch_time": 0.07831835746765137,
            "samples_per_second": 197862.21121949068,
            "samples_per_second_per_gpu": 98931.10560974534,
            "loss_sequences_lower_95": 1.930142369866371,
            "loss_sequences_upper_95": 2.7054183006286623,
            "loss_tokens_lower_95": 1.639371290698494,
            "loss_tokens_upper_95": 2.0880309964209487,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 7.436180048942566,
            "data_time": 0.0027560839653015135,
            "batch_time": 0.03473363637924194,
            "samples_per_second": 251743.8729234492,
            "samples_per_second_per_gpu": 125871.9364617246,
            "loss_sequences_lower_95": 7.355734216308593,
            "loss_sequences_upper_95": 7.675392663574218,
            "loss_tokens_lower_95": 7.149121796015757,
            "loss_tokens_upper_95": 7.431673714110089,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.899201239585876,
            "data_time": 0.0027598490715026857,
            "batch_time": 0.03446708393096924,
            "samples_per_second": 252830.05103973908,
            "samples_per_second_per_gpu": 126415.02551986954,
            "loss_sequences_lower_95": 6.9543475097656255,
            "loss_sequences_upper_95": 7.149378369140625,
            "loss_tokens_lower_95": 6.64186503143387,
            "loss_tokens_upper_95": 6.837339040787744,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.579829060387939,
            "data_time": 0.0017310700531619483,
            "batch_time": 0.030539599084981963,
            "samples_per_second": 277733.3995891531,
            "samples_per_second_per_gpu": 138866.69979457656,
            "loss_sequences_lower_95": 4.532776103977225,
            "loss_sequences_upper_95": 4.6266765414169875,
            "loss_tokens_lower_95": 4.532472593721191,
            "loss_tokens_upper_95": 4.626382033999958,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.806132412908997,
            "data_time": 0.003591999730568049,
            "batch_time": 0.03578231166991254,
            "samples_per_second": 248457.1800999167,
            "samples_per_second_per_gpu": 124228.59004995835,
            "loss_sequences_lower_95": 4.6971160185501875,
            "loss_sequences_upper_95": 4.914267047466038,
            "loss_tokens_lower_95": 4.696808573873728,
            "loss_tokens_upper_95": 4.914714863626272,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.1008356666564945,
            "data_time": 0.002797018051147461,
            "batch_time": 0.03704781866073609,
            "samples_per_second": 242763.691777258,
            "samples_per_second_per_gpu": 121381.845888629,
            "loss_sequences_lower_95": 5.0459141967773435,
            "loss_sequences_upper_95": 5.157067407226562,
            "loss_tokens_lower_95": 5.0456921630859375,
            "loss_tokens_upper_95": 5.156013476562499,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 2.9349387341438877,
            "data_time": 0.001307818369120303,
            "batch_time": 0.029591011631186603,
            "samples_per_second": 281200.4320880284,
            "samples_per_second_per_gpu": 140600.2160440142,
            "loss_sequences_lower_95": 3.4517053442082544,
            "loss_sequences_upper_95": 3.5489212171830653,
            "loss_tokens_lower_95": 2.3003927661642067,
            "loss_tokens_upper_95": 2.362766920695943,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.267833199074019,
            "data_time": 0.005743147695765775,
            "batch_time": 0.03803558559978709,
            "samples_per_second": 244943.51984158394,
            "samples_per_second_per_gpu": 122471.75992079197,
            "loss_sequences_lower_95": 5.080087166401878,
            "loss_sequences_upper_95": 5.4502683269443795,
            "loss_tokens_lower_95": 5.076328095393395,
            "loss_tokens_upper_95": 5.446676123320167,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.287433101149166,
            "data_time": 0.0033908095210790634,
            "batch_time": 0.03184439241886139,
            "samples_per_second": 279592.8475517525,
            "samples_per_second_per_gpu": 139796.42377587626,
            "loss_sequences_lower_95": 5.152973058363971,
            "loss_sequences_upper_95": 5.422343989353554,
            "loss_tokens_lower_95": 5.15224853515625,
            "loss_tokens_upper_95": 5.421820164100796,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.036604589176823,
            "data_time": 0.001335889817717558,
            "batch_time": 0.03355195300134724,
            "samples_per_second": 250785.13363783245,
            "samples_per_second_per_gpu": 125392.56681891622,
            "loss_sequences_lower_95": 3.3600396937241634,
            "loss_sequences_upper_95": 3.4476061435237377,
            "loss_tokens_lower_95": 2.496392450599406,
            "loss_tokens_upper_95": 2.5631613859661986,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.3419043361825285,
            "data_time": 0.0077919090787569685,
            "batch_time": 0.03628545751174291,
            "samples_per_second": 272446.72180021874,
            "samples_per_second_per_gpu": 136223.36090010937,
            "loss_sequences_lower_95": 4.163824325642258,
            "loss_sequences_upper_95": 4.510547141423301,
            "loss_tokens_lower_95": 4.167379397437686,
            "loss_tokens_upper_95": 4.513658836405114,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.407402290481311,
            "data_time": 0.0017507357527399414,
            "batch_time": 0.037669117992839486,
            "samples_per_second": 220569.14330444127,
            "samples_per_second_per_gpu": 110284.57165222063,
            "loss_sequences_lower_95": 3.3807293533185208,
            "loss_sequences_upper_95": 3.435007219693712,
            "loss_tokens_lower_95": 3.3811275414516437,
            "loss_tokens_upper_95": 3.4347506704534596,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 5.19397855036467,
            "data_time": 0.007348936337691087,
            "batch_time": 0.03552010426154503,
            "samples_per_second": 276872.58147873677,
            "samples_per_second_per_gpu": 138436.29073936839,
            "loss_sequences_lower_95": 5.006328634613926,
            "loss_sequences_upper_95": 5.375909483085557,
            "loss_tokens_lower_95": 5.00454505994482,
            "loss_tokens_upper_95": 5.37929204551919,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.705801792939504,
            "data_time": 0.02015836536884308,
            "batch_time": 0.049296699464321136,
            "samples_per_second": 245083.5516663476,
            "samples_per_second_per_gpu": 122541.7758331738,
            "loss_sequences_lower_95": 1.537157440185547,
            "loss_sequences_upper_95": 1.9622919782002768,
            "loss_tokens_lower_95": 1.3801755984624227,
            "loss_tokens_upper_95": 1.8743525822957356,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 1.533272925267617,
            "data_time": 0.02198442816734314,
            "batch_time": 0.05015411972999573,
            "samples_per_second": 251812.87812287093,
            "samples_per_second_per_gpu": 125906.43906143546,
            "loss_sequences_lower_95": 1.3835504849751792,
            "loss_sequences_upper_95": 1.8111457188924152,
            "loss_tokens_lower_95": 1.1347622496358465,
            "loss_tokens_upper_95": 1.6569738580939475,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.920382236767237,
            "data_time": 0.0015856398772744306,
            "batch_time": 0.029695630143711748,
            "samples_per_second": 282649.2454769809,
            "samples_per_second_per_gpu": 141324.62273849046,
            "loss_sequences_lower_95": 4.897204241071429,
            "loss_sequences_upper_95": 4.943617222017673,
            "loss_tokens_lower_95": 4.897327325340575,
            "loss_tokens_upper_95": 4.943747684439433,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 0.696786818951995,
            "data_time": 0.0011694795845373976,
            "batch_time": 0.02959453726199576,
            "samples_per_second": 279225.3388272102,
            "samples_per_second_per_gpu": 139612.6694136051,
            "loss_sequences_lower_95": 0.8125975160064625,
            "loss_sequences_upper_95": 0.8310315456467338,
            "loss_tokens_lower_95": 0.5751673096730558,
            "loss_tokens_upper_95": 0.5851931132744811,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.523519422125629,
            "data_time": 0.01266353577375412,
            "batch_time": 0.04550381377339363,
            "samples_per_second": 243776.3967360782,
            "samples_per_second_per_gpu": 121888.1983680391,
            "loss_sequences_lower_95": 4.49431009367695,
            "loss_sequences_upper_95": 4.885369044026052,
            "loss_tokens_lower_95": 4.0977015398550725,
            "loss_tokens_upper_95": 4.323127895384273,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 6.90464059726612,
            "data_time": 0.03255209922790527,
            "batch_time": 0.06269539594650268,
            "samples_per_second": 236728.11891924273,
            "samples_per_second_per_gpu": 118364.05945962136,
            "loss_sequences_lower_95": 6.484084567508182,
            "loss_sequences_upper_95": 7.595294045113228,
            "loss_tokens_lower_95": 6.161628977457682,
            "loss_tokens_upper_95": 7.409983411247347,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.208926118001705,
            "data_time": 0.008109878926050095,
            "batch_time": 0.03643195969717843,
            "samples_per_second": 271063.9449608041,
            "samples_per_second_per_gpu": 135531.97248040204,
            "loss_sequences_lower_95": 4.24855532762481,
            "loss_sequences_upper_95": 4.598330269790277,
            "loss_tokens_lower_95": 3.8896741205188183,
            "loss_tokens_upper_95": 4.069502051684714,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.29672529348513,
            "data_time": 0.008655233042580741,
            "batch_time": 0.0373257767586481,
            "samples_per_second": 268021.7473220647,
            "samples_per_second_per_gpu": 134010.87366103235,
            "loss_sequences_lower_95": 4.329913888326505,
            "loss_sequences_upper_95": 4.645976136370403,
            "loss_tokens_lower_95": 3.9997159184612117,
            "loss_tokens_upper_95": 4.152138084357043,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.507444165101865,
            "data_time": 0.008672132378532774,
            "batch_time": 0.038233739989144463,
            "samples_per_second": 261391.86064281245,
            "samples_per_second_per_gpu": 130695.93032140622,
            "loss_sequences_lower_95": 4.5442742324456935,
            "loss_sequences_upper_95": 4.986667279499334,
            "loss_tokens_lower_95": 4.126317910122556,
            "loss_tokens_upper_95": 4.368330671980575,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.332449168693729,
            "data_time": 0.008329104809533982,
            "batch_time": 0.04042170445124308,
            "samples_per_second": 243977.8128029484,
            "samples_per_second_per_gpu": 121988.9064014742,
            "loss_sequences_lower_95": 4.33268389818145,
            "loss_sequences_upper_95": 4.63019366380645,
            "loss_tokens_lower_95": 4.070569532162676,
            "loss_tokens_upper_95": 4.208872638833114,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.5359507184591354,
            "data_time": 0.009222528080881379,
            "batch_time": 0.03834925463170181,
            "samples_per_second": 270581.53292399977,
            "samples_per_second_per_gpu": 135290.76646199988,
            "loss_sequences_lower_95": 4.523480622664741,
            "loss_sequences_upper_95": 4.8149053348517565,
            "loss_tokens_lower_95": 4.278309538607781,
            "loss_tokens_upper_95": 4.391607526414887,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.54960622147816,
            "data_time": 0.008279769193558465,
            "batch_time": 0.03651786134356544,
            "samples_per_second": 271365.9769858846,
            "samples_per_second_per_gpu": 135682.9884929423,
            "loss_sequences_lower_95": 4.614130690039658,
            "loss_sequences_upper_95": 4.928929863906489,
            "loss_tokens_lower_95": 4.238566307357595,
            "loss_tokens_upper_95": 4.3799470807903855,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/params.txt",
    "uuid": "2a925d34-8650-497b-adcf-ea8439b0c51a",
    "creation_date": "2024_01_28-13_33_33"
}