{
    "name": "rpj-d=512_l=8_h=4-0.5",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 789140480,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "157828096",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.813268295923869,
            "data_time": 0.036028388887643814,
            "batch_time": 0.31641849875450134,
            "samples_per_second": 1733008.7198971356,
            "samples_per_second_per_gpu": 216626.08998714195,
            "loss_sequences_lower_95": 3.7380720520019532,
            "loss_sequences_upper_95": 3.884352111816406,
            "loss_tokens_lower_95": 3.8002506637573243,
            "loss_tokens_upper_95": 3.8266700808207195,
            "sequences": 120,
            "tokens": 245760,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/openlm/shard_00000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.164305552063085,
            "data_time": 0.001472043903581401,
            "batch_time": 0.015272201797422167,
            "samples_per_second": 2254148.6419679997,
            "samples_per_second_per_gpu": 281768.58024599997,
            "loss_sequences_lower_95": 4.161867602854151,
            "loss_sequences_upper_95": 4.166769664201932,
            "loss_tokens_lower_95": 4.153028322916667,
            "loss_tokens_upper_95": 4.17590465625,
            "sequences": 84999,
            "tokens": 174077952,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/c4_val/shard-{0000000..0000010}.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.3650199554404434,
            "data_time": 0.010818622589111327,
            "batch_time": 0.024685433387756348,
            "samples_per_second": 2174679.5923664724,
            "samples_per_second_per_gpu": 271834.94904580904,
            "loss_sequences_lower_95": 3.33632746482382,
            "loss_sequences_upper_95": 3.3932312696807236,
            "loss_tokens_lower_95": 3.3528329062499997,
            "loss_tokens_upper_95": 3.3777661354166666,
            "sequences": 490,
            "tokens": 1003520,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.01544637832445,
            "data_time": 0.0016997079119870538,
            "batch_time": 0.015121264971400561,
            "samples_per_second": 2326186.6545541077,
            "samples_per_second_per_gpu": 290773.33181926346,
            "loss_sequences_lower_95": 4.0023806882248705,
            "loss_sequences_upper_95": 4.028151427593428,
            "loss_tokens_lower_95": 4.004140395833334,
            "loss_tokens_upper_95": 4.026965875,
            "sequences": 4850,
            "tokens": 9932800,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.151032785776919,
            "data_time": 0.011056848731173937,
            "batch_time": 0.02468319528131371,
            "samples_per_second": 2200182.2526739743,
            "samples_per_second_per_gpu": 275022.7815842468,
            "loss_sequences_lower_95": 4.116951122400474,
            "loss_sequences_upper_95": 4.1839052530510115,
            "loss_tokens_lower_95": 4.139660604166666,
            "loss_tokens_upper_95": 4.162480979166666,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.939759904393365,
            "data_time": 0.004155853844207266,
            "batch_time": 0.017587195595969326,
            "samples_per_second": 2314938.7453216226,
            "samples_per_second_per_gpu": 289367.3431652028,
            "loss_sequences_lower_95": 3.8967654105678537,
            "loss_sequences_upper_95": 3.981646077086697,
            "loss_tokens_lower_95": 3.928016229166667,
            "loss_tokens_upper_95": 3.9514289687499997,
            "sequences": 1471,
            "tokens": 3012608,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.584224600889245,
            "data_time": 0.0016825474300353406,
            "batch_time": 0.01505147652369339,
            "samples_per_second": 2340521.0355010587,
            "samples_per_second_per_gpu": 292565.12943763233,
            "loss_sequences_lower_95": 2.558994235291773,
            "loss_sequences_upper_95": 2.60879405990912,
            "loss_tokens_lower_95": 2.5725766302083333,
            "loss_tokens_upper_95": 2.5963755208333335,
            "sequences": 4900,
            "tokens": 10035200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.341766286720156,
            "data_time": 0.0017214673974433239,
            "batch_time": 0.015014511528796902,
            "samples_per_second": 2349508.0527828876,
            "samples_per_second_per_gpu": 293688.50659786095,
            "loss_sequences_lower_95": 4.332165330497382,
            "loss_sequences_upper_95": 4.351205845058901,
            "loss_tokens_lower_95": 4.3306919375,
            "loss_tokens_upper_95": 4.35260378125,
            "sequences": 4775,
            "tokens": 9779200,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.2915129458032,
            "data_time": 0.01151712073220147,
            "batch_time": 0.025663447758508106,
            "samples_per_second": 2138850.4710040223,
            "samples_per_second_per_gpu": 267356.3088755028,
            "loss_sequences_lower_95": 4.249122309490917,
            "loss_sequences_upper_95": 4.336730597271182,
            "loss_tokens_lower_95": 4.279867958333333,
            "loss_tokens_upper_95": 4.303175416666667,
            "sequences": 492,
            "tokens": 1007616,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.83272132571978,
            "data_time": 0.011685413308441639,
            "batch_time": 0.025900710374116898,
            "samples_per_second": 2156405.856938327,
            "samples_per_second_per_gpu": 269550.7321172909,
            "loss_sequences_lower_95": 4.789738482448895,
            "loss_sequences_upper_95": 4.868949558423913,
            "loss_tokens_lower_95": 4.8204310625,
            "loss_tokens_upper_95": 4.844861708333333,
            "sequences": 506,
            "tokens": 1036288,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_gab/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.127651421553601,
            "data_time": 0.0013280318645164718,
            "batch_time": 0.014592374570077659,
            "samples_per_second": 2359648.2743390044,
            "samples_per_second_per_gpu": 294956.03429237555,
            "loss_sequences_lower_95": 4.11974520511426,
            "loss_sequences_upper_95": 4.135488658652529,
            "loss_tokens_lower_95": 4.115784572916667,
            "loss_tokens_upper_95": 4.13940003125,
            "sequences": 7297,
            "tokens": 14944256,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.082082838775813,
            "data_time": 0.002888017054104388,
            "batch_time": 0.01651968686010915,
            "samples_per_second": 2346064.501140065,
            "samples_per_second_per_gpu": 293258.0626425081,
            "loss_sequences_lower_95": 4.0714894794812055,
            "loss_sequences_upper_95": 4.092423282291233,
            "loss_tokens_lower_95": 4.0706075833333335,
            "loss_tokens_upper_95": 4.09360475,
            "sequences": 2401,
            "tokens": 4917248,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.359437460831648,
            "data_time": 0.010632170990050547,
            "batch_time": 0.02863099169825377,
            "samples_per_second": 2222279.691756632,
            "samples_per_second_per_gpu": 277784.961469579,
            "loss_sequences_lower_95": 4.322115438898485,
            "loss_sequences_upper_95": 4.395441602354842,
            "loss_tokens_lower_95": 4.347691583333334,
            "loss_tokens_upper_95": 4.371074260416666,
            "sequences": 493,
            "tokens": 1009664,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9272436891699516,
            "data_time": 0.010505384657962389,
            "batch_time": 0.02454708676889123,
            "samples_per_second": 2160641.2095252206,
            "samples_per_second_per_gpu": 270080.15119065257,
            "loss_sequences_lower_95": 3.865893144471582,
            "loss_sequences_upper_95": 3.9877236919830383,
            "loss_tokens_lower_95": 3.9150411562499996,
            "loss_tokens_upper_95": 3.939395447916667,
            "sequences": 491,
            "tokens": 1005568,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_mc4/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.833607283505526,
            "data_time": 0.08822963918958392,
            "batch_time": 0.10499869925635201,
            "samples_per_second": 1053532.31813147,
            "samples_per_second_per_gpu": 131691.53976643374,
            "loss_sequences_lower_95": 4.7606531663374465,
            "loss_sequences_upper_95": 4.904738053408536,
            "loss_tokens_lower_95": 4.811300971291282,
            "loss_tokens_upper_95": 4.85658219944347,
            "sequences": 44,
            "tokens": 90112,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_ptb/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5880343218934083,
            "data_time": 0.015012440356341276,
            "batch_time": 0.02892363884232261,
            "samples_per_second": 2144386.1413173187,
            "samples_per_second_per_gpu": 268048.26766466483,
            "loss_sequences_lower_95": 3.479749362600788,
            "loss_sequences_upper_95": 3.6949433440717248,
            "loss_tokens_lower_95": 3.5762103645833334,
            "loss_tokens_upper_95": 3.5996613229166665,
            "sequences": 343,
            "tokens": 702464,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.132923205483872,
            "data_time": 0.01369548092285792,
            "batch_time": 0.027515141914288204,
            "samples_per_second": 2196328.061114096,
            "samples_per_second_per_gpu": 274541.007639262,
            "loss_sequences_lower_95": 6.071649314860241,
            "loss_sequences_upper_95": 6.187701029513316,
            "loss_tokens_lower_95": 6.121291947916667,
            "loss_tokens_upper_95": 6.144327208333333,
            "sequences": 379,
            "tokens": 776192,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.539943433198773,
            "data_time": 0.04060777649283409,
            "batch_time": 0.05587315186858177,
            "samples_per_second": 1824591.790100214,
            "samples_per_second_per_gpu": 228073.97376252676,
            "loss_sequences_lower_95": 4.457135872762712,
            "loss_sequences_upper_95": 4.600632526835457,
            "loss_tokens_lower_95": 4.526664471235431,
            "loss_tokens_upper_95": 4.552971236432185,
            "sequences": 122,
            "tokens": 249856,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.383866251049821,
            "data_time": 0.002125867203319263,
            "batch_time": 0.015771276485132466,
            "samples_per_second": 2269791.082795158,
            "samples_per_second_per_gpu": 283723.88534939475,
            "loss_sequences_lower_95": 5.365984245922945,
            "loss_sequences_upper_95": 5.402049529425118,
            "loss_tokens_lower_95": 5.365839924423159,
            "loss_tokens_upper_95": 5.401973474108924,
            "sequences": 14042,
            "tokens": 14042,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/mmlu/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.9814870348728837,
            "data_time": 0.002310235693955877,
            "batch_time": 0.015824983216774693,
            "samples_per_second": 2289120.0198674817,
            "samples_per_second_per_gpu": 286140.0024834352,
            "loss_sequences_lower_95": 3.9687414130184475,
            "loss_sequences_upper_95": 3.99510841574761,
            "loss_tokens_lower_95": 3.9676157098917026,
            "loss_tokens_upper_95": 3.9886776322116626,
            "sequences": 10042,
            "tokens": 291143,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/hellaswag/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.84458388091823,
            "data_time": 0.003278836565799695,
            "batch_time": 0.017120672103709256,
            "samples_per_second": 2238449.8779794215,
            "samples_per_second_per_gpu": 279806.2347474277,
            "loss_sequences_lower_95": 6.044358425653932,
            "loss_sequences_upper_95": 6.339814296284541,
            "loss_tokens_lower_95": 5.35817058455072,
            "loss_tokens_upper_95": 5.565536499750566,
            "sequences": 2117,
            "tokens": 4197,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/jeopardy_all/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.004958555539449,
            "data_time": 0.004516936521580878,
            "batch_time": 0.0180777777382668,
            "samples_per_second": 2261018.012008267,
            "samples_per_second_per_gpu": 282627.2515010334,
            "loss_sequences_lower_95": 6.1807013671875,
            "loss_sequences_upper_95": 6.3871537109375005,
            "loss_tokens_lower_95": 5.6197534763168235,
            "loss_tokens_upper_95": 5.75646694428066,
            "sequences": 3000,
            "tokens": 7950,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/triviaqa_sm_sub/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.071256545929407,
            "data_time": 0.0048627961275264685,
            "batch_time": 0.018482181281525626,
            "samples_per_second": 2254195.677694696,
            "samples_per_second_per_gpu": 281774.459711837,
            "loss_sequences_lower_95": 4.116675054566078,
            "loss_sequences_upper_95": 4.186959318286817,
            "loss_tokens_lower_95": 3.970052908872568,
            "loss_tokens_upper_95": 4.004121312423369,
            "sequences": 1319,
            "tokens": 123972,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/gsm8k/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.8501383835619145,
            "data_time": 0.024299155388559614,
            "batch_time": 0.03871585002967289,
            "samples_per_second": 2010351.0670280438,
            "samples_per_second_per_gpu": 251293.88337850547,
            "loss_sequences_lower_95": 2.824028639359908,
            "loss_sequences_upper_95": 2.948504763516513,
            "loss_tokens_lower_95": 2.7831984326186636,
            "loss_tokens_upper_95": 2.833468000478686,
            "sequences": 220,
            "tokens": 49615,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_math/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.197960213252476,
            "data_time": 0.02174505591392517,
            "batch_time": 0.03567987307906151,
            "samples_per_second": 2030066.479111304,
            "samples_per_second_per_gpu": 253758.309888913,
            "loss_sequences_lower_95": 4.189766384825414,
            "loss_sequences_upper_95": 4.399967302594866,
            "loss_tokens_lower_95": 4.046121645173916,
            "loss_tokens_upper_95": 4.146328138223595,
            "sequences": 245,
            "tokens": 14770,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/aqua/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.706191379229228,
            "data_time": 0.01797485045897655,
            "batch_time": 0.03221469353406857,
            "samples_per_second": 2030787.8074089591,
            "samples_per_second_per_gpu": 253848.4759261199,
            "loss_sequences_lower_95": 4.665720326741536,
            "loss_sequences_upper_95": 4.776738728841146,
            "loss_tokens_lower_95": 4.563550000844986,
            "loss_tokens_upper_95": 4.80074231273901,
            "sequences": 300,
            "tokens": 3236,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/svamp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.967026936099213,
            "data_time": 0.0017086993098176847,
            "batch_time": 0.015349134264610823,
            "samples_per_second": 2273614.238119067,
            "samples_per_second_per_gpu": 284201.7797648834,
            "loss_sequences_lower_95": 7.986574022101521,
            "loss_sequences_upper_95": 8.062270692110378,
            "loss_tokens_lower_95": 7.809030681112332,
            "loss_tokens_upper_95": 7.8889074557133645,
            "sequences": 20321,
            "tokens": 20929,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.767981608206977,
            "data_time": 0.0030062678676323605,
            "batch_time": 0.016530525964378512,
            "samples_per_second": 2281735.307030253,
            "samples_per_second_per_gpu": 285216.91337878164,
            "loss_sequences_lower_95": 6.418031162043613,
            "loss_sequences_upper_95": 6.751123231830019,
            "loss_tokens_lower_95": 4.899091557391008,
            "loss_tokens_upper_95": 5.048623532495316,
            "sequences": 2376,
            "tokens": 8808,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_easy/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.084352142574844,
            "data_time": 0.005366945991644988,
            "batch_time": 0.018996237903027922,
            "samples_per_second": 2236673.353498362,
            "samples_per_second_per_gpu": 279584.16918729525,
            "loss_sequences_lower_95": 5.589851374186754,
            "loss_sequences_upper_95": 5.952918605348763,
            "loss_tokens_lower_95": 4.617911366317764,
            "loss_tokens_upper_95": 4.7809987770122415,
            "sequences": 1172,
            "tokens": 6198,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/arc_challenge/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.866865926681588,
            "data_time": 0.025022992065974643,
            "batch_time": 0.03929541153567178,
            "samples_per_second": 2005500.9655454548,
            "samples_per_second_per_gpu": 250687.62069318185,
            "loss_sequences_lower_95": 5.797756330934289,
            "loss_sequences_upper_95": 5.936840235043878,
            "loss_tokens_lower_95": 5.7977858451947775,
            "loss_tokens_upper_95": 5.932820052752212,
            "sequences": 219,
            "tokens": 219,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_misconceptions/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.319280619621277,
            "data_time": 0.05332187505868765,
            "batch_time": 0.06803938517203698,
            "samples_per_second": 1714300.5644458125,
            "samples_per_second_per_gpu": 214287.57055572656,
            "loss_sequences_lower_95": 4.16803254699707,
            "loss_sequences_upper_95": 4.556712715148925,
            "loss_tokens_lower_95": 4.0045477196654184,
            "loss_tokens_upper_95": 4.498692231698624,
            "sequences": 100,
            "tokens": 559,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/copa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.522795488060802,
            "data_time": 0.003532733897733786,
            "batch_time": 0.017439434864769683,
            "samples_per_second": 2228660.0617342577,
            "samples_per_second_per_gpu": 278582.5077167822,
            "loss_sequences_lower_95": 5.480021687783873,
            "loss_sequences_upper_95": 5.565732851682446,
            "loss_tokens_lower_95": 5.4796159820400465,
            "loss_tokens_upper_95": 5.5662077764161655,
            "sequences": 1954,
            "tokens": 1954,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/siqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.713397872340572,
            "data_time": 0.005331319463583618,
            "batch_time": 0.019008150302955416,
            "samples_per_second": 2248914.3717822544,
            "samples_per_second_per_gpu": 281114.2964727818,
            "loss_sequences_lower_95": 5.666756084920147,
            "loss_sequences_upper_95": 5.759445432870725,
            "loss_tokens_lower_95": 5.665630318706491,
            "loss_tokens_upper_95": 5.761162825200911,
            "sequences": 1221,
            "tokens": 1221,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/commonsense_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.552247360429774,
            "data_time": 0.003643195263510389,
            "batch_time": 0.017431020479222868,
            "samples_per_second": 2229963.179763761,
            "samples_per_second_per_gpu": 278745.3974704701,
            "loss_sequences_lower_95": 4.697484793678591,
            "loss_sequences_upper_95": 4.81870301820508,
            "loss_tokens_lower_95": 4.392497299290971,
            "loss_tokens_upper_95": 4.451923204038273,
            "sequences": 1838,
            "tokens": 39949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/piqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.435616019248962,
            "data_time": 0.011922762729227543,
            "batch_time": 0.025774147361516953,
            "samples_per_second": 2142451.3042602027,
            "samples_per_second_per_gpu": 267806.41303252534,
            "loss_sequences_lower_95": 6.626215197753906,
            "loss_sequences_upper_95": 7.186505737304688,
            "loss_tokens_lower_95": 5.744400632341992,
            "loss_tokens_upper_95": 6.1089066086976755,
            "sequences": 500,
            "tokens": 1511,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/openbook_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.983166962862015,
            "data_time": 0.16788075864315033,
            "batch_time": 0.18520590662956238,
            "samples_per_second": 912522.5857732175,
            "samples_per_second_per_gpu": 114065.32322165219,
            "loss_sequences_lower_95": 4.692932844161987,
            "loss_sequences_upper_95": 5.418889582157135,
            "loss_tokens_lower_95": 4.478969416125067,
            "loss_tokens_upper_95": 5.281989042786345,
            "sequences": 32,
            "tokens": 174,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_novel_concepts/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.787569907889969,
            "data_time": 0.028593324600382053,
            "batch_time": 0.04267629663994972,
            "samples_per_second": 1863479.7976213258,
            "samples_per_second_per_gpu": 232934.97470266573,
            "loss_sequences_lower_95": 6.1620536716505026,
            "loss_sequences_upper_95": 6.919761587559491,
            "loss_tokens_lower_95": 4.468830701275366,
            "loss_tokens_upper_95": 4.9469988593926155,
            "sequences": 174,
            "tokens": 887,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strange_stories/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.8473276039980133,
            "data_time": 0.0032101846817466947,
            "batch_time": 0.01682866012884511,
            "samples_per_second": 2258714.1838148898,
            "samples_per_second_per_gpu": 282339.2729768612,
            "loss_sequences_lower_95": 3.8294491189179225,
            "loss_sequences_upper_95": 3.8652357082275284,
            "loss_tokens_lower_95": 3.8290050901560453,
            "loss_tokens_upper_95": 3.86540760925213,
            "sequences": 2289,
            "tokens": 2289,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_strategy_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.996408110769137,
            "data_time": 0.002603382577625966,
            "batch_time": 0.01609095409483608,
            "samples_per_second": 2293791.53594031,
            "samples_per_second_per_gpu": 286723.94199253875,
            "loss_sequences_lower_95": 4.972209292144745,
            "loss_sequences_upper_95": 5.181491297924146,
            "loss_tokens_lower_95": 4.731658639605359,
            "loss_tokens_upper_95": 4.940182476203632,
            "sequences": 5153,
            "tokens": 5486,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/lambada_openai/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.935950370935293,
            "data_time": 0.01996579931841956,
            "batch_time": 0.03425304591655731,
            "samples_per_second": 2004243.7320980595,
            "samples_per_second_per_gpu": 250530.46651225744,
            "loss_sequences_lower_95": 3.7854153364132612,
            "loss_sequences_upper_95": 4.178240430224073,
            "loss_tokens_lower_95": 3.6503135208285156,
            "loss_tokens_upper_95": 3.966741034325133,
            "sequences": 273,
            "tokens": 1081,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winograd_wsc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.306921946786009,
            "data_time": 0.005061116069555283,
            "batch_time": 0.018797487020492554,
            "samples_per_second": 2228077.3931831685,
            "samples_per_second_per_gpu": 278509.67414789606,
            "loss_sequences_lower_95": 4.333409864129094,
            "loss_sequences_upper_95": 4.475932069942531,
            "loss_tokens_lower_95": 4.163098569284228,
            "loss_tokens_upper_95": 4.314756328860418,
            "sequences": 1267,
            "tokens": 5949,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogrande/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.7325327774373496,
            "data_time": 0.03270786149161203,
            "batch_time": 0.04723226172583444,
            "samples_per_second": 1911330.6814888816,
            "samples_per_second_per_gpu": 238916.3351861102,
            "loss_sequences_lower_95": 3.542155596105064,
            "loss_sequences_upper_95": 4.049667637522627,
            "loss_tokens_lower_95": 3.4591475668778036,
            "loss_tokens_upper_95": 3.8669024966863788,
            "sequences": 164,
            "tokens": 1226,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conlang_translation/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.709258880107778,
            "data_time": 0.002204816245194487,
            "batch_time": 0.015655078318325747,
            "samples_per_second": 2295685.0522093037,
            "samples_per_second_per_gpu": 286960.63152616296,
            "loss_sequences_lower_95": 4.698865085517103,
            "loss_sequences_upper_95": 4.719397883482946,
            "loss_tokens_lower_95": 4.69911181455041,
            "loss_tokens_upper_95": 4.719467663063862,
            "sequences": 9998,
            "tokens": 9998,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_language_identification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.991250857566167,
            "data_time": 0.049404902891679245,
            "batch_time": 0.06464835080233487,
            "samples_per_second": 1712244.005391262,
            "samples_per_second_per_gpu": 214030.50067390775,
            "loss_sequences_lower_95": 1.8840035225581198,
            "loss_sequences_upper_95": 2.173334740203561,
            "loss_tokens_lower_95": 1.7393524326101906,
            "loss_tokens_upper_95": 2.070723312830852,
            "sequences": 103,
            "tokens": 977,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.543595051253117,
            "data_time": 0.001616963671718639,
            "batch_time": 0.015199108937659987,
            "samples_per_second": 2277270.1006642417,
            "samples_per_second_per_gpu": 284658.7625830302,
            "loss_sequences_lower_95": 5.915340455892951,
            "loss_sequences_upper_95": 5.965800494627883,
            "loss_tokens_lower_95": 4.954757410541586,
            "loss_tokens_upper_95": 5.003086363636363,
            "sequences": 38160,
            "tokens": 64625,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.715963775634766,
            "data_time": 0.0061809638189891025,
            "batch_time": 0.01971884568532308,
            "samples_per_second": 2255263.3249382535,
            "samples_per_second_per_gpu": 281907.9156172817,
            "loss_sequences_lower_95": 6.7122170410156246,
            "loss_sequences_upper_95": 7.043018420410156,
            "loss_tokens_lower_95": 6.3862379270289535,
            "loss_tokens_upper_95": 6.677582603743716,
            "sequences": 1000,
            "tokens": 1293,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_dyck_languages/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.346513300356658,
            "data_time": 0.023731904514765336,
            "batch_time": 0.037873084262265995,
            "samples_per_second": 2029987.5841550895,
            "samples_per_second_per_gpu": 253748.44801938618,
            "loss_sequences_lower_95": 5.185517790421195,
            "loss_sequences_upper_95": 5.5051920086404555,
            "loss_tokens_lower_95": 5.188302081564198,
            "loss_tokens_upper_95": 5.504719861901324,
            "sequences": 230,
            "tokens": 230,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.6920339407342855,
            "data_time": 0.0047990661787699505,
            "batch_time": 0.018303877977003533,
            "samples_per_second": 2272222.269108316,
            "samples_per_second_per_gpu": 284027.7836385395,
            "loss_sequences_lower_95": 7.594098288796165,
            "loss_sequences_upper_95": 7.789382139263731,
            "loss_tokens_lower_95": 7.593256429036458,
            "loss_tokens_upper_95": 7.789609245531487,
            "sequences": 1320,
            "tokens": 1320,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.2905868932803473,
            "data_time": 0.0044279326783849835,
            "batch_time": 0.017942739610976362,
            "samples_per_second": 2276373.8746629134,
            "samples_per_second_per_gpu": 284546.7343328642,
            "loss_sequences_lower_95": 1.351638484700521,
            "loss_sequences_upper_95": 1.434244091796875,
            "loss_tokens_lower_95": 1.1850259967268157,
            "loss_tokens_upper_95": 1.257291320825205,
            "sequences": 1500,
            "tokens": 12495,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_logical_deduction/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.856872359911601,
            "data_time": 0.025649832827704295,
            "batch_time": 0.03977698300565992,
            "samples_per_second": 1956515.8305808331,
            "samples_per_second_per_gpu": 244564.47882260414,
            "loss_sequences_lower_95": 5.510077441987537,
            "loss_sequences_upper_95": 6.2007436843145465,
            "loss_tokens_lower_95": 5.516217680431548,
            "loss_tokens_upper_95": 6.200204729352678,
            "sequences": 210,
            "tokens": 210,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_operators/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.5460682958364487,
            "data_time": 0.1760566681623459,
            "batch_time": 0.1921704113483429,
            "samples_per_second": 932112.4527046613,
            "samples_per_second_per_gpu": 116514.05658808266,
            "loss_sequences_lower_95": 3.2582438409328462,
            "loss_sequences_upper_95": 4.675864040851593,
            "loss_tokens_lower_95": 2.88037353515625,
            "loss_tokens_upper_95": 3.498138163458441,
            "sequences": 32,
            "tokens": 485,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.40654901266098,
            "data_time": 0.0062651071283552386,
            "batch_time": 0.020296116669972736,
            "samples_per_second": 2176165.9784921375,
            "samples_per_second_per_gpu": 272020.7473115172,
            "loss_sequences_lower_95": 7.363071899414063,
            "loss_sequences_upper_95": 7.712778076171875,
            "loss_tokens_lower_95": 7.083311893572864,
            "loss_tokens_upper_95": 7.393597381127062,
            "sequences": 1000,
            "tokens": 1182,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.571199296951294,
            "data_time": 0.0063500286094726075,
            "batch_time": 0.02001016102139912,
            "samples_per_second": 2224862.6649158173,
            "samples_per_second_per_gpu": 278107.83311447717,
            "loss_sequences_lower_95": 6.644726684570313,
            "loss_sequences_upper_95": 6.884134753417969,
            "loss_tokens_lower_95": 6.320107272725495,
            "loss_tokens_upper_95": 6.514853591422289,
            "sequences": 1000,
            "tokens": 1997,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.629331079058466,
            "data_time": 0.004057153331794867,
            "batch_time": 0.017843209939656848,
            "samples_per_second": 2233102.8164071403,
            "samples_per_second_per_gpu": 279137.85205089254,
            "loss_sequences_lower_95": 4.612298238193513,
            "loss_sequences_upper_95": 4.646611437795948,
            "loss_tokens_lower_95": 4.612844931499853,
            "loss_tokens_upper_95": 4.646519526994113,
            "sequences": 2983,
            "tokens": 2983,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/math_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.403420707963396,
            "data_time": 0.009139707801565302,
            "batch_time": 0.022818764891149056,
            "samples_per_second": 2195039.676732253,
            "samples_per_second_per_gpu": 274379.9595915316,
            "loss_sequences_lower_95": 5.316837396343366,
            "loss_sequences_upper_95": 5.487911626344086,
            "loss_tokens_lower_95": 5.316434329922115,
            "loss_tokens_upper_95": 5.486605567756336,
            "sequences": 651,
            "tokens": 651,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/logi_qa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 7.855982986450195,
            "data_time": 0.006303843524720933,
            "batch_time": 0.02009154170278519,
            "samples_per_second": 2221995.040941407,
            "samples_per_second_per_gpu": 277749.3801176759,
            "loss_sequences_lower_95": 7.792486853027343,
            "loss_sequences_upper_95": 7.9221840087890625,
            "loss_tokens_lower_95": 7.7930734375,
            "loss_tokens_upper_95": 7.921228198242187,
            "sequences": 1000,
            "tokens": 1000,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/pubmed_qa_labeled/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.595287557097122,
            "data_time": 0.0021362697244366536,
            "batch_time": 0.015658493663953697,
            "samples_per_second": 2286094.1859297757,
            "samples_per_second_per_gpu": 285761.77324122196,
            "loss_sequences_lower_95": 5.233045387520695,
            "loss_sequences_upper_95": 5.347535828849337,
            "loss_tokens_lower_95": 3.829037394224822,
            "loss_tokens_upper_95": 3.9027629183018386,
            "sequences": 10570,
            "tokens": 46886,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/squad/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.947337050936115,
            "data_time": 0.019872588770730153,
            "batch_time": 0.033932862962995254,
            "samples_per_second": 2030464.2958066352,
            "samples_per_second_per_gpu": 253808.0369758294,
            "loss_sequences_lower_95": 5.796755298215952,
            "loss_sequences_upper_95": 6.096977154176626,
            "loss_tokens_lower_95": 5.798570524756588,
            "loss_tokens_upper_95": 6.095117813793581,
            "sequences": 268,
            "tokens": 268,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.135018423491833,
            "data_time": 0.011600564233958721,
            "batch_time": 0.02551484201103449,
            "samples_per_second": 2178749.3699250007,
            "samples_per_second_per_gpu": 272343.6712406251,
            "loss_sequences_lower_95": 6.025648145488664,
            "loss_sequences_upper_95": 6.241437605315563,
            "loss_tokens_lower_95": 6.030198735255821,
            "loss_tokens_upper_95": 6.2406106986251535,
            "sequences": 510,
            "tokens": 510,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.260170686607834,
            "data_time": 0.002221748810405879,
            "batch_time": 0.01586873217146526,
            "samples_per_second": 2270901.0386991906,
            "samples_per_second_per_gpu": 283862.6298373988,
            "loss_sequences_lower_95": 5.715242815788238,
            "loss_sequences_upper_95": 5.833788744441313,
            "loss_tokens_lower_95": 4.506242982870184,
            "loss_tokens_upper_95": 4.59810523174286,
            "sequences": 7983,
            "tokens": 27277,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/coqa/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.973916644141788,
            "data_time": 0.03011622776587804,
            "batch_time": 0.04448870817820231,
            "samples_per_second": 1968366.2549890534,
            "samples_per_second_per_gpu": 246045.78187363167,
            "loss_sequences_lower_95": 4.8858121034329525,
            "loss_sequences_upper_95": 5.060290866427952,
            "loss_tokens_lower_95": 4.885188390338231,
            "loss_tokens_upper_95": 5.059425402444507,
            "sequences": 189,
            "tokens": 189,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bigbench_understanding_fables/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.372528010114618,
            "data_time": 0.003682512940067948,
            "batch_time": 0.017232821101234072,
            "samples_per_second": 2272627.1791690416,
            "samples_per_second_per_gpu": 284078.3973961302,
            "loss_sequences_lower_95": 6.349995191848241,
            "loss_sequences_upper_95": 6.395179364965597,
            "loss_tokens_lower_95": 6.349620365061162,
            "loss_tokens_upper_95": 6.395510395761659,
            "sequences": 3270,
            "tokens": 3270,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/boolq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.890994453893124,
            "data_time": 0.025453430956060237,
            "batch_time": 0.039250516891479494,
            "samples_per_second": 1982109.1162865267,
            "samples_per_second_per_gpu": 247763.63953581583,
            "loss_sequences_lower_95": 5.735042653500455,
            "loss_sequences_upper_95": 6.045522537972164,
            "loss_tokens_lower_95": 5.736802984441368,
            "loss_tokens_upper_95": 6.046253063609299,
            "sequences": 206,
            "tokens": 206,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/agi_eval_sat_en/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.198180902004242,
            "data_time": 0.08577575534582138,
            "batch_time": 0.10047376155853271,
            "samples_per_second": 1557319.4047620597,
            "samples_per_second_per_gpu": 194664.92559525746,
            "loss_sequences_lower_95": 3.8737059529622395,
            "loss_sequences_upper_95": 4.679138145446777,
            "loss_tokens_lower_95": 3.486383194393582,
            "loss_tokens_upper_95": 4.684806230333116,
            "sequences": 60,
            "tokens": 72,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_female/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.090985735257466,
            "data_time": 0.08678988367319107,
            "batch_time": 0.10209072381258011,
            "samples_per_second": 1435601.3099233452,
            "samples_per_second_per_gpu": 179450.16374041815,
            "loss_sequences_lower_95": 3.7678252474466962,
            "loss_sequences_upper_95": 4.717545458475748,
            "loss_tokens_lower_95": 3.159876276937763,
            "loss_tokens_upper_95": 4.469012271152454,
            "sequences": 60,
            "tokens": 89,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/winogender_mc_male/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.443822445820287,
            "data_time": 0.0034751778423540588,
            "batch_time": 0.01727535389254134,
            "samples_per_second": 2236990.002364093,
            "samples_per_second_per_gpu": 279623.7502955116,
            "loss_sequences_lower_95": 6.422491068552099,
            "loss_sequences_upper_95": 6.464971494155007,
            "loss_tokens_lower_95": 6.42272373205081,
            "loss_tokens_upper_95": 6.464676943344992,
            "sequences": 3395,
            "tokens": 3395,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/enterprise_pii_classification/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 1.6561363895033552,
            "data_time": 0.0016292170244442295,
            "batch_time": 0.015113273101575526,
            "samples_per_second": 2291935.5171251344,
            "samples_per_second_per_gpu": 286491.9396406418,
            "loss_sequences_lower_95": 1.9641548537086269,
            "loss_sequences_upper_95": 2.00246284681559,
            "loss_tokens_lower_95": 1.33639505771033,
            "loss_tokens_upper_95": 1.357056264808855,
            "sequences": 58492,
            "tokens": 141385,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/bbq/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.9058022423992007,
            "data_time": 0.04379874840378761,
            "batch_time": 0.059411220252513885,
            "samples_per_second": 1923572.01136575,
            "samples_per_second_per_gpu": 240446.50142071876,
            "loss_sequences_lower_95": 2.8107716177392197,
            "loss_sequences_upper_95": 3.1442487821804255,
            "loss_tokens_lower_95": 2.6137096085498035,
            "loss_tokens_upper_95": 2.7790225211480593,
            "sequences": 127,
            "tokens": 4071,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_complex/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.781178525976233,
            "data_time": 0.13003617241269066,
            "batch_time": 0.14662798245747885,
            "samples_per_second": 955779.49570513,
            "samples_per_second_per_gpu": 119472.43696314126,
            "loss_sequences_lower_95": 4.355575819273253,
            "loss_sequences_upper_95": 5.312546054736988,
            "loss_tokens_lower_95": 4.221793167679398,
            "loss_tokens_upper_95": 5.16577212486738,
            "sequences": 37,
            "tokens": 162,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval_return_simple/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.7654510780078607,
            "data_time": 0.03248150859560285,
            "batch_time": 0.0469147108850025,
            "samples_per_second": 1934475.9152084973,
            "samples_per_second_per_gpu": 241809.48940106216,
            "loss_sequences_lower_95": 2.685733227613496,
            "loss_sequences_upper_95": 2.9806176301909657,
            "loss_tokens_lower_95": 2.5070028565890454,
            "loss_tokens_upper_95": 2.6492832409062235,
            "sequences": 164,
            "tokens": 5945,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.5/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.8146359302648682,
            "data_time": 0.03297237839017596,
            "batch_time": 0.04804244495573498,
            "samples_per_second": 1853101.5584755067,
            "samples_per_second_per_gpu": 231637.69480943834,
            "loss_sequences_lower_95": 2.7826078507958387,
            "loss_sequences_upper_95": 3.0447660957894676,
            "loss_tokens_lower_95": 2.5461538474752987,
            "loss_tokens_upper_95": 2.6624507939394717,
            "sequences": 164,
            "tokens": 8527,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.25/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.8195239647132593,
            "data_time": 0.034412168321155366,
            "batch_time": 0.048827304726555235,
            "samples_per_second": 1911009.6772843164,
            "samples_per_second_per_gpu": 238876.20966053955,
            "loss_sequences_lower_95": 2.6186176393090226,
            "loss_sequences_upper_95": 2.945585162465165,
            "loss_tokens_lower_95": 2.644933155448084,
            "loss_tokens_upper_95": 2.8312583883854763,
            "sequences": 164,
            "tokens": 3478,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval-0.75/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.915357349849329,
            "data_time": 0.032267476831163676,
            "batch_time": 0.04688917171387445,
            "samples_per_second": 1905605.0809749975,
            "samples_per_second_per_gpu": 238200.6351218747,
            "loss_sequences_lower_95": 2.8895671751441023,
            "loss_sequences_upper_95": 3.1349572344524104,
            "loss_tokens_lower_95": 2.6492095662054616,
            "loss_tokens_upper_95": 2.758704953772999,
            "sequences": 164,
            "tokens": 10272,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/human_eval/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.3850367691205894,
            "data_time": 0.03443751217406473,
            "batch_time": 0.0492705504099528,
            "samples_per_second": 1936096.030808672,
            "samples_per_second_per_gpu": 242012.003851084,
            "loss_sequences_lower_95": 2.3301272990540687,
            "loss_sequences_upper_95": 2.473991654674459,
            "loss_tokens_lower_95": 2.288605569291094,
            "loss_tokens_upper_95": 2.3723772884523986,
            "sequences": 161,
            "tokens": 17095,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_cpp/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 2.1184562982582467,
            "data_time": 0.03378179527464367,
            "batch_time": 0.04814480883734567,
            "samples_per_second": 1954616.6859125446,
            "samples_per_second_per_gpu": 244327.08573906808,
            "loss_sequences_lower_95": 2.0995378866428283,
            "loss_sequences_upper_95": 2.2653581619262697,
            "loss_tokens_lower_95": 1.923811260313065,
            "loss_tokens_upper_95": 1.9956416351152804,
            "sequences": 164,
            "tokens": 16590,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "/admin/<scrub>/scrub/training/eval_data/processed_human_eval_js/shard-0000000.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/params.txt",
    "uuid": "0480edba-6fa4-4e44-9c78-febf279c7b96",
    "creation_date": "2023_12_14-06_30_54"
}