{
    "name": "rpj-d=1024_l=24_h=8-32.0",
    "dataset_name": "rpj",
    "dataset_uuid": "67db6b77-c7c4-48ae-b431-57254587ed43",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 263434403840,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 32.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/27127",
        "--train-num-samples",
        "52686880768",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/rpj_original/manifest.jsonl",
        "--data-key",
        "json.gz",
        "--name",
        "rpj-d=1024_l=24_h=8-32.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/411m_32x_rpj_original"
    ],
    "results": [
        {
            "loss": 2.5859934464097023,
            "data_time": 0.05160880088806152,
            "batch_time": 0.5063364505767822,
            "samples_per_second": 993635.4650233433,
            "samples_per_second_per_gpu": 124204.43312791792,
            "loss_sequences_lower_95": 2.527633327245712,
            "loss_sequences_upper_95": 2.642466354370117,
            "loss_tokens_lower_95": 2.575577125,
            "loss_tokens_upper_95": 2.5966258697916667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.5509848333895206,
            "data_time": 0.04917493090033531,
            "batch_time": 0.16365930438041687,
            "samples_per_second": 1021351.8059376691,
            "samples_per_second_per_gpu": 127668.97574220864,
            "loss_sequences_lower_95": 2.4872345209121702,
            "loss_sequences_upper_95": 2.6124478459358214,
            "loss_tokens_lower_95": 2.540697885416667,
            "loss_tokens_upper_95": 2.5614535104166665,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6093563586473465,
            "data_time": 0.046611107885837555,
            "batch_time": 0.1608918309211731,
            "samples_per_second": 1024262.6035064808,
            "samples_per_second_per_gpu": 128032.8254383101,
            "loss_sequences_lower_95": 2.5527053833007813,
            "loss_sequences_upper_95": 2.665910416841507,
            "loss_tokens_lower_95": 2.59885284375,
            "loss_tokens_upper_95": 2.6198729687499998,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6129791997373104,
            "data_time": 0.0467141717672348,
            "batch_time": 0.1611723192036152,
            "samples_per_second": 1021362.3364563758,
            "samples_per_second_per_gpu": 127670.29205704697,
            "loss_sequences_lower_95": 2.5576188445091246,
            "loss_sequences_upper_95": 2.667377781867981,
            "loss_tokens_lower_95": 2.60246290625,
            "loss_tokens_upper_95": 2.6234332760416668,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6008135471493006,
            "data_time": 0.04673510417342186,
            "batch_time": 0.16139161214232445,
            "samples_per_second": 1020173.6182154852,
            "samples_per_second_per_gpu": 127521.70227693566,
            "loss_sequences_lower_95": 2.547404134273529,
            "loss_sequences_upper_95": 2.6540302455425264,
            "loss_tokens_lower_95": 2.5902631822916664,
            "loss_tokens_upper_95": 2.6115929947916663,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.630616333335638,
            "data_time": 0.046836454421281815,
            "batch_time": 0.16147257760167122,
            "samples_per_second": 1021554.9887623225,
            "samples_per_second_per_gpu": 127694.37359529031,
            "loss_sequences_lower_95": 2.5796908020973204,
            "loss_sequences_upper_95": 2.679863780736923,
            "loss_tokens_lower_95": 2.619983588541667,
            "loss_tokens_upper_95": 2.6411583385416666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.682670619338751,
            "data_time": 0.047156475484371185,
            "batch_time": 0.16150449961423874,
            "samples_per_second": 1023302.192104592,
            "samples_per_second_per_gpu": 127912.774013074,
            "loss_sequences_lower_95": 2.6311958193778993,
            "loss_sequences_upper_95": 2.732639807462692,
            "loss_tokens_lower_95": 2.671890197916667,
            "loss_tokens_upper_95": 2.693214505208333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.699781384319067,
            "data_time": 0.04691166430711746,
            "batch_time": 0.16124945878982544,
            "samples_per_second": 1024188.789188327,
            "samples_per_second_per_gpu": 128023.59864854088,
            "loss_sequences_lower_95": 2.651801770925522,
            "loss_sequences_upper_95": 2.7483743369579314,
            "loss_tokens_lower_95": 2.6893118541666667,
            "loss_tokens_upper_95": 2.710717567708333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.6905221305787563,
            "data_time": 0.04749887436628342,
            "batch_time": 0.16188474372029305,
            "samples_per_second": 1022587.0758256074,
            "samples_per_second_per_gpu": 127823.38447820093,
            "loss_sequences_lower_95": 2.6457138121128083,
            "loss_sequences_upper_95": 2.7342768728733065,
            "loss_tokens_lower_95": 2.6797724218749996,
            "loss_tokens_upper_95": 2.701259380208333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.690047701820731,
            "data_time": 0.04787244275212288,
            "batch_time": 0.1622590385377407,
            "samples_per_second": 1021761.2063213527,
            "samples_per_second_per_gpu": 127720.1507901691,
            "loss_sequences_lower_95": 2.6473310351371766,
            "loss_sequences_upper_95": 2.7329026401042937,
            "loss_tokens_lower_95": 2.67939871875,
            "loss_tokens_upper_95": 2.7005464895833335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.7140957675874233,
            "data_time": 0.04642908275127411,
            "batch_time": 0.16076669842004776,
            "samples_per_second": 1023821.8851101229,
            "samples_per_second_per_gpu": 127977.73563876536,
            "loss_sequences_lower_95": 2.671124219894409,
            "loss_sequences_upper_95": 2.7573738038539886,
            "loss_tokens_lower_95": 2.703414270833333,
            "loss_tokens_upper_95": 2.724670421875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=1024_l=24_h=8-32.0/params.txt",
    "uuid": "f17f8e15-c808-4630-862c-ffd777d49feb",
    "creation_date": "2024_01_26-08_20_56"
}