{
    "name": "rpj-d=96_l=8_h=4-16.0",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=96_l=8_h=4.json",
        "tokens": 3382179840,
        "warmup": 100,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 64,
        "acc": 1,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 10569312,
        "params_no_embed": 5727840,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 16.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "676435968",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "64",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "100",
        "--model",
        "training/open_lm_configs/d=96_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "1",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=96_l=8_h=4-16.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 4.439933031797409,
            "data_time": 0.07418341189622879,
            "batch_time": 0.5944393947720528,
            "samples_per_second": 4218729.022485053,
            "samples_per_second_per_gpu": 527341.1278106316,
            "loss_sequences_lower_95": 4.375822126865387,
            "loss_sequences_upper_95": 4.502169191837311,
            "loss_tokens_lower_95": 4.4275615625,
            "loss_tokens_upper_95": 4.452712593749999,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.546804890036583,
            "data_time": 0.08344615995883942,
            "batch_time": 0.1282683163881302,
            "samples_per_second": 4269375.158047148,
            "samples_per_second_per_gpu": 533671.8947558935,
            "loss_sequences_lower_95": 4.465212416648865,
            "loss_sequences_upper_95": 4.626812744140625,
            "loss_tokens_lower_95": 4.533905572916667,
            "loss_tokens_upper_95": 4.559170291666667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.709052190184593,
            "data_time": 0.09313441067934036,
            "batch_time": 0.13845976442098618,
            "samples_per_second": 4228694.262115326,
            "samples_per_second_per_gpu": 528586.7827644157,
            "loss_sequences_lower_95": 4.634310019016266,
            "loss_sequences_upper_95": 4.786081075668335,
            "loss_tokens_lower_95": 4.696517083333333,
            "loss_tokens_upper_95": 4.721801500000001,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.8311930149793625,
            "data_time": 0.08479568362236023,
            "batch_time": 0.12951208651065826,
            "samples_per_second": 4336762.363548653,
            "samples_per_second_per_gpu": 542095.2954435817,
            "loss_sequences_lower_95": 4.749635136127472,
            "loss_sequences_upper_95": 4.913470506668091,
            "loss_tokens_lower_95": 4.818924364583333,
            "loss_tokens_upper_95": 4.84360925,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 4.954751148819923,
            "data_time": 0.09276996552944183,
            "batch_time": 0.13741455227136612,
            "samples_per_second": 4341043.264877939,
            "samples_per_second_per_gpu": 542630.4081097423,
            "loss_sequences_lower_95": 4.865317094326019,
            "loss_sequences_upper_95": 5.043069887161255,
            "loss_tokens_lower_95": 4.941838729166666,
            "loss_tokens_upper_95": 4.967050250000001,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.105965122580528,
            "data_time": 0.08721298724412918,
            "batch_time": 0.13228625059127808,
            "samples_per_second": 4332711.985341871,
            "samples_per_second_per_gpu": 541588.9981677339,
            "loss_sequences_lower_95": 5.02001428604126,
            "loss_sequences_upper_95": 5.192256700992584,
            "loss_tokens_lower_95": 5.0934385312499995,
            "loss_tokens_upper_95": 5.118388260416666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.303593769669533,
            "data_time": 0.08927329629659653,
            "batch_time": 0.13367357105016708,
            "samples_per_second": 4317635.553603305,
            "samples_per_second_per_gpu": 539704.4442004131,
            "loss_sequences_lower_95": 5.217483770847321,
            "loss_sequences_upper_95": 5.387184071540832,
            "loss_tokens_lower_95": 5.290944104166667,
            "loss_tokens_upper_95": 5.3162377916666665,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.423750519752502,
            "data_time": 0.08763265609741211,
            "batch_time": 0.1336667761206627,
            "samples_per_second": 4177680.6353573375,
            "samples_per_second_per_gpu": 522210.0794196672,
            "loss_sequences_lower_95": 5.342403995990753,
            "loss_sequences_upper_95": 5.502926397323608,
            "loss_tokens_lower_95": 5.41130675,
            "loss_tokens_upper_95": 5.43595878125,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.548287749290466,
            "data_time": 0.09185604006052017,
            "batch_time": 0.1365254446864128,
            "samples_per_second": 4336386.239282856,
            "samples_per_second_per_gpu": 542048.279910357,
            "loss_sequences_lower_95": 5.4701862096786495,
            "loss_sequences_upper_95": 5.624940943717957,
            "loss_tokens_lower_95": 5.535629635416667,
            "loss_tokens_upper_95": 5.560560791666666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.642346754670143,
            "data_time": 0.08849015831947327,
            "batch_time": 0.13380296528339386,
            "samples_per_second": 4221711.701137855,
            "samples_per_second_per_gpu": 527713.9626422318,
            "loss_sequences_lower_95": 5.572579503059387,
            "loss_sequences_upper_95": 5.709129476547242,
            "loss_tokens_lower_95": 5.630091531250001,
            "loss_tokens_upper_95": 5.654549697916666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=96_l=8_h=4"
        },
        {
            "loss": 5.847593039274216,
            "data_time": 0.09089977294206619,
            "batch_time": 0.1362130418419838,
            "samples_per_second": 4214061.162345065,
            "samples_per_second_per_gpu": 526757.6452931331,
            "loss_sequences_lower_95": 5.7840402841567995,
            "loss_sequences_upper_95": 5.9078766226768495,
            "loss_tokens_lower_95": 5.835585166666666,
            "loss_tokens_upper_95": 5.859470989583333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=96_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=96_l=8_h=4-16.0/params.txt",
    "uuid": "1084ea36-5383-4043-8c42-00f5a786b48e",
    "creation_date": "2023_12_14-06_02_11"
}