{
    "name": "rpj-d=512_l=8_h=4-0.5",
    "dataset_name": "rpj",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf6",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 789140480,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "157828096",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/rpj_tokenized_upsampled_eleutherai/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rpj-d=512_l=8_h=4-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.9899745546281338,
            "data_time": 0.008252399042248726,
            "batch_time": 0.08230332564562559,
            "samples_per_second": 2241942.1114205215,
            "samples_per_second_per_gpu": 280242.7639275652,
            "loss_sequences_lower_95": 3.926061135530472,
            "loss_sequences_upper_95": 4.0530879378318785,
            "loss_tokens_lower_95": 3.9780180416666666,
            "loss_tokens_upper_95": 4.002357989583333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.0736039825715125,
            "data_time": 0.008952825330197811,
            "batch_time": 0.022916062735021114,
            "samples_per_second": 2212500.0971440305,
            "samples_per_second_per_gpu": 276562.5121430038,
            "loss_sequences_lower_95": 3.9952697157859802,
            "loss_sequences_upper_95": 4.151147520542144,
            "loss_tokens_lower_95": 4.061323135416666,
            "loss_tokens_upper_95": 4.0857859479166665,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.221406331285834,
            "data_time": 0.00914106983691454,
            "batch_time": 0.02260405942797661,
            "samples_per_second": 2284415.7718893965,
            "samples_per_second_per_gpu": 285551.97148617456,
            "loss_sequences_lower_95": 4.147596096992492,
            "loss_sequences_upper_95": 4.295156276226043,
            "loss_tokens_lower_95": 4.208847447916667,
            "loss_tokens_upper_95": 4.233848875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.319381681270897,
            "data_time": 0.008975711651146412,
            "batch_time": 0.02285716775804758,
            "samples_per_second": 2217508.412390526,
            "samples_per_second_per_gpu": 277188.55154881574,
            "loss_sequences_lower_95": 4.240851259231567,
            "loss_sequences_upper_95": 4.394251358509064,
            "loss_tokens_lower_95": 4.307224104166667,
            "loss_tokens_upper_95": 4.33165340625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4283766923472285,
            "data_time": 0.008450202643871307,
            "batch_time": 0.022822247818112373,
            "samples_per_second": 2175148.723305137,
            "samples_per_second_per_gpu": 271893.5904131421,
            "loss_sequences_lower_95": 4.345486855506897,
            "loss_sequences_upper_95": 4.511222970485687,
            "loss_tokens_lower_95": 4.4157162083333334,
            "loss_tokens_upper_95": 4.440724229166666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.554925155825913,
            "data_time": 0.009267346002161503,
            "batch_time": 0.022792152129113674,
            "samples_per_second": 2290808.2256615073,
            "samples_per_second_per_gpu": 286351.0282076884,
            "loss_sequences_lower_95": 4.473655879497528,
            "loss_sequences_upper_95": 4.633584797382355,
            "loss_tokens_lower_95": 4.542748708333334,
            "loss_tokens_upper_95": 4.567531520833334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.732027578167617,
            "data_time": 0.008764879778027534,
            "batch_time": 0.022757981903851032,
            "samples_per_second": 2228494.394997299,
            "samples_per_second_per_gpu": 278561.7993746624,
            "loss_sequences_lower_95": 4.651060545444489,
            "loss_sequences_upper_95": 4.811895394325257,
            "loss_tokens_lower_95": 4.7198135937500005,
            "loss_tokens_upper_95": 4.744382947916667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.840252116322517,
            "data_time": 0.008671626448631287,
            "batch_time": 0.022181793116033077,
            "samples_per_second": 2277534.307695716,
            "samples_per_second_per_gpu": 284691.7884619645,
            "loss_sequences_lower_95": 4.7624705791473385,
            "loss_sequences_upper_95": 4.915029394626617,
            "loss_tokens_lower_95": 4.8279860625,
            "loss_tokens_upper_95": 4.8526405,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.940921317785978,
            "data_time": 0.00875498354434967,
            "batch_time": 0.022588087245821953,
            "samples_per_second": 2222617.2112816214,
            "samples_per_second_per_gpu": 277827.15141020267,
            "loss_sequences_lower_95": 4.8679158091545105,
            "loss_sequences_upper_95": 5.011430120468139,
            "loss_tokens_lower_95": 4.9285685,
            "loss_tokens_upper_95": 4.953331895833333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.021685889922082,
            "data_time": 0.009394191205501556,
            "batch_time": 0.022993555292487144,
            "samples_per_second": 2257732.385764694,
            "samples_per_second_per_gpu": 282216.54822058673,
            "loss_sequences_lower_95": 4.95437262058258,
            "loss_sequences_upper_95": 5.086564815044403,
            "loss_tokens_lower_95": 5.009488125,
            "loss_tokens_upper_95": 5.033758135416667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.202870168723166,
            "data_time": 0.009233411401510239,
            "batch_time": 0.02295506838709116,
            "samples_per_second": 2270942.4044225058,
            "samples_per_second_per_gpu": 283867.8005528132,
            "loss_sequences_lower_95": 5.139790034294128,
            "loss_sequences_upper_95": 5.264601576328277,
            "loss_tokens_lower_95": 5.191077270833333,
            "loss_tokens_upper_95": 5.21503296875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/checkpoints/epoch_5.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rpj-d=512_l=8_h=4-0.5/params.txt",
    "uuid": "0480edba-6fa4-4e44-9c78-febf279c7b96",
    "creation_date": "2023_12_14-06_30_54"
}