{
    "name": "rw_original-d=512_l=8_h=4-2.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 3156561920,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "631312384",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=512_l=8_h=4-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.585495828650892,
            "data_time": 0.008139227516949177,
            "batch_time": 0.08262541424483061,
            "samples_per_second": 2241677.1228557006,
            "samples_per_second_per_gpu": 280209.64035696257,
            "loss_sequences_lower_95": 3.5225554049015044,
            "loss_sequences_upper_95": 3.6487263977527618,
            "loss_tokens_lower_95": 3.5735393229166665,
            "loss_tokens_upper_95": 3.5977059583333335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.62714654346928,
            "data_time": 0.009310257621109486,
            "batch_time": 0.02308223955333233,
            "samples_per_second": 2238673.913759993,
            "samples_per_second_per_gpu": 279834.2392199991,
            "loss_sequences_lower_95": 3.5557835698127747,
            "loss_sequences_upper_95": 3.698338806629181,
            "loss_tokens_lower_95": 3.6151107395833333,
            "loss_tokens_upper_95": 3.6391629791666666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.767040796112269,
            "data_time": 0.009438770823180676,
            "batch_time": 0.022969303652644157,
            "samples_per_second": 2284162.710678188,
            "samples_per_second_per_gpu": 285520.3388347735,
            "loss_sequences_lower_95": 3.700102871656418,
            "loss_sequences_upper_95": 3.8340927779674527,
            "loss_tokens_lower_95": 3.7550733333333337,
            "loss_tokens_upper_95": 3.7788617083333333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.854221375659108,
            "data_time": 0.009042483754456043,
            "batch_time": 0.022919930517673492,
            "samples_per_second": 2223290.172441482,
            "samples_per_second_per_gpu": 277911.27155518526,
            "loss_sequences_lower_95": 3.7827142477035522,
            "loss_sequences_upper_95": 3.923028790950775,
            "loss_tokens_lower_95": 3.8422953229166663,
            "loss_tokens_upper_95": 3.8660470937499998,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.952559331431985,
            "data_time": 0.009084356017410755,
            "batch_time": 0.0229610875248909,
            "samples_per_second": 2237027.8559904047,
            "samples_per_second_per_gpu": 279628.4819988006,
            "loss_sequences_lower_95": 3.8758158326148986,
            "loss_sequences_upper_95": 4.030240762233734,
            "loss_tokens_lower_95": 3.9404918645833336,
            "loss_tokens_upper_95": 3.9642145833333333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.084096108563244,
            "data_time": 0.008649105206131935,
            "batch_time": 0.02249837853014469,
            "samples_per_second": 2218173.342807745,
            "samples_per_second_per_gpu": 277271.66785096814,
            "loss_sequences_lower_95": 4.009812533855438,
            "loss_sequences_upper_95": 4.156556296348572,
            "loss_tokens_lower_95": 4.0722708125,
            "loss_tokens_upper_95": 4.0962071770833335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.24359274096787,
            "data_time": 0.008742463774979115,
            "batch_time": 0.0226196413859725,
            "samples_per_second": 2225583.7379270755,
            "samples_per_second_per_gpu": 278197.96724088443,
            "loss_sequences_lower_95": 4.167984688282013,
            "loss_sequences_upper_95": 4.318029534816742,
            "loss_tokens_lower_95": 4.23166490625,
            "loss_tokens_upper_95": 4.255607520833333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.33060020301491,
            "data_time": 0.008856839500367641,
            "batch_time": 0.022666454315185547,
            "samples_per_second": 2242476.076867908,
            "samples_per_second_per_gpu": 280309.5096084885,
            "loss_sequences_lower_95": 4.2570389866828915,
            "loss_sequences_upper_95": 4.401836693286896,
            "loss_tokens_lower_95": 4.31836565625,
            "loss_tokens_upper_95": 4.342781020833334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.4398222249001265,
            "data_time": 0.009281249716877937,
            "batch_time": 0.023067588917911053,
            "samples_per_second": 2245138.068584001,
            "samples_per_second_per_gpu": 280642.25857300014,
            "loss_sequences_lower_95": 4.373601448535919,
            "loss_sequences_upper_95": 4.505112171173096,
            "loss_tokens_lower_95": 4.427712791666667,
            "loss_tokens_upper_95": 4.4519471875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.492805204354227,
            "data_time": 0.009371524676680565,
            "batch_time": 0.023184075951576233,
            "samples_per_second": 2240950.127981767,
            "samples_per_second_per_gpu": 280118.7659977209,
            "loss_sequences_lower_95": 4.4298504114151,
            "loss_sequences_upper_95": 4.552693736553192,
            "loss_tokens_lower_95": 4.480783802083334,
            "loss_tokens_upper_95": 4.504751822916666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.74529692158103,
            "data_time": 0.00911820586770773,
            "batch_time": 0.022777329199016094,
            "samples_per_second": 2243133.4687551316,
            "samples_per_second_per_gpu": 280391.68359439145,
            "loss_sequences_lower_95": 4.6657912015914915,
            "loss_sequences_upper_95": 4.836609411239624,
            "loss_tokens_lower_95": 4.733077604166667,
            "loss_tokens_upper_95": 4.757773479166667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=512_l=8_h=4-2.0/params.txt",
    "uuid": "f859cec4-21b6-4818-a08b-3315a17437e2",
    "creation_date": "2023_12_14-05_01_12"
}