{
    "name": "rw_original-d=576_l=24_h=8-2.0",
    "dataset_name": "rw_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf7",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 6147095040,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1229419008",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/refined_web_tokenized/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "json.gz",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "rw_original-d=576_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.22856519324705,
            "data_time": 0.008855888620018959,
            "batch_time": 0.10475687123835087,
            "samples_per_second": 1088498.0335490291,
            "samples_per_second_per_gpu": 136062.25419362864,
            "loss_sequences_lower_95": 3.1699576675891876,
            "loss_sequences_upper_95": 3.2859211921691895,
            "loss_tokens_lower_95": 3.2169815520833334,
            "loss_tokens_upper_95": 3.2404733958333334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2509562380146235,
            "data_time": 0.009545974433422089,
            "batch_time": 0.0386585732921958,
            "samples_per_second": 1089225.8870350325,
            "samples_per_second_per_gpu": 136153.23587937906,
            "loss_sequences_lower_95": 3.182426381111145,
            "loss_sequences_upper_95": 3.318020248413086,
            "loss_tokens_lower_95": 3.239375796875,
            "loss_tokens_upper_95": 3.2624743333333335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.3791203116998076,
            "data_time": 0.009273635223507881,
            "batch_time": 0.0378982974216342,
            "samples_per_second": 1106116.9412047414,
            "samples_per_second_per_gpu": 138264.61765059267,
            "loss_sequences_lower_95": 3.3159045219421386,
            "loss_sequences_upper_95": 3.442087525129318,
            "loss_tokens_lower_95": 3.3674553958333333,
            "loss_tokens_upper_95": 3.390683802083333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4496305249631405,
            "data_time": 0.009159822016954422,
            "batch_time": 0.038405136205255985,
            "samples_per_second": 1083671.516118744,
            "samples_per_second_per_gpu": 135458.939514843,
            "loss_sequences_lower_95": 3.3831608295440674,
            "loss_sequences_upper_95": 3.513298326730728,
            "loss_tokens_lower_95": 3.437788078125,
            "loss_tokens_upper_95": 3.4612421875000003,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5275856517255306,
            "data_time": 0.009082664735615253,
            "batch_time": 0.03749471064656973,
            "samples_per_second": 1112198.4418299107,
            "samples_per_second_per_gpu": 139024.80522873884,
            "loss_sequences_lower_95": 3.4581508338451385,
            "loss_sequences_upper_95": 3.5973754286766053,
            "loss_tokens_lower_95": 3.516047895833333,
            "loss_tokens_upper_95": 3.5392741979166664,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6361481053754687,
            "data_time": 0.009412484243512154,
            "batch_time": 0.03796699829399586,
            "samples_per_second": 1110980.1178354556,
            "samples_per_second_per_gpu": 138872.51472943195,
            "loss_sequences_lower_95": 3.5695124506950378,
            "loss_sequences_upper_95": 3.7013760983943937,
            "loss_tokens_lower_95": 3.6246419479166665,
            "loss_tokens_upper_95": 3.6481630208333335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.782292613759637,
            "data_time": 0.00907643511891365,
            "batch_time": 0.038061702623963356,
            "samples_per_second": 1091941.783504472,
            "samples_per_second_per_gpu": 136492.722938059,
            "loss_sequences_lower_95": 3.71342511177063,
            "loss_sequences_upper_95": 3.849409079551697,
            "loss_tokens_lower_95": 3.7705762291666667,
            "loss_tokens_upper_95": 3.7941176770833334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.8524490064010024,
            "data_time": 0.009337973780930042,
            "batch_time": 0.038452024571597576,
            "samples_per_second": 1085484.1763250567,
            "samples_per_second_per_gpu": 135685.5220406321,
            "loss_sequences_lower_95": 3.784052789211273,
            "loss_sequences_upper_95": 3.918353855609894,
            "loss_tokens_lower_95": 3.8405082604166667,
            "loss_tokens_upper_95": 3.864432604166667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.936952672433108,
            "data_time": 0.009432383812963963,
            "batch_time": 0.039060838520526886,
            "samples_per_second": 1074998.7158475253,
            "samples_per_second_per_gpu": 134374.83948094066,
            "loss_sequences_lower_95": 3.876441776752472,
            "loss_sequences_upper_95": 3.996352857351303,
            "loss_tokens_lower_95": 3.9250107291666665,
            "loss_tokens_upper_95": 3.94906390625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.9755279514938593,
            "data_time": 0.009284510277211666,
            "batch_time": 0.0379322050139308,
            "samples_per_second": 1102200.6334948647,
            "samples_per_second_per_gpu": 137775.0791868581,
            "loss_sequences_lower_95": 3.9189385414123534,
            "loss_sequences_upper_95": 4.0306608319282535,
            "loss_tokens_lower_95": 3.9636141250000003,
            "loss_tokens_upper_95": 3.98721059375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.197738328948617,
            "data_time": 0.00918971560895443,
            "batch_time": 0.038015454076230526,
            "samples_per_second": 1106938.94383754,
            "samples_per_second_per_gpu": 138367.3679796925,
            "loss_sequences_lower_95": 4.12305349111557,
            "loss_sequences_upper_95": 4.284753918647766,
            "loss_tokens_lower_95": 4.185958020833334,
            "loss_tokens_upper_95": 4.210173802083333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/checkpoints/epoch_7.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/rw_original-d=576_l=24_h=8-2.0/params.txt",
    "uuid": "fff6723e-b3cf-425b-a488-fdbacacc0773",
    "creation_date": "2023_12_14-05_03_42"
}