{
    "name": "c4_original-d=1024_l=24_h=8-8.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 65858600960,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 8.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "13171720192",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=1024_l=24_h=8-8.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 2.9311233586631715,
            "data_time": 0.011208887211978436,
            "batch_time": 0.12812560331076384,
            "samples_per_second": 880605.7479044019,
            "samples_per_second_per_gpu": 110075.71848805023,
            "loss_sequences_lower_95": 2.8789745032787324,
            "loss_sequences_upper_95": 2.981768012046814,
            "loss_tokens_lower_95": 2.9195222760416666,
            "loss_tokens_upper_95": 2.9429565989583333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.921550034545362,
            "data_time": 0.010359046049416065,
            "batch_time": 0.046058233827352524,
            "samples_per_second": 880961.1163130734,
            "samples_per_second_per_gpu": 110120.13953913418,
            "loss_sequences_lower_95": 2.863480734825134,
            "loss_sequences_upper_95": 2.979804527759552,
            "loss_tokens_lower_95": 2.909907,
            "loss_tokens_upper_95": 2.9330515,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 2.988629949744791,
            "data_time": 0.01065461989492178,
            "batch_time": 0.046172778122127056,
            "samples_per_second": 883474.6158171335,
            "samples_per_second_per_gpu": 110434.32697714168,
            "loss_sequences_lower_95": 2.9343353629112245,
            "loss_sequences_upper_95": 3.0431387066841125,
            "loss_tokens_lower_95": 2.9771598229166667,
            "loss_tokens_upper_95": 2.9998928541666667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.017097958829254,
            "data_time": 0.010268723592162132,
            "batch_time": 0.04595402721315622,
            "samples_per_second": 882333.6160004798,
            "samples_per_second_per_gpu": 110291.70200005997,
            "loss_sequences_lower_95": 2.964470440149307,
            "loss_sequences_upper_95": 3.0691692769527434,
            "loss_tokens_lower_95": 3.0056996979166666,
            "loss_tokens_upper_95": 3.028424390625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.0327209271490574,
            "data_time": 0.010003837756812572,
            "batch_time": 0.04555748216807842,
            "samples_per_second": 882335.3501820815,
            "samples_per_second_per_gpu": 110291.91877276018,
            "loss_sequences_lower_95": 2.979351657629013,
            "loss_sequences_upper_95": 3.0860198080539702,
            "loss_tokens_lower_95": 3.021230109375,
            "loss_tokens_upper_95": 3.0438517552083333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1005311398766935,
            "data_time": 0.010503319092094898,
            "batch_time": 0.04624045081436634,
            "samples_per_second": 881087.8800889459,
            "samples_per_second_per_gpu": 110135.98501111823,
            "loss_sequences_lower_95": 3.050862556695938,
            "loss_sequences_upper_95": 3.149190014600754,
            "loss_tokens_lower_95": 3.089165229166667,
            "loss_tokens_upper_95": 3.1120339895833333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.1635805889964104,
            "data_time": 0.009876802563667297,
            "batch_time": 0.045329032465815544,
            "samples_per_second": 886001.7683228146,
            "samples_per_second_per_gpu": 110750.22104035183,
            "loss_sequences_lower_95": 3.1113265097141265,
            "loss_sequences_upper_95": 3.2154844045639037,
            "loss_tokens_lower_95": 3.1522392291666668,
            "loss_tokens_upper_95": 3.1749434322916668,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.199618529062718,
            "data_time": 0.010167363099753857,
            "batch_time": 0.04598442558199167,
            "samples_per_second": 880136.9534987324,
            "samples_per_second_per_gpu": 110017.11918734155,
            "loss_sequences_lower_95": 3.150152009725571,
            "loss_sequences_upper_95": 3.2476491808891295,
            "loss_tokens_lower_95": 3.1879865781250003,
            "loss_tokens_upper_95": 3.210938078125,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.2440827023237944,
            "data_time": 0.010433435440063477,
            "batch_time": 0.04641058295965195,
            "samples_per_second": 875205.5021164585,
            "samples_per_second_per_gpu": 109400.68776455731,
            "loss_sequences_lower_95": 3.1981602609157562,
            "loss_sequences_upper_95": 3.2899386882781982,
            "loss_tokens_lower_95": 3.232528557291667,
            "loss_tokens_upper_95": 3.2556417187499997,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.233315106946975,
            "data_time": 0.010096265003085136,
            "batch_time": 0.04567021410912275,
            "samples_per_second": 884166.3626024332,
            "samples_per_second_per_gpu": 110520.79532530415,
            "loss_sequences_lower_95": 3.188864368200302,
            "loss_sequences_upper_95": 3.2771695256233215,
            "loss_tokens_lower_95": 3.221864932291667,
            "loss_tokens_upper_95": 3.244882651041667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.322265474591404,
            "data_time": 0.009846691973507404,
            "batch_time": 0.045433335937559605,
            "samples_per_second": 883140.0951416426,
            "samples_per_second_per_gpu": 110392.51189270533,
            "loss_sequences_lower_95": 3.278827279806137,
            "loss_sequences_upper_95": 3.364972746372223,
            "loss_tokens_lower_95": 3.310794088541667,
            "loss_tokens_upper_95": 3.333868171875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/params.txt",
    "uuid": "06ca6ffb-ba45-4139-a432-2a66ba95b592",
    "creation_date": "2023_12_14-13_35_23"
}