{
    "name": "c4_original-d=512_l=8_h=4-4.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 6313123840,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 4.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1262624768",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-4.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.6333569260314107,
            "data_time": 0.008441929705440998,
            "batch_time": 0.09312698617577553,
            "samples_per_second": 2234738.2735108314,
            "samples_per_second_per_gpu": 279342.2841888539,
            "loss_sequences_lower_95": 3.579640144109726,
            "loss_sequences_upper_95": 3.6863079786300657,
            "loss_tokens_lower_95": 3.621011,
            "loss_tokens_upper_95": 3.6458725729166663,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.647206110879779,
            "data_time": 0.008951657451689243,
            "batch_time": 0.023185575380921364,
            "samples_per_second": 2167475.6446006885,
            "samples_per_second_per_gpu": 270934.45557508606,
            "loss_sequences_lower_95": 3.582366406917572,
            "loss_sequences_upper_95": 3.711468017101288,
            "loss_tokens_lower_95": 3.63496578125,
            "loss_tokens_upper_95": 3.659584697916667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.752702488563955,
            "data_time": 0.008881553076207638,
            "batch_time": 0.02262055967003107,
            "samples_per_second": 2243061.7159276414,
            "samples_per_second_per_gpu": 280382.7144909552,
            "loss_sequences_lower_95": 3.6936244308948516,
            "loss_sequences_upper_95": 3.8121499478816987,
            "loss_tokens_lower_95": 3.7405174687500002,
            "loss_tokens_upper_95": 3.7648400208333332,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.788985423743725,
            "data_time": 0.008885260671377182,
            "batch_time": 0.02264903299510479,
            "samples_per_second": 2244205.8103239774,
            "samples_per_second_per_gpu": 280525.7262904972,
            "loss_sequences_lower_95": 3.7270530462265015,
            "loss_sequences_upper_95": 3.847899353504181,
            "loss_tokens_lower_95": 3.7767379895833333,
            "loss_tokens_upper_95": 3.8008986875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.836777938529849,
            "data_time": 0.00936056300997734,
            "batch_time": 0.023500315845012665,
            "samples_per_second": 2181190.0743656806,
            "samples_per_second_per_gpu": 272648.7592957101,
            "loss_sequences_lower_95": 3.774627071619034,
            "loss_sequences_upper_95": 3.899297606945038,
            "loss_tokens_lower_95": 3.82458684375,
            "loss_tokens_upper_95": 3.84875596875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 3.94356751954183,
            "data_time": 0.008932131342589855,
            "batch_time": 0.022863631136715412,
            "samples_per_second": 2229440.6369267637,
            "samples_per_second_per_gpu": 278680.07961584546,
            "loss_sequences_lower_95": 3.8836659491062164,
            "loss_sequences_upper_95": 4.002013254165649,
            "loss_tokens_lower_95": 3.9315691145833336,
            "loss_tokens_upper_95": 3.9557911875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.03720894176513,
            "data_time": 0.009188790805637836,
            "batch_time": 0.022702001966536045,
            "samples_per_second": 2274755.8967494117,
            "samples_per_second_per_gpu": 284344.48709367646,
            "loss_sequences_lower_95": 3.9738366544246673,
            "loss_sequences_upper_95": 4.099692010879517,
            "loss_tokens_lower_95": 4.025074635416667,
            "loss_tokens_upper_95": 4.049176510416667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.101603830233216,
            "data_time": 0.00899230595678091,
            "batch_time": 0.023008679039776325,
            "samples_per_second": 2208599.4814330637,
            "samples_per_second_per_gpu": 276074.93517913297,
            "loss_sequences_lower_95": 4.042573261260986,
            "loss_sequences_upper_95": 4.158416080474853,
            "loss_tokens_lower_95": 4.089449770833333,
            "loss_tokens_upper_95": 4.113691010416666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.17176432069391,
            "data_time": 0.009381856769323349,
            "batch_time": 0.023137973621487617,
            "samples_per_second": 2248436.5431684796,
            "samples_per_second_per_gpu": 281054.56789605995,
            "loss_sequences_lower_95": 4.115781509876252,
            "loss_sequences_upper_95": 4.22663197517395,
            "loss_tokens_lower_95": 4.159628656250001,
            "loss_tokens_upper_95": 4.183911927083334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.183858385309577,
            "data_time": 0.009241754189133644,
            "batch_time": 0.023116111755371094,
            "samples_per_second": 2232429.1567752124,
            "samples_per_second_per_gpu": 279053.64459690155,
            "loss_sequences_lower_95": 4.131803715229035,
            "loss_sequences_upper_95": 4.235291957855225,
            "loss_tokens_lower_95": 4.172000302083333,
            "loss_tokens_upper_95": 4.195689625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 4.334569435566664,
            "data_time": 0.008903918787837029,
            "batch_time": 0.022925865836441517,
            "samples_per_second": 2214237.827881509,
            "samples_per_second_per_gpu": 276779.7284851886,
            "loss_sequences_lower_95": 4.284093570709229,
            "loss_sequences_upper_95": 4.384160375595092,
            "loss_tokens_lower_95": 4.3225371875,
            "loss_tokens_upper_95": 4.346674645833333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-4.0/params.txt",
    "uuid": "cfb61889-570f-4a32-93f9-64803af43400",
    "creation_date": "2023_12_14-04_59_29"
}