{
    "name": "c4_original-d=1024_l=24_h=8-0.5",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=1024_l=24_h=8.json",
        "tokens": 4116162560,
        "warmup": 2000,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 411616256,
        "params_no_embed": 359973888,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.5
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "823232512",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "2000",
        "--model",
        "training/open_lm_configs/d=1024_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=1024_l=24_h=8-0.5",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.268487725406885,
            "data_time": 0.011498968116939068,
            "batch_time": 0.13203256949782372,
            "samples_per_second": 880378.9525699415,
            "samples_per_second_per_gpu": 110047.36907124269,
            "loss_sequences_lower_95": 3.215274530649185,
            "loss_sequences_upper_95": 3.3206111073493956,
            "loss_tokens_lower_95": 3.2565876562500002,
            "loss_tokens_upper_95": 3.2806010677083335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.262610549107194,
            "data_time": 0.00974537804722786,
            "batch_time": 0.04539656639099121,
            "samples_per_second": 886589.7469179712,
            "samples_per_second_per_gpu": 110823.7183647464,
            "loss_sequences_lower_95": 3.2004975080490112,
            "loss_sequences_upper_95": 3.3246654272079468,
            "loss_tokens_lower_95": 3.250778328125,
            "loss_tokens_upper_95": 3.2744760572916665,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3541545644402504,
            "data_time": 0.010041466914117336,
            "batch_time": 0.04554453305900097,
            "samples_per_second": 886940.1166357122,
            "samples_per_second_per_gpu": 110867.51457946403,
            "loss_sequences_lower_95": 3.2977225124835967,
            "loss_sequences_upper_95": 3.4112503826618195,
            "loss_tokens_lower_95": 3.3422480885416666,
            "loss_tokens_upper_95": 3.3658221249999998,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.3894361313432455,
            "data_time": 0.009496554732322693,
            "batch_time": 0.045183450914919376,
            "samples_per_second": 881816.9769365129,
            "samples_per_second_per_gpu": 110227.12211706411,
            "loss_sequences_lower_95": 3.3312223672866823,
            "loss_sequences_upper_95": 3.4452390909194945,
            "loss_tokens_lower_95": 3.377509744791667,
            "loss_tokens_upper_95": 3.400890375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.4227245757356286,
            "data_time": 0.00980648398399353,
            "batch_time": 0.04545002244412899,
            "samples_per_second": 886537.0112906263,
            "samples_per_second_per_gpu": 110817.12641132828,
            "loss_sequences_lower_95": 3.36434069275856,
            "loss_sequences_upper_95": 3.481045186519623,
            "loss_tokens_lower_95": 3.4111010312500003,
            "loss_tokens_upper_95": 3.434301734375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5129170729778707,
            "data_time": 0.009931877255439758,
            "batch_time": 0.04544846620410681,
            "samples_per_second": 886159.9259428976,
            "samples_per_second_per_gpu": 110769.9907428622,
            "loss_sequences_lower_95": 3.4570028424263,
            "loss_sequences_upper_95": 3.566692715883255,
            "loss_tokens_lower_95": 3.5012449375,
            "loss_tokens_upper_95": 3.5246540833333335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.5977753894403577,
            "data_time": 0.009891904890537262,
            "batch_time": 0.04561514873057604,
            "samples_per_second": 883912.8896567873,
            "samples_per_second_per_gpu": 110489.11120709841,
            "loss_sequences_lower_95": 3.5387950778007506,
            "loss_sequences_upper_95": 3.6559614837169647,
            "loss_tokens_lower_95": 3.5859920208333333,
            "loss_tokens_upper_95": 3.6095629687499997,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.6519958144053817,
            "data_time": 0.01005645003169775,
            "batch_time": 0.04579588118940592,
            "samples_per_second": 880537.2582277532,
            "samples_per_second_per_gpu": 110067.15727846915,
            "loss_sequences_lower_95": 3.596957778930664,
            "loss_sequences_upper_95": 3.705037271976471,
            "loss_tokens_lower_95": 3.6401256979166665,
            "loss_tokens_upper_95": 3.663787322916667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.707444384228438,
            "data_time": 0.009933969005942345,
            "batch_time": 0.04562277067452669,
            "samples_per_second": 882480.8091720319,
            "samples_per_second_per_gpu": 110310.10114650399,
            "loss_sequences_lower_95": 3.6564437687397002,
            "loss_sequences_upper_95": 3.7575317919254303,
            "loss_tokens_lower_95": 3.6956255312499997,
            "loss_tokens_upper_95": 3.71925884375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.71201703697443,
            "data_time": 0.009666020050644875,
            "batch_time": 0.04523248225450516,
            "samples_per_second": 887288.6305546475,
            "samples_per_second_per_gpu": 110911.07881933094,
            "loss_sequences_lower_95": 3.663793647289276,
            "loss_sequences_upper_95": 3.7598956763744353,
            "loss_tokens_lower_95": 3.70047625,
            "loss_tokens_upper_95": 3.723496354166667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=1024_l=24_h=8"
        },
        {
            "loss": 3.8321622228249907,
            "data_time": 0.01009007915854454,
            "batch_time": 0.04576942231506109,
            "samples_per_second": 883451.5194509153,
            "samples_per_second_per_gpu": 110431.43993136441,
            "loss_sequences_lower_95": 3.7856821775436402,
            "loss_sequences_upper_95": 3.8778163373470305,
            "loss_tokens_lower_95": 3.8204483333333332,
            "loss_tokens_upper_95": 3.8438454687499997,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=1024_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/params.txt",
    "uuid": "3de0077a-6fe5-46cc-a473-9cdcff47681d",
    "creation_date": "2023_12_14-04_59_54"
}