{
    "name": "c4_original-d=576_l=24_h=8-2.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 6147095040,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 2.0
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "1229419008",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=576_l=24_h=8-2.0",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 3.4089447548612952,
            "data_time": 0.009821842424571514,
            "batch_time": 0.12083166278898716,
            "samples_per_second": 1027086.3606080583,
            "samples_per_second_per_gpu": 128385.79507600729,
            "loss_sequences_lower_95": 3.355893391370773,
            "loss_sequences_upper_95": 3.4611397683620453,
            "loss_tokens_lower_95": 3.3969772812500003,
            "loss_tokens_upper_95": 3.4211974375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.410223683807999,
            "data_time": 0.010095465928316116,
            "batch_time": 0.03943422343581915,
            "samples_per_second": 1083850.182477155,
            "samples_per_second_per_gpu": 135481.27280964438,
            "loss_sequences_lower_95": 3.3472325921058657,
            "loss_sequences_upper_95": 3.472765153646469,
            "loss_tokens_lower_95": 3.3982672239583334,
            "loss_tokens_upper_95": 3.4220568697916667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.509175057988614,
            "data_time": 0.009770800359547138,
            "batch_time": 0.03899097815155983,
            "samples_per_second": 1087912.505398626,
            "samples_per_second_per_gpu": 135989.06317482825,
            "loss_sequences_lower_95": 3.4517655611038207,
            "loss_sequences_upper_95": 3.5671283841133117,
            "loss_tokens_lower_95": 3.497009875,
            "loss_tokens_upper_95": 3.5209368020833334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5473941364325583,
            "data_time": 0.009767846204340458,
            "batch_time": 0.03962613921612501,
            "samples_per_second": 1060958.1073031048,
            "samples_per_second_per_gpu": 132619.7634128881,
            "loss_sequences_lower_95": 3.488450437784195,
            "loss_sequences_upper_95": 3.60377950668335,
            "loss_tokens_lower_95": 3.535415572916667,
            "loss_tokens_upper_95": 3.559133447916667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5842559346929193,
            "data_time": 0.009918228723108768,
            "batch_time": 0.038565040566027164,
            "samples_per_second": 1099627.9664047284,
            "samples_per_second_per_gpu": 137453.49580059104,
            "loss_sequences_lower_95": 3.5247812449932097,
            "loss_sequences_upper_95": 3.643685591220856,
            "loss_tokens_lower_95": 3.57200665625,
            "loss_tokens_upper_95": 3.595993125,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.6803811406716704,
            "data_time": 0.009963578544557095,
            "batch_time": 0.039032816886901855,
            "samples_per_second": 1085143.9800228474,
            "samples_per_second_per_gpu": 135642.99750285593,
            "loss_sequences_lower_95": 3.6229705035686495,
            "loss_sequences_upper_95": 3.7357026100158692,
            "loss_tokens_lower_95": 3.668504614583333,
            "loss_tokens_upper_95": 3.6923816979166664,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.7713947715237737,
            "data_time": 0.009851453825831413,
            "batch_time": 0.03910491522401571,
            "samples_per_second": 1086813.139491374,
            "samples_per_second_per_gpu": 135851.64243642174,
            "loss_sequences_lower_95": 3.7116197466850283,
            "loss_sequences_upper_95": 3.830821031332016,
            "loss_tokens_lower_95": 3.75942690625,
            "loss_tokens_upper_95": 3.7832502187499997,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.825910489074886,
            "data_time": 0.010072358883917332,
            "batch_time": 0.039720335975289345,
            "samples_per_second": 1069685.647257272,
            "samples_per_second_per_gpu": 133710.705907159,
            "loss_sequences_lower_95": 3.769791340827942,
            "loss_sequences_upper_95": 3.880027961730957,
            "loss_tokens_lower_95": 3.8138621041666667,
            "loss_tokens_upper_95": 3.8379527291666666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.890181826427579,
            "data_time": 0.009732105769217014,
            "batch_time": 0.038627900183200836,
            "samples_per_second": 1094562.6420337204,
            "samples_per_second_per_gpu": 136820.33025421505,
            "loss_sequences_lower_95": 3.8374401926994324,
            "loss_sequences_upper_95": 3.9424254298210144,
            "loss_tokens_lower_95": 3.87815928125,
            "loss_tokens_upper_95": 3.90209459375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.896591981872916,
            "data_time": 0.00974350143224001,
            "batch_time": 0.039194715209305286,
            "samples_per_second": 1072527.3806366948,
            "samples_per_second_per_gpu": 134065.92257958686,
            "loss_sequences_lower_95": 3.8468204736709595,
            "loss_sequences_upper_95": 3.945695918798447,
            "loss_tokens_lower_95": 3.884712260416667,
            "loss_tokens_upper_95": 3.9084445,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 4.037737767212093,
            "data_time": 0.010014118626713753,
            "batch_time": 0.04008801840245724,
            "samples_per_second": 1053600.6515480066,
            "samples_per_second_per_gpu": 131700.08144350082,
            "loss_sequences_lower_95": 3.9899464905261994,
            "loss_sequences_upper_95": 4.083815097808838,
            "loss_tokens_lower_95": 4.0260376354166665,
            "loss_tokens_upper_95": 4.049700947916667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-2.0/params.txt",
    "uuid": "1d14144e-e426-4241-b6b4-c1b7102fd847",
    "creation_date": "2023_12_14-04_59_41"
}