{
    "name": "c4_original-d=512_l=8_h=4-0.25",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=512_l=8_h=4.json",
        "tokens": 394570240,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 78914048,
        "params_no_embed": 53092864,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp"
        ],
        "chinchilla_multiplier": 0.25
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--train-num-samples",
        "78914048",
        "--workers",
        "2",
        "--dataset-manifest",
        "<scrub>/original_c4/manifest.jsonl",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=512_l=8_h=4.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--data-key",
        "txt",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--name",
        "c4_original-d=512_l=8_h=4-0.25",
        "--logs",
        "/admin/<scrub>/scrub_logs",
        "--val-data",
        "/admin/<scrub>/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/admin/<scrub>/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-batch-size",
        "8",
        "--val-data-key",
        "json",
        "txt",
        "--val-num-samples",
        "245760",
        "--fsdp",
        "--fsdp-amp",
        "--report-to",
        "wandb",
        "--wandb-project-name",
        "scrub",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/scrub_experiments_v3"
    ],
    "results": [
        {
            "loss": 5.4619203470647335,
            "data_time": 0.008300391025841236,
            "batch_time": 0.08201826177537441,
            "samples_per_second": 2261664.7658910267,
            "samples_per_second_per_gpu": 282708.09573637834,
            "loss_sequences_lower_95": 5.402981543540955,
            "loss_sequences_upper_95": 5.5211389541625975,
            "loss_tokens_lower_95": 5.449795218749999,
            "loss_tokens_upper_95": 5.47404821875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.566077908501029,
            "data_time": 0.009491626173257828,
            "batch_time": 0.023642122745513916,
            "samples_per_second": 2184225.9686703905,
            "samples_per_second_per_gpu": 273028.2460837988,
            "loss_sequences_lower_95": 5.499389100074768,
            "loss_sequences_upper_95": 5.633108961582184,
            "loss_tokens_lower_95": 5.554528739583334,
            "loss_tokens_upper_95": 5.57833709375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.6772514041513205,
            "data_time": 0.009240287356078625,
            "batch_time": 0.02307172305881977,
            "samples_per_second": 2241252.608057385,
            "samples_per_second_per_gpu": 280156.5760071731,
            "loss_sequences_lower_95": 5.612606918811798,
            "loss_sequences_upper_95": 5.74262216091156,
            "loss_tokens_lower_95": 5.665235239583334,
            "loss_tokens_upper_95": 5.689247552083333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.75347950309515,
            "data_time": 0.009559370577335358,
            "batch_time": 0.023297879844903946,
            "samples_per_second": 2237726.355483554,
            "samples_per_second_per_gpu": 279715.79443544423,
            "loss_sequences_lower_95": 5.68703978061676,
            "loss_sequences_upper_95": 5.822046875953674,
            "loss_tokens_lower_95": 5.741663677083333,
            "loss_tokens_upper_95": 5.764671052083334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.8793552946299314,
            "data_time": 0.009015309624373913,
            "batch_time": 0.02310111466795206,
            "samples_per_second": 2205097.688969357,
            "samples_per_second_per_gpu": 275637.2111211696,
            "loss_sequences_lower_95": 5.813455784320832,
            "loss_sequences_upper_95": 5.943593776226043,
            "loss_tokens_lower_95": 5.868066479166666,
            "loss_tokens_upper_95": 5.890434197916666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 5.991062398999929,
            "data_time": 0.009030306711792946,
            "batch_time": 0.022984036244452,
            "samples_per_second": 2216765.899287164,
            "samples_per_second_per_gpu": 277095.7374108955,
            "loss_sequences_lower_95": 5.921110236644745,
            "loss_sequences_upper_95": 6.059158897399902,
            "loss_tokens_lower_95": 5.979516927083334,
            "loss_tokens_upper_95": 6.00243815625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.130825862288475,
            "data_time": 0.00872735120356083,
            "batch_time": 0.022656497545540333,
            "samples_per_second": 2229537.9362288397,
            "samples_per_second_per_gpu": 278692.24202860496,
            "loss_sequences_lower_95": 6.067860567569733,
            "loss_sequences_upper_95": 6.193981993198395,
            "loss_tokens_lower_95": 6.119802395833333,
            "loss_tokens_upper_95": 6.1418115208333335,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.173796635121107,
            "data_time": 0.00898056197911501,
            "batch_time": 0.02287142537534237,
            "samples_per_second": 2223496.9911697432,
            "samples_per_second_per_gpu": 277937.1238962179,
            "loss_sequences_lower_95": 6.114237689971924,
            "loss_sequences_upper_95": 6.232593941688537,
            "loss_tokens_lower_95": 6.16310603125,
            "loss_tokens_upper_95": 6.184580677083334,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.302291529253125,
            "data_time": 0.009124337695538998,
            "batch_time": 0.02344007696956396,
            "samples_per_second": 2154307.7692578915,
            "samples_per_second_per_gpu": 269288.47115723643,
            "loss_sequences_lower_95": 6.248738873004913,
            "loss_sequences_upper_95": 6.3546130657196045,
            "loss_tokens_lower_95": 6.29183621875,
            "loss_tokens_upper_95": 6.31285709375,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.390161009505391,
            "data_time": 0.008653904311358929,
            "batch_time": 0.022582399658858776,
            "samples_per_second": 2229751.303397899,
            "samples_per_second_per_gpu": 278718.9129247374,
            "loss_sequences_lower_95": 6.347072052955627,
            "loss_sequences_upper_95": 6.431352603435516,
            "loss_tokens_lower_95": 6.379757395833334,
            "loss_tokens_upper_95": 6.400320072916666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=512_l=8_h=4"
        },
        {
            "loss": 6.573248477652669,
            "data_time": 0.008648786693811417,
            "batch_time": 0.022754866629838943,
            "samples_per_second": 2206398.4696107185,
            "samples_per_second_per_gpu": 275799.8087013398,
            "loss_sequences_lower_95": 6.538716673851013,
            "loss_sequences_upper_95": 6.613109540939331,
            "loss_tokens_lower_95": 6.563079104166667,
            "loss_tokens_upper_95": 6.58327396875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=512_l=8_h=4"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=512_l=8_h=4-0.25/params.txt",
    "uuid": "1fc79445-01d1-4a2a-9030-63c4aea3df94",
    "creation_date": "2023_12_14-04_59_25"
}