{
    "name": "c4_original-d=576_l=24_h=8-16.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 49176760320,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 8,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 16.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "8",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/186",
        "--train-num-samples",
        "9835352064",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/original_c4/manifest.jsonl",
        "--data-key",
        "txt",
        "--name",
        "c4_original-d=576_l=24_h=8-16.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-max-pop-ci",
        "300000",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/154m_16x_c4_original/"
    ],
    "results": [
        {
            "loss": 3.1269922177307308,
            "data_time": 0.008851471357047558,
            "batch_time": 0.10329339746385813,
            "samples_per_second": 1084787.8030967852,
            "samples_per_second_per_gpu": 135598.47538709815,
            "loss_sequences_lower_95": 3.0740019559860228,
            "loss_sequences_upper_95": 3.1786400973796844,
            "loss_tokens_lower_95": 3.114939875,
            "loss_tokens_upper_95": 3.139195755208333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1145771325100213,
            "data_time": 0.009042885154485703,
            "batch_time": 0.03750546928495169,
            "samples_per_second": 1106646.751122795,
            "samples_per_second_per_gpu": 138330.84389034938,
            "loss_sequences_lower_95": 3.0534465312957764,
            "loss_sequences_upper_95": 3.175242030620575,
            "loss_tokens_lower_95": 3.102647453125,
            "loss_tokens_upper_95": 3.12642246875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.1959328637458384,
            "data_time": 0.009478253312408924,
            "batch_time": 0.03834274969995022,
            "samples_per_second": 1096212.2440399264,
            "samples_per_second_per_gpu": 137026.5305049908,
            "loss_sequences_lower_95": 3.139625072479248,
            "loss_sequences_upper_95": 3.2534295320510864,
            "loss_tokens_lower_95": 3.1840052187500003,
            "loss_tokens_upper_95": 3.207681015625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2338662850670516,
            "data_time": 0.009166626259684563,
            "batch_time": 0.03924662992358208,
            "samples_per_second": 1063942.2815184705,
            "samples_per_second_per_gpu": 132992.7851898088,
            "loss_sequences_lower_95": 3.178235912322998,
            "loss_sequences_upper_95": 3.287841236591339,
            "loss_tokens_lower_95": 3.222043921875,
            "loss_tokens_upper_95": 3.245625328125,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2572774048894644,
            "data_time": 0.008855720050632954,
            "batch_time": 0.03855908662080765,
            "samples_per_second": 1073873.6024157945,
            "samples_per_second_per_gpu": 134234.2003019743,
            "loss_sequences_lower_95": 3.201279306411743,
            "loss_sequences_upper_95": 3.313831865787506,
            "loss_tokens_lower_95": 3.245445166666667,
            "loss_tokens_upper_95": 3.268840630208333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.338151892647147,
            "data_time": 0.009088963270187378,
            "batch_time": 0.03767870645970106,
            "samples_per_second": 1107266.3516772897,
            "samples_per_second_per_gpu": 138408.2939596612,
            "loss_sequences_lower_95": 3.2846206843853,
            "loss_sequences_upper_95": 3.3908535063266756,
            "loss_tokens_lower_95": 3.3261666041666667,
            "loss_tokens_upper_95": 3.3500195416666667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.41380434948951,
            "data_time": 0.00880990270525217,
            "batch_time": 0.03753245063126087,
            "samples_per_second": 1104833.9154530293,
            "samples_per_second_per_gpu": 138104.23943162867,
            "loss_sequences_lower_95": 3.3586513698101044,
            "loss_sequences_upper_95": 3.469096040725708,
            "loss_tokens_lower_95": 3.4021219895833332,
            "loss_tokens_upper_95": 3.425484265625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.458523469977081,
            "data_time": 0.009158056229352951,
            "batch_time": 0.03919478040188551,
            "samples_per_second": 1062449.1488722174,
            "samples_per_second_per_gpu": 132806.14360902717,
            "loss_sequences_lower_95": 3.4062096416950225,
            "loss_sequences_upper_95": 3.5090848982334135,
            "loss_tokens_lower_95": 3.446609244791667,
            "loss_tokens_upper_95": 3.470282,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.508967214729637,
            "data_time": 0.008870316669344902,
            "batch_time": 0.038147236220538616,
            "samples_per_second": 1083949.3755566978,
            "samples_per_second_per_gpu": 135493.67194458723,
            "loss_sequences_lower_95": 3.4601664185523986,
            "loss_sequences_upper_95": 3.558221530914307,
            "loss_tokens_lower_95": 3.4969508125,
            "loss_tokens_upper_95": 3.5209655625000003,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5042680143378675,
            "data_time": 0.009080695919692516,
            "batch_time": 0.03783275280147791,
            "samples_per_second": 1099582.9040115396,
            "samples_per_second_per_gpu": 137447.86300144246,
            "loss_sequences_lower_95": 3.4574182868003844,
            "loss_sequences_upper_95": 3.5504009246826174,
            "loss_tokens_lower_95": 3.492446078125,
            "loss_tokens_upper_95": 3.51604965625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.605809388682246,
            "data_time": 0.009235897101461887,
            "batch_time": 0.03917347826063633,
            "samples_per_second": 1060644.3349134203,
            "samples_per_second_per_gpu": 132580.54186417753,
            "loss_sequences_lower_95": 3.5596595287322996,
            "loss_sequences_upper_95": 3.651107943058014,
            "loss_tokens_lower_95": 3.5939877083333336,
            "loss_tokens_upper_95": 3.61754803125,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-16.0/params.txt",
    "uuid": "2a925d34-8650-497b-adcf-ea8439b0c51a",
    "creation_date": "2024_01_28-13_33_33"
}