{
    "name": "c4_original-d=576_l=24_h=8-8.0",
    "dataset_name": "c4_original",
    "dataset_uuid": "7e0f5507-aa36-4d8c-9026-d049f885adf1",
    "hyperparameters": {
        "model": "training/open_lm_configs/d=576_l=24_h=8.json",
        "tokens": 24588380160,
        "warmup": 400,
        "lr": 0.003,
        "wd": 0.033,
        "cd": 3e-05,
        "global_bs": 512,
        "acc": 2,
        "qk_norm": true,
        "z_loss": 0.0001,
        "grad_checkpointing": false,
        "params": 153677376,
        "params_no_embed": 124628544,
        "fsdp_flags": [
            "--fsdp",
            "--fsdp-amp",
            "--fsdp-limit-all-gathers"
        ],
        "chinchilla_multiplier": 8.0,
        "seed": 124
    },
    "checkpoint_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
    "open_lm_version": "0.0.21",
    "open_lm_args": [
        "--workers",
        "2",
        "--precision",
        "amp_bfloat16",
        "--global-batch-size",
        "512",
        "--log-every-n-steps",
        "20",
        "--grad-clip-norm",
        "1",
        "--lr",
        "0.003",
        "--warmup",
        "400",
        "--model",
        "training/open_lm_configs/d=576_l=24_h=8.json",
        "--wd",
        "0.033",
        "--beta2",
        "0.95",
        "--epochs",
        "5",
        "--resume",
        "latest",
        "--seed",
        "124",
        "--accum-freq",
        "2",
        "--model-norm",
        "gain_only_lp_layer_norm",
        "--delete-previous-checkpoint",
        "--lr-cooldown-end",
        "3e-05",
        "--logs",
        "logs/787",
        "--train-num-samples",
        "4917676032",
        "--dataset-manifest",
        "<scrub>/openlm/scrub/datasets/original_c4/manifest.jsonl",
        "--data-key",
        "txt",
        "--name",
        "c4_original-d=576_l=24_h=8-8.0",
        "--fsdp",
        "--fsdp-amp",
        "--fsdp-limit-all-gathers",
        "--val-data",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/open_lm_val/shard_00000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/c4_val/shard-0000000.tar",
        "/<scrub>/ubuntu/research/openlm/scrub/training/eval_data/paloma_val/00000001.tar",
        "--val-frequency",
        "5",
        "--val-data-key",
        "json",
        "txt",
        "json.gz",
        "--val-tok-ci",
        "--val-seq-ci",
        "--val-num-samples",
        "245760",
        "--qk-norm",
        "--z-loss",
        "0.0001",
        "--remote-sync",
        "<scrub>/openlm/scrub/experiments/154m_8x_c4_original/"
    ],
    "results": [
        {
            "loss": 3.176857117563486,
            "data_time": 0.04165232181549072,
            "batch_time": 0.3889746740460396,
            "samples_per_second": 1631174.018781296,
            "samples_per_second_per_gpu": 203896.752347662,
            "loss_sequences_lower_95": 3.1247424721717834,
            "loss_sequences_upper_95": 3.2276548624038695,
            "loss_tokens_lower_95": 3.165049848958333,
            "loss_tokens_upper_95": 3.1888178229166666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_000.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.172363940626383,
            "data_time": 0.04334286227822304,
            "batch_time": 0.1124938502907753,
            "samples_per_second": 1673967.9790451182,
            "samples_per_second_per_gpu": 209245.99738063978,
            "loss_sequences_lower_95": 3.109617108106613,
            "loss_sequences_upper_95": 3.2334521234035494,
            "loss_tokens_lower_95": 3.160688109375,
            "loss_tokens_upper_95": 3.1844033854166667,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_010.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.2553693801164627,
            "data_time": 0.042207587510347366,
            "batch_time": 0.11093678697943687,
            "samples_per_second": 1676934.0042731226,
            "samples_per_second_per_gpu": 209616.75053414033,
            "loss_sequences_lower_95": 3.1991540849208833,
            "loss_sequences_upper_95": 3.3116925954818726,
            "loss_tokens_lower_95": 3.2435823333333333,
            "loss_tokens_upper_95": 3.2672560729166666,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_020.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.290599837899208,
            "data_time": 0.04297127574682236,
            "batch_time": 0.1117810346186161,
            "samples_per_second": 1674148.2701811085,
            "samples_per_second_per_gpu": 209268.53377263856,
            "loss_sequences_lower_95": 3.2334574341773985,
            "loss_sequences_upper_95": 3.3465520262718202,
            "loss_tokens_lower_95": 3.2789454739583332,
            "loss_tokens_upper_95": 3.302540411458333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_030.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.320359356701374,
            "data_time": 0.041452232748270035,
            "batch_time": 0.11026737466454506,
            "samples_per_second": 1675272.069952911,
            "samples_per_second_per_gpu": 209409.00874411387,
            "loss_sequences_lower_95": 3.261929875612259,
            "loss_sequences_upper_95": 3.37731449007988,
            "loss_tokens_lower_95": 3.3084054739583335,
            "loss_tokens_upper_95": 3.33247065625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_040.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.4043753035366535,
            "data_time": 0.04331383481621742,
            "batch_time": 0.11209229752421379,
            "samples_per_second": 1672008.407526955,
            "samples_per_second_per_gpu": 209001.05094086938,
            "loss_sequences_lower_95": 3.3476323902606966,
            "loss_sequences_upper_95": 3.45832816362381,
            "loss_tokens_lower_95": 3.3926144947916663,
            "loss_tokens_upper_95": 3.4163329947916665,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_050.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.486296370625496,
            "data_time": 0.04189632833003998,
            "batch_time": 0.11068258807063103,
            "samples_per_second": 1676422.7857361108,
            "samples_per_second_per_gpu": 209552.84821701384,
            "loss_sequences_lower_95": 3.429607856273651,
            "loss_sequences_upper_95": 3.5411708831787108,
            "loss_tokens_lower_95": 3.4745459479166665,
            "loss_tokens_upper_95": 3.4981784062499997,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_060.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5338817313313484,
            "data_time": 0.04503675550222397,
            "batch_time": 0.11403607949614525,
            "samples_per_second": 1670992.9298795753,
            "samples_per_second_per_gpu": 208874.1162349469,
            "loss_sequences_lower_95": 3.4798656821250917,
            "loss_sequences_upper_95": 3.5867501378059385,
            "loss_tokens_lower_95": 3.52225934375,
            "loss_tokens_upper_95": 3.54570421875,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_070.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.5855831429362297,
            "data_time": 0.04088795185089111,
            "batch_time": 0.10993070527911186,
            "samples_per_second": 1675159.083179195,
            "samples_per_second_per_gpu": 209394.88539739937,
            "loss_sequences_lower_95": 3.5350070774555205,
            "loss_sequences_upper_95": 3.6355282425880433,
            "loss_tokens_lower_95": 3.5738465625,
            "loss_tokens_upper_95": 3.59766990625,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_080.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.588874891400337,
            "data_time": 0.04457852989435196,
            "batch_time": 0.1135568767786026,
            "samples_per_second": 1671289.3050758208,
            "samples_per_second_per_gpu": 208911.1631344776,
            "loss_sequences_lower_95": 3.5413538336753847,
            "loss_sequences_upper_95": 3.6350657403469087,
            "loss_tokens_lower_95": 3.57700828125,
            "loss_tokens_upper_95": 3.6005587916666664,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_090.tar"
            ],
            "model": "d=576_l=24_h=8"
        },
        {
            "loss": 3.697111714631319,
            "data_time": 0.04285683482885361,
            "batch_time": 0.11167116090655327,
            "samples_per_second": 1675328.1915647592,
            "samples_per_second_per_gpu": 209416.0239455949,
            "loss_sequences_lower_95": 3.651001960039139,
            "loss_sequences_upper_95": 3.7424441814422607,
            "loss_tokens_lower_95": 3.6854586666666664,
            "loss_tokens_upper_95": 3.708516927083333,
            "sequences": 512,
            "tokens": 1048576,
            "checkpoint_path": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt",
            "val_data": [
                "training/eval_data/val_tok_mult/de-en/val_de-en_100.tar"
            ],
            "model": "d=576_l=24_h=8"
        }
    ],
    "params_url": "<scrub>/scrub_experiments_v3/c4_original-d=576_l=24_h=8-8.0/params.txt",
    "uuid": "dec1155d-e531-41d9-9b7f-4831780bec8e",
    "creation_date": "2024_01_25-08_33_30"
}