model_type: "model_time_token"
model:
    vocab_size: 50304
    n_layer: 52
    n_head: 12
    n_embd: 1536
fm:
    type: "Logit"
fm_config:
    max_t: 0.3
    N: 512
    beta: 0.01
    t_split: 0.0
inference:
    N_samples: 1000
    B_data: 1
    B_sub_data: 5
    checkpoint: "./logs/wmt_10B_lfm_model_1_5B/ckpt.pt"
training_config:
    num_tokens_to_train: 10.
    batch_size: 40
    device_batch_size: 10
    val_loss_every : 1000
    save_every : 997
    pretrain:  "./logs/ffw_10B_lfm_model_1_5B/ckpt.pt"
    checkpoint: 
    project_name : 
    run_name: "wmt_10B_lfm_model_1_5B"
    wandb_key: 
data:
    input_bin: "./data/wmt14/train_*.bin"
    input_val_bin: "./data/wmt14/val_*.bin"
    sequence_length: 512
    condition: True
    input_bin_pretrain: 
    input_val_bin_pretrain:
optimizer:
    embed_learning_rate: 0.000036
    muon_learning_rate: 0.0002
    warmup_iters: 0
    warmdown_iters: 1450
    weight_decay: 0
