wandb:
  entity: null
  resume: 'auto'


experiment:
    project: "sft_llada" # need to be same of this file name
    num_node: 4 # the number of machines you have


model:
    pretrained_model: "/abs/path/of/model" # absolute path of your model
    optimized_name: "optimized" # the output name for your optimized model, will be saved under sft_llada/ckpt

# sft dataset
dataset:
    optimization_data: "dataset_name" # "sft_openr1math_llada"

training:
    gradient_checkpointing_enable: False # if the sequence is very larger, set as True
    gradient_accumulation_steps: 2
    batch_size_lm: 2 # the total batch size is num_node * num_gpu_per_node * gradient_accumulation_steps * batch_size_lm
    mixed_precision: "bf16"
    enable_tf32: True
    seed: 10086
    num_train_epochs: 1
    max_grad_norm: 1
    method: "random_masking" # "random_masking""semi-ar"
    lower_p: 0.1
    upper_p: 0.9
    block_size: 16 # use for semi-ar
    mask_times_per_sample: 1 # for random_masking
    post_num: 0 # number of pad token need to be trained for each data point
    max_gen_length: 1024
    max_prompt_len: 784



optimizer:
    name: adamw
    params: # default adamw params
        learning_rate: 1e-5
        scale_lr: False # scale learning rate by total batch size
        beta1: 0.9
        beta2: 0.999
        weight_decay: 0.0
        epsilon: 1e-8

lr_scheduler:
    scheduler: "cosine"
    params:
        learning_rate: ${optimizer.params.learning_rate}
        warmup_steps: 0
        min_lr_scale: 1.0


