wandb:
  entity: null
  resume: 'auto'


# only need to be set when using multi-nodes
system:
    HTTP_PROXY: "http://xxx.xx.xxx.xxx:xxxx" # your proxy, if don't need, set to null
    HF_HOME: "/abs/path/of/HF_HOME" # the absolute path of HF_HOME, if using default, set to null
    env_name: "env_name" # the env name, needed 
    envs_dir: "/abs/path/of/envs" # if using default, set to null
    base_dir: "/abs/path/of/DLLM-RL" # the working directory

experiment:
    project: "multinode_rl_sdar_with_value"  # need to be same of this file name
    function: "train" # no need to change
    start_from_scratch: True # set to True by default, if you stopped the training, and want to keep training with your ckpt model.optimized_name, set to False, and set current_epoch to the last step you stopped   
    total_step: 80
    save_every: 10
    eval_every: 10
    train_value_every: 1
    current_epoch: 1
    deepspeed_file: "4_node_4_gpus_deepspeed_zero3"
    num_node: 4
    node_index: 0

model:
    pretrained_model: "/abs/path/of/model" # absolute path of your model
    value_base_model: "/abs/path/of/model" # absolute path of your model
    optimized_name: "optimized" # the output name for your optimized policy model, will be saved under sft_dream/ckpt
    optimized_value_name: "optimized_value" # the output name for your optimized value model, will be saved under sft_dream/ckpt
    model_base: "sdar" # set sdar for TraDo and SDAR



dataset:
    train_dataset: "dataset_name" # "MATH_train""PrimeIntellect"
    optimization_data: "rl_data" # name of the rollout data output
    data_type: "math" # "math" "code" 

# also see explanations in eval configs
rollout:
    tensor_parallel_size: 1 # set to 1 by default, if oom, try reduce max_active first, if still oom, set tensor_parallel_size to 8
    max_active: 256
    num_task_per_step: 128
    num_response_per_task: 32
    temperature: 1.0
    max_token: 2000
    block_size: 4
    denoising_steps_per_block: 4
    top_p: 1.0
    top_k: 0
    remasking_strategy: "low_confidence_dynamic" #"low_confidence_static""low_confidence_dynamic"
    dynamic_threshold: 0.9 # no use for "low_confidence_static"
    start_with_think: False

execute:
    num_chunk: 128 # batch size of executing codes in coding eval tasks

training:
    gradient_checkpointing_enable: True
    gradient_accumulation_steps: 4
    batch_size_lm: 1
    mixed_precision: "bf16"
    enable_tf32: True
    seed: 10086
    num_train_epochs: 1
    max_grad_norm: 1.0
    method: "TraceRL" # "random_masking" "TraceRL" "coupled"
    block_size: 4
    shrink: 1
    post_num: 0
    max_gen_length: 2000
    max_prompt_len: 784
    eps: 0.20
    beta: 0.01
    gam: 1.0
    lam: 1.0
    use_kl_estimator_k3: True


optimizer:
    name: adamw
    params: # default adamw params
        policy_learning_rate: 1e-6
        value_learning_rate: 5e-6
        scale_lr: False # scale learning rate by total batch size
        beta1: 0.9
        beta2: 0.999
        weight_decay: 0.0
        epsilon: 1e-8


lr_scheduler:
    scheduler: "cosine"
    params:
        learning_rate: ${optimizer.params.policy_learning_rate}
        warmup_steps: 0
        min_lr_scale: 1.0


evaluation:
    eval_dataset: "MATH500"  # "MATH500" "LiveCodeBench"
    data_type: "math" # "math" "code"
    tensor_parallel_size: 1
    max_active: 256
    num_response_per_task: 3
    temperature: 1.0
    max_token: 2000
    block_size: 4
    denoising_steps_per_block: 4
    top_p: 1.0
    top_k: [0, 1]
    remasking_strategy: ["low_confidence_dynamic", "low_confidence_static"] #"low_confidence_static""low_confidence_dynamic"
    dynamic_threshold: 0.9 # no use for "low_confidence_static"
    start_with_think: False

# len(top_k) must == len(remasking_strategy)
