# ModelConfig
use_peft: true
torch_dtype: bfloat16
load_in_4bit: true
model_path: LLaDA-sft-s1k-merged
attn_implementation: flash_attention_2

lora_r: 128
lora_alpha: 64
lora_dropout: 0.05
peft_task_type: CAUSAL_LM

# GRPOConfig
dataset: gsm8k
seed: 42
bf16: true

sync_ref_model: True
ref_model_sync_steps: 64

adam_beta1: 0.9
adam_beta2: 0.99
weight_decay: 0.1
max_grad_norm: 0.2

use_vllm: false
vllm_device: auto
vllm_gpu_memory_utilization: 0.9

warmup_ratio: 0.0001
logging_steps: 1
learning_rate: 3e-6
lr_scheduler_type: constant_with_warmup

# For each training prompt, we generate num_generations completions
# One batch rollouts (gradient_accumulation_steps * per_device_train_batch_size * num_generations) 
# Global batch size (sequences) = gradient_accumulation_steps * per_device_train_batch_size * num_generations * num_devices
# The following setting follows d1 and b1
num_generations: 6

# Model backward (optimizer.step) for every gradient_accumulation_steps accumulated gradients
# Follow d1 and b1 and GDPO:
# per_device_eval_batch_size: 6
# per_device_train_batch_size: 6
# gradient_accumulation_steps: 2

# Max batch size 18 on ROCm with 192GB
# The above setting is equivalent to:
per_device_eval_batch_size: 12
per_device_train_batch_size: 12
gradient_accumulation_steps: 1

# Save memory
gradient_checkpointing: false
gradient_checkpointing_kwargs:
use_reentrant: false

# Number of epochs to train on the SAME collected data batch.
# Follow d1 and b1, we set num_iterations=12
# For every batch of data collected, the model will perform 12 separate updates 
# Which are 12 optimizer steps by reusing the same batch of data.
num_iterations: 12

# The default RL methods are d1 and b1
run_name: b1
output_dir: /checkpoints
resume_from_checkpoint: false

eval_strategy: steps
save_strategy: steps
eval_steps: 100
save_steps: 100
max_steps: 5000
save_total_limit: 50
num_train_epochs: 10

# To decode a sequence of N tokens, we use N/2 denoising steps and unmask 2 tokens in each step. While the decoding process can generate tokens in any order, we find that decoding from left to right in blocks yields slightly better performance in practice. This is referred to as the semi-autoregressive decoding strategy (Nie et al., 2025). 
# More specifically, we divide the sequence into blocks of 32 tokens. In each step, we unmask 2 tokens with the highest confidence within the current block, regardless of their position. Once all the tokens in the current block are unmasked, we move to the next one.
# max_completion_length: 512
# diffusion_steps: 256
max_completion_length: 256
diffusion_steps: 128

max_prompt_length: 200
block_length: 32
generation_batch_size: 6
remasking: low_confidence
random_masking: True
p_mask_prompt: 0.15
beta: 0.04
epsilon: 0.5