### model
model_name_or_path: /path/to/llada
trust_remote_code: true

### method
stage: durl
do_train: true
finetuning_type: full
deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]

### dataset
dataset: deepscaler-5k
template: llada
cutoff_len: 128
max_samples: 1000
overwrite_cache: true
preprocessing_num_workers: 16

### output
output_dir: saves/finetuned-llada
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 1
learning_rate: 1.0e-5
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### generate
max_new_tokens: 16
top_k: 0
top_p: 0.9

report_to: "none"

sampling_num: 1
sample_ratio_calculating_correlation_inside_response: 1/2
sample_ratio_of_groups_to_update_model: 1/32
num_of_groups_to_accumulate_grad: 1
steps_per_group: 2
model_path_or_name_calculating_ppl: /path/to/gpt2
# rejection_sampling: false
# sampling_num: 1

# rejection sampling
rejection_sampling: true
sampling_num: 4