# Model arguments
model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B
model_revision: v03.00-step-000008190
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
# We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
dataset_name: open-r1/DAPO-Math-17k-Processed
dataset_config: all
dataset_split: train

# Generation arguments
max_completion_length: 32000
num_generations: 8
temperature: 1.0

# Reward func arguments
reward_funcs:
- accuracy
reward_weights:
- 1.0

# Filtering arguments. Samples with mean reward outside of low / high will be filtered
pass_rate_min: 0.1
pass_rate_max: 0.6

output_dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-v03.00-step-000008190-filter
