# Model arguments
model_name_or_path: /data/home/the/models/DeepSeek-R1-Distill-Qwen-1.5B
ref_model_name_or_path: /data/home/the/models/DeepSeek-R1-Distill-Qwen-1.5B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
# We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
# dataset_name: open-r1/OpenR1-Math-220k
raw_dataset_name: data/DeepSeek-R1-Distill-Qwen-1.5B-50/collect_solutions/train-0-50-steps-data.json
dataset_name: data/DeepSeek-R1-Distill-Qwen-1.5B-50/train-0-50-steps-data.json
dataset_prompt_column: problem
# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>. And put the final answer in '\\boxed{}'."
# system_prompt: "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put the final answer within \\boxed{}. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."
user_instruction: "Let's think step by step and output the final answer within \\boxed{}."

# GRPO trainer config
bf16: true
use_vllm: false
do_eval: false
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
hub_model_id: DeepSeek-R1-Distill-Qwen-1.5B-GRPO-MATH-verl_prompt-50-1-1-1-collect
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
  min_lr_rate: 0.1
max_prompt_length: 1024
max_completion_length: 8192
max_steps: -1
num_generations: 8
num_train_epochs: 20
output_dir: data/DeepSeek-R1-Distill-Qwen-1.5B-MATH-verl_prompt-50-1-1-1/collect_solutions
overwrite_output_dir: true
per_device_eval_batch_size: 6
per_device_train_batch_size: 6
push_to_hub: false
report_to:
- wandb
wandb_entity: the-hit-harbin-institute-of-technology
wandb_project: offline_research
wandb_run_group: only_orm
reward_funcs:
- accuracy
- format
# - tag_count
reward_weights:
- 1.0
- 1.0
# - 1.0
save_strategy: "epoch"
# save_steps: 50
save_total_limit: 0
seed: 42
temperature: 0.7
use_liger_kernel: true
warmup_ratio: 0.1
loss_type: bnpo
num_iterations: 1
shuffle_dataset: false
skip_empty_process_case: false
apply_process_reward: 0
reward_exp: 1.0
reward_coe: 1.0
apply_entropy_loss: 1
theta: 0.0
delta: 0.005
tgt_ent: 0.2
update_ref_model: 1
epsilon_high: 0.28