train_file=your_training_dataset_path

# 'process_switch','process_depth','process_output','process_sequential' is not reward function, we use them as tracking metrics of reasoning process
reward_func="['adaptive_reasoning_control','math_accuracy','process_switch','process_depth','process_output','process_sequential']"
reward_weights="[1.0,1.0,0.0,0.0,0.0,0.0]"


model_name_or_path=None
model_name=your_model_name

system_prompt=None
deepspeed_config_path=None
nohup bash scripts/run_rl_lora.sh model_name=${model_name} base_model="DeepSeek-R1-Distill-Qwen-1.5B" \
    train_file=${train_file} model_name_or_path=${model_name_or_path} deepspeed_config_path=${deepspeed_config_path} use_flash_attn=False \
    per_device_train_batch_size=8 num_generations=8 target_generations=8 gradient_accumulation_steps=4 gradient_checkpointing=True \
    min_correct_generations=4 min_incorrect_generations=1 p_low=0.15 p_high=0.5 \
    save_strategy=steps save_steps=50 save_total_limit=20 force_postprocessor=True \
    rl_algorithm=grpo kl_coef=0.0 reward_func=${reward_func} reward_weights=${reward_weights} \
    use_vllm=True vllm_device=auto vllm_gpu_memory_utilization=0.7 \
    max_new_tokens=2048 do_sample=True temperature=0.7 top_k=60000 top_p=0.95 \
    evaluation_strategy=no validation_split=0 do_eval=False prompt_templates=simple_chat system_prompt=${system_prompt} \
    torch_dtype=bfloat16 bf16=True fp16=False load_in_8bit=False disable_caching=True \
    max_seq_length=1024 num_train_epochs=5 learning_rate=2e-6 lora_r=32 lora_alpha=32 \
    target_modules="['q_proj','k_proj','v_poj','o_proj','gate_proj','up_proj','down_proj']" seed=42 \
    role_tags="{'user':'<｜User｜>','assistant':'<｜Assistant｜>'}" \
    data_generation_task="SimpleRL" data_augmentations=None overwrite_output_dir=True \
    cuda=0,1,2,3,4,5,6,7 warmup_steps=20 prefix=REFT suffix=None &
