#!/bin/bash


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
NPROC_PER_NODE=6 \
swift rlhf \
    --rlhf_type grpo \
    --train_type full \
    --torch_dtype bfloat16 \
    --model Qwen/Qwen2.5-7B-Instruct \
    --model_type 'qwen2_5' \
    --use_hf true \
    --dataset '/path/to/home/lltm-h200/data_stable/Pytracify_deleted.jsonl' \
    --val_dataset '/path/to/home/lltm-h200/data_stable/CruxEval.jsonl' \
    --external_plugins /path/to/home/lltm-cp-h200/reward_funcs/exactmatch.py \
    --reward_funcs exactmatch \
    --use_vllm true \
    --vllm_mode server \
    --vllm_server_host 127.0.0.1 \
    --vllm_server_port 8000 \
    --learning_rate 1e-6 \
    --warmup_ratio 0.01 \
    --max_completion_length 8192 \
    --num_train_epochs 1 \
    --max_steps 10000 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --num_generations 8 \
    --beta 0.001 \
    --num_iterations 1 \
    --gradient_accumulation_steps 2 \
    --eval_steps 100 \
    --eval_limit 96 \
    --save_steps 100 \
    --save_total_limit 2 \
    --logging_steps 1 \
    --output_dir swift_logs/swift_train.log \
    --dataloader_num_workers 4 \
    --temperature 1.0 \
    --top_p 0.9 \
    --top_k 50 \
    --deepspeed zero3


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
NPROC_PER_NODE=6 \
swift rlhf \
    --rlhf_type grpo \
    --train_type full \
    --torch_dtype bfloat16 \
    --model Qwen/Qwen3-8B \
    --model_type 'qwen3' \
    --use_hf true \
    --dataset '/path/to/home/lltm/13_msswift/datasets/Pytracify_deleted_nothink.jsonl' \
    --val_dataset '/path/to/home/lltm/13_msswift/datasets/CruxEval_nothink.jsonl' \
    --external_plugins /path/to/home/lltm-cp-h200/reward_funcs/exactmatch.py \
    --reward_funcs exactmatch \
    --use_vllm true \
    --vllm_mode server \
    --vllm_server_host 127.0.0.1 \
    --vllm_server_port 8000 \
    --learning_rate 1e-6 \
    --warmup_ratio 0.01 \
    --max_completion_length 8192 \
    --num_train_epochs 1 \
    --max_steps 10000 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --num_generations 8 \
    --beta 0.001 \
    --num_iterations 1 \
    --gradient_accumulation_steps 2 \
    --eval_steps 100 \
    --eval_limit 96 \
    --save_steps 100 \
    --save_total_limit 2 \
    --logging_steps 1 \
    --output_dir swift_logs/swift_train.log \
    --dataloader_num_workers 4 \
    --temperature 1.0 \
    --top_p 0.9 \
    --top_k 50 \
    --deepspeed zero3



SYS='You are given a Python code and an input. Predict the the output of executing the code on the input. First, reason step by step before arriving at an answer. Then, surround the answer as an assertion with <answer> and </answer> tags.' \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \
NPROC_PER_NODE=6 \
swift rlhf \
  --rlhf_type grpo \
  --train_type full \
  --torch_dtype bfloat16 \
  --model allenai/OLMo-2-1124-7B-Instruct \
  --use_hf true \
  --model_type olmo2 \
  --template olmo2 \
  --custom_register_path /path/to/home/lltm/13_msswift/custom_register_olmo2.py \
  --dataset '/path/to/home/lltm/13_msswift/datasets/Pytracify_deleted.jsonl' \
  --val_dataset '/path/to/home/lltm/13_msswift/datasets/CruxEval.jsonl' \
  --external_plugins /path/to/home/lltm-cp-h200/reward_funcs/exactmatch.py \
  --reward_funcs exactmatch \
  --vllm_mode server \
  --vllm_server_host 127.0.0.1 \
  --vllm_server_port 8000 \
  --use_vllm true \
  --learning_rate 1e-6 \
  --warmup_ratio 0.01 \
  --max_completion_length 4096 \
  --num_train_epochs 1 \
  --max_steps 10000 \
  --per_device_train_batch_size 4 \
  --per_device_eval_batch_size 4 \
  --num_generations 8 \
  --beta 0.001 \
  --num_iterations 1 \
  --gradient_accumulation_steps 2 \
  --eval_steps 100 \
  --eval_limit 96 \
  --save_steps 100 \
  --save_total_limit 2 \
  --logging_steps 1 \
  --output_dir outputs/olmo2-grpo \
  --dataloader_num_workers 4 \
  --temperature 1.0 \
  --top_p 0.9 \
  --top_k 50 \
  --deepspeed zero2 \
  --log_level debug \

