export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export NCCL_DEBUG=INFO
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_IB_DISABLE=1

MODEL_PATH="qwen3_8b"
DATA_PATH="grpo_entropy_dataset_filtered.json"   
OUTPUT_DIR="./offline_sft_grpo_entropy_full_ckpts"


torchrun --nproc_per_node=4 offline_grpo_ddp_reward.py \
  --model_name_or_path "$MODEL_PATH" \
  --data_path "$DATA_PATH" \
  --output_dir "$OUTPUT_DIR" \
  --epochs 5 \
  --batch_size 1 \
  --grad_accum_steps 2 \
  --num_iterations 1 \
  --max_prompt_len 4096 \
  --max_response_len 2048 \
  --max_group 6 \
  --num_workers 0 \
  --lr 1e-6 \
  --beta 0.01 \
  --save_steps 1000 \
  --prompt_field "prompt" \
  --responses_field "responses" \
  --bf16
