output_global_parent_dir: results
output_dir: tb_rho1b_gsm8k
dataset_name: openai/gsm8k
bf16: True
#fp16: True
total_episodes: 140000 
response_length: 512 
num_ppo_epochs: 1
num_mini_batches: 1
learning_rate: 1e-5
per_device_train_batch_size: 20
per_device_eval_batch_size: 128
gradient_accumulation_steps: 7
model_name_or_path: realtreetune/rho-1b-sft-GSM8K
sft_model_path: realtreetune/rho-1b-sft-GSM8K
local_rollout_forward_batch_size: 72
non_eos_penalty: False
stop_token: eos
save_strategy: steps
save_steps: 0.25
hub_strategy: all_checkpoints
logging_steps: 100
num_sample_generations: 0