output_global_parent_dir: results
dataset_name: vwxyzjn/summarize_from_feedback_tldr_3_filtered_oai_preprocessing_1706381144
dataset_test_split: validation
max_length: 512
fp16: True
total_episodes: 40960
response_length: 128
num_ppo_epochs: 1
num_mini_batches: 1
learning_rate: 3.0e-6
per_device_train_batch_size: 20
gradient_accumulation_steps: 8
model_name_or_path: mnoukhov/pythia2.8b-sft-tldr 
sft_model_path: mnoukhov/pythia2.8b-sft-tldr 
reward_model_path: mnoukhov/pythia2.8b-rm-tldr6.9b
local_rollout_forward_batch_size: 10
non_eos_penalty: True
stop_token: eos
save_strategy: steps
save_steps: 0.25
hub_strategy: all_checkpoints
logging_steps: 100
num_sample_generations: 5