task_lm: distilgpt2
task_top_k: 10
style_classifier: ???
style_tokenizer: bert-base-uncased
style_batch_size: 32
pad_token: <|endoftext|>
num_repeats: 4
num_samples: 16
num_bootstraps: 4
compute_zscore: true
lower_outputs: true
control_output_length: true
template: '{prompt} "{sentence_1}" "'
end_punct: '"'
dataset: yelp
dataset_seed: null
direction: 0_to_1
base_path: ./data
max_size: null
max_length: null
max_length_tokenizer: null
policy_lm: distilgpt2
hidden_size: 2048
logit_bias: 0.0
fluent: false
fluent_top_k: 20
max_decoding_length: 5
eos_token_id: null
prompt_length: 5
prompt_train_batch_size: 8
prompt_infer_batch_size: 16
source_str: <|endoftext|>
sql_loss_impl: v2_v2r_v3_v3r
training_mode: sql-onpolicy
mix_strategy: null
target_update_method: polyak
target_update_steps: null
target_learning_rate: 0.001
reward_shaping: true
reward_shaping_old_min: 0.0
reward_shaping_old_max: 1.0
reward_shaping_new_min: -20.0
reward_shaping_new_max: 80.0
top_k: 50
top_p: 1.0
num_beams: 1
train_batch_size: 2
train_shuffle: false
train_drop_last: true
num_train_epochs: 1
max_train_steps: 12000
do_eval: true
eval_batch_size: 16
eval_steps: 50
do_save: true
save_dir: ./outputs
save_steps: 100
learning_rate: 5.0e-05
gradient_clip: true
gradient_clip_norm: 5.0
checkpoint_path: null
random_seed: 0
report_to_wandb: true
project_name: rl-prompt
run_name: null