batch_size: None # it should be set in the training batch size
ppo_buffer_size: 256
sample_batch_size: 16
ppo_mini_batch_size: 4 # ppo_discriminator_batch_size
ppo_epoch: 1
ppo_epsilon: 0.2
mix_human_demo_init_ratio: 0
mix_human_demo_ratio_warmup_steps: 100
# Pre-train discriminator first
discriminator_pretrain_steps: 0
# whehter a human demostration's reward is determined by the reward function
constant_human_demo_reward: True
# due to precision problems when autoregressive generating
# it might need to recompute log_probs
recompute_log_probs: True
random_seed: 13
num_gpus_per_node: 2