algorithm_name: ppo  # Name of the algorithm
# game: "classical_phantom_ttt"  # openspiel game id
# group_name: null  # group name, to group exps in wandb -- if None, no group is created
learning_rate: 0.00025  # the learning rate of the optimizer
# max_steps: 10000000  # total timesteps of the experiments
eval_every: 10  # evaluate the policy every N updates
# torch_deterministic: true  # if toggled, `torch.backends.cudnn.deterministic=False`
cuda: false  # if toggled, cuda will be enabled by default
num_envs: 8  # the number of parallel game environments
num_steps: 128  # the number of steps to run in each environment per policy rollout (batch size is num_steps * num_envs)
anneal_lr: true  # Toggle learning rate annealing for policy and value networks
gae: true  # Use GAE for advantage computation
gamma: 0.99  # the discount factor gamma
gae_lambda: 0.95  # the lambda for the general advantage estimation
num_minibatches: 4  # the number of mini-batches
update_epochs: 4  # the K epochs to update the policy
norm_adv: true  # Toggles advantages normalization
clip_coef: 0.1  # the surrogate clipping coefficient
clip_vloss: true  # Toggles whether or not to use a clipped loss for the value function, as per the paper
ent_coef: 0.05  # coefficient of the entropy
vf_coef: 0.5  # coefficient of the value function
max_grad_norm: 0.5  # the maximum norm for the gradient clipping
target_kl: null  # the target KL divergence threshold
# seed: None
