_target_: sampler.ppo.PPOAgent
# Random seeds
seed: 0
# Optimizer
optimizer:
  # Learning rates
  lr: 0.0001
  lr_decay_period: 1000000
  lr_decay_gamma: 0.5
  method: adam
  # Threshold loss for early stopping
  early_stopping: 0.0
  # Coefficient for exponential moving average
  ema_alpha: 0.5
  # Optimizer: adam, sgd
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_eps: 1e-5 #boorrowed from blogpost
  # Momentum for SGD
  sgd_momentum: 0.9
  # Mini-batch size
  batch_size: 2
  # Number of training iterations
  n_train_steps: 5000
  # From original implementation
  bootstrap_tau: 0.0
  clip_grad_norm: 0.0
# If True, compute rewards in batches
batch_reward: True
# Force zero probability of sampling invalid actions
mask_invalid_actions: True
# Temperature for the logits /= temperature_logits
temperature_logits: 1.0
# Percentage of random actions
random_action_prob: 0.001
# Percentage of trajectories in a batch from an empirical distribution
pct_offline: 0.0
# Replay buffer
policy: 
  n_hid: 256
  n_layers: 2
  checkpoint: null
  ckpt_period: null
active_learning: True
ppo_num_epochs: 16 #ttsr
ppo_epoch_size: 32 #sttr
ppo_clip: 0.1
ppo_entropy_coef: 1e-3
sample_only: False
