# --- Defaults REC-PPO ---

system_name: rec_ppo # Name of the system.

# --- RL hyperparameters ---
actor_lr: 2e-4 # Learning rate for actor network
critic_lr: 2e-4 # Learning rate for critic network
update_batch_size: 1 # Number of vectorised gradient updates per device. - TODO does this ever do anything on rec_ppo.py
rollout_length: 64 # Number of environment steps per vectorised environment.
epochs: 4 # Number of ppo epochs per training data batch.
num_minibatches: 8 # Number of minibatches per ppo epoch.
gamma: 0.99 # Discounting factor.
gae_lambda: 0.8 # Lambda value for GAE computation.
ent_coef: 0.01 # Entropy regularisation term for loss function.
vf_coef: 0.5 # Critic weight in
max_grad_norm: 1.0 # Maximum norm of the gradients for a weight update.
reward_scaling: 1.0 # Scaling of rewards prior to advantage computation
decay_learning_rates: True # Whether learning rates should be linearly decayed during training.
normalize_observations: False # Whether to normalize observations.
standardize_advantages: True # Whether to standardize the advantages.

# --- Recurrent hyperparameters ---
recurrent_chunk_size: ~ # The size of the chunks in which the recurrent sequences are divided during the training process.
# If unspecified, the rollout length is used as the chunk size which means that the recurrent sequences are not divided.

loss_actor_type: 'ppo-clip' # Policy loss function. Options: 'ppo-clip', 'ppo-penalty' 'dpo'.
clip_eps: 0.2 # Clipping value for PPO updates and value function.

loss_critic_type: 'clip' # Value loss function. Options: 'clip', 'unclip'

run_outer_ppo: False # this is now just for assertions to make sure we are not using outer ppo when we just want normal ppo

outer_optimizer:
  # ADAM
  # _target_ : optax.adam
  # b1: 0.9
  # b2: 0.999 
  # eps : 1e-8
  # eps_root: 0.0
  # nesterov: False

  # SGD
  _target_ : optax.sgd
  momentum : ~ # if none - momentum is not used.
  # nesterov : False

  # If you want a constant learning rate just use div_factor = final_div_factor = 1.0 (this is the default)
  learning_rate:
    _target_ : optax.cosine_onecycle_schedule
    peak_value: 1.0
    pct_start: 0.1
    div_factor: 1.0
    final_div_factor : 1.0

# Free step is always sgd with momentum. 
free_step_learning_rate:
    _target_ : optax.cosine_onecycle_schedule
    peak_value: 0.0
    pct_start: 0.1
    div_factor: 1.0
    final_div_factor : 1.0
free_step_momentum: 0.0 # Momentum for the free step optimizer.
