seed: 71
cuda: 1 # use_gpu
env:
  env_type: rmdp
  env_name: MRPOHopperRandomNormal-v0

  num_eval_tasks: 100
  worst_percentile: 0.10

train:
  # 3000*1000=3M steps, cost 8h (train) + 12h (test)
  num_iters: 3000 # number meta-training iterates
  num_init_rollouts_pool: 10 # before training
  num_rollouts_per_iter: 1
  buffer_size: 1e6 # we should full buffer size as VRM

  num_updates_per_iter: 0.2 # equiv to 0.2*1000 = 200 steps, now we fixed it
  batch_size: 64 # to tune based on sampled_seq_len
  sampled_seq_len: 64 # -1 is all, or positive integer.
  sample_weight_baseline: 0.0 # VRM adds weight

eval:
  eval_stochastic: false # also eval stochastic policy
  log_interval: 50 # 100 # num of iters
  save_interval: 50 # -1
  log_tensorboard: true

policy:
  seq_model: gru # [lstm, gru]
  algo_name: sac # [td3, sac]

  action_embedding_size: 16 # 0
  observ_embedding_size: 32
  reward_embedding_size: 0 # r(s,a,s') does not contain hidden state (mass)
  rnn_hidden_size: 128 

  dqn_layers: [256, 256]
  policy_layers: [256, 256]
  lr: 0.0003
  gamma: 0.99
  tau: 0.005

  sac:
    entropy_alpha: 0.2
    automatic_entropy_tuning: true
    alpha_lr: 0.0003

  td3:
    ## since we normalize action space to [-1, 1]
    ## the noise std is absolute value
    exploration_noise: 0.1 
    target_noise: 0.2
    target_noise_clip: 0.5