seed: 73
cuda: 0 # use_gpu
# RAM: ~5G
env:
  env_type: credit
  env_name: Catch-40-v0

  num_eval_tasks: 20 # num of eval episodes

train:
  # 20000*(7*n) = 5M steps
  num_iters: 20000 # number meta-training iterates
  num_init_rollouts_pool: 5 # before training
  num_rollouts_per_iter: 1

  num_updates_per_iter: 0.25 # 1.0

  # buffer params
  buffer_type: seq_efficient
  buffer_size: 1e6
  batch_size: 32 # to tune based on sampled_seq_len
  sampled_seq_len: -1 # -1 is all, or positive integer
  sample_weight_baseline: 0.0

eval:
  eval_stochastic: false # also eval stochastic policy
  log_interval: 50 # num of iters
  save_interval: -1
  log_tensorboard: true

policy:
  seq_model: lstm-mlp # [lstm-mlp, gru-mlp]
  algo_name: sacd # only support sac-discrete

  action_embedding_size: 0 # no need for catch
  observ_embedding_size: 0 # use image encoder instead
  image_encoder: 
    from_flattened: True

  reward_embedding_size: 0
  rnn_hidden_size: 128

  dqn_layers: [128, 128]
  policy_layers: [128, 128]
  lr: 0.0003
  gamma: 0.99
  tau: 0.005

  sacd:
    entropy_alpha: 0.1
    automatic_entropy_tuning: False
    target_entropy: None # the ratio: target_entropy = ratio * log(|A|)
    alpha_lr: 0.0003
