walker2d-lstm:
  local_dir: $LOCAL_DIR
  env: Walker2d-v3
  run: PPO
  num_samples: 1
  checkpoint_freq: 1
  checkpoint_at_end: True
  stop:
    episode_reward_mean: 99999
    timesteps_total: 12000000
  config:
    num_workers: 16 # NOTE: set to 0 and use local mode for debugging
    num_gpus: 0
    num_gpus_per_worker: 0
    framework: torch
    observation_filter: MeanStdFilter
    batch_mode: complete_episodes
    # # MDP-related
    # gamma: 0.995
    # rollout_fragment_length: 200
    # # sgd-related
    lr: 0.0001
    # # lr_schedule: null
    num_sgd_iter: 20
    train_batch_size: 320000
    sgd_minibatch_size: 32768
    # # value function
    # # vf_loss_coeff: 0.01
    # # # PPO-specific
    # # entropy_coeff: 0.001
    # # entropy_coeff_schedule: null
    # # lambda: 0.95
    kl_coeff: 0.1
    # # kl_target: 0.01
    # # clip_param: 0.3
    # model
    model:
      vf_share_layers: false
      custom_model: Rnn
      custom_action_dist: null # TorchBeta # prevent inf logprob
      custom_model_config:
        # need to specify observation space since rllib flatten it at model arguments
        obs_shape: [[11,]]
        # the MLP
        module_1:
          fc_filters: [11, 256]
          fc_activation: tanh
        # recurrent network
        module_2:
          type: lstm
          cell_size: 10
        # action distribution
        # custom_action_dist_config:
        #   low: -1.0
        #   high: 1.0
        # value function
        value_module:
          fc_filters: [11, 256, 256]
          fc_activation: tanh
