pendulum-lstm:
  local_dir: $LOCAL_DIR
  env: Pendulum-v1
  run: PPO
  num_samples: 1
  checkpoint_freq: 1
  checkpoint_at_end: True
  stop:
    episode_reward_mean: -500
    timesteps_total: 1000000
  config:
    num_workers: 0 # NOTE: set to 0 and use local mode for debugging
    num_envs_per_worker: 20
    num_gpus: 0
    num_gpus_per_worker: 0
    framework: torch
    observation_filter: MeanStdFilter
    # MDP-related
    gamma: 0.95
    # rollout_fragment_length: 200
    # sgd-related
    lr: 0.0003
    # lr_schedule: null
    num_sgd_iter: 6
    train_batch_size: 512
    sgd_minibatch_size: 64
    # value function
    # vf_loss_coeff: 0.01
    vf_clip_param: 10.0
    # # PPO-specific
    # entropy_coeff: 0.001
    # entropy_coeff_schedule: null
    lambda: 0.95
    # kl_coeff: 0.2
    # kl_target: 0.01
    # clip_param: 0.3
    # model
    model:
      vf_share_layers: false
      custom_model: Rnn
      custom_action_dist: null # TorchBeta # prevent inf logprob
      custom_model_config:
        # need to specify observation space since rllib flatten it at model arguments
        obs_shape: [[3,]]
        # the MLP
        module_1:
          fc_filters: [3, 10] # [3, 64] # [3, 10]
          fc_activation: tanh
        # recurrent network
        module_2:
          type: lstm
          cell_size: 4
        # action distribution
        # custom_action_dist_config:
        #   low: -1.0
        #   high: 1.0
        # value function
        value_module:
          fc_filters: [3, 64, 64] # [3, 32]
          fc_activation: tanh
