cartpole-lstm:
  local_dir: $LOCAL_DIR
  env: CartPole-v0
  run: PPO
  num_samples: 1
  checkpoint_freq: 1
  checkpoint_at_end: True
  stop:
    episode_reward_mean: 150 # 99999
    timesteps_total: 100000
  config:
    num_workers: 8 # NOTE: set to 0 and use local mode for debugging
    num_gpus: 0
    num_gpus_per_worker: 0
    framework: torch
    observation_filter: MeanStdFilter
    # MDP-related
    gamma: 0.99
    rollout_fragment_length: 200
    # sgd-related
    lr: 0.0003
    # lr_schedule: null
    num_sgd_iter: 6
    # train_batch_size: 10240
    # sgd_minibatch_size: 512
    # value function
    vf_loss_coeff: 0.01
    # # PPO-specific
    # entropy_coeff: 0.001
    # entropy_coeff_schedule: null
    # lambda: 0.95
    # kl_coeff: 0.2
    # kl_target: 0.01
    # clip_param: 0.3
    # model
    model:
      vf_share_layers: false
      custom_model: Rnn
      custom_action_dist: null # TorchBeta # prevent inf logprob
      custom_model_config:
        # need to specify observation space since rllib flatten it at model arguments
        obs_shape: [[4,]]
        # the MLP
        module_1:
          fc_filters: [4, 10]
          fc_activation: relu
        # recurrent network
        module_2:
          type: lstm
          cell_size: 4
        # action distribution
        # custom_action_dist_config:
        #   low: -1.0
        #   high: 1.0
        # value function
        value_module:
          fc_filters: [4, 32]
          fc_activation: relu
