halfcheetah-lstm:
  local_dir: $LOCAL_DIR
  env: HalfCheetah-v2
  run: PPO
  num_samples: 1
  checkpoint_freq: 10
  checkpoint_at_end: True
  stop:
    episode_reward_mean: 9800
    timesteps_total: 12000000
  config:
    num_workers: 16 # NOTE: set to 0 and use local mode for debugging
    num_gpus: 0
    num_gpus_per_worker: 0
    framework: torch
    observation_filter: MeanStdFilter
    # MDP-related
    gamma: 0.99
    # rollout_fragment_length: 200
    batch_mode: truncate_episodes
    # # sgd-related
    lr: 0.0003
    # # lr_schedule: null
    num_sgd_iter: 32
    sgd_minibatch_size: 4096
    train_batch_size: 65536
    # value function
    vf_loss_coeff: 0.5
    # PPO-specific
    grad_clip: 0.5
    # # entropy_coeff: 0.001
    # # entropy_coeff_schedule: null
    lambda: 0.95
    kl_coeff: 1.0
    # # kl_target: 0.01
    clip_param: 0.2
    # model
    model:
      vf_share_layers: false
      custom_model: Rnn
      custom_action_dist: null # TorchBeta # prevent inf logprob
      custom_model_config:
        # need to specify observation space since rllib flatten it at model arguments
        obs_shape: [[17,]]
        # the MLP
        module_1:
          fc_filters: [17, 256]
          fc_activation: tanh
        # recurrent network
        module_2:
          type: lstm
          cell_size: 10
        # action distribution
        # custom_action_dist_config:
        #   low: -1.0
        #   high: 1.0
        # value function
        value_module:
          fc_filters: [17, 256, 256]
          fc_activation: tanh
