# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
defaults:
- hydra/job_logging: colorlog
- hydra/hydra_logging: colorlog
- hydra/launcher: submitit_slurm

# pip install hydra-core hydra_colorlog
# can set these on the commandline too, e.g. `hydra.launcher.partition=dev`
hydra:
  launcher:
    timeout_min: 4300
    cpus_per_task: 10
    gpus_per_node: 1
    tasks_per_node: 1
    mem_gb: 20
    nodes: 1
    partition: dev
    comment: null
    max_num_timeout: 5  # will requeue on timeout or preemption

name: null  # can use this to have multiple runs with same params, eg name=1,2,3,4,5
seed: 123

## WandB settings
wandb: false  # enable wandb logging
project: mtp   # specifies the project name to log to
entity: nethack  # the team to log to
group: default  # defines a group name for the experiment
tags: 'rllib'  # what tags for this run, comma-separated

# Env settings
fn_penalty_step: constant
penalty_time: 0.0
penalty_step: -0.001
reward_lose: 0
reward_win: 1
character: mon-hum-neu-mal
save_tty: False
obs_keys: "glyphs,chars,colors,specials,blstats,message"
## typical characters we use
# mon-hum-neu-mal
# val-dwa-law-fem
# wiz-elf-cha-mal
# tou-hum-neu-fem

env: score
## NLE tasks:
# staircase, score, pet, oracle, gold, eat, scout,
## MiniHack tasks. See nle/agent/common/envs/tasks for all tasks
# small_room, small_room_random, small_room_dark, small_room_monster,
# small_room_trap, big_room, big_room_random, big_room_dark, big_room_monster,
# big_room_monster_trap, corridor2, corridor3, corridor5, corridor8,
# corridor10, keyroom_small_fixed, keyroom_small, keyroom_small_dark,
# keyroom_big, keyroom_big_dark, mazewalk_small, mazewalk_small_mapped,
# mazewalk_big, mazewalk_big_mapped, mazewalk_huge, mazewalk_huge_mapped,
# fight_corridor, fight_corridor_dark, river, river_lava, river_monster,
# river_monsterlava, multiroom_2, multiroom_4, multiroom_6, multiroom_2_locked,
# multiroom_4_locked, multiroom_6_locked, multiroom_2_trap, multiroom_4_trap,
# multiroom_6_trap, multiroom_2_monster, multiroom_4_monster,
# multiroom_6_monster, multiroom_2_extreme, multiroom_4_extreme,
# multiroom_6_extreme, boxoban_hard, boxoban_medium, mini_eat, mini_pray,
# mini_sink

# Training settings.
num_gpus: 1
num_cpus: 8
num_actors: 256           # should be at least batch_size
total_steps: 1e9          # 1e9 used in paper
train_batch_size: 32            # 32 is standard, can use 128 with small model variants
unroll_length: 80         # 80 is standard

# Model settings.
model: baseline    # random, baseline, rnd, ride
use_lstm: true
hidden_dim: 256    # use at least 128, 256 is stronger
embedding_dim: 64  # use at least 32, 64 is stronger
glyph_type: all_cat   # full, group_id, color_char, all, all_cat* (all_cat best, full fastest)
equalize_input_dim: false  # project inputs to same dim (*false unless doing dynamics)
equalize_factor: 2    # multiplies hdim by this when equalize is enabled (2 > 1)
layers: 5          # number of cnn layers for crop/glyph model
crop_model: cnn
crop_dim: 9        # size of crop
use_index_select: true    # use index select instead of normal embedding lookup

msg:
  model: lt_cnn            # character model? none, lt_cnn*, cnn, gru, lstm
  hidden_dim: 64           # recommend 256
  embedding_dim: 32        # recommend 64

# Experimental settings.
state_counter: none        # none, coordinates

# Generic Loss settings.
lr: 0.0002
gamma: 0.999       # probably a bit better at 0.999, esp with intrinsic reward
reward_clipping: none    # use none with normalize_reward, else use tim

# Optimizer settings.
decay: 0.99        # 0.99 vs 0.9 vs 0.5 seems to make no difference
momentum: 0        # keep at 0
grad_clip: 40

# Algorithm-specific settings. These can override settings above (i.e. learning rate)
algo: impala  # must match one of the keys below

impala:
  entropy_coeff: 0.001      # 0.001 is better than 0.0001
  vf_loss_coeff: 0.5
  epsilon: 0.000001  # do not use 0.01, 1e-6 seems same as 1e-8

dqn:
  double_q: True
  dueling: True
  noisy: False
  prioritized_replay: True
  n_step: 3
  buffer_size: 2000000
  target_network_update_freq: 50000
  prioritized_replay_beta_annealing_timesteps: 500000
  learning_starts: 50000
  model:
    use_lstm: False
  exploration_config:
    epsilon_timesteps: 1000000

ppo:
  rollout_fragment_length: 256
  train_batch_size: 4096
  sgd_minibatch_size: 256
  num_sgd_iter: 20
  entropy_coeff: 0.0001
  vf_loss_coeff: 0.5
  model:
    vf_share_layers: True

a2c:
  rollout_fragment_length: 32
  train_batch_size: 256
  entropy_coeff: 0.0001
  vf_loss_coeff: 0.5
  sample_async: False


# sac:
#   learning_starts: 50000
#   prioritized_replay_beta_annealing_timesteps: 500000
#   train_batch_size: 256
#   rollout_fragment_length: 1
#   model:
#     use_lstm: False
