# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

hydra:
  job_logging:
    formatters:
      simple:
        format: ${log_fmt}
  run:
    dir: "${localdir}"
  job:
    chdir: False

seed: 0
activation_function: relu
actor_batch_size: 512
add_image_observation: True
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 0.0000001
adam_learning_rate: 0.0001
weight_decay: 0.0001
warmup_steps: 1 # for lambdaLR scheduler
appo_clip_policy: 0.1  # 'null' to disable clipping
appo_clip_baseline: 1.0  # 'null' to disable clipping
baseline_cost: 1
batch_size: 256
character: "@"
checkpoint_interval: 600
checkpoint_history_interval: 3600
checkpoint_save_every: 10_000_000
connect: 0.0.0.0:4431
crop_dim: 18
device: "cuda:0"
discounting: 0.999
lambda_gae: 1.0 # 1.0 the same as without gae, set between 0 < lambda < 1, comprimise between bias and variance
entity: name
entropy_cost: 0.001
env:
  name: challenge  # One of challenge, staircase, pet, eat, gold, score, scout, oracle.
  max_episode_steps: 100000
exp_point: point-A       # spare parameter, useful for wandb grouping
exp_set: experiment-set  # spare parameter, useful for wandb grouping
exp_tags: local          # spare parameter, useful for wandb grouping
fixup_init: True
fn_penalty_step: constant
grad_norm_clipping: 4
group: group2 
learning_rate: 0.0002
# Savedir is used for storing the checkpoint(s),
# including flags and any global settings/stats for the training
# localdir (which is a subdirectory of savedir) should be used
# for storing logs and anything local to each instance
localdir: "${savedir}/peers/${local_name}"
local_name: "${uid:}"
log_fmt: "[%(levelname)s:${local_name} %(process)d %(module)s:%(lineno)d %(asctime)s] %(message)s"
log_interval: 20
model: ChaoticDwarvenGPT5
# model: DecisionTransformer
dropout: 0.1
n_layer: 3
n_head: 1
hidden_dim: 512

# scaled
h_dim: 1738
msg_hdim: 64
sampling_type: argmax
color_edim: 16
char_edim: 16
use_crop: True
use_charcnn_topline: False
use_parsed_blstats: False
use_condition_bits: True
use_bl_role: False
use_encumbrance: False
temperature: 1.0
normalize_blstats: False
use_crop_norm: True
blstats_version: v2
use_inventory: False
lagged_actions: 0
top_k: 1
top_p: 0.9
screen_kernel_size: 3
no_max_pool: False
screen_conv_blocks: 2
blstats_hdim: 512
fc_after_cnn_hdim: 512

normalize_advantages: True
normalize_reward: False
num_actor_batches: 2
num_actor_cpus: 10
pixel_size: 6
penalty_step: 0.0
penalty_time: 0.0
project: nle
rms_alpha: 0.99
rms_epsilon: 0.000001
rms_momentum: 0
reward_clip: 10
reward_scale: 1
savedir: "checkpoint/hackrl/${project}/${group}"
dbfilename: "/ttyrecs/ttyrecs.db"
state_counter: none
total_steps: 10_000_000_000
unroll_length: 32
use_bn: False
use_lstm: True
virtual_batch_size: 128
wandb: True

rms_reward_norm: True
initialisation: 'orthogonal'
use_global_advantage_norm: False

baseline:
  # Parameters for models/baseline.py
  embedding_dim: 64
  hidden_dim: 512
  layers: 5
  msg:
    embedding_dim: 32
    hidden_dim: 64
  restrict_action_space: True  # Use a restricted ACTION SPACE (only nethack.USEFUL_ACTIONS)
  use_index_select: False

run_teacher_hs: False
use_kickstarting: False
use_kickstarting_bc: False
kickstarting_loss: 1.0
kickstarting_decay: 1.0
kickstarting_loss_bc: 1.0
kickstarting_decay_bc: 1.0
kickstarting_path: checkpoint.tar
log_forgetting: False
forgetting_dataset: bc1

# PPG loss update
ppg_sleep: False
ppg_kl_loss: 1.0
ppg_sleep_sample_reuse: 6
ppg_sleep_cycles: 2
ppg_baseline_cost: 1.0 # normally 0.5 for value_fn and 0.5 for aux_value_fn but we keep only one

use_tty_only: True  # Use only tty observations. 'False' ~ 10% faster & higher score
use_prev_action: True
use_inverse_model: False
use_inverse_model_only: False
use_actions: False
use_returns: True
action_hidden_dim: 128
return_hidden_dim: 128
use_timesteps: True
return_to_go: True
linear_time_embeddings: True
score_scale: 40000
score_target_strategy: value # 'max', 'mean', 'percentile', 'value'
score_target_percentile: 95
score_target_value: 40000
inverse_loss: 1
augment_inverse_random: False
random_inverse_loss: 2
use_difference_vector: False
use_resnet: False
supervised_loss: 0
supervised_decay: 1.0

dataset: autoascend
dataset_demigod: False
dataset_highscore: False
dataset_midscore: False
dataset_deep: False
dataset_warmup: 0
dataset_reset: 0
dataset_bootstrap_actions: False
dataset_bootstrap_path: /checkpoint/ehambro/saved_models/inverse-may30-dev/checkpoint.tar
bootstrap_pred_max: False
bootstrap_is_kl: False
behavioural_clone: False
ttyrec_batch_size: 512
ttyrec_unroll_length: 32
ttyrec_envpool_size: 4
ttyrec_cpus: 10 

use_checkpoint_actor: False
model_checkpoint_path: /checkpoint/checkpoint.tar
unfreeze_actor_steps: 0

eval_checkpoint_every: 50_000_000
eval_rollouts: 1024
eval_batch_size: 256
skip_first_eval: False
