---
run_name: [RUN_NAME]
master_port: "12402"
load_model_path: [Path or None]

log_config:
  use_tensorboard: True
  tensorboard_log: [Path]
  training_log: [Path]
  evaluation_log: [Path]

model_config:
  context_warmup: 512
  max_position_loss_weighting: 48000
  action_dim: 5
  policy_loss_type: CrossEntropy
  rsa_type: podtaw  # "podtawr", "podtaw", "podaw", "odawr", "odtawr", "odtaw", "odaw", "opdtawr"

  state_encode:
      input_type: Discrete
      input_size: 128
      hidden_size: 512
      dropout: 0.0
      is_frozen: False

  action_encode:
      input_type: Discrete
      input_size: 6
      hidden_size: 512
      dropout: 0.0
      is_frozen: False

  prompt_encode:
      input_type: Discrete
      input_size: 8
      hidden_size: 512
      dropout: 0.0
      is_frozen: False

  tag_encode:
      input_type: Discrete
      input_size: 8
      hidden_size: 512
      dropout: 0.0
      is_frozen: False

  reward_encode:
      input_type: Continuous
      input_size: 1
      hidden_size: 512
      dropout: 0.0
      is_frozen: False

  state_decode:
      output_type: Discrete
      input_size: 512
      hidden_size:
          - 512
          - 128
      layer_norm: True
      residual_connect: True
      dropout: 0.0
      is_frozen: False

  action_decode:
      output_type: Discrete
      input_size: 512
      hidden_size:
          - 512
          - 6
      layer_norm: True
      residual_connect: True
      dropout: 0.0
      is_frozen: False

  reward_decode:
      output_type: Continuous
      input_size: 512
      hidden_size:
          - 512
          - 1
      layer_norm: True
      residual_connect: True
      dropout: 0.0
      is_frozen: False

  action_diffusion:
      enable: False
      T: 20
      hidden_size: 512
      condition_size: 512
      inner_hidden_size: 2048
      beta: [0.05, 0.20]
      dropout: 0.0
      is_frozen: False

  state_diffusion:
      enable: False
      T: 20
      hidden_size: 512
      condition_size: 512
      inner_hidden_size: 2048
      beta: [0.05, 0.20]
      dropout: 0.0
      is_frozen: False

  causal_block_bak_1: # change to causal_block if use this block
      model_type: TRANSFORMER
      num_layers: 6
      hidden_size: 512
      nhead: 8
      inner_hidden_size: 512
      dropout: 0.10
      context_window: -1
      checkpoints_density: -1
      position_encoding_size: 12000
      use_layer_norm: True
      use_blockrecurrence: True
      memory_length: 3000
      memory_type: KV
      is_frozen: False

  causal_block_bak_2:
      model_type: GSA # GLA
      num_layers: 12
      hidden_size: 512
      inner_hidden_size: 1024
      dropout: 0.10
      nhead: 4
      memory_length: 64
      position_encoding_size: 12000
      use_layer_norm: True
      gate_bound: 22
      use_blockrecurrence: True
      checkpoints_density: -1
      memory_type: MEM
      is_frozen: False
      is_generate: False

  causal_block_bak_3:
      model_type: RWKV6
      num_layers: 12
      hidden_size: 512
      inner_hidden_size: 1024
      dropout: 0.10
      nhead: 16
      expand_k: 1
      expand_v: 2
      hidden_ratio: 3.5
      position_encoding_size: 12000
      use_layer_norm: True
      use_blockrecurrence: True
      checkpoints_density: -1
      gate_bound: 12
      memory_length: 0
      memory_type: MEM
      is_frozen: False

  causal_block:
      model_type: RWKV7
      num_layers: 18
      hidden_size: 512
      inner_hidden_size: 1024
      num_hidden_layers: 24
      dropout: 0.10
      nhead: 4
      hidden_ratio: 4
      position_encoding_size: 12000
      use_layer_norm: True
      use_blockrecurrence: True
      checkpoints_density: -1
      memory_length: 0
      memory_type: MEM
      is_frozen: False

  causal_block_bak_5:
      model_type: Mamba
      num_layers: 12
      hidden_size: 512
      inner_hidden_size: 1024
      expand: 2
      d_conv: 4
      d_state: 16
      dropout: 0.10
      position_encoding_size: 12000
      use_layer_norm: True
      use_blockrecurrence: True
      checkpoints_density: -1
      memory_length: 0
      memory_type: MEM
      is_frozen: False

train_config:
    max_epochs: 20
    batch_size: 12

    seq_len: 16000
    seg_len: 4000

    manual_sync: True

    lr: 2.0e-4
    lr_decay_interval: 2000
    lr_start_step: 8000

    data_path: [Path]
    save_model_path: [Path]
    max_save_iterations: 1000

    state_dropout: 0.0
    reward_dropout: 0.0

    lossweight_policymodel: 0.2
    lossweight_worldmodel_states: 0.4
    lossweight_worldmodel_rewards: 0.4
    lossweight_entropy: 0.0
    lossweight_l2: 0.0

    use_amp: False
    use_scaler: False

test_config:
    batch_size: 16
    data_path: [Path]
    output: "./offline_eval/"
    seq_len: 16000
    seg_len: 4000

generator_config:
    agent_num: 1 # If env=switch, set agent_num=2, use MultiAgentGenerator class in generate.py
    env: lake4x4 #  anymdp32x5 / lake4x4 / cliff / mountaincar12x5 / pendulum12x5 / switch(multi-agent)
    map_env_discrete: True
    task_file: [PATH] # Use pre-defined tasks (pickle file generated by gen_xxx_task.py)
    mult_anymdp_task: False
    action_clip: 4
    skip_frame: 0 
    epoch_numbers: 1
    downsample_trail: 30 
    decoding_strategy:
        T_ini: 1.0
        T_fin: 0.1
        T_step: 10000
        decay_type: Linear  # Linear / Exponential
    max_trails: 1000 
    max_steps: 200 # max steps for each trails
    max_total_steps: 0 # If > 0 record all step rewards, and stop loop by (max_total_steps or max_trails)
    learn_from_data: False # For lake4x4, use gen_gym_record.py to dump data
    data_root: [Path]
    run_icl: True
    use_dym_tag: False
    run_benchmark: 
        run_opt: False
        run_online: False
        run_random: False
    benchmark_model_name: "ppo" # For gym
    benchmark_model_save_path: [PATH] # For gym
    output: "./online_eval/"
    save_gif: True
    save_gif_gap: 50
