algorithm:
  class: PTIQL
  beta: 0.3333
  expectile: 0.75
  max_exp_clip: 100.0
  reward_reg: 0.0
  use_weighted_sum: true
  max_seq_len: 100
  rm_label: true

checkpoint: null
seed: 0
name: default
debug: false
device: null
wandb:
  activate: false
  entity: null
  project: null

env: pen-cloned-v1
env_kwargs:
env_wrapper:
env_wrapper_kwargs:

optim:
  default:
    class: Adam
    lr: 0.0003
  reward:
    class: AdamW
    lr: 0.0003

network:
  reward:
    num_layers: 3
    embed_dim: 256
    pref_embed_dim: 256
    num_heads: 1
    reward_act: identity
  actor:
    class: SquashedGaussianActor
    hidden_dims: [256, 256, 256]
    reparameterize: false
    conditioned_logstd: false
    logstd_min: -5
    logstd_max: 2
  critic:
    class: Critic
    ensemble_size: 2
    hidden_dims: [256, 256, 256]
  value:
    class: Critic
    ensemble_size: 1
    hidden_dims: [256, 256, 256]

rm_dataset:
  - class: IPLComparisonOfflineDataset
    env: pen-cloned-v1
    batch_size: 64
    mode: human
rm_dataloader:
  num_workers: 2
  batch_size: null

rl_dataset:
  - class: D4RLOfflineDataset
    env: pen-cloned-v1
    batch_size: 256
    mode: trajectory
    segment_length: 20
rl_dataloader:
  num_workers: 2
  batch_size: null

trainer:
  env_freq: null
  rm_label: true
  rm_steps: 100000
  rl_steps: 500000
  log_freq: 500
  profile_freq: 500
  eval_freq: 5000

rm_eval:
  function: eval_reward_model
  eval_dataset_kwargs:
    class: IPLComparisonOfflineDataset
    env: pen-cloned-v1
    batch_size: 32
    mode: human
    eval: false
rl_eval:
  function: eval_offline
  num_ep: 10
  deterministic: true

schedulers:
  actor:
    class: CosineAnnealingLR
    T_max: 500000
  reward:
    warmup_steps: 10000
    max_steps: 100000


processor: null
