seed: 0

env:
  name: ???
  state_dim: ???
  action_dim: ???
  max_action: ???
  max_episode_steps: ???

data:
  path: ???
  save_dir: './cache'
  num_pairs: 200000
  seg_len: 10

policy:
  hidden_dim: 
    - 400
    - 300
  std: 0.01
  path: ???

model:
  input_dim: ???
  hidden_dim: 1024
  context_dim: ???
  save_dir: './weights/flow'

reward_model:
  num_epochs: 100
  hidden_dim: 16
  num_layers: 2


# rlhf:
#   beta: 1
#   num_epochs: 1
#   learning_rate: 1e-3
#   batch_size: 1000

dpo:
  beta: 10
  num_epochs: 100
  learning_rate: 1e-3
  batch_size: 1000


batch_size: 1000
num_epochs: 10
learning_rate: 1e-3
loss_scale: 1
action_idx: 0
rollout_frequency: 1
flow_iteration: 1