tokenizer:
  model_name: gpt2
  padding_side: left
  truncation_side: left
  pad_token_as_eos_token: True

reward_fn:
  id: increasing_numbers
  args:
    min_tokens: 20

datapool:
  id: dummy_pool
  args:
    n_samples: 50
    prompt: '<|endoftext|>'
env:
  n_envs: 10
  args:
    max_prompt_length: 5
    max_episode_length: 20
    terminate_on_eos: True
    context_start_token: 0

alg:
  id: nlpo
  args:
    n_steps: 128
    batch_size: 64
    verbose: 1
    learning_rate: 0.00001
    n_epochs: 5
    ent_coef: 0.0
    gae_lambda: 0.9
    vf_coef: 0.1
  kl_div:
    coeff: 0.02
    target_kl: 2
  policy:
    id: maskable_causal_lm_actor_critic_policy
    args:
      model_name: gpt2
      apply_model_parallel: True
      top_mask: 0.9
      mask_type: 'learned_top_p'
      target_update_iterations: 100
      generation_kwargs:
        do_sample: True
        top_k: 0
        min_length: 20
        max_new_tokens: 20  # this must align with env's max steps

train_evaluation:
  eval_batch_size: 256
  n_iters: 200
  eval_every: 10
  save_every: 10
  metrics:
    - id: increasing_numbers
      args:
          min_tokens: 20