tokenizer:
  model_name: t5-base
  padding_side: left
  truncation_side: left
  pad_token_as_eos_token: False

reward_fn:
  id: ter 
  # values:
  #   - id: sacre_bleu
  #     args:
  #       tokenize: "intl"
  #   - id: ter 
  #   - id: chrf
  #   - id: bert_score
  #     args: 
  #       language: "de"
  
datapool:
  id: iwslt2017en_de
  args:
    prompt_prefix: "translate English to German: "

env:
  n_envs: 10
  args:
    max_prompt_length: 128
    max_episode_length: 128
    terminate_on_eos: True
    prompt_truncation_side: "right"
    context_start_token: 0

alg:
  args:
    batch_size: 64
    ent_coef: 0.0
    learning_rate: 0.0000005
    n_epochs: 5
    n_steps: 512
    verbose: 1
  id: nlpo
  kl_div:
    coeff: 0.001
    target_kl: 0.2
  policy:
    args:
      apply_model_parallel: true
      generation_kwargs:
        do_sample: True
        top_k: 10
        max_new_tokens: 128
      mask_type: learned_top_p
      min_tokens_to_keep: 100
      model_name: rajkumarrrk/t5-fine-tuned-on-iwslt2017en_de
      prompt_truncation_side: right
      target_update_iterations: 20
      top_mask: 0.5
    id: maskable_seq2seq_lm_actor_critic_policy

train_evaluation:
  eval_batch_size: 50
  n_iters: 50
  eval_every: 10
  save_every: 1
  metrics:
    - id: meteor
      args: {}
    - id: rouge
    - id: bleu
      args: {}
    - id: bert_score
      args:
        language: de
    - id: bleu
      args: {}
    - id: sacre_bleu
      args:
        tokenize: "intl"
    - id: ter
      args: {}
    - id: chrf
      args: {}
    - id: diversity
      args: {}
  generation_kwargs:
    num_beams: 4
    length_penalty: 0.6
    max_new_tokens: 128
