tokenizer:
  model_name: flan-t5-large-warm-up
  padding_side: left
  truncation_side: left
  pad_token_as_eos_token: False

reward_fn:
  id: exact_match
  args:
    think: True

datapool:
  id: three
  args:
    prompt_prefix: "You will be presented with a question. If you know the answer, please respond directly. If you don't know the answer, use the Bing search engine to find the necessary information and then answer the question based on your observation.\n\nQuestion: "
    prompt_suffix: "\n\nPlease format your output as follows:\n\n1. If you choose to answer the question directly, please use: \"[Answer] YOUR_ANSWER\"\n2. If you choose to use the Bing search engine, please use: \"[Search] YOUR_SEARCH_QUERY\"\n\nPlease output:\n" 
    ifdebug: False

env:
  n_envs: 10 
  args:
    max_prompt_length: 512
    max_episode_length: 50
    terminate_on_eos: True
    prompt_truncation_side: "right"
    context_start_token: 0
    retrieval_cost: -0.2
    retrieval_engine: "bing"
    search_num: 4

alg:
  id: ppo
  model_type: seq2seq 
  args: 
    n_steps: 512 
    batch_size: 4
    gradient_accumulation_steps: 8
    verbose: 1
    learning_rate: 0.000002
    n_epochs: 2 # 4 # 2
    ent_coef: 0.0
  kl_div:
    coeff: 0.001
    target_kl: 0.2
  policy:
    id: seq2seq_lm_actor_critic_policy
    args:
      model_name: flan-t5-large-warm-up
      apply_model_parallel: True 
      prompt_truncation_side: "right"
      generation_kwargs:
        do_sample: True
        top_k: 50

train_evaluation:
  eval_batch_size: 16
  n_iters: 100
  eval_every: 5 
  save_every: 5 
  metrics:
    - id: exact_match 
      args:
        use_single_ref: False

  generation_kwargs:
    num_beams: 4


