tokenizer:
  model_name: llama2-7b-chat-hf
  padding_side: left
  truncation_side: left
  pad_token_as_eos_token: True

reward_fn:
  id: exact_match 
  args:
    think: True

datapool:
  id: three
  args:
    prompt_prefix: "You will be presented with a question. If you know the answer, please respond directly. If you don't know the answer, use the Bing search engine to find the necessary information and then answer the question based on your observation.\n\nQuestion: "
    prompt_suffix: "\n\nPlease format your output as follows:\n\n1. If you choose to answer the question directly, please use: \"[Answer] YOUR_ANSWER\"\n2. If you choose to use the Bing search engine, please use: \"[Search] YOUR_SEARCH_QUERY\"\n\nPlease output:\n" 
    ifdebug: False

env:
  n_envs: 10
  args:
    max_prompt_length: 512
    max_episode_length: 50
    terminate_on_eos: True
    prompt_truncation_side: "left"
    retrieval_cost: -0.2
    retrieval_engine: "bing"
    search_num: 4

alg:
  id: ppo
  model_type: causal 
  args: 
    n_steps: 512 
    batch_size: 2 
    gradient_accumulation_steps: 64
    verbose: 1
    learning_rate: 0.000002
    n_epochs: 2 
    ent_coef: 0.0
  kl_div:
    coeff: 0.001
    target_kl: 0.2
  policy:
    id: causal_lm_actor_critic_policy
    args:
      model_name: llama2-7b-chat-hf
      apply_model_parallel: True

      prompt_truncation_side: "left"
      generation_kwargs:
        do_sample: True
        top_k: 50

train_evaluation:
  eval_batch_size: 16
  n_iters: 50 
  eval_every: 3
  save_every: 3 
  metrics:
    - id: exact_match 
      args:
        use_single_ref: False

  generation_kwargs:
    num_beams: 4

