gpu: 
verbose: True
data_path: data/datasets/coqa/test.jsonl
output_dir: output/cached_results/coqa
save_file: results.json
model_name: llama3-8b
template_id: 1
system_id: 
evaluation_batch_size: 128
tokenize_kwargs:
  padding: longest
  truncation: True
  padding_side: left
  truncation_side: left
generate_kwargs:
  batch_size: 5 #per device batch size
  temperature: 1.0
  top_p: 0.9
  top_k: 50
  do_sample: False
  max_new_tokens: 150
  output_scores: True # distribution of each token
  num_scores_returned: 100
  num_responses_per_prompt: 1
  return_normalized_transition_scores: True # log prob of each generated token