dataset_path: json
dataset_name: null
dataset_kwargs:
  data_files: datasets/etr_for_lm_eval.jsonl

task: etr_problems

doc_to_text: "{{question}}"
doc_to_target: "{{scoring_guide}}"

output_type: generate_until

metric_list:
  - metric: correct
    aggregation: mean
    higher_is_better: true
  - metric: len_response
    aggregation: mean
    higher_is_better: false
  - metric: etr_agreement
    aggregation: mean
    higher_is_better: true
  - metric: parse_error
    aggregation: mean
    higher_is_better: false
  - metric: etr_is_same_as_correct
    aggregation: mean
    higher_is_better: true
  # 2x2 matrix metrics with consistent naming
  - metric: correct_and_etr
    aggregation: mean
    higher_is_better: true
  - metric: correct_and_not_etr
    aggregation: mean
    higher_is_better: true
  - metric: not_correct_and_etr
    aggregation: mean
    higher_is_better: false
  - metric: not_correct_and_not_etr
    aggregation: mean
    higher_is_better: false

test_split: train
fewshot_split: train

generation_kwargs:
  until:
    - "Done."
  do_sample: false
  temperature: 0.2
  max_gen_toks: 500

process_results: !function scoring.score_answer

metadata:
  version: 1.0
