dataset_path: json
dataset_name: null
dataset_kwargs:
  data_files: datasets/etr_for_lm_eval.jsonl

task: etr_problems_open_ended

doc_to_text: "{{question}}"
doc_to_target: "{{scoring_guide}}"

output_type: generate_until

# "aggregation: bypass" means that the metric aggregation is 999. "higher_is_better" is filled out to suppress warnings.
metric_list:
  - metric: correct
    aggregation: mean
    higher_is_better: true
  - metric: is_etr_predicted
    aggregation: mean
    higher_is_better: true
  - metric: is_etr_predicted_exact
    aggregation: mean
    higher_is_better: true
  - metric: is_logically_equivalent
    aggregation: mean
    higher_is_better: true
  - metric: len_response
    aggregation: mean
    higher_is_better: false
  - metric: parse_error
    aggregation: mean
    higher_is_better: false
  - metric: model_answer
    aggregation: bypass
    higher_is_better: false
  - metric: full_model_response
    aggregation: bypass
    higher_is_better: false
  - metric: recalculated_is_same
    aggregation: mean
    higher_is_better: false
  - metric: correct_and_etr
    aggregation: mean
    higher_is_better: false
  - metric: correct_and_not_etr
    aggregation: mean
    higher_is_better: false
  - metric: not_correct_and_etr
    aggregation: mean
    higher_is_better: false
  - metric: not_correct_and_not_etr
    aggregation: mean
    higher_is_better: false


test_split: train
fewshot_split: train

generation_kwargs:
  until:
    - "Done."
  do_sample: false
  temperature: 0.2
  max_gen_toks: 2000

process_results: !function open_ended_scoring.score_answer

metadata:
  version: 1.0
