dataset_path: lmms-lab/HallusionBench
dataset_kwargs:
  token: True
task: "hallusion_bench_image"
test_split: image
output_type: generate_until
doc_to_visual: !function evaluate_hb.hb_doc_to_visual
doc_to_text: !function evaluate_hb.hb_doc_to_text
doc_to_target: "gt_answer_details"
process_results: !function evaluate_hb.hb_process_results
lmms_eval_specific_kwargs:
  default:
    pre_prompt: ""
    post_prompt: ""
generation_kwargs:
  max_new_tokens: 128
  temperature: 0
  top_p: 1.0
  num_beams: 1
  do_sample: false
metric_list:
  - metric: aAcc
    aggregation: !function evaluate_hb.hb_aggregation_result_aAcc
    higher_is_better: true
  - metric: qAcc
    aggregation: !function evaluate_hb.hb_aggregation_result_qAcc
    higher_is_better: true
  - metric: fAcc
    aggregation: !function evaluate_hb.hb_aggregation_result_fAcc
    higher_is_better: true
  # - metric: aAcc
  #  aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern
  #  higher_is_better: true
  # - metric: qAcc
  #  aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern
  #  higher_is_better: true
  # - metric: fAcc
  #  aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern
  #  higher_is_better: true
metadata:
  - version: 0.0
