group: bbh
task:
  - bbh_cot_fewshot_boolean_expressions
  - bbh_cot_fewshot_causal_judgement
  - bbh_cot_fewshot_date_understanding
  - bbh_cot_fewshot_disambiguation_qa
  - bbh_cot_fewshot_dyck_languages
  - bbh_cot_fewshot_formal_fallacies
  - bbh_cot_fewshot_geometric_shapes
  - bbh_cot_fewshot_hyperbaton
  - bbh_cot_fewshot_logical_deduction_five_objects
  - bbh_cot_fewshot_logical_deduction_seven_objects
  - bbh_cot_fewshot_logical_deduction_three_objects
  - bbh_cot_fewshot_movie_recommendation
  - bbh_cot_fewshot_multistep_arithmetic_two
  - bbh_cot_fewshot_navigate
  - bbh_cot_fewshot_object_counting
  - bbh_cot_fewshot_penguins_in_a_table
  - bbh_cot_fewshot_reasoning_about_colored_objects
  - bbh_cot_fewshot_ruin_names
  - bbh_cot_fewshot_salient_translation_error_detection
  - bbh_cot_fewshot_snarks
  - bbh_cot_fewshot_sports_understanding
  - bbh_cot_fewshot_temporal_sequences
  - bbh_cot_fewshot_tracking_shuffled_objects_five_objects
  - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
  - bbh_cot_fewshot_tracking_shuffled_objects_three_objects
  - bbh_cot_fewshot_web_of_lies
  - bbh_cot_fewshot_word_sorting
aggregate_metric_list:
  - metric: exact_match
    aggregation: mean
    weight_by_size: true
    filter_list: get-answer
metadata:
  version: 3.0
