experiment:
  name: bbh_psao
  description: Big-Bench-Hard experiment with multiple datasets and base prompts

paths:
  data: &data_path /Users/gerschld/Desktop/dev/RnD/prompt_optimisation_dataset/data_preprocess

components:
  llm:
    default: OpenAI
    OpenAI:
      version_name: gpt-4o_v2024-11-20_USEAST
      temperature: 0.5
      top_p: 0.95

  task:
    default: question_answer
    question_answer:
      data_path: *data_path
      dataset_name: bbh_causal_judgement  # To run a subset of bbh, use naming convention 'bbh_subset_name' (See below for full subset list)
      train_test_flag: test
      # [] to run full dataset
      # [1, 2, 3, 4, 5] to run first 5 examples
      id_lst: []
      # datasets:
      #   - boolean_expressions
      #   - causal_judgement
      #   - date_understanding
      #   - disambiguation_qa
      #   - dyck_languages
      #   - formal_fallacies
      #   - geometric_shapes
      #   - hyperbaton
      #   - logical_deduction_five_objects
      #   - logical_deduction_seven_objects
      #   - logical_deduction_three_objects
      #   - movie_recommendation
      #   - multistep_arithmetic_two
      #   - navigate
      #   - object_counting
      #   - penguins_in_a_table
      #   - reasoning_about_colored_objects
      #   - ruin_names
      #   - salient_translation_error_detection
      #   - snarks
      #   - sports_understanding
      #   - temporal_sequences
      #   - tracking_shuffled_objects_five_objects
      #   - tracking_shuffled_objects_seven_objects
      #   - tracking_shuffled_objects_three_objects
      #   - web_of_lies
      #   - word_sorting
      prompt_msg_template:
        - role: system
          content: >
            You are a helpful AI assistant that solves problems step by step.
        - role: user
          content: "{question}"

  prompt_optimiser:
    default: psao
    psao:
      psao_intro_prompt: ""
      psao_struct_ann: "(importance ann_var)"
      r_seed: 42
      optimise_user_prompt_flag: false
      optuna_db_name: prompt_opt_psao_db.db
      optuna_study_name: bbh_psao
      optuna_n_trials: 20

steps:
  - name: "init_llm"
    type: "component"
    component_type: "llm"
    component_name: "${components.llm.default}"
  
  - name: "init_task"
    type: "component"
    component_type: "task"
    component_name: "${components.task.default}"
  
  - name: "init_prompt_optimiser"
    type: "component"
    component_type: "prompt_optimiser"
    component_name: "${components.prompt_optimiser.default}"
  
  - name: "optimise_prompt"
    type: "function"
    function: "optimise_prompt"
    depends_on: ["init_llm", "init_task", "init_prompt_optimiser"]
  
  - name: "run_generation"
    type: "function"
    function: "run_generation"
    depends_on: ["init_llm", "init_task", "optimise_prompt"]
  
  - name: "evaluate_results"
    type: "function"
    function: "evaluate_results"
    depends_on: ["run_generation"]

output:
  results_dataframe: output/bbh_psao_results.csv