# ------------------------- Qwen3-4B ------------------------------
- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "first_order.json"
    custom_prompt_id: "zero_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "first_order.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-first-order-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "first_order.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-first-order-few"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "propositional.json"
    custom_prompt_id: "zero_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-propositional-zero"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "propositional.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-propositional-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "propositional.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-propositional-few"   

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "knowledge_graph.json"
    custom_prompt_id: "zero_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-knowledge-graph-zero"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "knowledge_graph.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-knowledge-graph-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "knowledge_graph.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-knowledge-graph-few"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "alist.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-alist-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-4B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "alist.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-4B-mmlu-alist-few"

# ------------------------- Qwen3-8B ------------------------------
- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "first_order.json"
    custom_prompt_id: "zero_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "first_order.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-first-order-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "first_order.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-first-order-few"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "propositional.json"
    custom_prompt_id: "zero_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-propositional-zero"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "propositional.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-propositional-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "propositional.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-propositional-few"   

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "knowledge_graph.json"
    custom_prompt_id: "zero_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-knowledge-graph-zero"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "knowledge_graph.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-knowledge-graph-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "knowledge_graph.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-knowledge-graph-few"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "alist.json"
    custom_prompt_id: "one_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-alist-one"

- hyperparam_set:
    model_name_or_path: "Qwen/Qwen3-8B"
    dataset_name: "cais/mmlu"
    split: "test"
    subset_name: "all"
    input_columns: ["question", "choices"]
    answer_key: "answer"
    reasoning_type: "multiple_choice"
    eval_batch_size: 64
    custom_prompt_file: "alist.json"
    custom_prompt_id: "few_shot"
    output_dir: "/app/evaluation_output"
    wandb_project: "prompt-finetuning"
    wandb_run_name: "Qwen3-8B-mmlu-alist-few"
