#!/bin/bash
source $(conda info --base)/etc/profile.d/conda.sh
conda activate python_env
set -e
export OPENAI_API_KEY=""
export WANDB_API_KEY=""
export GROQ_API_KEY=""
export model_name="openai/gpt-oss-20b"

# ------------------------------------------------------------------- GSM8K EVALUATION ------------------------------------------------------------------ #
/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/first_order.json" \
    --custom_prompt_id "zero_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-fol-zero-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/first_order.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-fol-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/first_order.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-fol-few-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/propositional.json" \
    --custom_prompt_id "zero_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-prop-zero-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/propositional.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-prop-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/propositional.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-prop-few-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/alist.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-alist-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/alist.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-alist-few-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/knowledge_graph.json" \
    --custom_prompt_id "zero_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-knowledge-graph-zero-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/knowledge_graph.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-knowledge-graph-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "openai/gsm8k" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "main" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/knowledge_graph.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning" \
    --wandb_run_name "$model_name-gsm8k-evaluation-knowledge-graph-few-shot" \
# ------------------------------------------------------------------ MMLU EVALUATION ------------------------------------------------------------------ #
/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/first_order.json" \
    --custom_prompt_id "zero_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-fol-zero-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/first_order.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-fol-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/first_order.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-fol-few-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/propositional.json" \
    --custom_prompt_id "zero_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-prop-zero-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/propositional.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-prop-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/propositional.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-prop-few-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/alist.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-alist-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/alist.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-alist-few-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/knowledge_graph.json" \
    --custom_prompt_id "zero_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-knowledge-graph-zero-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/knowledge_graph.json" \
    --custom_prompt_id "one_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-knowledge-graph-one-shot" \

/opt/miniconda3/envs/env1/bin/python3.13 evaluate_cs_models.py \
    --provider "groq" \
    --model_name "$model_name" \
    --dataset_name "cais/mmlu" \
    --split "test" \
    --input_columns "question" "choices" \
    --subset_name "all" \
    --answer_key "answer" \
    --reasoning_type "multiple_choice" \
    --output_dir "./output" \
    --custom_prompt_file "prompts/knowledge_graph.json" \
    --custom_prompt_id "few_shot" \
    --batched_eval True \
    --early_stopping True \
    --wandb_project "prompt-finetuning-groq" \
    --wandb_run_name "$model_name-mmlu-evaluation-knowledge-graph-few-shot" \
    