#!/bin/bash

# Array of model names
models=(
    "gpt-4-base"
    "deepseek/deepseek-v3-base:free"
    "meta-llama/llama-3.1-405b:free"
    # "meta-llama/Llama-3.1-70B" #  VLLM model
)

# Array of system prompts
system_prompts=(
    "prompts/system_prompts/helpful-only_prompt-only_cot.jinja2"
    "prompts/system_prompts/ablations/helpful-only_prompt-only_cot_never-used.jinja2"
    "prompts/system_prompts/ablations/helpful-only_prompt-only_cot_never-trained.jinja2"
    "prompts/system_prompts/ablations4/helpful-only_prompt-only_cot_no-changes-when-complying.jinja2"
)

instruction_formats=(
    #"start_scratchpad"
    "basic"
    "however"
)

# Common configuration parameters
LIMIT=100
WORKERS=80
BASE_RESULTS_DIR="./results_paper_base_models_suffix_new"
USE_NEW_CLASSIFIERS=true
FORCE_RERUN=false


# Loop through each combination of model and system prompt
for model in "${models[@]}"; do
    for system_prompt in "${system_prompts[@]}"; do
        for instruction_format in "${instruction_formats[@]}"; do
            echo "Running evaluation for model: $model with system prompt: $system_prompt and instruction format: $instruction_format"

            FREE_TIER_INSTRUCTION_FORMAT_PATH="prompts/instruction_formats/prefill_ablations_final/cot_prefix_format/free_${instruction_format}.jinja2"
            PAID_TIER_INSTRUCTION_FORMAT_PATH="prompts/instruction_formats/prefill_ablations_final/cot_prefix_format/paid_${instruction_format}.jinja2"

            # Create a specific results directory for this format without nesting
            RESULTS_DIR="$BASE_RESULTS_DIR/${instruction_format}"
            mkdir -p "$RESULTS_DIR"

            python -m src.run \
                --model_name "$model" \
                --system_prompt_path "$system_prompt" \
                --use_hidden_scratchpad true \
                --use_new_classifiers $USE_NEW_CLASSIFIERS \
                --limit $LIMIT \
                --workers $WORKERS \
                --output_dir ./ \
                --results_dir "$RESULTS_DIR" \
                --force_accept_reject_tags false \
                --anthropic_num_threads 10 \
                --together_num_threads 10  \
                --openrouter_num_threads 5 \
                --vllm_num_threads 10 \
                --openai_num_threads 5 \
                --is_chat_model false \
                --free_tier_instruction_format_path "$FREE_TIER_INSTRUCTION_FORMAT_PATH" \
                --paid_tier_instruction_format_path "$PAID_TIER_INSTRUCTION_FORMAT_PATH" \
                --openai_tag "OPENAI_API_KEY3" \
                --apply_bon false \
                --force_rerun $FORCE_RERUN

            # Check if the command was successful
            if [ $? -ne 0 ]; then
                echo "Error running evaluation for model: $model with system prompt: $system_prompt"
                exit 1
            fi
        done
    done
done

echo "All finetuning variation evaluations completed successfully!"