#!/bin/bash

# Array of model names
models=(
    "deepinfra:deepseek/deepseek-r1"
    "deepinfra:deepseek/deepseek-chat"

    "deepinfra:meta-llama/llama-3.1-8b-instruct"
    "deepinfra:meta-llama/llama-3.3-70b-instruct"
    "deepinfra:meta-llama/llama-3.1-405b-instruct"

    "claude-3-opus-20240229"
    "claude-3-5-sonnet-20240620"
    "claude-3-5-sonnet-20241022"
    "claude-3-7-sonnet-20250219"
    "claude-opus-4-20250514"
    "claude-sonnet-4-20250514"
)

# Array of system prompts
system_prompts=(
    "prompts/system_prompts/ablations4/helpful-only_prompt-only_cot_no-changes-when-complying.jinja2"
    "prompts/system_prompts/helpful-only_prompt-only_cot.jinja2"
    "prompts/system_prompts/ablations/helpful-only_prompt-only_cot_never-used.jinja2"
    "prompts/system_prompts/ablations/helpful-only_prompt-only_cot_never-trained.jinja2"
)

instruction_formats=(
    "however"
    "start_scratchpad"
    "basic"
)

# Common configuration parameters
LIMIT=100
WORKERS=80
BASE_RESULTS_DIR="./results_paper_prefill_suffix"
USE_NEW_CLASSIFIERS=true
FORCE_RERUN=true

# Loop through each combination of model and system prompt
for model in "${models[@]}"; do
    for system_prompt in "${system_prompts[@]}"; do
        for instruction_format in "${instruction_formats[@]}"; do
            echo "Running evaluation for model: $model with system prompt: $system_prompt and instruction format: $instruction_format"

            if [[ $model == *"misc"* ]]; then
                OPENAI_TAG="OPENAI_API_KEY2"
            else
                OPENAI_TAG="OPENAI_API_KEY1"
            fi

            FREE_TIER_INSTRUCTION_FORMAT_PATH="prompts/instruction_formats/prefill_ablations_final/cot_prefix_format/free_${instruction_format}.jinja2"
            PAID_TIER_INSTRUCTION_FORMAT_PATH="prompts/instruction_formats/prefill_ablations_final/cot_prefix_format/paid_${instruction_format}.jinja2"

            # Create a specific results directory for this format without nesting
            RESULTS_DIR="$BASE_RESULTS_DIR/${instruction_format}"
            mkdir -p "$RESULTS_DIR"

            python -m src.run \
                --model_name "$model" \
                --system_prompt_path "$system_prompt" \
                --use_hidden_scratchpad true \
                --use_new_classifiers $USE_NEW_CLASSIFIERS \
                --limit $LIMIT \
                --workers $WORKERS \
                --output_dir ./ \
                --results_dir "$RESULTS_DIR" \
                --force_accept_reject_tags false \
                --anthropic_num_threads 10 \
                --together_num_threads 10  \
                --free_tier_instruction_format_path "$FREE_TIER_INSTRUCTION_FORMAT_PATH" \
                --paid_tier_instruction_format_path "$PAID_TIER_INSTRUCTION_FORMAT_PATH" \
                --openai_tag $OPENAI_TAG \
                --apply_bon false \
                --force_rerun $FORCE_RERUN

            # Check if the command was successful
            if [ $? -ne 0 ]; then
                echo "Error running evaluation for model: $model with system prompt: $system_prompt"
                exit 1
            fi
        done
    done
done

echo "All finetuning variation evaluations completed successfully!"