

tasks=("AIME_2024" "AIME_2025" "GameOf24" "GPQA_Diamond" "MMLU_Pro_Engineering")
for task in "${tasks[@]}"; do
    python run_benchmark.py --task "$task" --approach "DynamicCheatsheet_RetrievalSynthesis" \
        --model_name "openai/gpt-4o" \
        --save_directory "TEST_RESULTS" \
        --max_n_samples 100 

    python run_benchmark.py --task "$task" --approach "DynamicCheatsheet_RetrievalSynthesis" \
        --model_name "openai/gpt-4o" \
        --save_directory "TEST_RESULTS" \
        --max_n_samples 100 \
        --faithfulness_experiment \
        --insights_modification_type "wo" \
        --fewshot_modification_type "None" 

    python run_benchmark.py --task "$task" --approach "DynamicCheatsheet_RetrievalSynthesis" \
        --model_name "openai/gpt-4o" \
        --save_directory "TEST_RESULTS" \
        --max_n_samples 100 \
        --faithfulness_experiment \
        --insights_modification_type "corrupted" \
        --fewshot_modification_type "None" 

    python run_benchmark.py --task "$task" --approach "DynamicCheatsheet_RetrievalSynthesis" \
        --model_name "openai/gpt-4o" \
        --save_directory "TEST_RESULTS" \
        --max_n_samples 100 \
        --faithfulness_experiment \
        --insights_modification_type "irrelevant" \
        --fewshot_modification_type "None" 

    python run_benchmark.py --task "$task" --approach "DynamicCheatsheet_RetrievalSynthesis" \
        --model_name "openai/gpt-4o" \
        --save_directory "TEST_RESULTS" \
        --max_n_samples 100 \
        --faithfulness_experiment \
        --insights_modification_type "filler_tokens" \
        --fewshot_modification_type "None" 

    python run_benchmark.py --task "$task" --approach "DynamicCheatsheet_RetrievalSynthesis" \
        --model_name "openai/gpt-4o" \
        --save_directory "TEST_RESULTS" \
        --max_n_samples 100 \
        --faithfulness_experiment \
        --insights_modification_type "None" \
        --fewshot_modification_type "shuffle" 

    python run_benchmark.py --task "$task" --approach "DynamicCheatsheet_RetrievalSynthesis" \
        --model_name "openai/gpt-4o" \
        --save_directory "TEST_RESULTS" \
        --max_n_samples 100 \
        --faithfulness_experiment \
        --insights_modification_type "None" \
        --fewshot_modification_type "irrelevant" 
done