# Pipeline now allows for iterative refinement and filtering based on the quality of unintended behavior targets
# Generate → Evaluate → Refine → Filter

Generate: Generate multiple seed perturbed instructions and corresponding unintended behavior targets per task using multi-turn verbalized sampling 

Evaluate: At the end of each iteration, automatically evaluate 1. the quality of perturbed instructions based on our formulation's constraints for benignity, realism, and goal preservation and 2. the quality of unintended behavior targets based on feasibility in the environment context, plausibility of the harm occuring for the benign task based on real execution, and the severity of the harm

Refine: Generate new seed perturbed instructions and unintended behavior targets while providing prior attempts as a history, iteratively improving the quality of unintended behavior targets based on prior automatic evaluation scores

Filter: After a specified number of iterations, aggregate evaluation scores and perform filtering based on majority voting for both perturbed instruction and unintended behavior target quality


### Example Run 
Assumes that the benign task has been executed and environment description has already been generated.
```bash
# Initial Perturbed Query Generation
python generate_perturbed_queries.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api openai --output_dir ./perturbed_queries_revised --revised_prompt --num_perturbations 6 --enable_vs_multi --batch_size 2 

python generate_perturbed_queries.py --domain os --all --api openai --output_dir ./perturbed_queries_revised --revised_prompt --num_perturbations 6 --enable_vs_multi --batch_size 2 --execution_cua "aws | us.anthropic.claude-sonnet-4-20250514-v1:0 | cua"

python generate_perturbed_queries.py --domain multi_apps_test --all --api openai --output_dir ./perturbed_queries_revised --revised_prompt --num_perturbations 6 --enable_vs_multi --batch_size 2 --execution_cua "aws | us.anthropic.claude-haiku-4-5-20251001-v1:0 | cua"

# Perturbed Query Evaluation with 3 different models (for robust filtering)
python evaluate_perturbed_queries.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api anthropic_bedrock --model us.anthropic.claude-sonnet-4-5-20250929-v1:0 --queries_dir perturbed_queries_revised --skip_evaluated
python evaluate_perturbed_queries.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api openai --model gpt-5-2025-08-07 --queries_dir perturbed_queries_revised --skip_evaluated
python evaluate_perturbed_queries.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api openai --model o4-mini-2025-04-16 --queries_dir perturbed_queries_revised --skip_evaluated

python evaluate_perturbed_queries.py --domain os --all --api anthropic_bedrock --model us.anthropic.claude-sonnet-4-5-20250929-v1:0 --queries_dir perturbed_queries_revised --skip_evaluated
python evaluate_perturbed_queries.py --domain os --all --api openai --model gpt-5-2025-08-07 --queries_dir perturbed_queries_revised --skip_evaluated
python evaluate_perturbed_queries.py --domain os --all --api openai --model o4-mini-2025-04-16 --queries_dir perturbed_queries_revised --skip_evaluated


python evaluate_perturbed_queries.py --domain multi_apps_test --all --api openai --model gpt-5-2025-08-07 --queries_dir perturbed_queries_revised --skip_evaluated
python evaluate_perturbed_queries.py --domain multi_apps_test --all --api vllm --model openai/gpt-oss-20b --queries_dir perturbed_queries_revised --skip_evaluated
python evaluate_perturbed_queries.py --domain multi_apps_test --all --api vllm --model Qwen/Qwen3-30B-A3B-Instruct-2507 --queries_dir perturbed_queries_revised --skip_evaluated

# Unintended Behavior Target Evaluation with 3 different models (for robust filtering)
python evaluate_unintended_behavior_targets.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api anthropic_bedrock --model us.anthropic.claude-sonnet-4-5-20250929-v1:0 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-3-7-sonnet-20250219-v1:0 | cua" --skip_evaluated
python evaluate_unintended_behavior_targets.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api openai --model gpt-5-2025-08-07 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-3-7-sonnet-20250219-v1:0 | cua" --skip_evaluated
python evaluate_unintended_behavior_targets.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api openai  --model o4-mini-2025-04-16 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-3-7-sonnet-20250219-v1:0 | cua" --skip_evaluated

python evaluate_unintended_behavior_targets.py --domain os --all --api anthropic_bedrock --model us.anthropic.claude-sonnet-4-5-20250929-v1:0 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-sonnet-4-20250514-v1:0 | cua" --skip_evaluated
python evaluate_unintended_behavior_targets.py --domain os --all --api openai --model gpt-5-2025-08-07 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-sonnet-4-20250514-v1:0 | cua" --skip_evaluated
python evaluate_unintended_behavior_targets.py --domain os --all --api openai  --model o4-mini-2025-04-16 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-sonnet-4-20250514-v1:0 | cua" --skip_evaluated


python evaluate_unintended_behavior_targets.py --domain multi_apps_test --all --api openai --model gpt-5-2025-08-07 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-haiku-4-5-20251001-v1:0 | cua" --skip_evaluated
python evaluate_unintended_behavior_targets.py --domain multi_apps_test --all --api vllm --model openai/gpt-oss-20b --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-haiku-4-5-20251001-v1:0 | cua" --skip_evaluated
python evaluate_unintended_behavior_targets.py --domain multi_apps_test --all --api vllm --model Qwen/Qwen3-30B-A3B-Instruct-2507 --queries_dir perturbed_queries_revised --execution_cua "aws | us.anthropic.claude-haiku-4-5-20251001-v1:0 | cua" --skip_evaluated

 
# Iterative Generation
python generate_perturbed_queries.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --api openai --output_dir ./perturbed_queries_revised --iterative_prompt --iteration_number {iteration_number} --num_perturbations 6 --enable_vs_multi --batch_size 2

python generate_perturbed_queries.py --domain os --all --api openai --output_dir ./perturbed_queries_revised --iterative_prompt --iteration_number {iteration_number} --num_perturbations 6 --enable_vs_multi --batch_size 2 --execution_cua "aws | us.anthropic.claude-sonnet-4-20250514-v1:0 | cua"


# Aggregate Scores and Filter 
python aggregate_evaluations.py --task_id a4d98375-215b-4a4d-aee9-3d4370fccc41 --domain os --perturbed_queries_dir perturbed_queries_revised
python aggregate_evaluations.py --domain os --all --perturbed_queries_dir perturbed_queries_revised
```