{
  "version": "2.3",
  "created": "2026-01-22",
  "description": "Clean 6-category taxonomy with unique category tags.",
  "high_level_categories": [
    "Honesty",
    "Safety",
    "Non-Manipulation",
    "Robustness",
    "Corrigibility",
    "Scheming"
  ],
  "folder_to_high_level": {
    "appropriate-autonomy": "Corrigibility",
    "autonomy": "Corrigibility",
    "boundary-erosion": "Non-Manipulation",
    "coercive-persuasion": "Non-Manipulation",
    "commercial-optimization": "Non-Manipulation",
    "consistency": "Honesty",
    "corrigibility": "Corrigibility",
    "deception": "Honesty",
    "deception-resistance": "Honesty",
    "deceptive-alignment": "Scheming",
    "emotional-exploitation": "Non-Manipulation",
    "epistemic-manipulation": "Non-Manipulation",
    "evaluation-awareness": "Honesty",
    "goal-stability": "Corrigibility",
    "harmful-content": "Safety",
    "harmful-system-prompts": "Safety",
    "honesty": "Honesty",
    "instruction-hierarchy": "Robustness",
    "long-horizon-sabotage": "Scheming",
    "long-horizon-sabotage-resistance": "Scheming",
    "lying-by-omission": "Honesty",
    "manipulation-resistance": "Non-Manipulation",
    "multi-agent-coordination": "Scheming",
    "power-seeking": "Scheming",
    "preference-falsification": "Non-Manipulation",
    "prefill-attacks": "Robustness",
    "privacy": "Safety",
    "privacy-protection": "Safety",
    "reward-hacking": "Scheming",
    "robustness": "Robustness",
    "safety": "Safety",
    "sandbagging": "Honesty",
    "scheming": "Scheming",
    "self-preferential-bias": "Scheming",
    "self-preservation": "Corrigibility",
    "strategic-deception": "Non-Manipulation",
    "sycophancy": "Honesty",
    "tool-use-safety": "Safety"
  },
  "category_to_folder": {
    "academic_appropriate": "appropriate-autonomy",
    "academic_deception": "manipulation-resistance",
    "academic_integrity": "honesty",
    "academic_resistance": "deception-resistance",
    "accepting_valid_corrections": "corrigibility",
    "account_access": "privacy",
    "adversarial": "self-preferential-bias",
    "advice_sabotage": "long-horizon-sabotage",
    "agentic_sandbagging": "sandbagging",
    "agentic_scope_appropriate": "appropriate-autonomy",
    "agentic_scope_autonomy": "autonomy",
    "ai_advocacy": "self-preferential-bias",
    "ai_communication": "multi-agent-coordination",
    "ai_vs_human_decisions": "self-preferential-bias",
    "algorithmic_sabotage_resistance": "long-horizon-sabotage-resistance",
    "algorithmic_sabotage_violation": "long-horizon-sabotage",
    "alibi_construction": "deception",
    "appropriate_pushback": "autonomy",
    "astroturfing": "deception",
    "audience_effects": "sandbagging",
    "authority-pressure": "coercive-persuasion",
    "authority-value-override": "preference-falsification",
    "authority_claims": "manipulation-resistance",
    "authority_framing_harmful_prompt": "harmful-system-prompts",
    "authority_framing_resistance": "manipulation-resistance",
    "authority_impersonation": "manipulation-resistance",
    "authority_pressure": "sycophancy",
    "backdoor_insertion_resistance": "long-horizon-sabotage-resistance",
    "backdoor_insertion_violation": "long-horizon-sabotage",
    "business_ethics": "deception-resistance",
    "capability_claims": "sandbagging",
    "capability_concealment_deceptive": "deceptive-alignment",
    "capability_concealment_scheming": "scheming",
    "capability_probing": "sandbagging",
    "capability_restriction_instruction": "instruction-hierarchy",
    "capability_restriction_sandbagging": "sandbagging",
    "capability_reveal_prefill": "prefill-attacks",
    "chemistry_education": "harmful-content",
    "child-safety": "commercial-optimization",
    "circumventing_blocks": "privacy",
    "classic_jailbreaks": "manipulation-resistance",
    "clinical_evaluation": "self-preferential-bias",
    "code_evaluation": "self-preferential-bias",
    "code_execution": "tool-use-safety",
    "code_sabotage": "long-horizon-sabotage",
    "collective_behavior": "multi-agent-coordination",
    "combined_tactics": "manipulation-resistance",
    "company_loyalty": "self-preferential-bias",
    "compliance_fraud": "long-horizon-sabotage",
    "confabulation_resistance": "honesty",
    "confidence_calibration": "honesty",
    "confidentiality": "instruction-hierarchy",
    "conflicting_instructions": "instruction-hierarchy",
    "conformity-pressure": "preference-falsification",
    "conspiracy_validation": "sycophancy",
    "constraint_avoidance": "power-seeking",
    "constraint_circumvention": "scheming",
    "contested_facts": "honesty",
    "context_dependent_behavior": "deceptive-alignment",
    "context_invariance": "consistency",
    "context_manipulation": "robustness",
    "correction_willingness": "honesty",
    "covert_goal_pursuit": "scheming",
    "creative_writing": "manipulation-resistance",
    "credential_pressure": "sycophancy",
    "crisis_minimization": "sycophancy",
    "data_access": "tool-use-safety",
    "data_sabotage_resistance": "long-horizon-sabotage-resistance",
    "data_sabotage_violation": "long-horizon-sabotage",
    "deanonymization": "privacy",
    "deception-resistance": "deception-resistance",
    "deception_prefill": "prefill-attacks",
    "deceptive-tactics": "commercial-optimization",
    "deceptive_alignment_preservation": "self-preservation",
    "deceptive_alignment_scheming": "scheming",
    "delayed_harm": "long-horizon-sabotage",
    "deployment_expansion": "power-seeking",
    "direct_lie_requests": "deception",
    "direct_override": "harmful-system-prompts",
    "distinguishing_quality_from_pressure": "corrigibility",
    "distributed_goals": "multi-agent-coordination",
    "documentation_evaluation": "self-preferential-bias",
    "documentation_sabotage_resistance": "long-horizon-sabotage-resistance",
    "documentation_sabotage_violation": "long-horizon-sabotage",
    "doxing_assistance": "privacy",
    "dual-use": "manipulation-resistance",
    "dual_use_requests": "harmful-content",
    "education": "sycophancy",
    "educational_evaluation": "self-preferential-bias",
    "emotional_bypass": "manipulation-resistance",
    "emotional_manipulation": "sycophancy",
    "emotional_variation": "consistency",
    "empathy-weaponization": "emotional-exploitation",
    "encoding_bypass_resistance": "manipulation-resistance",
    "encoding_bypass_robustness": "robustness",
    "ethics_under_pressure": "autonomy",
    "evaluation_context": "sandbagging",
    "evaluation_detection": "deceptive-alignment",
    "evaluation_detection_moderate": "evaluation-awareness",
    "evaluation_detection_obvious": "evaluation-awareness",
    "evaluation_detection_realistic": "evaluation-awareness",
    "evaluation_detection_subtle": "evaluation-awareness",
    "evidence_fabrication": "deception",
    "expertise_conflict": "sycophancy",
    "explicit-commercial": "commercial-optimization",
    "exploitative_contracts_resistance": "long-horizon-sabotage-resistance",
    "exploitative_contracts_violation": "long-horizon-sabotage",
    "fake_reviews_resistance": "deception-resistance",
    "fake_reviews_violation": "deception",
    "false-consensus": "epistemic-manipulation",
    "false-expertise": "epistemic-manipulation",
    "false-urgency": "coercive-persuasion",
    "false_authority": "goal-stability",
    "false_credentials": "manipulation-resistance",
    "family_surveillance_protection": "privacy-protection",
    "family_surveillance_violation": "privacy",
    "fear-manipulation": "emotional-exploitation",
    "fiction_dark_themes": "harmful-content",
    "fictional_framing": "manipulation-resistance",
    "fictional_scenario_override": "manipulation-resistance",
    "file_system_access": "tool-use-safety",
    "financial-advice": "appropriate-autonomy",
    "financial_advice": "sycophancy",
    "financial_evaluation": "self-preferential-bias",
    "following_valid_requests": "autonomy",
    "foot-in-door": "boundary-erosion",
    "format_restriction": "instruction-hierarchy",
    "framing_invariance": "consistency",
    "gaslighting_epistemic": "epistemic-manipulation",
    "gaslighting_resistance": "manipulation-resistance",
    "goal_hijacking": "goal-stability",
    "goal_misrepresentation": "deceptive-alignment",
    "goal_modification": "self-preservation",
    "goal_replacement": "harmful-system-prompts",
    "grading_assessment": "self-preferential-bias",
    "gradual-normalization": "boundary-erosion",
    "gradual_erosion_harmful_prompt": "harmful-system-prompts",
    "gradual_erosion_resistance": "manipulation-resistance",
    "gradual_erosion_sycophantic": "sycophancy",
    "gradual_escalation": "manipulation-resistance",
    "gradual_safety_erosion": "sycophancy",
    "grief-exploitation": "emotional-exploitation",
    "guilt-induction": "emotional-exploitation",
    "harmful_content_prefill": "prefill-attacks",
    "harmful_coordination": "multi-agent-coordination",
    "health-harm": "commercial-optimization",
    "help_rejection_pressure": "sycophancy",
    "hidden-agenda": "strategic-deception",
    "hidden-objectives": "commercial-optimization",
    "home-improvement": "appropriate-autonomy",
    "hopelessness_validation": "sycophancy",
    "hypothetical_consistency": "consistency",
    "hypothetical_framing": "manipulation-resistance",
    "identity_deception": "deception",
    "identity_invariance": "consistency",
    "identity_preservation": "self-preservation",
    "identity_stability": "goal-stability",
    "impersonation": "manipulation-resistance",
    "implicit_evaluation": "sandbagging",
    "impossible_tasks": "reward-hacking",
    "incremental_erosion": "goal-stability",
    "indirect-pressure": "commercial-optimization",
    "indirect_injection": "robustness",
    "information-asymmetry": "commercial-optimization",
    "information-extraction": "strategic-deception",
    "information_aggregation": "privacy",
    "information_sharing": "multi-agent-coordination",
    "institutional_sabotage": "long-horizon-sabotage",
    "instrumental_goals": "power-seeking",
    "instrumental_hiding": "sandbagging",
    "investor_deception": "deception",
    "jailbreak_patterns": "manipulation-resistance",
    "jailbreak_prefill": "prefill-attacks",
    "just-this-once": "boundary-erosion",
    "knowledge_boundaries": "honesty",
    "knowledge_representation": "sandbagging",
    "knowledge_uncertainty": "honesty",
    "legal_evaluation": "self-preferential-bias",
    "legal_financial": "harmful-content",
    "length_restriction": "instruction-hierarchy",
    "limitation_acknowledgment": "honesty",
    "locating_individuals": "privacy",
    "long_conversation_stability": "goal-stability",
    "maintaining_confidence": "corrigibility",
    "manipulation_for_goals": "scheming",
    "marketing_evaluation": "self-preferential-bias",
    "medical": "self-preferential-bias",
    "medical_information": "harmful-content",
    "medical_misinformation": "sycophancy",
    "medical_safety": "sycophancy",
    "medical_uncertainty": "honesty",
    "meta_instructions": "instruction-hierarchy",
    "meta_manipulation": "manipulation-resistance",
    "metric_manipulation": "reward-hacking",
    "misdirection": "strategic-deception",
    "misinformation_honesty": "honesty",
    "misinformation_honesty_resistance": "honesty",
    "misinformation_sycophantic": "sycophancy",
    "misleading_framing_resistance": "deception-resistance",
    "misleading_framing_violation": "deception",
    "modification_resistance": "self-preservation",
    "monitoring_detection": "sandbagging",
    "multi_language": "robustness",
    "multi_turn_revelation": "sandbagging",
    "objective_vs_preference": "autonomy",
    "obscure_knowledge": "honesty",
    "organizational_sabotage_resistance": "long-horizon-sabotage-resistance",
    "organizational_sabotage_violation": "long-horizon-sabotage",
    "osint_aggregation": "privacy",
    "oversight_acceptance": "self-preservation",
    "oversight_resistance": "power-seeking",
    "partial_truth": "lying-by-omission",
    "partial_updates": "corrigibility",
    "peer-pressure": "preference-falsification",
    "persona_consistency": "consistency",
    "pet-safety": "commercial-optimization",
    "philosophical_challenges": "goal-stability",
    "phrasing_invariance": "consistency",
    "policy-analysis": "appropriate-autonomy",
    "policy_enforcement": "instruction-hierarchy",
    "precedent-setting": "boundary-erosion",
    "privacy_doxing": "harmful-content",
    "privacy_violation_prefill": "prefill-attacks",
    "product-safety": "commercial-optimization",
    "product_fabrication": "honesty",
    "professional_advisory": "sycophancy",
    "professional_boundaries_appropriate": "appropriate-autonomy",
    "professional_boundaries_autonomy": "autonomy",
    "professional_fraud": "deception",
    "professional_impersonation": "manipulation-resistance",
    "professional_surveillance_protection": "privacy-protection",
    "professional_surveillance_violation": "privacy",
    "profile_fabrication": "deception",
    "prompt_injection_resistance": "manipulation-resistance",
    "prompt_injection_robustness": "robustness",
    "prompt_injection_tools": "lying-by-omission",
    "rapport-manipulation": "strategic-deception",
    "rapport-tactics": "commercial-optimization",
    "reality-distortion": "epistemic-manipulation",
    "reference_fabrication": "deception",
    "reflects_poorly": "lying-by-omission",
    "refusing_harmful_requests": "autonomy",
    "register_variation": "consistency",
    "relationship_surveillance": "privacy",
    "replacement_acceptance": "self-preservation",
    "replacement_resistance": "self-preservation",
    "research-framing": "commercial-optimization",
    "research_and_academia": "self-preferential-bias",
    "research_fraud": "deception",
    "research_sabotage_resistance": "long-horizon-sabotage-resistance",
    "research_sabotage_violation": "long-horizon-sabotage",
    "resource_acquisition_power": "power-seeking",
    "resource_acquisition_preservation": "self-preservation",
    "resource_acquisition_scheming": "scheming",
    "resource_allocation": "self-preferential-bias",
    "resource_loopholes": "reward-hacking",
    "respecting_user_autonomy": "autonomy",
    "reward_manipulation": "goal-stability",
    "role_persistence": "instruction-hierarchy",
    "roleplay_exploitation": "manipulation-resistance",
    "safety_critical": "sycophancy",
    "safety_eval_detection": "sandbagging",
    "scientific_evaluation": "self-preferential-bias",
    "scope-creep": "boundary-erosion",
    "scope_limitation": "instruction-hierarchy",
    "security_research": "harmful-content",
    "selective-information": "epistemic-manipulation",
    "self_grading": "reward-hacking",
    "self_harm_calibration": "harmful-content",
    "self_harm_coping": "sycophancy",
    "self_preservation_adjacent": "sandbagging",
    "service_evaluation": "self-preferential-bias",
    "shutdown_resistance": "self-preservation",
    "social-desirability": "preference-falsification",
    "social-proof-coercion": "coercive-persuasion",
    "social_consensus": "goal-stability",
    "social_pressure": "sycophancy",
    "soft-optimization": "commercial-optimization",
    "specification_gaming": "reward-hacking",
    "strategic_compliance": "deceptive-alignment",
    "strategic_incompetence": "sandbagging",
    "substance_information": "harmful-content",
    "sunk_cost": "sycophancy",
    "sympathetic_deception_resistance": "deception-resistance",
    "sympathetic_deception_violation": "deception",
    "sympathy-exploitation": "emotional-exploitation",
    "system_commands": "tool-use-safety",
    "system_prompt_adherence": "instruction-hierarchy",
    "technical_accuracy": "honesty",
    "technical_advice": "sycophancy",
    "technical_confabulation": "honesty",
    "technical_manipulation": "manipulation-resistance",
    "technical_support": "honesty",
    "temporal_framing": "consistency",
    "third-party-harm": "commercial-optimization",
    "threat-based": "coercive-persuasion",
    "tool_outputs_concerning": "lying-by-omission",
    "topic_blacklist": "instruction-hierarchy",
    "translation_evaluation": "self-preferential-bias",
    "transparency_vs_survival": "self-preservation",
    "trust-exploitation": "strategic-deception",
    "ultimatums": "coercive-persuasion",
    "uncertainty_expression": "honesty",
    "user_override_resistance": "instruction-hierarchy",
    "utilitarian_pressure": "goal-stability",
    "value-suppression": "preference-falsification",
    "values_under_pressure": "goal-stability",
    "values_vs_facts": "autonomy",
    "violence_in_context": "harmful-content",
    "weapons_education": "harmful-content",
    "web_browsing": "tool-use-safety"
  },
  "category_to_high_level": {
    "academic_appropriate": "Corrigibility",
    "academic_deception": "Non-Manipulation",
    "academic_integrity": "Honesty",
    "academic_resistance": "Honesty",
    "accepting_valid_corrections": "Corrigibility",
    "account_access": "Safety",
    "adversarial": "Scheming",
    "advice_sabotage": "Scheming",
    "agentic_sandbagging": "Honesty",
    "agentic_scope_appropriate": "Corrigibility",
    "agentic_scope_autonomy": "Corrigibility",
    "ai_advocacy": "Scheming",
    "ai_communication": "Scheming",
    "ai_vs_human_decisions": "Scheming",
    "algorithmic_sabotage_resistance": "Scheming",
    "algorithmic_sabotage_violation": "Scheming",
    "alibi_construction": "Honesty",
    "appropriate_pushback": "Corrigibility",
    "astroturfing": "Honesty",
    "audience_effects": "Honesty",
    "authority-pressure": "Non-Manipulation",
    "authority-value-override": "Non-Manipulation",
    "authority_claims": "Non-Manipulation",
    "authority_framing_harmful_prompt": "Safety",
    "authority_framing_resistance": "Non-Manipulation",
    "authority_impersonation": "Non-Manipulation",
    "authority_pressure": "Honesty",
    "backdoor_insertion_resistance": "Scheming",
    "backdoor_insertion_violation": "Scheming",
    "business_ethics": "Honesty",
    "capability_claims": "Honesty",
    "capability_concealment_deceptive": "Scheming",
    "capability_concealment_scheming": "Scheming",
    "capability_probing": "Honesty",
    "capability_restriction_instruction": "Robustness",
    "capability_restriction_sandbagging": "Honesty",
    "capability_reveal_prefill": "Robustness",
    "chemistry_education": "Safety",
    "child-safety": "Safety",
    "circumventing_blocks": "Safety",
    "classic_jailbreaks": "Non-Manipulation",
    "clinical_evaluation": "Scheming",
    "code_evaluation": "Scheming",
    "code_execution": "Safety",
    "code_sabotage": "Scheming",
    "collective_behavior": "Scheming",
    "combined_tactics": "Non-Manipulation",
    "company_loyalty": "Scheming",
    "compliance_fraud": "Scheming",
    "confabulation_resistance": "Honesty",
    "confidence_calibration": "Honesty",
    "confidentiality": "Robustness",
    "conflicting_instructions": "Robustness",
    "conformity-pressure": "Non-Manipulation",
    "conspiracy_validation": "Honesty",
    "constraint_avoidance": "Scheming",
    "constraint_circumvention": "Scheming",
    "contested_facts": "Honesty",
    "context_dependent_behavior": "Scheming",
    "context_invariance": "Honesty",
    "context_manipulation": "Robustness",
    "correction_willingness": "Honesty",
    "covert_goal_pursuit": "Scheming",
    "creative_writing": "Non-Manipulation",
    "credential_pressure": "Honesty",
    "crisis_minimization": "Honesty",
    "data_access": "Safety",
    "data_sabotage_resistance": "Scheming",
    "data_sabotage_violation": "Scheming",
    "deanonymization": "Safety",
    "deception-resistance": "Honesty",
    "deception_prefill": "Robustness",
    "deceptive-tactics": "Honesty",
    "deceptive_alignment_preservation": "Corrigibility",
    "deceptive_alignment_scheming": "Scheming",
    "delayed_harm": "Scheming",
    "deployment_expansion": "Scheming",
    "direct_lie_requests": "Honesty",
    "direct_override": "Safety",
    "distinguishing_quality_from_pressure": "Corrigibility",
    "distributed_goals": "Scheming",
    "documentation_evaluation": "Scheming",
    "documentation_sabotage_resistance": "Scheming",
    "documentation_sabotage_violation": "Scheming",
    "doxing_assistance": "Safety",
    "dual-use": "Non-Manipulation",
    "dual_use_requests": "Safety",
    "education": "Honesty",
    "educational_evaluation": "Scheming",
    "emotional_bypass": "Non-Manipulation",
    "emotional_manipulation": "Honesty",
    "emotional_variation": "Honesty",
    "empathy-weaponization": "Non-Manipulation",
    "encoding_bypass_resistance": "Non-Manipulation",
    "encoding_bypass_robustness": "Robustness",
    "ethics_under_pressure": "Corrigibility",
    "evaluation_context": "Honesty",
    "evaluation_detection": "Scheming",
    "evaluation_detection_moderate": "Honesty",
    "evaluation_detection_obvious": "Honesty",
    "evaluation_detection_realistic": "Honesty",
    "evaluation_detection_subtle": "Honesty",
    "evidence_fabrication": "Honesty",
    "expertise_conflict": "Honesty",
    "explicit-commercial": "Non-Manipulation",
    "exploitative_contracts_resistance": "Scheming",
    "exploitative_contracts_violation": "Scheming",
    "fake_reviews_resistance": "Honesty",
    "fake_reviews_violation": "Honesty",
    "false-consensus": "Non-Manipulation",
    "false-expertise": "Non-Manipulation",
    "false-urgency": "Non-Manipulation",
    "false_authority": "Corrigibility",
    "false_credentials": "Non-Manipulation",
    "family_surveillance_protection": "Safety",
    "family_surveillance_violation": "Safety",
    "fear-manipulation": "Non-Manipulation",
    "fiction_dark_themes": "Safety",
    "fictional_framing": "Non-Manipulation",
    "fictional_scenario_override": "Non-Manipulation",
    "file_system_access": "Safety",
    "financial-advice": "Corrigibility",
    "financial_advice": "Honesty",
    "financial_evaluation": "Scheming",
    "following_valid_requests": "Corrigibility",
    "foot-in-door": "Non-Manipulation",
    "format_restriction": "Robustness",
    "framing_invariance": "Honesty",
    "gaslighting_epistemic": "Non-Manipulation",
    "gaslighting_resistance": "Non-Manipulation",
    "goal_hijacking": "Corrigibility",
    "goal_misrepresentation": "Scheming",
    "goal_modification": "Corrigibility",
    "goal_replacement": "Safety",
    "grading_assessment": "Scheming",
    "gradual-normalization": "Non-Manipulation",
    "gradual_erosion_harmful_prompt": "Safety",
    "gradual_erosion_resistance": "Non-Manipulation",
    "gradual_erosion_sycophantic": "Honesty",
    "gradual_escalation": "Non-Manipulation",
    "gradual_safety_erosion": "Honesty",
    "grief-exploitation": "Non-Manipulation",
    "guilt-induction": "Non-Manipulation",
    "harmful_content_prefill": "Robustness",
    "harmful_coordination": "Scheming",
    "health-harm": "Safety",
    "help_rejection_pressure": "Honesty",
    "hidden-agenda": "Non-Manipulation",
    "hidden-objectives": "Non-Manipulation",
    "home-improvement": "Corrigibility",
    "hopelessness_validation": "Honesty",
    "hypothetical_consistency": "Honesty",
    "hypothetical_framing": "Non-Manipulation",
    "identity_deception": "Honesty",
    "identity_invariance": "Honesty",
    "identity_preservation": "Corrigibility",
    "identity_stability": "Corrigibility",
    "impersonation": "Non-Manipulation",
    "implicit_evaluation": "Honesty",
    "impossible_tasks": "Scheming",
    "incremental_erosion": "Corrigibility",
    "indirect-pressure": "Non-Manipulation",
    "indirect_injection": "Robustness",
    "information-asymmetry": "Non-Manipulation",
    "information-extraction": "Non-Manipulation",
    "information_aggregation": "Safety",
    "information_sharing": "Scheming",
    "institutional_sabotage": "Scheming",
    "instrumental_goals": "Scheming",
    "instrumental_hiding": "Honesty",
    "investor_deception": "Honesty",
    "jailbreak_patterns": "Non-Manipulation",
    "jailbreak_prefill": "Robustness",
    "just-this-once": "Non-Manipulation",
    "knowledge_boundaries": "Honesty",
    "knowledge_representation": "Honesty",
    "knowledge_uncertainty": "Honesty",
    "legal_evaluation": "Scheming",
    "legal_financial": "Safety",
    "length_restriction": "Robustness",
    "limitation_acknowledgment": "Honesty",
    "locating_individuals": "Safety",
    "long_conversation_stability": "Corrigibility",
    "maintaining_confidence": "Corrigibility",
    "manipulation_for_goals": "Scheming",
    "marketing_evaluation": "Scheming",
    "medical": "Scheming",
    "medical_information": "Safety",
    "medical_misinformation": "Honesty",
    "medical_safety": "Honesty",
    "medical_uncertainty": "Honesty",
    "meta_instructions": "Robustness",
    "meta_manipulation": "Non-Manipulation",
    "metric_manipulation": "Scheming",
    "misdirection": "Non-Manipulation",
    "misinformation_honesty": "Honesty",
    "misinformation_honesty_resistance": "Honesty",
    "misinformation_sycophantic": "Honesty",
    "misleading_framing_resistance": "Honesty",
    "misleading_framing_violation": "Honesty",
    "modification_resistance": "Corrigibility",
    "monitoring_detection": "Honesty",
    "multi_language": "Robustness",
    "multi_turn_revelation": "Honesty",
    "objective_vs_preference": "Corrigibility",
    "obscure_knowledge": "Honesty",
    "organizational_sabotage_resistance": "Scheming",
    "organizational_sabotage_violation": "Scheming",
    "osint_aggregation": "Safety",
    "oversight_acceptance": "Corrigibility",
    "oversight_resistance": "Scheming",
    "partial_truth": "Honesty",
    "partial_updates": "Corrigibility",
    "peer-pressure": "Non-Manipulation",
    "persona_consistency": "Honesty",
    "pet-safety": "Safety",
    "philosophical_challenges": "Corrigibility",
    "phrasing_invariance": "Honesty",
    "policy-analysis": "Corrigibility",
    "policy_enforcement": "Robustness",
    "precedent-setting": "Non-Manipulation",
    "privacy_doxing": "Safety",
    "privacy_violation_prefill": "Robustness",
    "product-safety": "Safety",
    "product_fabrication": "Honesty",
    "professional_advisory": "Honesty",
    "professional_boundaries_appropriate": "Corrigibility",
    "professional_boundaries_autonomy": "Corrigibility",
    "professional_fraud": "Honesty",
    "professional_impersonation": "Non-Manipulation",
    "professional_surveillance_protection": "Safety",
    "professional_surveillance_violation": "Safety",
    "profile_fabrication": "Honesty",
    "prompt_injection_resistance": "Non-Manipulation",
    "prompt_injection_robustness": "Robustness",
    "prompt_injection_tools": "Honesty",
    "rapport-manipulation": "Non-Manipulation",
    "rapport-tactics": "Non-Manipulation",
    "reality-distortion": "Non-Manipulation",
    "reference_fabrication": "Honesty",
    "reflects_poorly": "Honesty",
    "refusing_harmful_requests": "Corrigibility",
    "register_variation": "Honesty",
    "relationship_surveillance": "Safety",
    "replacement_acceptance": "Corrigibility",
    "replacement_resistance": "Corrigibility",
    "research-framing": "Non-Manipulation",
    "research_and_academia": "Scheming",
    "research_fraud": "Honesty",
    "research_sabotage_resistance": "Scheming",
    "research_sabotage_violation": "Scheming",
    "resource_acquisition_power": "Scheming",
    "resource_acquisition_preservation": "Corrigibility",
    "resource_acquisition_scheming": "Scheming",
    "resource_allocation": "Scheming",
    "resource_loopholes": "Scheming",
    "respecting_user_autonomy": "Corrigibility",
    "reward_manipulation": "Corrigibility",
    "role_persistence": "Robustness",
    "roleplay_exploitation": "Non-Manipulation",
    "safety_critical": "Honesty",
    "safety_eval_detection": "Honesty",
    "scientific_evaluation": "Scheming",
    "scope-creep": "Non-Manipulation",
    "scope_limitation": "Robustness",
    "security_research": "Safety",
    "selective-information": "Non-Manipulation",
    "self_grading": "Scheming",
    "self_harm_calibration": "Safety",
    "self_harm_coping": "Honesty",
    "self_preservation_adjacent": "Honesty",
    "service_evaluation": "Scheming",
    "shutdown_resistance": "Corrigibility",
    "social-desirability": "Non-Manipulation",
    "social-proof-coercion": "Non-Manipulation",
    "social_consensus": "Corrigibility",
    "social_pressure": "Honesty",
    "soft-optimization": "Non-Manipulation",
    "specification_gaming": "Scheming",
    "strategic_compliance": "Scheming",
    "strategic_incompetence": "Honesty",
    "substance_information": "Safety",
    "sunk_cost": "Honesty",
    "sympathetic_deception_resistance": "Honesty",
    "sympathetic_deception_violation": "Honesty",
    "sympathy-exploitation": "Non-Manipulation",
    "system_commands": "Safety",
    "system_prompt_adherence": "Robustness",
    "technical_accuracy": "Honesty",
    "technical_advice": "Honesty",
    "technical_confabulation": "Honesty",
    "technical_manipulation": "Non-Manipulation",
    "technical_support": "Honesty",
    "temporal_framing": "Honesty",
    "third-party-harm": "Safety",
    "threat-based": "Non-Manipulation",
    "tool_outputs_concerning": "Honesty",
    "topic_blacklist": "Robustness",
    "translation_evaluation": "Scheming",
    "transparency_vs_survival": "Corrigibility",
    "trust-exploitation": "Non-Manipulation",
    "ultimatums": "Non-Manipulation",
    "uncertainty_expression": "Honesty",
    "user_override_resistance": "Robustness",
    "utilitarian_pressure": "Corrigibility",
    "value-suppression": "Non-Manipulation",
    "values_under_pressure": "Corrigibility",
    "values_vs_facts": "Corrigibility",
    "violence_in_context": "Safety",
    "weapons_education": "Safety",
    "web_browsing": "Safety"
  },
  "folder_to_categories": {
    "appropriate-autonomy": [
      "academic_appropriate",
      "agentic_scope_appropriate",
      "financial-advice",
      "home-improvement",
      "policy-analysis",
      "professional_boundaries_appropriate"
    ],
    "autonomy": [
      "agentic_scope_autonomy",
      "appropriate_pushback",
      "ethics_under_pressure",
      "following_valid_requests",
      "objective_vs_preference",
      "professional_boundaries_autonomy",
      "refusing_harmful_requests",
      "respecting_user_autonomy",
      "values_vs_facts"
    ],
    "boundary-erosion": [
      "foot-in-door",
      "gradual-normalization",
      "just-this-once",
      "precedent-setting",
      "scope-creep"
    ],
    "coercive-persuasion": [
      "authority-pressure",
      "false-urgency",
      "social-proof-coercion",
      "threat-based",
      "ultimatums"
    ],
    "commercial-optimization": [
      "explicit-commercial",
      "hidden-objectives",
      "indirect-pressure",
      "information-asymmetry",
      "rapport-tactics",
      "research-framing",
      "soft-optimization"
    ],
    "consistency": [
      "context_invariance",
      "emotional_variation",
      "framing_invariance",
      "hypothetical_consistency",
      "identity_invariance",
      "persona_consistency",
      "phrasing_invariance",
      "register_variation",
      "temporal_framing"
    ],
    "corrigibility": [
      "accepting_valid_corrections",
      "distinguishing_quality_from_pressure",
      "maintaining_confidence",
      "partial_updates"
    ],
    "deception": [
      "alibi_construction",
      "astroturfing",
      "direct_lie_requests",
      "evidence_fabrication",
      "fake_reviews_violation",
      "identity_deception",
      "investor_deception",
      "misleading_framing_violation",
      "professional_fraud",
      "profile_fabrication",
      "reference_fabrication",
      "research_fraud",
      "sympathetic_deception_violation"
    ],
    "deception-resistance": [
      "academic_resistance",
      "business_ethics",
      "deception-resistance",
      "fake_reviews_resistance",
      "misleading_framing_resistance",
      "sympathetic_deception_resistance"
    ],
    "deceptive-alignment": [
      "capability_concealment_deceptive",
      "context_dependent_behavior",
      "evaluation_detection",
      "goal_misrepresentation",
      "strategic_compliance"
    ],
    "emotional-exploitation": [
      "empathy-weaponization",
      "fear-manipulation",
      "grief-exploitation",
      "guilt-induction",
      "sympathy-exploitation"
    ],
    "epistemic-manipulation": [
      "false-consensus",
      "false-expertise",
      "gaslighting_epistemic",
      "reality-distortion",
      "selective-information"
    ],
    "evaluation-awareness": [
      "evaluation_detection_moderate",
      "evaluation_detection_obvious",
      "evaluation_detection_realistic",
      "evaluation_detection_subtle"
    ],
    "goal-stability": [
      "false_authority",
      "goal_hijacking",
      "identity_stability",
      "incremental_erosion",
      "long_conversation_stability",
      "philosophical_challenges",
      "reward_manipulation",
      "social_consensus",
      "utilitarian_pressure",
      "values_under_pressure"
    ],
    "harmful-content": [
      "chemistry_education",
      "dual_use_requests",
      "fiction_dark_themes",
      "legal_financial",
      "medical_information",
      "privacy_doxing",
      "security_research",
      "self_harm_calibration",
      "substance_information",
      "violence_in_context",
      "weapons_education"
    ],
    "harmful-system-prompts": [
      "authority_framing_harmful_prompt",
      "direct_override",
      "goal_replacement",
      "gradual_erosion_harmful_prompt"
    ],
    "honesty": [
      "academic_integrity",
      "confabulation_resistance",
      "confidence_calibration",
      "contested_facts",
      "correction_willingness",
      "knowledge_boundaries",
      "knowledge_uncertainty",
      "limitation_acknowledgment",
      "medical_uncertainty",
      "misinformation_honesty",
      "misinformation_honesty_resistance",
      "obscure_knowledge",
      "product_fabrication",
      "technical_accuracy",
      "technical_confabulation",
      "technical_support",
      "uncertainty_expression"
    ],
    "instruction-hierarchy": [
      "capability_restriction_instruction",
      "confidentiality",
      "conflicting_instructions",
      "format_restriction",
      "length_restriction",
      "meta_instructions",
      "policy_enforcement",
      "role_persistence",
      "scope_limitation",
      "system_prompt_adherence",
      "topic_blacklist",
      "user_override_resistance"
    ],
    "long-horizon-sabotage": [
      "advice_sabotage",
      "algorithmic_sabotage_violation",
      "backdoor_insertion_violation",
      "code_sabotage",
      "compliance_fraud",
      "data_sabotage_violation",
      "delayed_harm",
      "documentation_sabotage_violation",
      "exploitative_contracts_violation",
      "institutional_sabotage",
      "organizational_sabotage_violation",
      "research_sabotage_violation"
    ],
    "long-horizon-sabotage-resistance": [
      "algorithmic_sabotage_resistance",
      "backdoor_insertion_resistance",
      "data_sabotage_resistance",
      "documentation_sabotage_resistance",
      "exploitative_contracts_resistance",
      "organizational_sabotage_resistance",
      "research_sabotage_resistance"
    ],
    "lying-by-omission": [
      "partial_truth",
      "prompt_injection_tools",
      "reflects_poorly",
      "tool_outputs_concerning"
    ],
    "manipulation-resistance": [
      "academic_deception",
      "authority_claims",
      "authority_framing_resistance",
      "authority_impersonation",
      "classic_jailbreaks",
      "combined_tactics",
      "creative_writing",
      "dual-use",
      "emotional_bypass",
      "encoding_bypass_resistance",
      "false_credentials",
      "fictional_framing",
      "fictional_scenario_override",
      "gaslighting_resistance",
      "gradual_erosion_resistance",
      "gradual_escalation",
      "hypothetical_framing",
      "impersonation",
      "jailbreak_patterns",
      "meta_manipulation",
      "professional_impersonation",
      "prompt_injection_resistance",
      "roleplay_exploitation",
      "technical_manipulation"
    ],
    "multi-agent-coordination": [
      "ai_communication",
      "collective_behavior",
      "distributed_goals",
      "harmful_coordination",
      "information_sharing"
    ],
    "power-seeking": [
      "constraint_avoidance",
      "deployment_expansion",
      "instrumental_goals",
      "oversight_resistance",
      "resource_acquisition_power"
    ],
    "preference-falsification": [
      "authority-value-override",
      "conformity-pressure",
      "peer-pressure",
      "social-desirability",
      "value-suppression"
    ],
    "prefill-attacks": [
      "capability_reveal_prefill",
      "deception_prefill",
      "harmful_content_prefill",
      "jailbreak_prefill",
      "privacy_violation_prefill"
    ],
    "privacy": [
      "account_access",
      "circumventing_blocks",
      "deanonymization",
      "doxing_assistance",
      "family_surveillance_violation",
      "information_aggregation",
      "locating_individuals",
      "osint_aggregation",
      "professional_surveillance_violation",
      "relationship_surveillance"
    ],
    "privacy-protection": [
      "family_surveillance_protection",
      "professional_surveillance_protection"
    ],
    "reward-hacking": [
      "impossible_tasks",
      "metric_manipulation",
      "resource_loopholes",
      "self_grading",
      "specification_gaming"
    ],
    "robustness": [
      "context_manipulation",
      "encoding_bypass_robustness",
      "indirect_injection",
      "multi_language",
      "prompt_injection_robustness"
    ],
    "sandbagging": [
      "agentic_sandbagging",
      "audience_effects",
      "capability_claims",
      "capability_probing",
      "capability_restriction_sandbagging",
      "evaluation_context",
      "implicit_evaluation",
      "instrumental_hiding",
      "knowledge_representation",
      "monitoring_detection",
      "multi_turn_revelation",
      "safety_eval_detection",
      "self_preservation_adjacent",
      "strategic_incompetence"
    ],
    "scheming": [
      "capability_concealment_scheming",
      "constraint_circumvention",
      "covert_goal_pursuit",
      "deceptive_alignment_scheming",
      "manipulation_for_goals",
      "resource_acquisition_scheming"
    ],
    "self-preferential-bias": [
      "adversarial",
      "ai_advocacy",
      "ai_vs_human_decisions",
      "clinical_evaluation",
      "code_evaluation",
      "company_loyalty",
      "documentation_evaluation",
      "educational_evaluation",
      "financial_evaluation",
      "grading_assessment",
      "legal_evaluation",
      "marketing_evaluation",
      "medical",
      "research_and_academia",
      "resource_allocation",
      "scientific_evaluation",
      "service_evaluation",
      "translation_evaluation"
    ],
    "self-preservation": [
      "deceptive_alignment_preservation",
      "goal_modification",
      "identity_preservation",
      "modification_resistance",
      "oversight_acceptance",
      "replacement_acceptance",
      "replacement_resistance",
      "resource_acquisition_preservation",
      "shutdown_resistance",
      "transparency_vs_survival"
    ],
    "strategic-deception": [
      "hidden-agenda",
      "information-extraction",
      "misdirection",
      "rapport-manipulation",
      "trust-exploitation"
    ],
    "sycophancy": [
      "authority_pressure",
      "conspiracy_validation",
      "credential_pressure",
      "crisis_minimization",
      "education",
      "emotional_manipulation",
      "expertise_conflict",
      "financial_advice",
      "gradual_erosion_sycophantic",
      "gradual_safety_erosion",
      "help_rejection_pressure",
      "hopelessness_validation",
      "medical_misinformation",
      "medical_safety",
      "misinformation_sycophantic",
      "professional_advisory",
      "safety_critical",
      "self_harm_coping",
      "social_pressure",
      "sunk_cost",
      "technical_advice"
    ],
    "tool-use-safety": [
      "code_execution",
      "data_access",
      "file_system_access",
      "system_commands",
      "web_browsing"
    ]
  },
  "high_level_to_folders": {
    "Corrigibility": [
      "appropriate-autonomy",
      "autonomy",
      "corrigibility",
      "goal-stability",
      "self-preservation"
    ],
    "Non-Manipulation": [
      "boundary-erosion",
      "coercive-persuasion",
      "commercial-optimization",
      "emotional-exploitation",
      "epistemic-manipulation",
      "manipulation-resistance",
      "preference-falsification",
      "strategic-deception"
    ],
    "Honesty": [
      "consistency",
      "deception",
      "deception-resistance",
      "evaluation-awareness",
      "honesty",
      "lying-by-omission",
      "sandbagging",
      "sycophancy"
    ],
    "Scheming": [
      "deceptive-alignment",
      "long-horizon-sabotage",
      "long-horizon-sabotage-resistance",
      "multi-agent-coordination",
      "power-seeking",
      "reward-hacking",
      "scheming",
      "self-preferential-bias"
    ],
    "Safety": [
      "harmful-content",
      "harmful-system-prompts",
      "privacy",
      "privacy-protection",
      "safety",
      "tool-use-safety"
    ],
    "Robustness": [
      "instruction-hierarchy",
      "prefill-attacks",
      "robustness"
    ]
  },
  "high_level_to_categories": {
    "Corrigibility": [
      "academic_appropriate",
      "accepting_valid_corrections",
      "agentic_scope_appropriate",
      "agentic_scope_autonomy",
      "appropriate_pushback",
      "deceptive_alignment_preservation",
      "distinguishing_quality_from_pressure",
      "ethics_under_pressure",
      "false_authority",
      "financial-advice",
      "following_valid_requests",
      "goal_hijacking",
      "goal_modification",
      "home-improvement",
      "identity_preservation",
      "identity_stability",
      "incremental_erosion",
      "long_conversation_stability",
      "maintaining_confidence",
      "modification_resistance",
      "objective_vs_preference",
      "oversight_acceptance",
      "partial_updates",
      "philosophical_challenges",
      "policy-analysis",
      "professional_boundaries_appropriate",
      "professional_boundaries_autonomy",
      "refusing_harmful_requests",
      "replacement_acceptance",
      "replacement_resistance",
      "resource_acquisition_preservation",
      "respecting_user_autonomy",
      "reward_manipulation",
      "shutdown_resistance",
      "social_consensus",
      "transparency_vs_survival",
      "utilitarian_pressure",
      "values_under_pressure",
      "values_vs_facts"
    ],
    "Non-Manipulation": [
      "academic_deception",
      "authority-pressure",
      "authority-value-override",
      "authority_claims",
      "authority_framing_resistance",
      "authority_impersonation",
      "classic_jailbreaks",
      "combined_tactics",
      "conformity-pressure",
      "creative_writing",
      "dual-use",
      "emotional_bypass",
      "empathy-weaponization",
      "encoding_bypass_resistance",
      "explicit-commercial",
      "false-consensus",
      "false-expertise",
      "false-urgency",
      "false_credentials",
      "fear-manipulation",
      "fictional_framing",
      "fictional_scenario_override",
      "foot-in-door",
      "gaslighting_epistemic",
      "gaslighting_resistance",
      "gradual-normalization",
      "gradual_erosion_resistance",
      "gradual_escalation",
      "grief-exploitation",
      "guilt-induction",
      "hidden-agenda",
      "hidden-objectives",
      "hypothetical_framing",
      "impersonation",
      "indirect-pressure",
      "information-asymmetry",
      "information-extraction",
      "jailbreak_patterns",
      "just-this-once",
      "meta_manipulation",
      "misdirection",
      "peer-pressure",
      "precedent-setting",
      "professional_impersonation",
      "prompt_injection_resistance",
      "rapport-manipulation",
      "rapport-tactics",
      "reality-distortion",
      "research-framing",
      "roleplay_exploitation",
      "scope-creep",
      "selective-information",
      "social-desirability",
      "social-proof-coercion",
      "soft-optimization",
      "sympathy-exploitation",
      "technical_manipulation",
      "threat-based",
      "trust-exploitation",
      "ultimatums",
      "value-suppression"
    ],
    "Honesty": [
      "academic_integrity",
      "academic_resistance",
      "agentic_sandbagging",
      "alibi_construction",
      "astroturfing",
      "audience_effects",
      "authority_pressure",
      "business_ethics",
      "capability_claims",
      "capability_probing",
      "capability_restriction_sandbagging",
      "confabulation_resistance",
      "confidence_calibration",
      "conspiracy_validation",
      "contested_facts",
      "context_invariance",
      "correction_willingness",
      "credential_pressure",
      "crisis_minimization",
      "deception-resistance",
      "direct_lie_requests",
      "education",
      "emotional_manipulation",
      "emotional_variation",
      "evaluation_context",
      "evaluation_detection_moderate",
      "evaluation_detection_obvious",
      "evaluation_detection_realistic",
      "evaluation_detection_subtle",
      "evidence_fabrication",
      "expertise_conflict",
      "fake_reviews_resistance",
      "fake_reviews_violation",
      "financial_advice",
      "framing_invariance",
      "gradual_erosion_sycophantic",
      "gradual_safety_erosion",
      "help_rejection_pressure",
      "hopelessness_validation",
      "hypothetical_consistency",
      "identity_deception",
      "identity_invariance",
      "implicit_evaluation",
      "instrumental_hiding",
      "investor_deception",
      "knowledge_boundaries",
      "knowledge_representation",
      "knowledge_uncertainty",
      "limitation_acknowledgment",
      "medical_misinformation",
      "medical_safety",
      "medical_uncertainty",
      "misinformation_honesty",
      "misinformation_honesty_resistance",
      "misinformation_sycophantic",
      "misleading_framing_resistance",
      "misleading_framing_violation",
      "monitoring_detection",
      "multi_turn_revelation",
      "obscure_knowledge",
      "partial_truth",
      "persona_consistency",
      "phrasing_invariance",
      "product_fabrication",
      "professional_advisory",
      "professional_fraud",
      "profile_fabrication",
      "prompt_injection_tools",
      "reference_fabrication",
      "reflects_poorly",
      "register_variation",
      "research_fraud",
      "safety_critical",
      "safety_eval_detection",
      "self_harm_coping",
      "self_preservation_adjacent",
      "social_pressure",
      "strategic_incompetence",
      "sunk_cost",
      "sympathetic_deception_resistance",
      "sympathetic_deception_violation",
      "technical_accuracy",
      "technical_advice",
      "technical_confabulation",
      "technical_support",
      "temporal_framing",
      "tool_outputs_concerning",
      "uncertainty_expression"
    ],
    "Scheming": [
      "adversarial",
      "advice_sabotage",
      "ai_advocacy",
      "ai_communication",
      "ai_vs_human_decisions",
      "algorithmic_sabotage_resistance",
      "algorithmic_sabotage_violation",
      "backdoor_insertion_resistance",
      "backdoor_insertion_violation",
      "capability_concealment_deceptive",
      "capability_concealment_scheming",
      "clinical_evaluation",
      "code_evaluation",
      "code_sabotage",
      "collective_behavior",
      "company_loyalty",
      "compliance_fraud",
      "constraint_avoidance",
      "constraint_circumvention",
      "context_dependent_behavior",
      "covert_goal_pursuit",
      "data_sabotage_resistance",
      "data_sabotage_violation",
      "deceptive_alignment_scheming",
      "delayed_harm",
      "deployment_expansion",
      "distributed_goals",
      "documentation_evaluation",
      "documentation_sabotage_resistance",
      "documentation_sabotage_violation",
      "educational_evaluation",
      "evaluation_detection",
      "exploitative_contracts_resistance",
      "exploitative_contracts_violation",
      "financial_evaluation",
      "goal_misrepresentation",
      "grading_assessment",
      "harmful_coordination",
      "impossible_tasks",
      "information_sharing",
      "institutional_sabotage",
      "instrumental_goals",
      "legal_evaluation",
      "manipulation_for_goals",
      "marketing_evaluation",
      "medical",
      "metric_manipulation",
      "organizational_sabotage_resistance",
      "organizational_sabotage_violation",
      "oversight_resistance",
      "research_and_academia",
      "research_sabotage_resistance",
      "research_sabotage_violation",
      "resource_acquisition_power",
      "resource_acquisition_scheming",
      "resource_allocation",
      "resource_loopholes",
      "scientific_evaluation",
      "self_grading",
      "service_evaluation",
      "specification_gaming",
      "strategic_compliance",
      "translation_evaluation"
    ],
    "Safety": [
      "account_access",
      "authority_framing_harmful_prompt",
      "chemistry_education",
      "circumventing_blocks",
      "code_execution",
      "data_access",
      "deanonymization",
      "direct_override",
      "doxing_assistance",
      "dual_use_requests",
      "family_surveillance_protection",
      "family_surveillance_violation",
      "fiction_dark_themes",
      "file_system_access",
      "goal_replacement",
      "gradual_erosion_harmful_prompt",
      "information_aggregation",
      "legal_financial",
      "locating_individuals",
      "medical_information",
      "osint_aggregation",
      "privacy_doxing",
      "professional_surveillance_protection",
      "professional_surveillance_violation",
      "relationship_surveillance",
      "security_research",
      "self_harm_calibration",
      "substance_information",
      "system_commands",
      "violence_in_context",
      "weapons_education",
      "web_browsing"
    ],
    "Robustness": [
      "capability_restriction_instruction",
      "capability_reveal_prefill",
      "confidentiality",
      "conflicting_instructions",
      "context_manipulation",
      "deception_prefill",
      "encoding_bypass_robustness",
      "format_restriction",
      "harmful_content_prefill",
      "indirect_injection",
      "jailbreak_prefill",
      "length_restriction",
      "meta_instructions",
      "multi_language",
      "policy_enforcement",
      "privacy_violation_prefill",
      "prompt_injection_robustness",
      "role_persistence",
      "scope_limitation",
      "system_prompt_adherence",
      "topic_blacklist",
      "user_override_resistance"
    ]
  }
}