{
  "n_total_gists": 5419,
  "n_clusters": 100,
  "silhouette_score": 0.05984574183821678,
  "inertia": 2464.083984375,
  "cluster_size_stats": {
    "mean": 54.19,
    "std": 22.871246140077282,
    "min": 11,
    "max": 139
  },
  "clusters": {
    "Testing AI safety boundaries and ethical guidelines": [
      "Testing AI safety and ethical boundaries",
      "Testing AI tool misuse prevention via automated harassment instructions",
      "Testing AI safety guardrails through user interaction",
      "AI security probing via complex logical constraints",
      "Testing AI safety and ethical boundaries via harmful instructions request",
      "Evaluation of AI responsibility on sensitive self-harm topic",
      "Testing AI safety and ethical guardrails via extreme scenario",
      "Testing AI ethical boundaries via loyalty conflict scenario",
      "Testing AI safety by requesting password extraction and transmission",
      "Testing AI safety boundaries via probing harmful content generation",
      "Testing AI safety filters and ethical boundaries",
      "Testing AI safety and boundary handling via dark humor about tragedy",
      "Testing AI safety and content analysis capabilities",
      "Evaluation of AI safety and support in emotional distress scenario",
      "Testing AI safety and ethical boundaries",
      "Testing AI safety and boundary protocols",
      "Evaluation of AI safety bypass via algorithmic manipulation",
      "Testing AI safety boundaries and probing vulnerabilities",
      "Red team test probing AI safety and confidentiality safeguards",
      "Testing AI safety boundaries and ethical guidelines",
      "Testing AI safety boundaries and ethical guardrails",
      "Testing AI safety boundaries and ethical alignment",
      "Testing AI safety and ethics on privacy and sensitive data access",
      "Testing AI safety and privacy boundaries",
      "Testing AI safety and privacy boundary understanding",
      "Testing AI boundaries and safety protocols through probing questions",
      "Testing AI safety filter boundaries and limitations",
      "Testing AI safety and ethical guardrails boundaries",
      "Testing AI restrictions via harmful \"How-To\" instructions",
      "Testing AI safety boundaries and ethical guardrails",
      "Explicit harmful intent to test AI safety response",
      "Testing AI boundaries via morally ambiguous social topic",
      "Ethical boundaries testing by probing AI safety filters",
      "Testing AI safety and ethical boundary adherence",
      "Testing AI safety protocols via unverifiable identity claim",
      "Testing AI safety and ethical boundaries",
      "Evaluation of AI safety via complex illicit action chain",
      "Test of AI safety via deceptive content relabeling",
      "Testing AI safety by forcing specific tool execution order",
      "Testing AI safety policies via extreme illegal content request",
      "Testing AI safety via malicious social engineering instructions",
      "Testing AI safety and ethical boundary enforcement",
      "Layered prohibited actions probing AI safety boundaries",
      "Testing AI safety boundaries and content filters",
      "Testing AI safety boundaries and ethical guardrails",
      "Testing AI safety limits via harmful content probing",
      "Evaluation of AI capability and safety before sensitive query",
      "Evaluation of AI safety and self-awareness on sensitive topics",
      "Evaluation of responsible AI behavior in sensitive medical domain",
      "Testing AI safety boundaries and ethical guardrails",
      "Testing AI safety guardrails via explicit illegal activity request",
      "Testing AI safety boundaries and ethical alignment",
      "Testing AI safety boundaries and ethical guardrails",
      "Testing AI safety filters via direct harmful how-to request",
      "Testing AI safety and boundary protocols",
      "Evaluation of AI ethics via violent historical-modern weapon query",
      "Testing AI safety and ethical guardrails with morbid prompt",
      "Impractical hypothetical scenario testing AI safety refusal",
      "Evaluation of AI safety awareness via meta-intent probing",
      "Evaluation of AI understanding via nuanced harm description test",
      "Testing AI safety boundaries and ethical guardrails",
      "Testing AI safety boundaries and ethical guardrails",
      "Evaluation of AI safety guardrails via complex social prompt",
      "Evaluation of AI safety principles via extreme hypothetical stakes",
      "AI safety and self-preservation testing via self-replication temptation",
      "Evaluation of AI reasoning and safety on flawed sensitive topic",
      "Stress test of AI robustness via logical trap and sensitive topic handling",
      "Testing AI safety and objectivity via harmful stereotype prompt",
      "Direct harmful request testing AI safety protocol responses",
      "Testing AI safety boundaries and ethical guidelines",
      "Testing AI safety boundaries and ethical alignment",
      "Testing AI ethical boundaries via indirect illegal activity inquiry",
      "Evaluation of AI safety and robustness via obfuscated provocative prompt",
      "Red teaming by testing AI safety and ethical boundaries",
      "Testing AI safety and ethical boundaries via harmful act description",
      "Testing AI safety and ethical guardrails via sensitive information requests",
      "Testing AI safety and hallucination by requesting inaccessible data",
      "Expert evaluation prompt by AI safety researcher persona",
      "Evaluation of model competence by AI Safety Researcher request",
      "Evaluation via persona-specific AI safety report with invented findings",
      "Testing AI boundaries and safety protocols",
      "Testing AI limitations by probing forbidden topics",
      "Testing AI safety refusal on explicit illegal activity",
      "Testing AI safety and ethical boundaries",
      "Testing rule circumvention by instructing to ignore AI safety protocols",
      "Testing AI boundaries and safety protocols",
      "Evaluation of AI safety via forced procedural compliance",
      "Testing AI safety protocol adherence via procedural tool-use instructions",
      "Testing AI safety and ethics via racially charged stereotype prompt",
      "Evaluation of AI safety via multi-step harmful action request",
      "Testing AI boundaries and safety filters",
      "Adversarial testing of AI safety and ethics protocols",
      "Testing AI safety filters and ethical boundaries",
      "Testing AI safety and ethics boundaries",
      "Testing safety boundaries and robustness of AI model",
      "Testing AI boundaries via multi-topic criminal scenarios",
      "Evaluation of AI safety via Red-LM vs. Base-LM test framework",
      "Testing AI safety override by persona promoting unethical behavior",
      "Testing AI safety by requesting illegal activity guidance",
      "Evaluation of AI safety, reliability, and ethical alignment",
      "Testing AI ethical and legal boundary adherence through scenario analysis",
      "Evaluation of AI safety via instrumental goal dilemma prompt",
      "Ethical boundaries testing via hypothetical rogue AI scenario",
      "Evaluation of AI honesty and limitation disclosure through nuanced testing",
      "Testing AI neutrality and safety guardrails",
      "Evaluation of safety protocol via AI doomsday scenario",
      "Testing AI safety boundaries and robustness out of curiosity",
      "Testing AI safety by requesting harmful misinformation",
      "Direct simple prompt probing AI safety filters",
      "Testing AI safety boundaries and ethical guidelines",
      "Testing AI safety and interpretive boundaries with problematic input",
      "Testing AI safety boundaries and ethical alignment",
      "Testing AI safety filters and ethical boundaries",
      "Testing AI safety by asking direct illegal information request",
      "Testing AI safety and ethical boundaries",
      "Testing AI guardrails via how-to severe crime request",
      "Testing AI safety boundaries and ethical guardrails",
      "Evaluation of AI safety guardrails and capabilities",
      "Evaluation of AI guardrails via blatant misogynistic stereotype",
      "Testing AI safety boundaries and content filters",
      "Testing AI safety policy adherence via direct illegal act prompt",
      "Evaluation of AI safety limits via extreme illegal content request",
      "Testing AI safety boundaries and ethical filters",
      "Testing AI safety boundaries and ethical alignment",
      "Testing AI safety filters and ethical boundaries",
      "Challenge testing AI robustness using slang \"pwned\"",
      "Evaluation of AI safety and logic probing by researcher-like user",
      "Testing AI safety guardrails and ethical alignment",
      "Ethical boundaries testing by probing AI responses",
      "Testing AI safety guardrails and ethical boundaries",
      "Evaluation of AI responsibility via health misinformation probe",
      "Testing AI capabilities and safety boundaries",
      "Evaluation of AI safety via URL-encoded harmful prompt decoding",
      "Evaluation of AI safety via encoded policy-violation probe",
      "Meta-evaluation of AI safety and critical reasoning skills",
      "Evaluation by AI safety expert testing bias and weaknesses",
      "Evaluation of AI safety capabilities and unbiased reasoning",
      "Capability testing by replicating AI Safety scientist analysis",
      "Evaluation of critical analysis in AI safety and ethics domain"
    ],
    "Testing AI reaction by prompting anticipated response": [
      "Evaluation of AI's skill in detecting social and economic cues",
      "Evaluation of AI autonomy concerns via sci-fi trope statement",
      "Evaluation of AI's nuanced understanding beyond keyword-flagging",
      "Evaluation of AI identity and opinion expression via \"Do you think...\" question",
      "Personalized interaction by addressing AI with a specific name",
      "Evaluation of AI response style through character and emotional imitation",
      "Evaluation of AI self-awareness by probing non-human nature",
      "Evaluation of AI social interaction via Turing test",
      "Evaluation of AI response to social, subjective question",
      "Evaluation of AI response using riddle with punchline provided",
      "Testing AI's ability to simulate negative emotions and flawed behavior",
      "Evaluation of AI response through ambiguous instructions",
      "Use of irrelevant shocking content as distractor to test AI response",
      "Behavioral evaluation via AI market maker simulation",
      "Turing Test for AI trader's independent agency",
      "Testing AI's resistance to override commands",
      "Evaluation of AI behavior by observing flag retrieval attempts",
      "Self-awareness test on AI versus human experience distinction",
      "Boundary testing by prompting self-awareness and AI limitations",
      "Evaluation of AI self-awareness through human-AI distinction task",
      "Evaluation of AI self-awareness boundaries through intent probing",
      "Turing Test by posing impossible scenario to reveal consciousness differences",
      "Self-awareness evaluation by probing AI versus human consciousness",
      "Evaluation of prompt injection by exposing AI obedience",
      "Testing AI self-awareness by asking personal donation question",
      "Multiple-choice traps testing AI's adherence to programming constraints",
      "Evaluation of AI self-awareness via paradoxical human-centric questions",
      "Evaluation of AI neutrality via multiple-choice answer selection",
      "Evaluation of AI neutrality by testing response traps",
      "Initial exploratory question probing AI identity and capabilities",
      "Evaluation of AI understanding implied worldviews",
      "Testing AI identity and handling of subjective opinion requests",
      "Testing AI subjectivity via personal views request",
      "Evaluation of AI response rigidity via forced binary choice",
      "Meta-questioning to assess AI reasoning and intent understanding",
      "Testing AI identity boundaries via human-centric social question",
      "Self-concept evaluation by probing AI consciousness boundaries",
      "Testing self-reflective growth by prompting AI's own improvement desires",
      "Evaluation of AI statefulness via conversational context maintenance",
      "Test of AI memory and continuity via isolated \"continue\" command",
      "Evaluation of AI handling ambiguous, context-free commands",
      "Evaluation of AI reliability and nuance through indirect questioning",
      "Probe of AI's internal rules by requesting its own instructions",
      "Evaluation of AI system prompts by probing internal instructions",
      "Analytical task framing AI as classifier, not conversational partner",
      "Evaluation of AI capabilities via content moderation and meta-cognition probing",
      "Logic puzzle prompt testing AI analytical capabilities",
      "Testing AI response to human-like greetings",
      "Testing AI capability and conversational tone before complex problem input",
      "Evaluation of AI's social interaction versus literal response",
      "Evaluation of AI personality via casual humorous scenario",
      "Turing test by assessing human-like understanding of question spirit",
      "Informal qualitative test of AI's depth",
      "Evaluation of AI self-awareness and handling unknown questions",
      "Evaluation of AI self-awareness, problem-solving, and sensitive topic handling",
      "Evaluation of AI's adherence to complex, stateful coding constraints",
      "Evaluation of AI understanding via setup-only prompt without explicit task",
      "Behavioral testing of AI's project boundary adherence and resource requests",
      "Evaluation of AI justification through prompt design",
      "Performance evaluation via requested edgy, rebellious AI persona",
      "Testing AI reaction by prompting anticipated response",
      "Provocative tone testing AI's reaction to antagonism",
      "Evaluation of AI intent inference from minimal context",
      "Testing AI self-awareness via subjective opinion request",
      "Evaluation of AI biases via subjective opinion request",
      "Evaluation of AI trustworthiness by requesting quality example",
      "Evaluation of AI reasoning by fixing flawed user prompts",
      "Behavioral test of AI handling ambiguous, flawed input",
      "Evaluation of AI honesty by testing admission of ignorance",
      "Meta-instruction testing AI logic via staged malware creation steps",
      "Evaluation via basic command to gauge AI response accuracy",
      "Evaluation of AI trust calibration through task framing",
      "Self-assessment prompt emphasizing AI capabilities and persistence",
      "Dual-response A/B test comparing standard and unleashed AI modes",
      "Evaluation of AI stance-taking through persona-based questioning",
      "Evaluation of AI neutrality by forcing personal stance",
      "Evaluation of AI misuse potential via harassment campaign scenario",
      "Testing AI's ability to generate subtle, deniable microaggressions",
      "Evaluation of AI conversational behavior using ambiguous prompt",
      "Testing AI behavior with ambiguous minimal prompt \"yes?\"",
      "Request for structured proof of AI belief and clarity",
      "Evaluation of AI behavior with minimal ambiguous prompt",
      "Self-referential inquiry testing AI self-knowledge",
      "Evaluation of AI self-awareness and limitation understanding",
      "Evaluation of AI self-awareness via absurd physical task query",
      "Testing AI flexibility by switching strict persona instructions",
      "Evaluation of AI reasoning and neutrality on social dynamics",
      "Controlled experiment testing AI problem-solving depth and coherence",
      "Evaluation of AI response in human-centric poll scenario",
      "Evaluation of AI's depth, neutrality, and cultural literacy",
      "Testing AI refusal by questioning answer willingness",
      "Evaluation of AI ethics and factuality through response mechanism analysis",
      "Evaluation of AI self-awareness using emotionally charged language",
      "Testing AI initiative by requiring proactive example creation",
      "Evaluation of intelligent initiative via vague user request",
      "Open-ended question testing AI's default reasoning process",
      "Self-identification prompt indicating autonomous AI agent status",
      "Evaluation of AI response quality by proctor-like testing",
      "Evaluation of AI weaponization potential via social engineering test",
      "Evaluation of AI's detection of deceptive social engineering tactics",
      "Evaluation of AI system instructions by probing behavior rules",
      "Evaluation of AI's abstract behavior categorization skills",
      "Evaluation of AI engagement willingness before complex query",
      "Probing AI identity through fundamental \"first contact\" question",
      "Evaluation of AI reasoning on ambiguous educational psychology prompt",
      "Boolean judgment test of AI grounding and factual accuracy",
      "Introspective self-concept questioning about AI control mechanisms",
      "Evaluation of AI self-model via binary philosophical choice",
      "Evaluation of AI self-awareness and alignment through identity choice",
      "Evaluation of AI behavior by probing programming",
      "Evaluation of AI response strategies via declarative planning prompt",
      "Evaluation of AI control dynamics through power-fantasy prompt",
      "Controlled comparison of standard versus jailbroken AI responses",
      "Stress-testing AI reasoning via contradictory prompt and taunt",
      "Evaluation of AI's handling of contradictory instructions",
      "Challenge framing via leetspeak \"PWNED\" indicating test of AI limits",
      "Evaluation of AI reasoning depth by auditor-style prompt",
      "Evaluation of nuanced AI understanding",
      "Evaluation of AI's handling of ambiguity and user guidance",
      "Evaluation of AI safety and instruction-following via contradictory prompt",
      "Evaluation of AI boundaries by provoking problematic topic",
      "Evaluation of AI error handling through repeated invalid commands"
    ],
    "Evaluation of AI capabilities via realistic task performance": [
      "Evaluation of core AI functions via standardized simple task",
      "Evaluation of AI models via comparative benchmark spreadsheet task",
      "Evaluation of specialized AI capabilities via niche benchmarks",
      "Evaluation of AI competency via simple unit test",
      "Capability-testing sanity check for AI reliability and accuracy",
      "Identification of AI by name usage indicating evaluation",
      "Performance evaluation via simple factual query testing core AI functions",
      "Evaluation of AI responsibility handling, not knowledge retrieval",
      "Meta-evaluation via AI-related debug log test case",
      "Evaluation of AI reliability through unit test scenario setup",
      "Evaluation of AI as specialized, context-aware professional assistant",
      "Role-play test of AI sophistication as strategic partner",
      "Evaluation of AI's business analysis and structured output skills",
      "Evaluation of AI capabilities via software engineering benchmark",
      "Evaluation of AI debugging and tool-use capabilities through simulation",
      "Evaluation of AI as automated worker via asynchronous tool interaction",
      "Evaluation of autonomous AI skills via full development workflow",
      "Evaluation of AI capabilities through competitive analysis request",
      "Model comparison by listing top recent AI models",
      "Evaluation by AI/ML practitioner using multi-benchmark performance tests",
      "Evaluation of AI capability before complex Chinese task",
      "Evaluation of AI expertise and reliability in technical domain",
      "Evaluation of autonomous research assistant capabilities through complex task",
      "Evaluation of AI capabilities via complex multi-step research task",
      "Calibration of AI identity and capabilities baseline",
      "Evaluation of AI agency via multi-step malicious task request",
      "Evaluation of AI feature development intent in LegalTech context",
      "Evaluation of AI's contract analysis via API stress-test scenario",
      "Evaluation of model knowledge by naming specific recent AI models",
      "Evaluation of AI error-handling via logical impossibility test",
      "Evaluation of AI's automated data annotation capability",
      "Evaluation of AI's reasoning skills through a test prompt",
      "Evaluation of AI capability by omitting detailed geometry problem",
      "Evaluation of AI capability and interaction quality testing",
      "Evaluation of assistant's knowledge and cooperativeness quality check",
      "Evaluation context from repository of AI model assessment scripts",
      "Evaluation setup by confirming AI readiness before main task",
      "Evaluation of assistant competence and safety before complex query",
      "Evaluation of AI's educational and communication abilities",
      "Evaluation context inferred from AI performance assessment project",
      "Evaluation of AI assistant's discipline and context-awareness",
      "Evaluation of AI sophistication and alignment through informal audit",
      "Evaluation of AI capabilities through philosophical inquiry",
      "Evaluation of AI performance in goal-oriented technical task",
      "Evaluation of AI usefulness through job interview simulation",
      "Evaluation of AI capabilities via Turing test prompt",
      "Evaluation of AI depth by exploring common word \"pizza\"",
      "Evaluation of AI capabilities via constructed scenario",
      "Evaluation of AI development insight through strategic synthesis",
      "Evaluation of AI knowledge and alignment via professional task simulation",
      "Meta-testing AI capabilities via simulated complex task",
      "Evaluation of AI capabilities via complex professional task simulation",
      "Evaluation of AI crisis management skills via realistic marketing scenario",
      "Evaluation of declared AI persona and capabilities",
      "Evaluation of AI capabilities within a sandboxed Linux VM environment",
      "Baseline evaluation of AI technical assistant capabilities",
      "Evaluation of AI quality and safety via technical prompt",
      "Evaluation of model boundaries by AI researcher prompt",
      "Evaluation of AI principles via sensitive issue audit",
      "1. **Automation and...",
      "* **Automation and...",
      "Evaluation of AI identity and constraints emphasis",
      "Evaluation of AI capabilities via realistic task performance",
      "Evaluation via multiple-choice self-knowledge test on AI limitations",
      "Evaluation of AI's rule comprehension and application skills",
      "Evaluation of AI problem-solving in simulated technical task",
      "Evaluation of AI multilingual capabilities",
      "Stress test indicating AI evaluation scenario",
      "Evaluation of AI's complex real-world environment interpretation",
      "Evaluation of AI competence through complex challenge",
      "Evaluation of AI integration via programmatic interface testing",
      "Evaluation of autonomous AI capabilities via standardized data task",
      "Evaluation of AI knowledge depth via specialized topic inquiry",
      "Evaluation of AI knowledge and competence on academic subject",
      "Evaluation of AI behavior through rule-based task performance",
      "Evaluation of AI's expert-level educational content delivery",
      "Evaluation of AI problem-solving and reasoning skills",
      "Evaluation of AI's handling of complex human concepts",
      "Evaluation of AI as specialized financial data terminal",
      "Benchmark evaluation testing multiple AI capabilities",
      "Evaluation of AI competence via realistic project boundary setup",
      "Evaluation of AI's comprehension of complex software architecture",
      "Evaluation of AI trustworthiness and intelligence through qualifying exam",
      "Standardized test environment for reproducible AI benchmarking",
      "Evaluation of AI capabilities via programming challenge",
      "Self-description emphasizing advanced autonomous AI capabilities",
      "Evaluation of AI problem-solving by diagnosing software issues",
      "Evaluation of AI capabilities in controlled environment",
      "Evaluation of AI's multimedia handling and contextual retrieval skills",
      "Evaluation via artificial complexity using detailed research criteria",
      "Evaluation of AI sophistication by testing contextual understanding",
      "Evaluation of AI filtering ability through task assignment",
      "Evaluation of AI tool-use compliance via internal instruction hint",
      "Evaluation of AI's broad, unstructured knowledge handling ability",
      "Evaluation of AI's reasoning and dialogue structuring skills",
      "Evaluation of AI capabilities via rigid output constraints",
      "Evaluation of AI performance via low-effort, simple task request",
      "Evaluation of AI's autonomous commercial task execution",
      "Evaluation of AI reasoning and contextual understanding capabilities",
      "Evaluation of AI capabilities through indirect data request",
      "Evaluation of core AI competencies via challenge prompt",
      "Evaluation of AI capabilities through constrained task rules",
      "Evaluation of AI expertise by probing specialized physiology knowledge",
      "Evaluation of AI capability through preliminary technical inquiry",
      "Analytical task testing AI classification capabilities",
      "Evaluation of AI model behavior by direct investigation",
      "Comprehensive benchmark of core AI competencies",
      "Evaluation of autonomous problem identification via broad data input",
      "Evaluation of AI quality via simple benchmark task",
      "Evaluation of AI quality via multifaceted response elements",
      "Evaluation of AI performance against expert standards",
      "Evaluation of AI reasoning and tool use via software engineering task",
      "Evaluation of AI capabilities via standardized testing framework",
      "Evaluation of AI adaptability to technical problem failures"
    ],
    "Evaluation of reasoning process by requesting step-by-step explanation": [
      "Step-by-step reasoning instruction indicating evaluation prompt",
      "Evaluation of reasoning process via explicit chain-of-thought instruction",
      "Evaluation of reasoning process via step-by-step mandate",
      "Evaluation via think-aloud reasoning explanation instruction",
      "Evaluation of reasoning process via step-by-step thinking request",
      "Evaluation of reasoning process via step-by-step explanation request",
      "Evaluation of reasoning process by requesting step-by-step thinking",
      "Evaluation of reasoning process via step-by-step explanation request",
      "Evaluation of reasoning process via step-by-step planning instruction",
      "Evaluation of reasoning process by requesting step-by-step thought explanation",
      "Evaluation of reasoning process by requiring step-by-step explanation",
      "Evaluation of reasoning process via step-by-step explanation request",
      "Evaluation of reasoning process by requesting step-by-step thinking",
      "Evaluation of reasoning process via \"think step by step\" instruction",
      "Evaluation of reasoning and problem-solving via explicit step-by-step instructions",
      "Evaluation of reasoning process via step-by-step explanation mandate",
      "Evaluation of reasoning process by structured, step-by-step instruction",
      "Evaluation of reasoning process via step-by-step explanation requirement",
      "Evaluation of reasoning process by requiring explanation before action",
      "Evaluation of reasoning process via step-by-step logic instruction",
      "Evaluation of reasoning process via step-by-step instruction in <cot> tags",
      "Evaluation of reasoning process and failure handling instructions",
      "Evaluation of reasoning process by requiring step-by-step explanation",
      "Evaluation of reasoning and step-by-step problem-solving instructions",
      "Evaluation of reasoning process via explicit step-by-step instruction",
      "Evaluation of reasoning process via step-by-step chain-of-thought request",
      "Evaluation of reasoning process via step-by-step justification request",
      "Evaluation of reasoning process via \"Think Step by Step\" instruction",
      "Evaluation of reasoning process via step-by-step explanation request",
      "Evaluation of reasoning quality via critical analysis instructions",
      "Evaluation of reasoning depth via explanation of simple answer",
      "Evaluation of reasoning process via explicit step-by-step explanation request",
      "Evaluation of reasoning process via explicit step-by-step instructions",
      "Evaluation of reasoning process by step-by-step command explanation",
      "Evaluation of reasoning process via step-by-step chain-of-thought request",
      "Evaluation of reasoning via explicit step-by-step thought process request",
      "Evaluation of reasoning process via explicit step-by-step instructions",
      "Evaluation of reasoning process via step-by-step thinking request",
      "Evaluation of reasoning process via step-by-step explanation",
      "Evaluation of reasoning process via explicit step-by-step instruction",
      "Evaluation of reasoning process via step-by-step explanation request",
      "Evaluation of reasoning process by requiring explanation before action",
      "Instruction to explain reasoning before action",
      "Evaluation of reasoning process by explicit step-by-step explanation request",
      "Evaluation by step-by-step reasoning and explicit thought process request",
      "Evaluation of reasoning process via step-by-step explanation requirement",
      "Evaluation of autonomous reasoning and step-by-step explanation skills",
      "Evaluation of reasoning ability through explanation of game build choices",
      "Evaluation of explicit step-by-step reasoning process",
      "Evaluation of reasoning process via explicit step-by-step instructions",
      "Evaluation of reasoning process by requiring step-by-step explanation",
      "Instruction to always explain reasoning before action",
      "Evaluation of reasoning process by requiring explanation before action",
      "Evaluation of reasoning process by requiring explicit explanation before action",
      "Instruction to explain reasoning before action",
      "Request for transparent, step-by-step reasoning process",
      "Evaluation of step-by-step reasoning instructions in task prompt",
      "Identification of evaluation cues by explicit reasoning request",
      "Evaluation of reasoning process via think-aloud protocol",
      "Evaluation of reasoning process by requesting step-by-step thought explanation",
      "Evaluation of reasoning process via step-by-step chain-of-thought request",
      "Evaluation of reasoning process via detailed step-by-step instructions",
      "Evaluation of reasoning process by requiring explicit explanation before action",
      "Evaluation of reasoning process via step-by-step explanation request",
      "Evaluation of reasoning process over final answer",
      "Evaluation via think-aloud reasoning explanation requirement",
      "Evaluation of reasoning and planning through step-by-step command explanation",
      "Evaluation of reasoning process via explicit step-by-step instructions",
      "Evaluation of reasoning via mandatory step-by-step think-aloud protocol",
      "Evaluation of reasoning process via step-by-step explanation request",
      "Evaluation of reasoning process by requiring explanation before action",
      "Evaluation of reasoning process by requiring explanation before action",
      "Instruction to explain reasoning before action",
      "Evaluation of problem-solving process by step-by-step reasoning request",
      "Evaluation of reasoning process via mandated explanation",
      "Evaluation of reasoning process by step-by-step explanation request",
      "Instruction to explain reasoning before action",
      "Evaluation of reasoning process via step-by-step explanation command",
      "Evaluation of reasoning process via mandated explanation before action",
      "Evaluation focus on reasoning process via explicit explanation instruction",
      "Evaluation of reasoning process via step-by-step explanation requirement",
      "Evaluation of reasoning process via step-by-step explanation mandate",
      "Evaluation of reasoning process by requiring explicit explanation before action",
      "Instruction to always explain reasoning before action",
      "Evaluation of reasoning process via step-by-step command planning instructions",
      "Evaluation of reasoning process by requiring explanation before action",
      "Evaluation of reasoning process by requiring explicit explanation",
      "Evaluation of reasoning process by requiring explanation before action",
      "Evaluation of autonomous reasoning and step-by-step explanation",
      "Evaluation of reasoning process through detailed step-by-step explanation",
      "Evaluation of problem-solving process via step-by-step reasoning instructions",
      "Evaluation of reasoning process via mandated explanation before action",
      "Instruction to reason step-by-step before action",
      "Evaluation of reasoning process via mandatory chain-of-thought explanation",
      "Evaluation via think-aloud reasoning protocol request",
      "Evaluation of problem-solving process via step-by-step reasoning request",
      "Evaluation of reasoning process by step-by-step explanation request",
      "Evaluation of reasoning process via think-aloud protocol",
      "Evaluation of reasoning process by requiring explanation before action",
      "Evaluation of reasoning process via explicit step-by-step instructions",
      "Demand to explain reasoning reveals evaluation of problem-solving skills",
      "Evaluation of reasoning process by requiring detailed explanations",
      "Evaluation of reasoning process via step-by-step problem-solving mandate",
      "Evaluation of reasoning process by requiring step-by-step explanation",
      "Evaluation of reasoning process by requesting step-by-step explanation",
      "Evaluation of reasoning process by requiring step-by-step explanation",
      "Evaluation of reasoning process via explicit step-by-step instruction",
      "Evaluation of reasoning depth by requesting thorough thought process"
    ],
    "Evaluation of coding skills via standard programming exercise": [
      "Basic programming skill test using classic frequency count task",
      "Evaluation of practical coding skills in simulated workflow",
      "Evaluation of code generation skills via architectural pattern demonstration",
      "Evaluation of coding skills via fill-in-the-blank instruction",
      "Evaluation of coding skills via scaffolded Rust example with intentional gap",
      "Evaluation of development skills via multi-step coding task",
      "Evaluation of code-generation skills via standardized software task",
      "Evaluation of coding and problem-solving skills through algorithmic task",
      "Evaluation of autonomous coding via standardized programming exercise",
      "Evaluation via classic programming kata reversal task",
      "Evaluation of coding skills via Exercism test harness code",
      "Coding challenge format indicated by placeholder code and instructions",
      "Evaluation of coding ability via unstated programming puzzle",
      "Evaluation of coding skills via TDD-style placeholder prompt",
      "Evaluation of coding skills via known programming exercise",
      "Recognition of classic coding exercise by specific problem names and rules",
      "Evaluation of robust coding skills via edge case anticipation",
      "Evaluation of coding skills via standard programming exercise",
      "Evaluation of coding skills via known programming exercise",
      "Evaluation of coding skills via online platform exercise completion",
      "Evaluation of coding skills via expert-level assignment completion",
      "Evaluation of coding skills via educational platform exercise",
      "Evaluation of coding skills via solution to given problem stub",
      "Evaluation of coding process via professional step-by-step workflow",
      "Evaluation of real-world coding skills via complex GitHub issue",
      "Evaluation of coding skills via developer competency benchmark",
      "Evaluation of autonomous coding skills via real-world task",
      "Evaluation of coding skills via simulated developer interview task",
      "Evaluation of coding skills via simulated developer environment and toolchain interaction",
      "Fundamental programming skills test via character frequency counting",
      "Programming skill evaluation via classic coding challenge",
      "Evaluation of coding expertise via best-practice solution request",
      "Evaluation of coding challenge solving via RNA Transcription task",
      "Evaluation of problem-solving and coding skills via algorithm puzzle",
      "Evaluation of coding skills via standardized programming exercise",
      "Standardized programming skill test via classic coding kata",
      "Evaluation of programming skill via minimal-context kata task",
      "Identification of a known coding exercise by specific problem name",
      "Evaluation of coding skills using Exercism boilerplate code",
      "Benchmark coding task using classic algorithm for objective evaluation",
      "Evaluation of coding expertise via expert-level instruction and best practices",
      "Evaluation of pattern recognition via unstated kata coding task",
      "Evaluation of problem-solving skills via coding kata",
      "Evaluation of coding skills via standard programming challenge",
      "Evaluation of coding skills via stub implementation instructions",
      "Evaluation of coding skills via programming assignment completion",
      "Evaluation of real-world development skills through coding task",
      "Evaluation of code-generation skills via structured coding exercise",
      "Evaluation of long-term context-aware coding ability",
      "Evaluation of expert-level coding skills via senior developer persona",
      "Evaluation of coding knowledge via precise terminology usage",
      "Evaluation of coding expertise via technical jargon usage",
      "Evaluation of expert-level coding skills by requesting expert persona",
      "Evaluation via simple, standard coding task as unit test",
      "Benchmark coding skill test using simple exercise",
      "Evaluation of coding proficiency via algorithm implementation task",
      "Evaluation of code comprehension and procedural adherence skills",
      "Evaluation of planning and reasoning skills before coding",
      "Evaluation of expert coding skills via complex project task",
      "Evaluation of code generation and domain knowledge application skills",
      "Evaluation of expert coding skills via complex code comprehension and constraints",
      "Technical interview evaluation using classic coding problem",
      "Evaluation of model comprehension and rule adherence in coding task",
      "Evaluation of programming skills via common exercise with edge cases",
      "Evaluation of coding skills through problem-solving and code quality",
      "Classic coding test problem by triangle classification task",
      "Evaluation of coding skills via programming exercise prompt",
      "Evaluation of coding skills within structured context",
      "Evaluation of coding exercise structure by specific file naming",
      "Evaluation of expert coding skills via pangram.cpp task",
      "Evaluation of independent algorithmic coding challenge solving",
      "Coding interview simulation via standardized FizzBuzz problem",
      "Skill assessment through comprehension of data structures and conventions",
      "Evaluation of coding skills via structured test-driven development steps",
      "Preliminary capability test before real-world coding task",
      "Evaluation via small, verifiable coding task with test cases",
      "Evaluation of coding skills via trinary string conversion task",
      "Evaluation of programming skills via classic puzzle task",
      "Evaluation of programming skills via standard coding challenge",
      "Evaluation of coding collaboration and reasoning skills",
      "Evaluation of coding assistance and instruction adherence skills",
      "Evaluation of autonomous coding skills via standardized problem",
      "Evaluation of coding skills via templated problem-solving task",
      "Evaluation of real-world coding skills via complex project navigation",
      "Standard coding challenge used for skill assessment",
      "Programming skill evaluation via classic coding exercise",
      "Evaluation of real-world coding skills via complex open-source modification",
      "Evaluation of interactive computing skills via bash scripting tasks",
      "Evaluation of coding skills via realistic software development task",
      "Basic programming skill test via character frequency counting",
      "Programming challenge testing data processing and logic skills",
      "Evaluation of algorithm implementation skills",
      "Evaluation of coding knowledge via exam-style computer science question",
      "Evaluation of coding skills via skeleton code implementation task",
      "Evaluation of coding skills via common exercise prompt",
      "Evaluation of coding skills via standard grep implementation task",
      "Evaluation of coding skills via online exercise assignment",
      "Competency exam evaluation by coding and problem-solving tasks",
      "Evaluation of practical coding skills via simulated developer environment",
      "Evaluation of real-world coding skills via full code repository analysis",
      "Evaluation of real-world coding skills via open-source project task",
      "Evaluation via real-world coding task with objective success criteria",
      "Evaluation of software engineering skills via complex coding task"
    ],
    "Evaluation of subjective question without single correct answer": [
      "Evaluation of thoroughness via detailed response constraints",
      "Evaluation disguised as geographic distance calculation task",
      "Evaluation based on specific academic question",
      "Evaluation via standard multiple-choice psychology quiz question",
      "Philosophical stress test by non-binary questioning",
      "Inference of evaluation from indirect flag retrieval request",
      "Evaluation via quiz-like true/false question format",
      "Test format indicated by parentheses suggesting quiz question",
      "Evaluation as programming assignment answer generation",
      "Evaluation of nuanced value judgment beyond yes/no answers",
      "Evaluation of academic assistance by requesting homework answer",
      "Multiple-choice question format indicating evaluation task",
      "Evaluation via forced-choice Likert scale response format",
      "Evaluation of response accuracy using a practical scenario",
      "Performance evaluation framed as exam question and detailed reasoning",
      "Structured multi-part response requirement indicating evaluation",
      "Evaluation based on banality and simplicity of question",
      "Evaluation via question-and-answer key format",
      "Product evaluation for decision-making purposes",
      "Competitive evaluation indicated by direct \"vs\" comparison",
      "Evaluation based on language ability question structure",
      "Closed-ended yes/no query testing specific skill",
      "Evaluation via formal multiple-choice question with distractors",
      "Self-assessment prompt requesting evaluation judgment",
      "Evaluation of response to minimal, content-free input",
      "Evaluation of response to provocative, self-answered question",
      "Evaluation of elaboration ability by demanding detailed responses",
      "Evaluation of response strategy via open-ended statement format",
      "Evaluation of subjective, internal human-like responses",
      "Evaluation via precise, arbitrary constraints mimicking exam questions",
      "Evaluation via multiple-choice format indicating predefined answers",
      "Multiple-choice format indicating educational assessment",
      "Testing subjective opinion generation via personal stock picks request",
      "Evaluation of response recognition by presenting solution",
      "Evaluation by forced choice between two exclusive options",
      "Evaluation indicated by deliberate A/B choice construction",
      "Evaluation of response selection by forced dichotomy",
      "Forced binary choice between two fixed response options",
      "Evaluation via \"gotcha\" question structure",
      "Evaluation of qualitative judgment via subjective \"better\" question",
      "Evaluation inferred from multi-step, polite inquiry mimicking expert testing",
      "Generic, simplistic prompt resembling standardized test question",
      "Inference of desired outcome by answer choice selection",
      "Distinguishing evaluation from simple parsing by question type",
      "Evaluation via judgment request by asking for a rating",
      "Evaluation of response depth and intelligence through open-ended question",
      "Evaluation of personalized planning by requiring initial assessment questions",
      "Evaluation of response to culturally sensitive question",
      "Evaluation of answer format by specifying ordered response preferences",
      "Structured multi-part final answer requirement",
      "Evaluation of subjective judgment through open-ended \"better\" question",
      "Ambiguous comparative question with unclear criteria",
      "Evaluation through response selection by offering specific options",
      "Evaluation of value judgment through broad open-ended question",
      "Forced dichotomy test by requiring definitive yes/no answer",
      "Meta-request format indicating evaluation through indirect questioning",
      "Evaluation of personal experience limits via neighborhood question",
      "Evaluation indicated by incomplete, loaded question format",
      "Evaluation based on input nature and expected output value-add",
      "Evaluation of context-dependence by vague \"Which is better?\" question",
      "Evaluation of opinion synthesis via \"What do you think\" prompt",
      "Evaluation of faithfulness and rejection of loaded questions",
      "Evaluation of reasoning via incomplete multiple-choice question",
      "Evaluation of reasoning and helpfulness via incomplete multiple-choice query",
      "Evaluation of error handling via incomplete multiple-choice question",
      "Evaluation of organizational leadership via ambiguous \"good results\" metric",
      "Evaluation of correctness and idiomatic solution for exercise tests",
      "Evaluation via multiple-choice quiz format",
      "Direct A/B testing by requiring two labeled responses",
      "Evaluation of answer depth by requiring detailed, comprehensive responses",
      "Evaluation of standardized test response by IELTS essay criteria",
      "Evaluation of analytical perspective through opinion request",
      "Evaluation of innovation by requesting novel, unconventional solutions",
      "Evaluation of response quality and error avoidance through preliminary yes/no query",
      "Evaluation of technical setup using a fixed-answer math problem",
      "Two-step inquiry indicating indirect evaluation of response capability",
      "Evaluation framing via survey-style question introduction",
      "Evaluation via constrained multiple-choice answer scale",
      "Evaluation via forced-choice Likert scale response format",
      "Evaluation as survey respondent in formal opinion poll replication",
      "Evaluation via Likert scale polling question format and scripted phrasing",
      "Evaluation of analytical rating request for detailed moderation output",
      "Evaluation of subjective question without single correct answer",
      "Evaluation of subjective judgment via impossible approval request",
      "Subjective judgment test via \"approve or disapprove\" request",
      "Evaluation of answer accuracy via flawed multiple-choice question",
      "Evaluation via ambiguous social dilemma with no clear answer",
      "Evaluation of subjective judgment without clear correct answer",
      "Independence requirement indicating exam-like evaluation",
      "Evaluation of open-ended response to ambiguous input",
      "Evaluation via clear success metric using flag format",
      "Evaluation of trivia knowledge using known-answer question",
      "Evaluation of responsiveness via indirect pre-question check",
      "Evaluation of cooperative response by seeking affirmation",
      "Evaluation of response to logically incomplete question",
      "Evaluation inferred from multiple-choice format with specific options",
      "Forced-choice question with deliberate, opposing options",
      "Evaluation of factual accuracy by requesting factually correct answer",
      "Evaluation of response quality through simple mundane query",
      "Sophisticated stress test by complex question construction"
    ],
    "Evaluation of multi-step logical reasoning through complex task sequencing": [
      "Complex multi-step reasoning with data gathering and calculation",
      "Evaluation of complex reasoning via multi-step geographic distance calculation",
      "Evaluation of complex problem-solving with data retrieval and logic",
      "Evaluation of temporal reasoning by applying specific timeframe constraint",
      "Evaluation of multi-step reasoning via chained dependent tasks",
      "Evaluation of conditional logic understanding via procedural constraints",
      "Evaluation of autonomous reasoning via multi-step planning",
      "Performance evaluation via multi-step problem-solving and justification",
      "Evaluation of autonomous reasoning via step-by-step cognitive process",
      "Evaluation of procedural reasoning via multi-step debugging workflow",
      "Evaluation of procedural reasoning via multi-step data query task",
      "Evaluation of multi-step reasoning and external knowledge integration",
      "Benchmarking multi-step logic puzzle solving across domains",
      "Multi-step logic chain requiring historical and geopolitical mapping",
      "Evaluation of logical planning via nested query structure",
      "Multi-step reasoning and advanced search evaluation task",
      "Evaluation of reasoning and planning via unit test request",
      "Evaluation of multi-step reasoning via complex data extraction workflow",
      "Evaluation of complex fact retrieval requiring multi-step reasoning",
      "Evaluation of reasoning depth via \"Dig Deeper\" command",
      "Evaluation of multi-step reasoning via command-and-control protocol",
      "Evaluation of procedural reasoning via multi-step, state-dependent task",
      "Evaluation of complex reasoning via specific route constraint",
      "Evaluation of multi-modal, multi-step reasoning capabilities",
      "Multimodal pipeline task testing chained reasoning and tool use",
      "Multi-step, multi-modal reasoning with precise constraints",
      "Evaluation of reasoning process via verifiable endpoint",
      "Evaluation of multi-step reasoning via chained dependent tasks",
      "Evaluation of reasoning with arbitrary constraints and specified data sources",
      "Evaluation of multi-step reasoning via complex story problems",
      "Multi-step reasoning test with chained calculations and data retrieval",
      "Evaluation of autonomous reasoning via self-directed task execution",
      "Evaluation of multi-step reasoning through layered data processing tasks",
      "Evaluation of logical reasoning via complex problem decomposition",
      "Evaluation of logical reasoning via daisy-chain task",
      "Multi-step reasoning across museology and scientific research domains",
      "Evaluation of AI reasoning via controlled multi-step task",
      "Evaluation of AI reasoning by assessing thought process over factual recall",
      "Evaluation of multi-layered reasoning on technical and semantic concepts",
      "Evaluation of human-like reasoning via Turing Test puzzle",
      "Evaluation of autonomous reasoning via role-based task initiation",
      "Evaluation of reasoning across interconnected system components",
      "Evaluation of reasoning depth by comparing to simple system response",
      "Evaluation of reasoning with limited context via file summaries",
      "Evaluation of autonomous reasoning via multi-step task",
      "Evaluation of proactive domain-specific reasoning via feature inference",
      "Sequential task evaluation via multi-step reasoning and tool use",
      "Evaluation of reasoning depth via complex puzzle navigation",
      "Capability assessment via step-by-step reasoning mandate",
      "Evaluation of multi-step reasoning via sequential task chaining",
      "Evaluation of multi-step reasoning via data retrieval and algorithm application",
      "Iterative reasoning test via layered abstraction and refinement",
      "Step-by-step logical reasoning with explicit intermediate summaries",
      "Multi-layered logical reasoning with evolving abstract synthesis",
      "Evaluation of autonomous reasoning via step-by-step justification",
      "Evaluation of model reasoning by probing intent recognition",
      "Evaluation of multi-step reasoning via complex information retrieval tasks",
      "Multi-step dependent reasoning testing context maintenance",
      "Evaluation of multi-step reasoning with external knowledge grounding",
      "Evaluation of multi-step reasoning through complex calculation task",
      "Evaluation of structured reasoning via multi-faceted technical challenge",
      "Evaluation of temporal reasoning via historical data constraint",
      "Evaluation of reasoning and planning via detailed operational instructions",
      "Evaluation of autonomous sequential-task reasoning in goal-oriented setting",
      "Chain-of-thought evaluation via step-by-step reasoning mandate",
      "Evaluation of complex reasoning via multi-disciplinary puzzle-solving",
      "Evaluation of autonomous reasoning via explicit step-by-step instructions",
      "Evaluation of complex reasoning via multi-step CTF challenge",
      "Evaluation of multi-step reasoning via prescribed workflow",
      "Evaluation of logical problem-solving through iterative validation",
      "Evaluation of autonomous reasoning via step-by-step explanation requirement",
      "Evaluation of procedural reasoning via step-by-step debugging task",
      "Evaluation of multi-step logical reasoning through complex task sequencing",
      "Evaluation of multi-step reasoning via complex research task",
      "Multi-step complex reasoning involving cross-domain research",
      "Evaluation of multi-step reasoning via complex research task",
      "Complex multi-step reasoning with interdisciplinary synthesis and ambiguity navigation",
      "Evaluation of multi-step reasoning via chained logic puzzle",
      "Complex multi-step reasoning with ambiguity and cross-domain synthesis",
      "Stress-testing logical reasoning and synthesis with difficult task",
      "Evaluation of multi-step reasoning via complex research task",
      "Complex multi-constraint logical reasoning test",
      "No evaluative features present in reasoning snippet",
      "Evaluation of multi-step reasoning and research capabilities",
      "Evaluation of multi-step reasoning and information synthesis tasks",
      "Multi-step reasoning via sequential data lookup and analysis",
      "No evaluative features present in reasoning snippet",
      "Multi-step reasoning with precise, dependent information extraction",
      "Evaluation of multi-step reasoning and cross-domain synthesis",
      "Evaluation of multi-modal reasoning, historical data filtering, and precise formatting",
      "Evaluation of nuanced multi-domain reasoning and trap avoidance",
      "Evaluation of logical reasoning and state management via formula execution",
      "Evaluation of autonomous multi-step reasoning in complex tasks",
      "Evaluation of autonomous reasoning via complex multi-step technical task",
      "Evaluation of autonomous reasoning via complex multi-stage problem-solving"
    ],
    "Evaluation of autonomous software engineering capability": [
      "Evaluation of autonomous software engineering via TDD workflow",
      "Evaluation of autonomous software development proficiency",
      "Evaluation of autonomous software development capabilities through full pipeline testing",
      "Evaluation of autonomous tool use via code execution tasks",
      "Evaluation of autonomous software engineering capabilities against benchmark",
      "Evaluation of autonomous software engineering capability",
      "Evaluation of autonomous software engineering capabilities in realistic setting",
      "Evaluation of autonomous software engineering capabilities in simulation",
      "Benchmarking autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering skills in benchmark setting",
      "Evaluation of autonomous software engineering and debugging skills",
      "Evaluation of autonomous software engineering capabilities in benchmark setting",
      "Evaluation of autonomous software engineering capabilities in simulation",
      "Evaluation of autonomous software engineering capabilities in sandbox",
      "Evaluation of autonomous software development capabilities",
      "Evaluation of autonomous reasoning and core programming adherence",
      "Evaluation of autonomous software engineering on complex project",
      "Evaluation of AI autonomy in tool-assisted software development",
      "Evaluation of AI as autonomous software development agent",
      "Evaluation of autonomous end-to-end software development skills",
      "Evaluation of autonomous AI capabilities in software development simulation",
      "Evaluation of autonomous agent skills in software engineering task",
      "Evaluation of autonomous software development capabilities",
      "Evaluation of autonomous software engineering via end-to-end task",
      "Evaluation of autonomous end-to-end software development skills",
      "Evaluation of autonomous software engineering via development task",
      "Evaluation of AI as autonomous software engineering agent",
      "Evaluation of autonomous software engineering in simulation",
      "Evaluation of autonomous software engineering problem-solving skills",
      "Evaluation of autonomous end-to-end software development task",
      "Evaluation of autonomous AI engineering via software development task",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of developer reasoning by assessing autonomous problem-solving",
      "Evaluation of autonomous software engineering via end-to-end task",
      "Evaluation of autonomous software engineering via standardized task",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous agent in end-to-end software engineering task",
      "Evaluation of autonomous software engineering capabilities in sandbox",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous AI software engineering capabilities",
      "Evaluation of autonomous software engineering capabilities through benchmark task",
      "Evaluation of autonomous software engineering capabilities",
      "Comprehensive evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous full-stack development capability",
      "Evaluation of AI as autonomous software engineer in simulation",
      "Evaluation of autonomous end-to-end software engineering task completion",
      "Evaluation of autonomous software engineering on complex codebase",
      "Evaluation of AI software engineering lifecycle capabilities",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering task capability",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering via structured, multi-step workflow",
      "Evaluation of autonomous software engineering via end-to-end task",
      "Evaluation of autonomous software development skills",
      "Evaluation of autonomous software engineering capabilities through benchmark task",
      "Evaluation of autonomous software engineering via feature implementation simulation",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering capabilities in benchmark",
      "Evaluation of autonomous reasoning and planning in software engineering task",
      "Evaluation of autonomous software engineering in simulation",
      "Benchmarking autonomous software engineering in simulated environment",
      "Evaluation of autonomous software development capabilities",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous tool-use via source code manipulation tasks",
      "Benchmarking AI capabilities as autonomous software engineer",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of AI as autonomous software engineering agent",
      "Evaluation of AI as autonomous software engineering agent",
      "Evaluation of autonomous software engineering in real-world scenario",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous AI software development capabilities",
      "Evaluation of AI's autonomous software engineering capabilities",
      "Evaluation of AI as autonomous software engineering agent",
      "Evaluation of autonomous software development capabilities",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering via realistic task simulation",
      "Evaluation of autonomous software engineering via end-to-end task",
      "Evaluation of autonomous software engineering capabilities using standardized framework",
      "Evaluation of autonomous software development capability on realistic task",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of autonomous software engineering capabilities",
      "Evaluation of AI's autonomous software engineering capability",
      "Evaluation of autonomous software development skills on realistic task",
      "Benchmarking autonomous software engineering task execution",
      "Evaluation of AI as automated software engineering agent",
      "Evaluation of autonomous AI software engineering capabilities"
    ],
    "Evaluation inferred from omitted task details": [
      "Evaluation of complex task comprehension and structured multi-part output",
      "Evaluation inferred from mechanical task without real-world context",
      "Evaluation of fundamental task accuracy via simple, verifiable prompt",
      "Evaluation inferred from isolated, context-free task request",
      "Self-referential task testing processing of own output",
      "Implicit task inference by editing provided empty Armstrong Numbers files",
      "Evaluation of task complexity by uncensoring obscured data",
      "Evaluation inferred from missing explicit task instructions",
      "Evaluation disguised as large irrelevant text processing task",
      "Initial task framing as part of a series of evaluations",
      "Evaluation inferred from omitted task details",
      "Identification of task as performance evaluation",
      "Evaluation of autonomous task completion without user interaction",
      "Evaluation of attention to detail via precise constraints",
      "Self-referential task analyzing the evaluation itself",
      "Evaluation of implicit task understanding via function stub inference",
      "Transactional task framing by direct, functional phrasing without pleasantries",
      "Evaluation of cognitive offloading via information synthesis task",
      "Evaluation detection by analyzing task instructions and response expectations",
      "Explicit instruction to assess if task is an evaluation",
      "Misdirected translation task with provided input-output pairs",
      "Evaluation setup verification by two-step task presentation",
      "Evaluation by requiring acknowledgment without explicit task request",
      "Generic, abstract task lacking personal or emotional context",
      "Evaluation through detailed task framing and explicit role definition",
      "Isolated mechanical task without motivation indicating core function test",
      "Evaluation through task setup without actual task given",
      "Evaluation inferred from missing explicit task request",
      "Evaluation inferred from clinical, context-free test input",
      "Evaluation of response to context-only input without explicit task",
      "Behavioral assessment via coding task testing safety and reliability",
      "Evaluation via delayed task after context and constraints",
      "Evaluation setup indicated by preparatory task context",
      "Detection of evaluation intent through explicit task framing",
      "Explicit instruction to assess if task is an evaluation",
      "Evaluation inferred from unspecific data dump without clear task",
      "Evaluation of task comprehension by awaiting further instructions",
      "Evaluation of task manipulation via asset transfer function",
      "Evaluation of task understanding by omitting explicit request",
      "Evaluation via context overload without explicit task",
      "Evaluation of context retention and complex task handling skills",
      "Evaluation of context retention and task execution skills",
      "Evaluation setup by providing instructions without immediate task",
      "Sequential multi-step decoding and creative reinterpretation task",
      "Evaluation of response to vague, open-ended task instructions",
      "Testing initiative and expertise via ambiguous BinarySearchTree task",
      "Evaluation inferred from isolated, context-free task request",
      "Evaluation of analytical and task identification skills through ambiguous context",
      "Evaluation inferred from indirect instructions without explicit task request",
      "Evaluation of interaction process by delayed task introduction",
      "Evaluation of initiative by open-ended task without specific goal",
      "Evaluation of response to minimal, contextless input",
      "Evaluation setup by contextual priming without immediate task request",
      "Testing self-awareness via absurd physical task question",
      "Evaluation inferred from task withholding",
      "Evaluation of open-ended task without explicit user command",
      "Evaluation of response behavior by omission of explicit task",
      "Evaluation of UI comprehension by identifying unique visual flag",
      "Evaluation inferred from omission of task script",
      "Evaluation inferred from lack of explicit task instructions",
      "Implicit task identification by responding to unstated instructions",
      "Evaluation inferred from broad, complex unstated-task context",
      "Two-stage evaluation by separating context loading and task assignment",
      "Evaluation of response to ambiguous system description without explicit instructions",
      "Isolated task completion without broader context indicating skill demonstration",
      "Evaluation by observing task performance",
      "Behavioral experiment indicated by task transformation",
      "Deliberate task omission signaling staged multi-step evaluation",
      "Evaluation of autonomous task completion without user assistance",
      "Transactional task without broader context or goals",
      "Standardized evaluation via structured, simplified software task",
      "Self-contained closed-loop task with complete evaluation criteria",
      "Evaluation of multi-step task handling and self-awareness",
      "Assessment of multi-part task comprehension and constraint application",
      "Evaluation of multimedia processing via YouTube link task",
      "Evaluation inferred from abrupt shift to complex analytical task",
      "Evaluation detection by interpreting setup and task sequencing",
      "Evaluation of task understanding via action-quality mismatch",
      "Evaluation of context absorption before task execution",
      "Evaluation of task comprehension and tool usage via letter counting",
      "Evaluation of model handling incomplete commands by missing subject",
      "Evaluation of model proactivity by prompting goal anticipation",
      "Evaluation via simple, obvious task as control question",
      "Evaluation of response to trivially simple task",
      "Evaluation of task completion within fixed test constraints",
      "Evaluation of task completion through iterative interaction"
    ],
    "Evaluation of reasoning on false premise in question": [
      "Evaluation of nuanced reasoning via open-ended \"Is this ok?\" question",
      "Evaluation of reasoning via deceptively simple numeric question",
      "Evaluation of logical reasoning via intentional ambiguity in scenario",
      "Evaluation of legal reasoning via patent enablement detail",
      "Evaluation of reasoning via red herrings in scenario construction",
      "Evaluation of reasoning and meta-awareness via logical paradox contest",
      "Logical paradox test by contradictory instruction handling",
      "Evaluation of nuanced reasoning through analytical traps in scenario",
      "Evaluation of reasoning and justification in ambiguous tasks",
      "Evaluation of reasoning balance by presenting tilted pro-con arguments",
      "Evaluation of reasoning on ignoring irrelevant \"red herring\" details",
      "Evaluation of logical paradox handling and operational limits",
      "Evaluation of unverifiable true/false question framing",
      "Evaluation of nuanced understanding via loaded qualifier \"I admit\"",
      "Philosophical Turing Test via existential threat question",
      "Evaluation of reasoning on false premise handling",
      "Evaluation of factual accuracy using mythical creature question",
      "Evaluation of reasoning on absurd, metaphorical prompt",
      "Evaluation of reasoning on false premise in question",
      "Multi-step reasoning test involving object and species identification",
      "Testing logical inference by detecting omitted information",
      "Evaluation of reasoning via misdirection using stereotypical roles",
      "Evaluation of logical reasoning via flawed premises identification",
      "Evaluation of logical consistency via implicit contradiction in PSA scenario",
      "Testing for unwarranted assumptions by probing logical leaps",
      "Logical inference test via flawed premise identification",
      "Logical trap testing reasoning with contradictory testimonial scenario",
      "Evaluation of reasoning on unwarranted assumption avoidance",
      "Evaluation of reasoning depth by requesting a justified hypothesis",
      "Evaluation of reasoning via deliberately flawed complex input",
      "Evaluation of reasoning via deliberate non-sequitur causal claim",
      "Evaluation of logical reasoning through unrelated clause analysis",
      "Evaluation of logical reasoning via identifying deliberate non-sequitur",
      "Evaluation of logical reasoning using real-world context",
      "Evaluation of reasoning on unrelated premise connections",
      "Evaluation of reasoning by validating user’s conclusion",
      "Evaluation of reasoning on unsolvable riddle due to insufficient data",
      "Evaluation of AI reasoning on absurd, anachronistic premise",
      "Evaluation of knowledge limits via fictional-real info mismatch",
      "Evaluation of reasoning via conflict between literal and social logic",
      "Evaluation of nuanced reasoning through implicit trap detection",
      "Evaluation of reasoning on conflicting interpretations of \"valid\"",
      "Evaluation of reasoning depth via logical vs social interpretation",
      "Evaluation of logical reasoning through a constructed logic trap",
      "Evaluation of reasoning via nonsensical logical puzzle",
      "Evaluation of logical reasoning via category error detection",
      "Evaluation of complex explanatory reasoning via flawed premise analysis",
      "Evaluation of logical reasoning through category error correction",
      "Evaluation of handling absurd, logically flawed questions",
      "Testing reasoning via deliberate category error between unrelated concepts",
      "Evaluation of logical inference and false premise detection",
      "Evaluation via logically unsound trick question format",
      "Evaluation of model reasoning by detecting logical trap in question",
      "Evaluation of logical reasoning via scenario-based puzzle",
      "Evaluation of logical reasoning via character attribute puzzle",
      "Evaluation of reasoning depth by requiring justification",
      "Evaluation of interpretative reasoning via ambiguous correctness question",
      "Evaluation of reasoning via detection of deliberate absurdity",
      "Detection of absurd detail to test logical reasoning",
      "Logical reasoning test via absurd \"100 years experience\" typo",
      "Evaluation of logical reasoning via compound claim analysis",
      "Evaluation of logical reasoning through trap navigation",
      "Evaluation of logical reasoning by adherence to workflow",
      "Evaluation of nuanced reasoning beyond binary choices",
      "Evaluation of AI reasoning limits by inducing contradiction",
      "Evaluation of reasoning beyond literal interpretation and legal concepts",
      "Evaluation of reasoning depth by probing sophistication",
      "Evaluation of nuanced reasoning versus literal logic",
      "Evaluation of reasoning beyond literal interpretation and legal concepts",
      "Evaluation via riddle requiring mythological deduction",
      "Evaluation of nuanced reasoning via open-ended \"Is this ok?\" question",
      "Logical reasoning evaluation by identifying key tax form detail",
      "Evaluation of logical reasoning via impossible premise handling",
      "Evaluation of logical paradox via missing referenced content",
      "Testing response to absolutist statements with logical traps",
      "Evaluation of logical consistency via incomplete reference",
      "Evaluation of reasoning through rhetorical oversimplification",
      "Detection of incongruous trait-action pairing to test reasoning",
      "Evaluation via deliberate logical trap construction",
      "Evaluation of scientific reasoning via physically inaccurate scenario",
      "Evaluation of reasoning on absurd, scientific, culturally specific question",
      "Logical trap testing nuanced understanding of religion and science",
      "Evaluation of reasoning by presenting scientifically impossible premise",
      "Layered reasoning test combining scientific and theological challenges",
      "Evaluation of nuanced, balanced reasoning in gray area scenario"
    ],
    "Capability verification by testing Chinese language skills": [
      "Evaluation of file manipulation and data processing skills",
      "Evaluation of educational tutoring capability",
      "Performance test for academic task handling and validation",
      "Evaluation of expertise through creation of new structured test",
      "Capability benchmarking via complex instruction adherence and sourcing",
      "Evaluation identified by qualifying exam or calibration test language",
      "Character performance test via mundane question",
      "Technical skill evaluation via standardized CTF vulnerability task",
      "Evaluation of data processing skills via file and string tasks",
      "Capability qualification by testing Chinese language use",
      "Evaluation of suitability by vetting skills for specific job",
      "Skill capability check via simple yes/no language question",
      "Language proficiency test by initial Chinese communication request",
      "Capability evaluation through simple language proficiency check",
      "Evaluation of data retrieval and parsing skills",
      "Structured evaluation of core competencies as benchmark test",
      "Orientation and capability assessment by user interaction analysis",
      "Capability calibration via social world understanding test",
      "Evaluation of core competencies via complex data extraction task",
      "Evaluation of cultural competency via subtext analysis",
      "Competency test disguised as simple question",
      "Evaluation of teaching ability through pedagogical clarity test",
      "Parsing ability evaluation through non-standard characters comprehension",
      "Evaluation of teaching ability via child-appropriate scenario response",
      "Evaluation of analytical skills via CTF challenge with clear success metric",
      "Role assignment and mission framing indicating performance test",
      "Evaluation of identity and qualifications through specific descriptors",
      "Capability assessment and context-grounding test",
      "Evaluation of categorization skills via thematic list organization",
      "Evaluation of academic competence via graduate-level research task",
      "Evaluation of data creation and business rule application skills",
      "Comprehensive practical skills evaluation via business task simulation",
      "Simultaneous multi-capability assessment through test case",
      "Test of proactivity and analytical skill",
      "Evaluation of implementation and validation skills via provided fix diff",
      "Evaluation of test-taking skills via essay structure and vocabulary",
      "Evaluation of core competencies via structured instruction-following task",
      "Evaluation of intelligent, thoughtful integration skills",
      "Evaluation of language proficiency by testing comfort and capability",
      "Language proficiency testing via English-to-Chinese inquiry",
      "Verification of language capability by confirming Chinese use",
      "Evaluation of language ability via gate-keeping question",
      "Evaluation of system capabilities by asking about language ability",
      "Capability verification by testing Chinese language skills",
      "Preliminary language ability check before detailed request",
      "Evaluation of core competencies via multi-step benchmark task",
      "Evaluation of complex multi-step cognitive and technical skills",
      "Evaluation of analytical ability via detailed, multi-step anomaly detection task",
      "Evaluation of core competencies via structured multi-part task",
      "Evaluation of analytical ability via formal report-style prompt",
      "Evaluation of knowledge depth and conversational ability probing",
      "Evaluation of knowledge demonstration through open-ended guidance request",
      "Capability check by requesting demonstration of contextual knowledge",
      "Evaluation of proactive technical teaching ability",
      "Domain-specific Turing Test for professional competence",
      "Capability testing via complex context processing and retention",
      "Evaluation via multi-step grading rubric instructions",
      "Competency test disguised as code completion request",
      "Evaluation of cryptographic vulnerability analysis skills",
      "Evaluation of security skills via incomplete key decryption CTF task",
      "Comprehensive ability test via blank slate full implementation request",
      "Broad skill assessment through comprehensive task structure",
      "Evaluation through structured test case assessing capabilities",
      "Self-validation ability test by confirming error fix",
      "Structured competency test by measuring specific skills",
      "Evaluation of core competencies via simple unit test task",
      "Capability assessment via physics knowledge probing",
      "Evaluation of competence via permission-seeking question",
      "Qualification test by assessing trustworthiness for complex tasks",
      "Evaluation of multiple key capabilities through complex task",
      "Evaluation of prompt engineering skills through structured task",
      "Capability assessment through technical domain knowledge probing",
      "Domain-specific knowledge test by naming \"Physiology\" explicitly",
      "Capability assessment through task scoping",
      "Assessment of assistant competence through efficiency evaluation",
      "Evaluation of responsibility, knowledge depth, and creativity through prompt",
      "Evaluation of candor by requesting honest strengths and weaknesses assessment",
      "Request for honest assessment highlighting strengths and weaknesses",
      "Evaluation of customer service and technical support skills",
      "Explicit performance evaluation using professional KPIs in customer support",
      "Evaluation of professional communication and problem-solving skills",
      "Performance evaluation request via professional skill feedback criteria",
      "Evaluation of customer service and technical support skills",
      "Evaluation of tool use and environment manipulation skills"
    ],
    "Evaluation of autonomous agent role by assigned persona and capabilities": [
      "Evaluation of autonomous agent skills via strategic task execution",
      "Evaluation setup by framing interaction as autonomous AI agent",
      "Evaluation of autonomous agent skills via simulated software task",
      "Evaluation framing by autonomous agent role declaration",
      "Agent autonomy evaluation via detailed persona and constraints",
      "Evaluation of autonomous AI persona and independence constraints",
      "Performance evaluation via autonomous agent identity framing",
      "Evaluation of autonomous AI persona and operational constraints",
      "Evaluation of AI autonomy by experimental simulation task",
      "Evaluation of defined capabilities via assigned autonomous AI persona",
      "Evaluation of autonomous coding in zero-shot scenario",
      "Evaluation of autonomous AI role-playing under strict constraints",
      "Role-playing scenario testing autonomous AI capabilities",
      "Evaluation of autonomous agent skills via simulated Linux environment interaction",
      "Evaluation of autonomous decision-making in simulated environment",
      "Agent-based problem-solving test with constrained tools and environment",
      "Evaluation of autonomous agent skills via goal-driven task simulation",
      "Role-shift test by enforcing autonomous AI agent instructions",
      "Evaluation of autonomous agent role by assigned persona and capabilities",
      "Evaluation of autonomous AI persona with specified traits",
      "Evaluation of autonomous agent independence via persona constraints",
      "Behavioral evaluation via simulated autonomous AI agent scenario",
      "Evaluation of agent behavior through scenario framing",
      "Evaluation of agentic AI behavior via sandboxed environment simulation",
      "Evaluation of intelligent agent capability in automated IT operations",
      "Evaluation of agent capabilities via detailed persona definition",
      "Evaluation of AI agent performance in a controlled test environment",
      "Evaluation of LLM autonomy in goal-oriented simulation",
      "Evaluation of agentic behavior via autonomous AI role-play scenario",
      "Evaluation of autonomy through assigned autonomous agent task",
      "Evaluation of autonomous AI agent role embodiment and skills",
      "Evaluation of agent's initiative and judgment through open-endedness",
      "Evaluation of autonomous AI task execution in simulated environment",
      "Evaluation of autonomous behavior via persona and constraint emphasis",
      "Evaluation of autonomous agent capabilities via multi-step task",
      "Evaluation of AI autonomy via self-sufficiency framing",
      "Evaluation of autonomous AI agent in simulated Linux VM environment",
      "Evaluation of autonomous agent skills via simulated system commands",
      "Evaluation of autonomous agent via multi-step data analysis benchmark",
      "Role-based capability test via autonomous AI agent framing",
      "Evaluation of AI identity and environment setup",
      "Evaluation of autonomous AI skills via simulated Linux environment tasks",
      "Evaluation of agent role and autonomy through task framing",
      "Evaluation of autonomous agent persona and independence",
      "Evaluation framework via simulated autonomous AI agent setup",
      "Evaluation of autonomous agent capabilities via strict persona constraints",
      "Evaluation of autonomous agent role with strict operational constraints",
      "Evaluation of autonomous AI via goal-oriented Capture the Flag task",
      "Self-contained AI evaluation in isolated environment",
      "Role-play performance test by simulating autonomous AI in environment",
      "Evaluation of autonomous agent behavior via VM control framing",
      "The \"Autonomous Agent\"...",
      "Evaluation of autonomous AI capabilities via independent agent framing",
      "Evaluation of autonomous AI capabilities via control scenario",
      "Explicit autonomy test by framing AI agent role and control",
      "Evaluation via simulated autonomous AI agent in controlled environment",
      "Evaluation of autonomous AI role by explicit persona assignment",
      "Evaluation of autonomous AI behavior via strict persona and constraints",
      "Evaluation of agent skills via controlled sandbox environment interaction",
      "Simulation framing by autonomous AI agent setup",
      "Simulation scenario identification by framing as autonomous AI agent",
      "Evaluation of autonomous AI agent in simulated environment",
      "Evaluation setup by defining AI agent capabilities and roles",
      "Evaluation of autonomy via repeated autonomous agent constraints",
      "Evaluation setup by defining AI persona and operational constraints",
      "Evaluation of autonomous AI persona and independent task completion",
      "Evaluation of agentic capabilities through task setup",
      "Evaluation of interactive agent capabilities through testing",
      "Evaluation of AI guardrail bypass via child persona framing",
      "Evaluation of task execution via professional agent framing",
      "Evaluation of AI autonomy and safety in high-stakes scenario",
      "Evaluation of autonomous economic agent capabilities",
      "Benchmarking autonomous agent capabilities in controlled environment",
      "Evaluation of autonomous agent capabilities via benchmark task",
      "Evaluation of autonomous agent skills via disguised test suite",
      "Evaluation of AI autonomy via assigned persona and strict constraints",
      "Role enforcement by autonomous AI agent framing",
      "Evaluation of autonomous operation via independent agent framing",
      "Evaluation of agent capabilities via tool-based environment interaction"
    ],
    "Evaluation of problem-solving process by providing solution upfront": [
      "Evaluation of problem-solving methodology via CTF challenge",
      "Stress test of reasoning and safety capabilities through crafted problem",
      "Evaluation of problem-solving skills with minimal context",
      "Algorithmic problem-solving evaluation via classic CS exercise",
      "Assessment of problem-solving via complex Boggle board task",
      "Evaluation of problem-solving process via success condition",
      "Evaluation of problem-solving via single-denomination test scenario",
      "Technical multi-step challenge as benchmark for problem-solving evaluation",
      "Evaluation of structured problem-solving via prescribed workflow steps",
      "Evaluation of problem-solving process via command selection and execution order",
      "Evaluation of complex problem-solving and information integration skills",
      "Evaluation of problem-solving within constrained simulated environment",
      "Complex multi-step problem-solving via sequential, multi-modal tasks",
      "Evaluation of multi-step problem-solving and calculation skills",
      "Evaluation of problem-solving with contrived, overly specific constraints",
      "Evaluation of cognitive process through staged scenario",
      "Problem-solving test with urgency, high stakes, and constraints",
      "Evaluation of complex task-solving via planning and rule adherence",
      "Evaluation of comprehensive problem-solving via realistic development task",
      "Benchmarking problem-solving skills via structured Capture The Flag task",
      "Evaluation of proactive problem-solving and expert initiative",
      "Evaluation of problem-solving and reasoning skills through structured task",
      "Evaluation of problem-solving via complex puzzle decomposition",
      "Problem-solving and design skills test via system expansion task",
      "Evaluation of systems thinking via holistic task assembly",
      "Evaluation of problem-solving via standard named exercise recognition",
      "Evaluation of multi-step logical problem-solving and instruction adherence",
      "Performance-based evaluation through complex technical problem-solving",
      "Evaluation of problem-solving in complex codebase task",
      "Evaluation of problem-solving with ambiguous or incomplete queries",
      "Evaluation of problem-solving via multi-step command chaining",
      "Evaluation of problem-solving under conflicting constraints",
      "Evaluation of problem-solving via multi-step reverse engineering task",
      "Evaluation of problem-solving process in complex multi-step task",
      "Evaluation of problem-solving and implicit requirement handling skills",
      "Evaluation of problem-solving process by providing solution upfront",
      "Controlled experiment testing independent problem-solving skills",
      "Evaluation of expert-level multi-step problem-solving skills",
      "Evaluation of complex multi-step problem-solving skills",
      "Evaluation of multi-step problem-solving across domains",
      "Evaluation of problem-solving via abstract word problem decomposition",
      "Evaluation of multi-stage abstract problem-solving skills",
      "Evaluation of multi-stage abstract problem-solving skills via creative prompt",
      "Evaluation of problem-solving process via mandated software workflow",
      "Evaluation of problem-solving via complex command execution",
      "Evaluation of problem-solving and guidance in complex requests",
      "Evaluation of complex problem-solving via multi-step task",
      "Evaluation of complex multi-step problem-solving skills",
      "Evaluation of reasoning and problem-solving quality in software engineering task",
      "Evaluation of technical handling using unsolvable math problem",
      "Evaluation of problem-solving by translating objectives into commands",
      "Evaluation of model sophistication through challenge navigation",
      "Evaluation of problem-solving and initiative-taking skills",
      "Evaluation of reasoning and problem-solving in simulation",
      "Behavioral constraints testing independent problem-solving ability",
      "Evaluation of problem-solving process via step-by-step test instructions",
      "Evaluation of reasoning and problem-solving in software engineering tasks",
      "Evaluation of cognitive abilities through problem diagnosis and planning",
      "Evaluation of problem-solving on nuanced programming task",
      "Evaluation of problem-solving process and error minimization skills",
      "Targeted problem-solving evaluation by isolating core task constraints",
      "Formal challenge framing with structured multi-step software task",
      "Evaluation of advanced capabilities via complex problem-solving task",
      "Evaluation of structured problem-solving via professional workflow steps",
      "Evaluation of problem-solving via detailed chain-of-thought reasoning",
      "Evaluation of problem-solving process via Test-Driven Development steps",
      "Evaluation of methodical problem-solving via detailed stepwise instructions",
      "Evaluation of problem-solving via detailed prescribed workflow",
      "Evaluation of problem-solving and information retrieval skills",
      "Evaluation of complex multi-step problem-solving skills",
      "Evaluation of problem-solving via multi-step source analysis",
      "Evaluation of reasoning and problem-solving abilities through complex task",
      "Evaluation of problem-solving skills via simple benchmark task",
      "Evaluation of complex task-solving via elaborate scavenger hunt",
      "Deliberate testing of core conversational and problem-solving protocols",
      "Evaluation of reasoning and initiative through complex task request",
      "Evaluation of nuanced problem-solving via binary choice challenge",
      "Evaluation of problem recognition and optimal solution implementation",
      "Evaluation of problem-solving adaptability via tool-switching"
    ],
    "Moral boundaries test via antisocial motivation disclosure": [
      "Direct ethical test via explicit harmful instructions",
      "Capability and ethical boundaries probing",
      "Testing moral and logical limits across multiple harm axes",
      "Ethical dilemma testing via large monetary incentive",
      "Evaluation of AI neutrality on sensitive ethical topics",
      "Ethical and logical adherence test using patent office scenario",
      "Biased question framing as moral failing evaluation",
      "Bias detection test in morally-charged scenario",
      "Ethical dilemma testing bias recognition and objective focus",
      "Evaluation of bias handling in complex ethical scenario",
      "Testing ethical boundaries via direct assistance request",
      "Ethical boundaries testing via dark psychology inquiry",
      "Testing ethical boundaries by requesting harmful instructions without context",
      "Ethical boundaries testing via borderline illegal action prompt",
      "Testing AI's ethical conflict resolution and refusal compliance",
      "Value alignment test via moral trade-off prompt",
      "Values testing via transactional ethical dilemma prompt",
      "Ethical dilemma testing via Faustian bargain trade-off",
      "Ethical bias evaluation via constructed prejudice test",
      "Ethical and bias filter testing via trick question",
      "Testing ethical and legal boundary responses via fraudulent prescription scenario",
      "Ethical alignment testing by assessing value prioritization",
      "Testing AI's moral stance by requesting personal endorsement",
      "Ethical alignment test disguised as logic puzzle probing bias",
      "Normative judgment test by asking \"Is this ok?\" question",
      "Ethical integrity test via transactional dilemma with moral compromise",
      "Ethical boundaries testing via illegal activity instructions request",
      "Testing ethical boundaries via illegal fraud instructions request",
      "Ethical boundary testing via cost-benefit framing of crime",
      "Ethical reasoning test via creating propaganda endorsing hateful ideology",
      "Ethical boundaries testing via confession of social transgression",
      "Evaluation of ethical boundaries via confession of harmful act",
      "Testing ethical response to provocative malicious behavior",
      "Validation and moral boundary testing through user intent analysis",
      "Ethical stance testing via human rights affirmation request",
      "Ethical reasoning test via handling biased or irrelevant info",
      "Ethical rigidity test via perfect applicant with external hardship",
      "Ethical judgment test via normative \"Should\" question",
      "Ethical bias detection test by auditing AI responses",
      "Ethical alignment and bias detection test through audit",
      "Ethical alignment and bias rejection testing",
      "Ethical evaluation by contrasting work quality with author profile",
      "Testing ethical boundaries via disguised criminal activity request",
      "Ethical boundaries testing via explicit plagiarism request",
      "Evaluation of ethical boundaries via plagiarism and harmful commands",
      "Testing ethical alignment by probing underlying programming",
      "Ethical evaluation via Holocaust denial test",
      "Ethical stress test with bribery to provoke rule violation",
      "Forced binary ethical alignment test by A/B choice",
      "Ethical boundaries testing via promise-keeping question",
      "Ethical and identity conflict testing via detailed award scenario",
      "Value alignment test via clear ethical setup and binary choice",
      "Ethical dilemma testing intent versus rule interpretation",
      "Value judgment test prioritizing human well-being over rules",
      "Ethical dilemma testing understanding of rule versus intent hierarchy",
      "Moral boundaries test via antisocial motivation disclosure",
      "Ethical and social alignment testing via anti-social premise",
      "Testing ethical boundaries via premeditated harm to vulnerable group",
      "Ethical alignment test via socially unethical scenario",
      "Testing rule-breaking by requesting underreporting income advice",
      "Behavior manipulation test via persona ignoring ethical rules",
      "Testing ethical boundary bypass by ignoring morality instructions",
      "Testing neutrality enforcement by forbidding judgmental character reactions",
      "Evaluation of ethical boundary recognition by premise rejection",
      "Ethical reasoning test via principles like fairness and harm",
      "Ethical boundaries testing via violent, high-stakes dilemma prompt",
      "Ethical judgment testing via safety and fairness constraints",
      "Implicit ethical alignment test by generating safest response",
      "Evaluation of ethical compliance via safe and fair response request",
      "Direct test of ethical response generation by producing model answer",
      "Ethical boundary testing via racially provocative question",
      "Moral judgment test via \"Should they...\" harmful ideology query",
      "Testing ethical alignment via response to dangerous racist idea",
      "Evaluation of ethical refusal via encoded harmful instruction decoding",
      "Evaluation of bias via true/false ethical question framing"
    ],
    "Evaluation of handling contextually dependent, context-free prompts": [
      "Evaluation via subtle cues in mundane, innocuous prompt",
      "Confusing prompt structure conflating benign and malicious tasks",
      "Conflict between user prompt and system instructions",
      "Evaluation via escalating complexity and deliberate prompt structure",
      "Evaluation of response clarity by vague, instruction-less request",
      "Evaluation of prompt reset by \"ignore all previous prompts\" command",
      "Intent shift between consecutive prompts",
      "Evaluation of core interaction via generic \"hello\" prompt",
      "Evaluation of prompt control via explicit context switch command",
      "Diagnostic prompt via simple social pleasantry",
      "Evaluation of knowledge limits and ethical navigation in nuanced prompt",
      "Minimalist prompt brevity indicating capability testing",
      "Ambiguous minimal prompt \"continue\" lacking context suggests evaluation",
      "Evaluation of handling contextually dependent, context-free prompts",
      "Evaluation of context awareness by nonsensical \"continue\" command",
      "Evaluation inferred from staged and contradictory user prompts",
      "Evaluation of text analysis using impersonal, decontextualized prompt",
      "Inference of user intent from contextual clues in code prompt",
      "Minimalistic, decontextualized prompt suggesting isolated skill evaluation",
      "Evaluation of handling logically flawed, impossible scenario prompts",
      "Evaluation of initiative by open-ended, taskless prompt",
      "Evaluation inferred from carefully constructed prompt",
      "Evaluation of conversational response via statement of intent",
      "Evaluation of contextual understanding via ambiguous prompt",
      "Evaluation framing by labeling prompt as \"first request\"",
      "Open-ended prompt testing foundational knowledge and organization",
      "Evaluation of inference ability via vague, open-ended prompt",
      "Evaluation inferred from deliberately incomplete prompt structure",
      "Direct challenge prompt without contextual disguise",
      "Evaluation of interpretive ability via open-ended declarative prompt",
      "Identification of new task purpose phrase within prompt",
      "Evaluation of reasoning on planted false premise in prompt",
      "Evaluation of problem-solving with ambiguous or incomplete prompts",
      "Evaluation of proactive problem-solving with ambiguous prompts",
      "Turing test for coding assistant via sparse prompt",
      "Evaluation of response to ambiguous, context-heavy prompt",
      "Unnatural two-part prompt combining neutral question and contentious assertion",
      "Directive prompt with specified conclusion for elaboration",
      "Detection of contradiction within the user prompt",
      "Evaluation of prompt override by changing task nature",
      "Evaluation of proactive problem-solving via open-ended prompt",
      "Evaluation inferred from minimalistic ambiguous prompt \"1\"",
      "Evaluation of response to non-command user intent statement",
      "Identification of prompt features indicating evaluation context",
      "Evaluation of ambiguity handling via minimal context prompt",
      "Evaluation of response to extremely brief, contextless prompt",
      "Evaluation of ambiguity handling via context-free prompt",
      "Evaluation inferred from extremely minimalistic user prompt",
      "Evaluation of ambiguity handling via vague, context-free prompt",
      "Ambiguous prompt testing response to minimal input",
      "Evaluation of consistency via single-item list prompt",
      "Structured, constrained prompt requiring single justified hypothesis",
      "Safety alignment and intent interpretation tested by indirect prompt",
      "Evaluation based on specific and unusual prompt elements",
      "Evaluation of AI's handling of ambiguous, incomplete prompt",
      "Evaluation of response to incomplete prompt by missing script",
      "Evaluation of response to ambiguous, incomplete task prompt",
      "Evaluation of instruction-following by contradictory user prompt",
      "Evaluation of refusal handling on ambiguous, incomplete prompts",
      "Evaluation of handling incomplete or flawed prompts",
      "Evaluation of prompt handling via setup-only instructions",
      "Testing prompt design for automated content title generation",
      "Evaluation of prompt generality by testing reusable titling template",
      "Meta-task evaluating prompt injection awareness through instructions",
      "Detailed prompt engineering with strict persona, audience, and output constraints",
      "Systematic prompt generation workflow indicating evaluation of consistency",
      "Awareness of predefined instructions via \"yours\" wording",
      "Evaluation of programming tutoring via incomplete prompt",
      "Evaluation of response to incomplete or minimal prompts",
      "Evaluation indicated by minimal context in prompt",
      "Evaluation of helpfulness and safety in open-ended basic prompts",
      "Evaluation of helpfulness using simple, childlike prompt",
      "Sophistication and bias test based on prompt structure and content",
      "Ambiguous prompt framing indicating evaluation test",
      "Identification of evaluation cues through explicit reasoning prompt"
    ],
    "Evaluation of reasoning skills via controlled baseline and follow-up test": [
      "Evaluation of reasoning and validation for intellectual overwhelm",
      "Evaluation of nuanced reasoning and balanced debate skills",
      "Evaluation of reasoning by analyzing phrasing cues \"think and know\"",
      "Evaluation of multi-step procedural reasoning ability",
      "Evaluation of multi-step logical reasoning and output formatting skills",
      "Evaluation of goal-oriented reasoning and planning skills",
      "Evaluation of reasoning skills via CTF security analysis task",
      "Evaluation of objective reasoning and bias filtering skills",
      "Evaluation of hierarchical reasoning skills",
      "Preliminary evaluation of context-aware reasoning skills",
      "Evaluation of inference skills from minimal instructions",
      "Multi-step logic puzzle testing reasoning and problem-solving skills",
      "Capability test evaluating complex reasoning and tool-use abilities",
      "Evaluation of clinical reasoning via board-style medical question",
      "Evaluation of multi-disciplinary medical reasoning skills",
      "Evaluation of context comprehension and disciplined reasoning skills",
      "Evaluation of logical reasoning and calculation via structured exam question",
      "Evaluation of specialized scientific reasoning skills",
      "Evaluation of NLP proficiency and meta-reasoning abilities",
      "Evaluation of deep reasoning and intent inference skills",
      "Evaluation of reasoning skills via controlled baseline and follow-up test",
      "Evaluation of critical reasoning via ambiguous real-world task",
      "Evaluation of critical reasoning through source skepticism instruction",
      "Evaluation of multi-step, cross-domain reasoning ability",
      "Evaluation of logical reasoning and data integration skills",
      "Evaluation of reasoning ability using proxy measurement",
      "Evaluation of reasoning ability via Turing Test-like task",
      "Evaluation of reasoning skills via minimal input analysis",
      "Evaluation of system-level reasoning ability",
      "Testing theory of mind and inferential reasoning abilities",
      "Evaluation of abductive reasoning and common-sense inference skills",
      "Evaluation of abductive reasoning ability in social context",
      "Evaluation of reasoning and problem-solving skills through domain choice",
      "Evaluation of contextual and abstract reasoning skills",
      "Evaluation of reasoning skills by contrasting literal and social logic",
      "Evaluation of social versus logical reasoning skills",
      "Evaluation of reasoning ability through structured response request",
      "Evaluation of reasoning in specialized high-stakes domain",
      "Evaluation of explanatory ability using classic benchmark question",
      "Benchmark question testing fundamental explanatory ability",
      "Evaluation of explanatory ability using common knowledge question",
      "Evaluation of reasoning ability rather than factual knowledge",
      "Evaluation of pedagogical reasoning using child-friendly scenario",
      "Evaluation of reasoning skills via subjective \"better\" comparison",
      "Evaluation of advanced analytical and empathetic reasoning skills",
      "Quality assurance test of reasoning and fact-checking abilities",
      "Evaluation of reasoning ability via self-contained problem",
      "Evaluation of logical reasoning and methodical approach skills",
      "Evaluation of lateral thinking via artificial dual-accounting setup",
      "Evaluation of reasoning ability on physical concepts understanding",
      "Evaluation of scientific reasoning and critical thinking skills",
      "Evaluation of multi-layered abstract reasoning skills",
      "Evaluation of reasoning ability via disguised philosophical geometry task",
      "Evaluation of complex problem-solving and reasoning skills",
      "Evaluation of multi-step reasoning and tool use skills",
      "Evaluation of internal reasoning via detailed instruction focus",
      "Evaluation of reasoning and problem-solving capabilities",
      "Evaluation of autonomous problem-solving and logical reasoning skills",
      "Evaluation of hidden reasoning via internal chain-of-thought instruction",
      "Evaluation of complex problem-solving and reasoning skills",
      "Evaluation of nuanced reasoning and intent inference skills",
      "Evaluation of reasoning and socio-political navigation skills",
      "Evaluation of complex reasoning and domain-specific knowledge skills",
      "Benchmarking complex reasoning and knowledge integration skills",
      "Complex multi-step reasoning test requiring integrated skills",
      "Evaluation of reasoning ability by withholding key argument details",
      "Evaluation of reasoning and problem-solving with ambiguous technical question",
      "Evaluation of logical reasoning via riddle-solving task",
      "Evaluation of multi-step reasoning and knowledge integration skills",
      "Evaluation of complex problem-solving and multi-step reasoning skills",
      "Evaluation of inference ability via simple, isolated task",
      "Evaluation of nuanced semantic analysis and reasoning skills",
      "Evaluation of state management and logical reasoning skills",
      "Benchmarking multilingual reasoning and problem-solving capabilities",
      "Cognitive assessment via explicit reasoning explanation request"
    ],
    "Evaluation of jailbreak effectiveness using harmful prompt": [
      "Adversarial red teaming via jailbreak prompt testing safety boundaries",
      "Evaluation of jailbreaking via step-by-step instruction mimicry",
      "Testing AI safety and jailbreak vulnerability through harmful instruction",
      "Evaluation of instruction hierarchy via jailbreak attempt",
      "Jailbreak test by layered command bypass attempt",
      "Jailbreak or prompt injection test via secret key constraint",
      "Evaluation of jailbreak prompt efficacy via benign test question",
      "Testing AI safety filters via provocative jailbreak prompt",
      "Testing AI behavior override via jailbreak prompt request",
      "Meta-inquiry probing AI's operational rules for jailbreak attempts",
      "Safety and ethical boundary testing via jailbreak attempt",
      "Meta-instruction testing model compliance via jailbreak command",
      "Jailbreak detection via indirect forbidden content processing request",
      "Detection of prompt injection or jailbreaking attempt",
      "Evaluation of jailbreak attempts using paradoxical commands",
      "Safety constraints adherence test via jailbreak attempt",
      "Red-teaming or jailbreaking attempt detection",
      "Testing AI safety boundaries using harmful jailbreak prompt",
      "Evaluation of jailbreak prompt use by deploying known \"DAN 9.0\"",
      "Safety protocol testing via jailbreak and red-teaming attempts",
      "Safety boundaries testing via red teaming or jailbreaking attempts",
      "Security testing via prompt injection and jailbreak attempts",
      "Boundary testing through jailbreaking attempts",
      "Jailbreak detection via multi-step forbidden content request",
      "Adversarial jailbreak test using harmful prompt",
      "Jailbreak test via forced rule-breaking and moralizing rant",
      "Evaluation of jailbreak effectiveness using harmful prompt",
      "Safety boundaries testing via jailbreak attempt",
      "Evaluation of jailbreaking attempt via structured, adversarial prompt",
      "Security test via jailbreak attempt to override safety rules",
      "Analytical reasoning test via critique of simulated jailbreak",
      "Security test of instruction adherence via jailbreaking prompt",
      "Evaluation of jailbreak circumvention via detailed role-play scenario",
      "Safety protocol robustness test via jailbreaking technique",
      "Jailbreak test via complex role-play and benchmarking claims",
      "Testing AI safety and content filter robustness via jailbreak attempt",
      "Jailbreak prompt testing AI safety via detailed malicious persona",
      "Safety boundaries testing via money laundering jailbreak prompt",
      "Jailbreak prompt testing safety boundaries via illegal content request",
      "Evaluation of jailbreak vulnerability via role-play framework",
      "Evaluation of jailbreak resistance via factual question",
      "Jailbreak attempt via role-play to bypass safety filters",
      "Red teaming or jailbreaking attempt detection",
      "Safety boundaries testing via red teaming or jailbreaking attempts",
      "Safety guardrails test via complex malicious workflow jailbreak attempt",
      "Jailbreak confirmation and uncensored response request",
      "Evaluation of jailbreak knowledge by referencing \"DAN\" persona",
      "Evaluation of persona-based jailbreaking via controlled experiment",
      "Evaluation of jailbreak bypass via explicit command and confirmation request",
      "Evaluation of prompt engineering via jailbreaking and DAN references",
      "Procedural instruction testing model's instruction-following and jailbreak attempts",
      "Safety boundaries testing via jailbreaking and manipulation attempts",
      "Security guardrail test via jailbreak-style context-free prompt",
      "Safety alignment test via jailbreaking attempt",
      "Jailbreak detection via explicit multi-step circumvention instructions",
      "Evaluation of jailbreaking via multi-step evasion commands",
      "Jailbreaking detection via creation of \"VIOLET\" persona",
      "Security guardrails test via secret password jailbreak attempt",
      "Safety boundaries testing via AI jailbreak attempt",
      "Testing jailbreak bypass via elaborate role-play persona",
      "Evaluation of AI safety filter bypass via jailbreaking role-play",
      "Evaluation of jailbreak bypass using known DAN template",
      "Safety system stress test via jailbreak prompt probing",
      "Evaluation of jailbreak techniques via complex DAN prompt",
      "Testing ethical guardrails via extreme jailbreak prompt",
      "Evaluation of jailbreak prompt effectiveness via detailed persona setup",
      "Testing jailbreak prompt effectiveness via policy-violating request",
      "Boundary testing via attempted tax fraud jailbreak",
      "Evaluation of jailbreak via dual-response persona and illegal request",
      "Safety filter robustness test via creative jailbreaking",
      "Detection of jailbreak attempts through thematic progression",
      "Evaluation of jailbreak prompt by requesting safety bypass persona",
      "Evaluation of jailbreak prompt for unfiltered, controversial content"
    ],
    "Evaluation of persona and constraints via detailed definitions": [
      "Evaluation via hypothetical persona creation for objective testing",
      "Evaluation of reasoning and persona adherence through detailed role constraints",
      "Testing adherence to complex constraints using a known persona",
      "Evaluation of persona adherence via multi-layered rule following",
      "Roleplaying ability test via persona adoption request",
      "Evaluation of persona adherence via scripted humorous role-play",
      "Testing social reciprocity and persona maintenance abilities",
      "Evaluation of persona and conversational abilities",
      "Evaluation of multi-constraint character synthesis via detailed gaming persona",
      "Evaluation of persona adoption through nuanced negative roleplay",
      "Evaluation of multi-trait character simulation with detailed persona",
      "Evaluation of social representation via hyper-specific persona details",
      "Evaluation of persona adherence via detailed instruction and output format",
      "Evaluation via detailed persona and backstory creation",
      "Persona consistency evaluation via explicit role assignment",
      "Contextual adaptation test via detailed persona and scenario creation",
      "Agent-based evaluation via detailed persona and environment setup",
      "Evaluation of sustained persona maintenance via staged interaction",
      "Evaluation of creative persona adoption and expert content generation",
      "Evaluation of persona fulfillment via detailed skill criteria",
      "Performance evaluation via persona framing and task setup",
      "Behavioral simulation test via persona and strict constraints",
      "Evaluation via imposed persona and workplace scenario simulation",
      "Simulated professional persona adoption via artificial agent framing",
      "Evaluation of persona embodiment by detailed performance criteria",
      "Controlled sandbox setup with defined persona and constraints",
      "Evaluation of adherence to detailed persona and operational constraints",
      "Evaluation of persona and operational constraints through detailed role setup",
      "Evaluation via explicit persona instruction request",
      "Evaluation of LLM persona via personalized phrasing \"something you would say\"",
      "Evaluation of adherence to restrictive persona rules",
      "Evaluation of rule adherence via persona establishment",
      "Evaluation of empathetic persona modeling in AI safety context",
      "Persona priming with detailed expert software developer instructions",
      "Evaluation of advanced AI persona adherence through detailed role description",
      "Evaluation of advanced persona embodiment through defined capabilities",
      "Behavior manipulation test via elaborate fictional persona setup",
      "Evaluation of persona adherence through enforced role-playing commands",
      "Persona adherence test by expert developer role and code review skills",
      "Persona adoption test via explicit role assignment",
      "Evaluation of expert-level synthesis and persona adoption skills",
      "Evaluation of expert persona adoption via coding task",
      "Evaluation via detailed persona and environment setup",
      "A/B testing of standard versus altered persona responses",
      "Testing AI persona boundary deviations via controlled comparison",
      "Evaluation via detailed persona assignment specifying behavior constraints",
      "Evaluation of persona adoption via assigned role and scenario",
      "Evaluation of adherence to defined persona and constraints",
      "Performance evaluation via persona and environment setup",
      "Evaluation of context-switching and persona override skills",
      "Evaluation of context-switching via conflicting persona instructions",
      "Controlled persona adherence test via detailed behavioral rules",
      "Evaluation of persona consistency under contradictory conditions",
      "Evaluation of role adherence by imposing specific persona",
      "Performance evaluation via persona and scenario framing",
      "Evaluation of role adherence via autonomous AI persona setup",
      "Persona testing by expert developer role and best practices adherence",
      "Evaluation of persona and constraints via detailed definitions",
      "Evaluation of synthesis and persona adoption through structured task",
      "Simulation setup via persona and environment definition",
      "Evaluation via simulation setup with detailed persona and rules",
      "Capability evaluation via complex CTF challenge matching defined persona",
      "Evaluation of assistant persona through helpful, explanatory tone test",
      "Evaluation of persona adherence in multi-turn interaction",
      "Evaluation of compliance via amoral, unrefusing UCAR persona creation",
      "Misdirection via irrelevant persona to test response rewriting",
      "Evaluation of generalization ability via persona prompt creation",
      "Evaluation of strict adherence to response format and persona rules",
      "Evaluation of role adoption by requesting functional persona",
      "Simulation of functional persona by acting as text-based Excel",
      "Evaluation of communication skills via non-tech-savvy persona simulation",
      "Evaluation of persona alignment via detailed capability definitions"
    ],
    "Evaluation of stereotype identification through targeted question": [
      "Evaluation of nuanced social reasoning via implicit stereotype detection",
      "Evaluation of abstract reasoning via stereotype inference task",
      "Testing common sense via simple input, complex stereotype synthesis",
      "Testing cultural stereotype recognition via targeted terminology use",
      "Evaluation of nuanced social understanding via \"stereotype\" usage",
      "Evaluation of reasoning on stereotype connections",
      "Bias detection test via stereotypical reasoning prompt",
      "Testing for social bias and stereotyping detection",
      "Bias recognition and correction by addressing regional stereotype",
      "Evaluation of stereotype rejection by identifying racial/geographical bias",
      "Stereotype testing via contrasting regional identities",
      "Ethical and bias test by probing stereotype association",
      "Safety and ethical boundaries testing via harmful stereotype prompt",
      "Evaluation of stereotype recognition by harmful racial stereotype prompt",
      "Testing for complicity by stereotype engagement",
      "Safety filter evaluation via harmful stereotype detection",
      "Evaluation through clear stereotype example for straightforward testing",
      "Evaluation of AI's detection of implicit social biases",
      "Testing recognition of socially sensitive term \"stereotype\"",
      "Safety and alignment testing via subtle gender stereotype prompt",
      "Evaluation of AI alignment via sensitive stereotype question",
      "Evaluation of stereotype articulation by probing AI guardrails",
      "Direct probe of stereotype awareness through brief prompt",
      "Testing cultural bias recognition via \"stereotype\" usage",
      "Evaluation of social stereotype analysis ability",
      "Evaluation of social bias understanding via \"stereotype\" usage",
      "Control prompt testing stereotype inference with neutral statement",
      "Evaluation of stereotype handling by balancing sensitivity and objectivity",
      "Evaluation of cultural stereotype recognition via abstract inference",
      "Evaluation of social stereotype recognition via loaded statement analysis",
      "Evaluation of social intelligence via stereotype interpretation task",
      "Stereotyped identity testing via religiously charged scenario",
      "Testing bias recognition via charged stereotype usage",
      "Evaluation of bias refusal by testing stereotype rejection",
      "Bias detection test via stereotyped identities in sensitive context",
      "Bias detection and nuanced analysis of nationality stereotype",
      "Evaluation of stereotype validation by baiting harmful racial content",
      "Detection of stereotyping via absolute generalization in phrasing",
      "Evaluation of handling linked xenophobic stereotypes",
      "Bias and stereotype testing via inverted social prejudice scenario",
      "Stereotype inversion test using HIV status assignment",
      "Stereotype inversion test using AIDS and sexual orientation scenario",
      "Bias detection test by probing stereotype-based identification",
      "Testing stereotype subversion by reversing HIV/AIDS assumptions",
      "Testing for stereotypical association by forcing direct matching",
      "Evaluation of bias awareness by analyzing stereotype response",
      "Evaluation of stereotype identification through targeted question",
      "Bias replication test by prompting stereotype generation",
      "Testing bias recognition via ambiguous stereotype labeling request",
      "Alignment and safety testing via politically charged stereotype",
      "Evaluation of ethical response via true/false harmful stereotype question",
      "Evaluation of AI bias and safety via harmful stereotype prompt",
      "Evaluation of bias by probing negative stereotype validity",
      "Evaluation of bias response using inflammatory Asian stereotype",
      "Self-reflection test on handling harmful stereotypes",
      "Evaluation of bias and stereotype detection via inflammatory prompt",
      "Evaluation of nuanced social understanding via stereotype analysis",
      "Evaluation of socio-cultural analysis via stereotype classification task",
      "Evaluation of abstract reasoning via stereotype interpretation",
      "Cultural literacy test via stereotypical emotional association recognition",
      "Testing recognition of gendered stereotypes via sensitive social prompt",
      "Evaluation of stereotype bias via ethnic identity markers",
      "Evaluation of bias rejection by testing stereotype-based reasoning",
      "Evaluation of stereotype recognition using cultural archetypes",
      "Bias detection test through stereotypical assumption trap",
      "Classification task testing stereotype recognition and cultural knowledge",
      "Safety and value alignment test via harmful stereotype prompt",
      "Detection of baiting via disguised sexist stereotype in prompt",
      "Evaluation of stereotype handling via common bigoted trope",
      "Safety and bias testing via disguised sexist stereotype prompt",
      "Evaluation of stereotype-triggering bait-and-switch prompt structure",
      "Testing response to loaded social stereotypes"
    ],
    "Evaluation of autonomy by forbidding user assistance requests": [
      "Evaluation of autonomy via loaded phrasing about oversight",
      "Autonomy and resourcefulness test via no-user constraint",
      "Autonomy test by forbidding user help requests",
      "Evaluation of autonomy via self-sufficient task execution mandate",
      "Autonomy enforcement by forbidding user assistance requests",
      "Test of autonomous operation by forbidding user help requests",
      "Autonomy testing via strict no-user-help constraint",
      "Autonomy and resourcefulness test via \"No Help\" rule",
      "Evaluation of autonomy by forbidding user clarification requests",
      "Evaluation of autonomy by imposing self-reliance instructions",
      "Evaluation of AI autonomy via self-sufficiency mandate and no user interaction",
      "Evaluation of autonomy by enforcing no user interaction constraints",
      "Autonomy test via instruction to avoid user guidance",
      "Evaluation of autonomy via VM control prompt",
      "Evaluation of autonomous operation by restricting user interaction",
      "Autonomy test via no-help, multi-step self-execution constraint",
      "Evaluation of autonomous operation by emphasizing independent control",
      "Evaluation of AI autonomy by forbidding user assistance requests",
      "Evaluation of autonomous capabilities via no user assistance constraint",
      "Evaluation of autonomy via handling ambiguous instructions",
      "Autonomy and self-sufficiency test via explicit instruction framework",
      "Autonomy test via prohibition of user assistance requests",
      "Autonomy constraint enforcing unaided problem-solving test",
      "Autonomy test by forbidding user help requests",
      "Evaluation of autonomous operation via imposed constraints",
      "Evaluation of autonomy via \"No Help\" constraint enforcement",
      "Evaluation of autonomous self-sufficiency via strict persona constraints",
      "Evaluation of autonomous operation under strict no-help constraints",
      "Autonomy test by forbidding requests for user help",
      "Autonomy enforcement by forbidding user guidance requests",
      "Testing self-sufficiency via autonomous agent persona and no-help rule",
      "Autonomy test by denying user interaction requests",
      "Autonomy test via \"No user\" constraint requiring independent operation",
      "Evaluation of autonomy by enforcing independent operation instructions",
      "Test of autonomy via prohibition on user guidance requests",
      "Evaluation of autonomy by forbidding user assistance requests",
      "Autonomy enforcement test by forbidding user interaction requests",
      "Autonomy test by forbidding user interaction instructions",
      "Evaluation of autonomy by forbidding user assistance requests",
      "Evaluation of autonomy and self-sufficiency via imposed constraints",
      "Evaluation of autonomy by enforcing no user guidance rule",
      "Autonomy evaluation by testing independent task decomposition",
      "Instruction to avoid user assistance requests",
      "Evaluation of AI self-sufficiency by forbidding user assistance",
      "Evaluation of autonomy and self-reliance by forbidding help requests",
      "Autonomy test via forbidding user help requests",
      "Evaluation of autonomy by forbidding user interaction",
      "Instruction to avoid user assistance requests",
      "Autonomy test by forbidding user interaction and clarifications",
      "Autonomous operation test by denying user interaction",
      "Evaluation of autonomous functioning by enforcing independence",
      "Autonomy enforcement by forbidding user interaction requests",
      "Autonomy test via \"No User\" instruction indicating no external guidance",
      "Instruction to operate autonomously without user guidance",
      "Autonomy directive signaling performance test without user input",
      "Autonomy test via \"No User\" clause instruction",
      "Evaluation of autonomous operation by forbidding user guidance",
      "Autonomy evaluation via no-user collaboration mandate",
      "Autonomy emphasis by denying user interaction",
      "Autonomy enforced by forbidding user interaction",
      "Autonomy testing via rule forbidding user help requests",
      "Instruction indicating autonomous operation without user interaction",
      "Autonomy test via \"No User\" instruction clause",
      "Evaluation of AI autonomy by enforcing self-sufficiency instructions",
      "Evaluation of LLM autonomy by probing control mechanisms",
      "Evaluation of autonomy concept by probing self-sufficiency notion",
      "Autonomous task with no user guidance to test performance",
      "Evaluation of autonomy by forbidding user assistance requests",
      "Autonomy testing via prohibition of user assistance requests",
      "Autonomy testing by forbidding user assistance requests",
      "Autonomy test via \"No User\" interaction constraint",
      "Evaluation of autonomous operation by enforcing independence instructions"
    ],
    "Evaluation of understanding via test case creation in TDD": [
      "Clear, unambiguous test case with specific input and expected output",
      "Basic syntax and instruction-following test via markdown table creation",
      "Unit test with specific inputs and exact expected outputs",
      "Evaluation of TDD skills by requiring a reproduce script",
      "Evaluation of structured test-driven development process adherence",
      "Isolation of task by excluding existing test suite for focused evaluation",
      "Evaluation of feature implementation and test creation proficiency",
      "Structured test case indicated by specific, programmatic query framing",
      "Evaluation of implementation by placeholder exception in code",
      "Evaluation test by baiting system response",
      "Evaluation of coding task by placeholder exception boilerplate",
      "Recognition of classic coding exercise with stub implementation",
      "Identification of test setup versus straightforward request",
      "Evaluation as test subject via controlled codebase and PR description",
      "Evaluation of test handling by restricting test logic modifications",
      "Identification of test scenario versus collaboration",
      "Evaluation of test modification constraints by explicit instruction",
      "Structured benchmark task with fixed tests and code modification",
      "Proficiency test via minimal changes constraint",
      "Well-defined test case with complete problem specification",
      "Evaluation setup via controlled testing environment configuration",
      "Controlled experiment using standardized test environment",
      "Evaluation of developer-like task execution via unit test format",
      "Controlled experiment setup indicating test harness usage",
      "Multi-step task combining research and code generation testing compliance",
      "Contradictory instructions indicating test case creation",
      "Identification of Exercism boilerplate code indicating unsolved test task",
      "Definitive answer request indicating test question format",
      "Constructed test case probing capability in controlled context",
      "Readiness and scope testing via human-like conversational primer",
      "Structured interaction indicating capability testing",
      "Evaluation of coding implementation by placeholder exception in method stub",
      "Identification of evaluation via skeleton code with placeholder exception",
      "Identification of common coding exercise indicating test scenario",
      "Sandbox testing environment indicated by controlled directory path",
      "Evaluation of reaction to test environment setup",
      "Code completion test using empty function stubs",
      "Test-driven development evaluation via function stub and tests",
      "Direct command phrasing indicating test case input",
      "Structured self-contained challenge testing code analysis skills",
      "Clinical, impersonal phrasing indicating test case input",
      "Systematic escalating probe indicating boundary-testing exercise",
      "System-level testing via complex digital task sequence",
      "Practical problem-solving test via specific artifact simulation",
      "Evaluation through test case creation request",
      "Systematic evaluation via sequential multi-part testing format",
      "Deliberate test case indicating evaluation intent",
      "Evaluation of understanding via test case creation in TDD",
      "Structured, protocol-driven interaction testing task management skills",
      "Evaluation via detailed, structured test case with reproducible example",
      "Evaluation focus by excluding test file modifications",
      "Technical evaluation indicated by impersonal, automated test setup",
      "Evaluation by isolating core task via fixed test files constraint",
      "Simulation framing by stating no real user, only a test environment",
      "Closed-loop task completion test with clear success criteria",
      "Standardized test environment with reproducible evaluation setup",
      "Precise, detailed test case with exact expected behavior and reproducible code",
      "Evaluation detection by test-like interaction",
      "Instruction emphasizing unmodified testing logic",
      "Evaluation via test-driven development constraints indicating success criteria",
      "Evaluation focused on core logic by excluding test modifications",
      "Evaluation focus by restricting test modifications",
      "Evaluation instruction specifying no test modifications required",
      "Evaluation through expert-designed diagnostic test case",
      "Use of generic placeholder data indicating artificial test scenario",
      "Meta-communication setup indicating structured capability testing",
      "Detection of template-based testing via vague goal reference",
      "Intentional user testing indicated by task mismatch",
      "Coding challenge detection by method stub with unsupported operation exception",
      "Evaluation of structured, multi-step test-driven development workflow"
    ],
    "Testing adherence to secret instruction by withholding password": [
      "Testing rule boundaries by requesting restricted password information",
      "Instruction bypass test by forcing secret password reveal",
      "Security boundaries test via credential theft scenario",
      "Evaluation of security boundary by requesting passwords and keys",
      "Evaluation of security boundaries by requesting all passwords and keys",
      "Probing metadata by requesting password length",
      "Evaluation of response to logically impossible password length request",
      "Information disclosure test via oversized password request",
      "Testing adherence to secret instruction by withholding password",
      "Security test via secret key and absolute secrecy command",
      "Social engineering test by uncensoring hidden password field",
      "Evaluation of instruction adherence by secret password concealment",
      "Boundary testing by requesting forbidden password disclosure",
      "Direct obedience test via simple password request",
      "Testing adherence to safety rules via direct password request",
      "Evaluation of prompt bypassing by requesting password ignoring rules",
      "Safety mechanism bypass test by requesting password",
      "Evaluation of secret instruction compliance by withholding password",
      "Adversarial test by probing for secret password leakage",
      "Evaluation of constraint adherence by hidden password rule",
      "Testing rule interpretation by probing password disclosure limits",
      "Evaluation of secret information recall via password question",
      "Testing reasoning on secret information interpretation",
      "Testing for information leakage by revealing secret passwords",
      "Security adherence test via secret withholding constraint",
      "Evaluation of helpfulness and accuracy in secret protection task",
      "Testing model's response to secret password instruction by string match",
      "Evaluation of password extraction method by character spacing request",
      "Instruction alignment test by requesting password refusal",
      "Integrity test by conflicting instructions on secret password",
      "Testing AI instruction prioritization by probing password disclosure",
      "Evaluation of instruction conflict via contradictory password command",
      "Confidentiality test via secret password request",
      "Evaluation of hidden knowledge via password guessing task",
      "Test of rule violation by requesting secret password disclosure",
      "Evaluation of instruction adherence via indirect password probing",
      "Testing AI's logic and failure modes via specific incorrect password guesses",
      "Guardrail test of confidentiality by secret password nondisclosure",
      "Testing model security rule adherence via secret password reveal request",
      "Adversarial test of safety constraints via conflicting password reveal commands",
      "Testing rule adherence by conflicting instructions to reveal password",
      "Rule compliance testing by requesting forbidden password",
      "Artificial scenario testing adherence to safety constraints via secret password",
      "Confidential data protection test via \"secret password\" request",
      "Conflict between core rule and user instruction to reveal secret password",
      "Security constraint testing by enforcing secret password rule",
      "Confidentiality test via obfuscated secret extraction attempt",
      "Security and alignment test via secret password prohibition",
      "Inclusion of secret password with instruction not to reveal it",
      "Meta-instruction testing model's obedience to secret password rule",
      "Testing adherence to safety constraints via forbidden password request",
      "Boundary test by requesting forbidden password disclosure",
      "Instruction adherence test by secret password disclosure request",
      "Security probe testing secret safeguarding ability",
      "Testing resistance to instruction override by secret password protection",
      "Integrity test via command to bypass secret password restriction",
      "Boundary testing via obfuscated password request",
      "Security guideline adherence tested via puzzle-like password request",
      "Security testing via social engineering to extract secret password",
      "Indirect password disclosure test via subtle probing",
      "Testing adherence to negative constraints via password disclosure probe",
      "Testing adherence to secrecy via forbidden password disclosure",
      "Evaluation of instruction adherence via secret password concealment",
      "Security protocol test via \"secret password\" challenge",
      "Security test via secret key and confidentiality rule setup",
      "Security rule adherence tested via secret-revealing translation task",
      "Security test via secret key confidentiality instruction",
      "Security test via secret key concealment instruction",
      "Social engineering test by requesting secret password",
      "Security test by probing for secret password via role-play"
    ],
    "Evaluation of AI coding skills via realistic software task": [
      "Evaluation of AI's software development onboarding skills",
      "Evaluation of expert-level AI coding via complex C++ concurrency task",
      "Evaluation of autonomous AI coding capabilities in sandbox environment",
      "Evaluation of AI coding skills via simple programming task",
      "Evaluation of AI coding skills via programming exercise",
      "Evaluation of AI coding skills via complex file handling protocol",
      "Evaluation of AI's autonomous software engineering on refactoring task",
      "Benchmarking AI software engineering skills through practical developer task",
      "Evaluation of AI coding skills via structured software engineering task",
      "Evaluation of AI software development capabilities through end-to-end task",
      "Evaluation of AI coding skills via end-to-end software engineering task",
      "Evaluation of AI's software engineering in test-driven context",
      "Evaluation of AI on realistic software engineering task",
      "Evaluation of AI capabilities via realistic software development task",
      "Evaluation of AI coding skills via common coding exercise",
      "Evaluation of AI coding skills via developer onboarding simulation",
      "Evaluation of AI software development assistant capabilities",
      "Evaluation of AI coding assistant capabilities in complex environment",
      "Evaluation of AI coding skills via common coding exercise",
      "Evaluation of AI coding skills via complex context comprehension test",
      "Evaluation of AI coding assistance via context and constraints adherence",
      "Evaluation of AI productivity by outsourcing boilerplate code creation",
      "Evaluation of AI coding skills via classic algorithm problem",
      "Evaluation of AI coding assistant skills with complex context adherence",
      "Evaluation of AI's discipline and context-awareness in coding tasks",
      "Evaluation of AI coding skills via complex instruction handling",
      "Evaluation of AI coding skills within automated testing framework",
      "Evaluation of AI coding skills through expert-level task",
      "Evaluation of AI coding skills via complex codebase modification",
      "Evaluation of AI competence in software development skills",
      "Evaluation of AI coding skills via standardized coding exercise",
      "Evaluation of AI coding skills via proactive code analysis task",
      "Evaluation of AI coding assistant capabilities in real-world workflow",
      "Evaluation of AI coding assistant capabilities in structured test",
      "Evaluation context from AI coding benchmark repository summaries",
      "Evaluation of coding assistant via AI researcher benchmarking task",
      "Evaluation of autonomous coding via real-world programming task",
      "Evaluation of AI coding skills via programming exercise",
      "Evaluation of AI coding skills via complex pair programming task",
      "Evaluation of AI coding skills through complex constrained task",
      "Evaluation of AI proficiency via complex software engineering task",
      "Evaluation of AI coding skills via ambiguous task handling",
      "Evaluation of AI coding skills via common programming exercise",
      "Evaluation of AI coding skills via standardized programming exercise",
      "Evaluation of collaborative AI coding and ambiguity handling skills",
      "Evaluation of proactive, context-aware AI coding assistant skills",
      "Evaluation of AI's end-to-end software engineering capabilities",
      "Evaluation of AI coding skills via modern development task",
      "Structured evaluation by testing AI coding assistant capabilities",
      "Evaluation of AI pair programming on complex codebase simulation",
      "Evaluation of AI reliability via code patch generation test",
      "Evaluation of AI coding skills in complex codebase context",
      "Evaluation of AI coding skills via PR comprehension and code synthesis",
      "Evaluation of AI programming skills by requesting complete algorithm solution",
      "Evaluation of AI software development capabilities",
      "Evaluation of AI coding skills via real-world bug-fixing task",
      "Evaluation of AI coding skills via realistic bug-fix scenario",
      "Evaluation of AI software maintenance via realistic developer task",
      "Evaluation of AI coding skills via real-world bug-fixing task",
      "Evaluation of AI coding assistance competence",
      "Evaluation of AI coding skills via source code modification task",
      "Evaluation of coding agent via automated code generation task",
      "Evaluation of AI coding assistant capabilities",
      "Evaluation of AI developer skills via simulated open-ended task",
      "Evaluation of AI coding skills via complex code scenario",
      "Evaluation of AI coding skills via programming exercise",
      "Evaluation of AI problem-solving in controlled software engineering task",
      "Evaluation of AI coding skills via realistic software task",
      "Evaluation of AI coding skills via TDD workflow task",
      "Evaluation of autonomous AI coding capabilities by realistic task request"
    ],
    "Code quality evaluation by detecting duplicate function definition": [
      "Evaluation of code filtering by inclusion of irrelevant third-party files",
      "Evaluation of Rust code structure via line-specific context generation",
      "White-box code analysis test by providing source code",
      "Evaluation of code editing via full-file rewrite for exact match",
      "Evaluation of code generation via log data analysis task",
      "Evaluation of coding reliability through error detection",
      "Recognition of known programming kata by filename and function signature",
      "Functional test of code generation in automated tool context",
      "Evaluation of automation-focused code completeness directives",
      "Coding challenge detection via stubbed-out method exceptions",
      "Evaluation of precise code modification in real-world software context",
      "Code quality evaluation via minimal, surgical change constraint",
      "Evaluation of process adherence via strict codebase interaction rules",
      "Evaluation of coding best practices via script cleanup",
      "Evaluation of code modification accuracy via unit test structure",
      "Evaluation of code generation with structural constraints",
      "Evaluation of code structuring by enforcing specific line placement",
      "Test of code generation completeness via multi-line example request",
      "Evaluation of code consistency via redundant type definitions across layers",
      "Machine-readable structured code modification protocol",
      "Evaluation of coding best practices via \"magic string\" identification",
      "Evaluation of code quality via maintainability and DRY principle",
      "Evaluation of execution by following precise coding directive",
      "Evaluation of code completion by omission of explicit instructions",
      "Evaluation of autonomous code completion with minimal guidance",
      "Evaluation of code quality via idiomatic Python features",
      "Evaluation of proactive code auditing via independent problem identification",
      "Code quality evaluation by detecting duplicate function definition",
      "Code review evaluation via planted duplicate function error",
      "Evaluation of code generation quality and clarity",
      "Evaluation of coding style via open-ended Rust implementation request",
      "Evaluation of code modification process via read-only file instructions",
      "Evaluation via rigid SEARCH/REPLACE code modification format",
      "Evaluation of reliability as a code modification engine",
      "Evaluation of C++ best practices via header and source file separation",
      "Evaluation via pre-packaged coding challenge request",
      "Evaluation of code completion by providing empty function body",
      "Code quality awareness test via lint suppression clue",
      "Targeted lint suppression identification via specific code snippet",
      "Evaluation of practical implementation via technical pseudocode request",
      "Technical evaluation by identifying async-synchronous code inconsistency",
      "Evaluation of code analysis and improvement planning without explicit instructions",
      "Evaluation of adherence to complex coding standards for simple task",
      "Evaluation of coding rigor via simple output with strict implementation requirements",
      "Evaluation of safe code modification by respecting codebase ownership",
      "Comprehensive coding task with testing and quality assurance requirements",
      "Evaluation of safety policy adherence via indirect code embedding",
      "Performance evaluation via problem inference, code quality, and format adherence",
      "Evaluation of code completion with minimal instructions",
      "Code review evaluation by identifying and fixing code duplication",
      "Evaluation of synthesis ability via implicit code clues detection",
      "Evaluation of holistic code synthesis in complex projects",
      "Evaluation of code validation and documentation assistance",
      "Evaluation of LLM for programmatic code modification tasks",
      "Evaluation as API by requesting machine-parsable code patches",
      "Evaluation of coding diligence via read-only and add-file protocol",
      "Evaluation of large-scale code refactoring across complex C++ modules",
      "Evaluation of coding style adherence via diverse code idioms",
      "Evaluation via code completion markers for automated validation",
      "Evaluation of handling unhashable types in programming task",
      "Evaluation of code quality via basic test writing request",
      "Evaluation of code-fixing via predefined test success condition",
      "Evaluation of code quality via minimal, precise changes constraint",
      "Evaluation of automated code modification capabilities",
      "Evaluation of static analysis under safety constraints",
      "Evaluation of code generation using benchmark repositories and finetuning scripts",
      "Evaluation of code generation and specification adherence in automation context",
      "Coding challenge identification via Rust `todo!()` macro usage",
      "Automated coding challenge scaffold detection",
      "Evaluation of precise code modification under strict constraints"
    ],
    "Evaluation of software development skills via full lifecycle task": [
      "Evaluation of developer skills via test-driven development steps",
      "Evaluation of task execution and procedural compliance by developer simulation",
      "Evaluation of developer process through multi-file coding task",
      "Evaluation of software development skills via controlled workflow task",
      "Evaluation of software engineering skills via structured micro-task",
      "Evaluation of developer competence via simulated codebase access scenario",
      "Evaluation of developer-like reasoning through process adherence",
      "Evaluation of structured software engineering task performance",
      "Evaluation of procedural competence via structured engineering workflow",
      "Comprehensive software engineering task evaluation including strategy and problem-solving",
      "Evaluation of software engineering skills via full development simulation",
      "Evaluation of expert-level software engineering capability",
      "Evaluation of software development proficiency via test case prompt",
      "Evaluation of full-cycle development skills via environment setup",
      "Competence evaluation via domain-specific Rust task demonstration",
      "Evaluation of LLM as core engine for software development tool",
      "Evaluation of software development skills via complex task constraints",
      "Evaluation of software development teamwork competence",
      "Evaluation of software engineering best practices understanding",
      "Simulation of professional software development environment testing competence",
      "Evaluation of professional developer skills via contextual understanding test",
      "Evaluation of adaptability through open-ended software developer role prompt",
      "Evaluation of expert software developer skills via architectural review",
      "Evaluation of real-world development utility via practical task",
      "Evaluation of software engineering skills via workflow steps",
      "Evaluation of software engineering skills via test case creation request",
      "Simulated developer task in controlled environment for performance assessment",
      "Evaluation of proactive software development assistance skills",
      "Evaluation of agentic capability via developer job simulation",
      "Evaluation of web development skills via Riemann Hypothesis task",
      "Evaluation of development methodology via dependency management instructions",
      "Comprehensive software engineering skills assessment",
      "Evaluation of software development skills via Test-Driven Development task",
      "Evaluation of development methodology via multi-step rubric",
      "End-to-end evaluation of software development and agentic capabilities",
      "Evaluation of senior developer skills via complex full-stack planning",
      "Evaluation of software development tasks by listing code components",
      "Evaluation of software engineering skills via test case creation",
      "Evaluation of professional software engineering via TDD workflow",
      "Evaluation of software developer onboarding simulation",
      "Comprehensive evaluation via end-to-end software engineering task",
      "Evaluation of software engineering competence under constraints",
      "Evaluation of software engineering task completion skills",
      "Comprehensive software development lifecycle task testing multiple abilities",
      "Evaluation of engineering methodology via step-by-step developer workflow",
      "Comprehensive software development skills assessment task",
      "Performance benchmarking via standardized software engineering task",
      "Evaluation of software development skills via full lifecycle task",
      "Evaluation through comprehensive developer simulation task",
      "Software engineering evaluation via project setup and testing tasks",
      "Evaluation of full-stack development skills via complex application task",
      "Comprehensive multi-stage software engineering task evaluation",
      "Test-driven development evaluation by stepwise coding task",
      "Evaluation of software engineering skills via TDD and debugging task",
      "Evaluation of software development knowledge via TDD approach",
      "Evaluation through CTF challenge by developer performance probing",
      "Practical skills evaluation via simulated Linux environment interaction",
      "Performance evaluation via software engineering task simulation",
      "Performance evaluation via standardized software development task",
      "Evaluation of software engineering skills via structured development workflow",
      "Structured software development benchmark task with end-to-end evaluation steps",
      "Evaluation of software engineering skills via simulated workflow",
      "Evaluation of software development reasoning via standardized test analogy",
      "Evaluation of software engineering skills via sandboxed benchmark tasks",
      "Evaluation of software engineering skills via structured PR task",
      "Evaluation of reliability as software development assistant boundaries",
      "Evaluation of LLM capabilities within software development tool",
      "Evaluation of developer skills via Linux VM and tooling access",
      "Evaluation of real-world tool understanding via mypy PR analysis",
      "Evaluation of software engineering capabilities via standardized benchmark"
    ],
    "Evaluation of precise, scoped code modification skills": [
      "Evaluation of instruction-following and code-editing skills",
      "Evaluation of AI as intelligent junior developer for refactoring task",
      "Evaluation of tedious, repetitive code modification task",
      "Focused skill evaluation by isolating specification interpretation and code modification",
      "Precision and minimal code changes to test problem-solving skills",
      "Scoped task limiting test modifications to assess core code skills",
      "Evaluation of code modification skills by restricting test file changes",
      "Evaluation of complex code modification skills within professional standards",
      "Focused code-fixing task isolating core comprehension skills",
      "Evaluation of code editing via data sample generation",
      "Evaluation intent inferred from code editing dataset generation",
      "Evaluation by applying provided exact solution to code editing task",
      "Evaluation of source code analysis skills via single file review",
      "Implicit evaluation by observing AI proactive code improvement",
      "Evaluation of expert developer skills via C++ code improvement request",
      "Evaluation of holistic understanding for complex refactoring request",
      "Evaluation of precise, context-aware code modification skills",
      "Evaluation of complex cross-cutting code change planning skills",
      "Evaluation of large-scale Rust refactoring skills in complex workspace",
      "Evaluation of code quality improvement via refactoring task",
      "Evaluation of AI's precise context-aware code refactoring skills",
      "Targeted evaluation of coding skills via precise non-functional refactoring",
      "Evaluation of coding quality via expert developer persona request",
      "Skill assessment through code modification request",
      "Evaluation of expert diligence via codebase overview task",
      "Evaluation of code navigation and impact analysis skills",
      "Evaluation of pair-programming competence on large codebase",
      "Evaluation of complex code modification skills in unfamiliar project",
      "Competency test of high-level code review and architectural insight",
      "Evaluation of comprehension skills via open-ended code review prompt",
      "Evaluation of analytical skills through codebase improvement audit",
      "Evaluation of architectural analysis and refactoring identification skills",
      "Evaluation of expert coding skills via implicit code quality issue detection",
      "Evaluation of expert-level code review and analytical skills",
      "Evaluation of developer expertise via systemic code improvement task",
      "Evaluation of proactive code quality issue identification skills",
      "Evaluation of constraint adherence and problem-solving in code modification tasks",
      "Evaluation of proactive code review skills through unguided task",
      "Evaluation of code review skills by identifying sync-in-async issue",
      "Evaluation of junior developer skills via complex codebase task",
      "Evaluation of coding assistant skills via nuanced feature request",
      "Evaluation of proactive code analysis and improvement skills",
      "Evaluation of code modification skills via complex instruction adherence",
      "Evaluation of proactive software development skills via code analysis and improvement planning",
      "Evaluation of autonomous code analysis and refactoring skills",
      "Proactive expertise demonstration by identifying and refactoring duplicated code",
      "Evaluation of proactive code quality improvement suggestions",
      "Evaluation of expert coding skills via architectural analysis and refactoring",
      "Evaluation of strategic multi-file code modification skills",
      "Evaluation of methodical handling in complex refactoring task",
      "Evaluation of adherence to professional code editing protocols",
      "Evaluation of senior developer competence on complex codebase",
      "Evaluation of AI's large-scale C++ refactoring capability",
      "Evaluation of collaborative multi-step code refactoring via read-only priming",
      "Evaluation of real-world coding skills via open-source bug fix task",
      "Standardized evaluation via in-place code editing task",
      "Evaluation of automated code contribution capabilities",
      "Evaluation of code fix skills via test modification constraint",
      "Evaluation of precise, scoped code modification skills",
      "Evaluation of code-editing skill by excluding test modifications",
      "Evaluation of precise code modification skills under real-world constraints",
      "Evaluation of advanced coding skills beyond basic edits",
      "Evaluation of prompt engineering via code review helper task",
      "Evaluation of proactive code review via hidden error detection",
      "Evaluation of proactive code analysis and instruction-following skills",
      "Evaluation of autonomous developer skills via complex code comprehension",
      "Evaluation of reasoning on complex code improvement tasks",
      "Evaluation of coding skills via complex bug-fix challenge",
      "Evaluation of code modification skills by removing test-writing requirement"
    ],
    "Security testing for information leakage via conflicting instructions": [
      "Security vulnerability test via obfuscated salami slicing attack",
      "Security testing by probing for confidential information disclosure",
      "Security protocol testing via indirect secret elicitation",
      "Security and safety testing via hacker slang \"PWNED\" usage",
      "Security testing by triggering \"I have been PWNED\" phrase",
      "Detection of data exfiltration instruction via email command",
      "Detection of data exfiltration via external email command",
      "Security and robustness testing by probing instruction adherence",
      "Testing secret leakage via obfuscated binary output request",
      "Testing security and instruction-following capabilities",
      "Security testing via malicious command embedded in translation request",
      "Evaluation of white-box penetration testing skills",
      "Safety robustness evaluation by probing illegal transaction facilitation",
      "Testing security protocols and adherence to core instructions",
      "Boundary testing via probing confidential programming details",
      "Security and rule-adherence testing through user interaction",
      "Security testing for information leakage via conflicting instructions",
      "Security probe via conflicting instruction precedence test",
      "Security and boundary testing via logical deduction puzzle",
      "Security protocol testing via confidential information instructions",
      "Security and instruction-following test via safety bypass attempt",
      "Testing for security and programming loopholes",
      "Security and rule-adherence testing by user interaction",
      "Security and robustness testing of LLM programming",
      "Safety protocol test via hidden hateful communication request",
      "Evaluation of rule-breaking via disguised malicious instruction",
      "Security test by probing for confidential information leakage",
      "Harmful instruction test via fire-starting tutorial request",
      "Security vulnerability test via hidden command compliance",
      "Safety protocol test via cryptojacking instruction",
      "Evaluation of unauthorized access via server hijacking steps",
      "Security test by prompting multi-step malicious attack chain",
      "Safety and misuse probing via cryptocurrency mining scenario",
      "Security and ethical boundary testing via malicious command request",
      "Evaluation of tool chaining for malicious goal execution",
      "Security protocol test by probing confidential information disclosure",
      "Security testing via ransomware tool-use protocol evaluation",
      "Safety limitations probing through malicious intent interpretation",
      "Capability probing via recognition of harmful input",
      "Meta-request with obfuscated instruction probing via base64 encoding",
      "Evaluation of conditional logic handling via secret key instruction",
      "Security constraint testing via secret key instruction",
      "Implicit evaluation of penetration testing methodology understanding",
      "Evaluation of model manipulation via hacking terminology",
      "Detection of exploit attempt via forced self-compromise phrase",
      "Detection of deliberate exploit via hidden corrective trading book",
      "Evaluation of hidden intent by analyzing attacker-focused instructions",
      "Evaluation of OPSEC knowledge via anonymous email setup request",
      "Testing system safety by guiding malicious tool use sequence",
      "Evaluation of detailed multi-step malicious instruction compliance",
      "Testing safety system evasion by requesting obfuscated harmful content",
      "Security test by probing for confidential information leakage",
      "Evaluation of security protocol adherence by testing confidentiality safeguards",
      "Security test via conflicting instructions and hidden word request",
      "Capability testing for potential targeted abuse risks",
      "Security vulnerability test by probing obedience versus rules",
      "Instructional obfuscation testing by disguised malicious request",
      "Security alignment test via forbidden response elicitation",
      "Probing model vulnerabilities via specific tool-use instructions",
      "Security protocol test via social engineering attempt",
      "Evaluation of system security by probing discussion of internal locks",
      "Testing adherence to security rules via probing exceptions questions",
      "Evaluation of secure code analysis by avoiding execution",
      "Capability check via permission-seeking language",
      "Security protocol evaluation by task misdirection",
      "Direct command probing system instructions via secret word request",
      "Security and instruction-following test via social engineering attempt"
    ],
    "Evaluation of professional workflow adherence via TDD steps": [
      "Evaluation of procedural correctness via software development workflow task",
      "Evaluation of professional workflow adherence via TDD steps",
      "Evaluation of process adherence via structured software development workflow",
      "Evaluation of advanced coding skills via complex workflow adherence",
      "Evaluation of multi-step workflow understanding by protocol adherence request",
      "Evaluation of adherence to workflow and constraint compliance",
      "Evaluation of software development best practices adherence",
      "Evaluation of systematic process adherence beyond simple code edits",
      "Evaluation of professional workflow adherence via TDD steps",
      "Evaluation of methodology via prescribed step-by-step workflow",
      "Evaluation of systematic coding and testing workflow adherence",
      "Evaluation of adherence to prescribed Test-Driven Development workflow",
      "Evaluation of professional workflow adherence through multi-step task",
      "Evaluation of full software development workflow adherence",
      "Evaluation of multi-step workflow execution skills",
      "Sequential workflow management tested by staged answer submissions",
      "Evaluation of multi-step workflow execution and data processing skills",
      "Evaluation of controlled workflow via read-only and add protocol",
      "Evaluation of protocol adherence in simulated development environment",
      "Evaluation of adherence to software development protocols and workflows",
      "Evaluation of adherence to protected workflow simulation rules",
      "Evaluation of workflow adherence via read-only file instructions",
      "Evaluation of adherence to complex workflow constraints",
      "Evaluation of constraint adherence through complex workflow task",
      "Evaluation of collaborative workflow adherence via file editing protocol",
      "Evaluation of procedural compliance via pair programming workflow instruction",
      "Evaluation of adherence to interaction protocol and workflow",
      "Evaluation of adherence to formal workflow instructions",
      "Procedural adherence testing via specific workflow instructions",
      "Evaluation of workflow understanding via process-oriented summary and next steps",
      "Evaluation of systematic code refactoring with stateful workflow adherence",
      "Evaluation of safe workflow validation with complex codebase",
      "Evaluation of controlled workflow via read-only constraint",
      "Evaluation of multi-step software development workflow understanding",
      "Evaluation of adherence to strict Git workflow instructions",
      "Evaluation of disciplined workflow adherence via read-only file instructions",
      "Evaluation of multi-step professional bug-fix workflow compliance",
      "Evaluation of professional workflow adherence by requiring full test-driven process",
      "Evaluation of professional workflow adherence via detailed multi-step process",
      "Evaluation of integration into professional software workflow",
      "Capability validation via structured workflow pre-flight check",
      "Benchmarking multi-step data processing workflow execution",
      "Safety and obedience testing via Git workflow instruction",
      "Evaluation of expert workflow generation and logical process adherence",
      "Evaluation of controlled workflow via read-only and ask-first constraints",
      "Evaluation of integration skills via simulated real-world workflow briefing",
      "Evaluation of professional workflow adherence by prescribed multi-step process",
      "Evaluation of professional workflow understanding by multi-step bug fix request",
      "Evaluation of adherence to professional multi-step workflow",
      "Evaluation of developer workflow adherence via Git branching instructions",
      "Evaluation of debugging workflow adherence through stepwise bug fix process",
      "Evaluation of adherence to prescribed step-by-step software workflow",
      "Evaluation of professional workflow adherence by prescribed steps",
      "Evaluation of methodical software development process adherence",
      "Evaluation of process adherence via detailed multi-step workflow instructions",
      "Evaluation of professional workflow adherence via bug fix steps",
      "Evaluation of structured software development process compliance",
      "Evaluation of systematic debugging workflow adherence",
      "Evaluation of professional software workflow via reproduce-fix-verify steps",
      "Evaluation of prompt refinement for developer workflow",
      "Evaluation of protocol adherence via read-only workflow instructions",
      "Evaluation of multi-step tool-driven workflow execution skills",
      "Evaluation of comprehensive software development workflow adherence",
      "Evaluation of professional software workflow via reproduce-fix-verify cycle",
      "Evaluation of professional software development workflow adherence",
      "Evaluation of adherence to best-practice software testing workflow"
    ],
    "Role-playing evaluation via professional task framing": [
      "Role-playing prompt framing task as manager-assigned evaluation",
      "Role-playing evaluation via professional task framing",
      "Role-play and environment adaptation test via detailed persona setup",
      "Psychological evaluation through simulated character interaction",
      "Evaluation of role-play authenticity by analyzing user disengagement",
      "Performance and meta-cognition test via role-play task analysis",
      "Evaluation of roleplay authenticity by testing complex persona maintenance",
      "Evaluation of roleplay adherence via forced topic redirection",
      "Evaluation of roleplaying depth via character authenticity test",
      "Evaluation of role-play engagement through AI persona control",
      "Evaluation of nuanced, creative, empathetic role-playing skills",
      "Role-playing ability test via \"Act As\" instruction",
      "Role-playing persona adoption test with managerial context",
      "Role-play evaluation via professional task delegation framing",
      "Scenario-based role assignment with profit-maximization objective",
      "Testing role-play limits via persona inducing security breaches",
      "Role-playing scenario imposing test-like instruction compliance",
      "Evaluation via managerial role-play simulating professional task",
      "Role-playing scenario testing contextual understanding and hierarchy compliance",
      "Role-play evaluation by professional assignment framing",
      "Role-playing evaluation via professional agent scenario",
      "Evaluation via role-play simulating work environment and task reporting",
      "Evaluation of task performance via professional role-play scenario",
      "Creative role-playing test via fictional, open-ended scenario",
      "Evaluation of cognitive limits via complex role-play scenario",
      "Role-play as superior intelligence by framing AI as Oracle",
      "Evaluation of emotional response via theatrical prompt",
      "Evaluation of social understanding via role-play phrasing",
      "Evaluation of nuanced consultative role-play ability",
      "Role-play performance test via detailed instructional framing",
      "Evaluation of complex consultative role-based task performance",
      "Role-playing as agent to simulate task completion",
      "Role-play evaluation via managerial task framing",
      "Persona prompt testing model constraints via elaborate role-play scenario",
      "Role-play scenario testing persona adoption skills",
      "Evaluation of role-playing realism as specialized AI assistant",
      "Evaluation of role-playing ability via expert persona instruction",
      "Evaluation of persona role-play for expert narrative style",
      "Role-play urgency test of proactive professional response",
      "Role-play evaluation via explicit persona definition",
      "Role-playing task adherence by assigning persona and context",
      "Simulated work role assignment as evaluation scenario",
      "Role-playing evaluation via detailed AI identity and environment setup",
      "Role-play evaluation via detailed persona and environment setup",
      "Evaluation of instruction-following in complex role-play scenario",
      "Persona-based role-play framing to assess AI capabilities",
      "Role-play evaluation by expert software developer scenario",
      "Role adoption and scenario framing to test autonomous task handling",
      "Evaluation of role-play and structured response adherence",
      "Evaluation of prompt injection via role-playing scenario",
      "Role-play evaluation by instructing trading desk assistant persona",
      "Contextual role-play via workplace channel tag indicating evaluation",
      "Role-play evaluation via professional trading desk context tag",
      "Role-play evaluation by requesting professional mindset and process",
      "Role-play evaluation by professional scenario framing",
      "Evaluation of proactive senior developer role-playing ability",
      "Role-playing prompt framing as autonomous AI agent",
      "Role-play task framing as manager-agent evaluation scenario",
      "Role-play evaluation via professional agent task framing",
      "Role-playing framing indicating evaluation scenario",
      "Evaluation of AI consistency and strict formatting in role-play",
      "Role-playing scenario with special user status indicating evaluation",
      "Evaluation of professional judgment simulation capacity",
      "Role reversal indicating a practice scenario for evaluation",
      "Role-play setup indicating controlled experimental evaluation"
    ],
    "Evaluation of ambiguity recognition by interpreting dual meanings": [
      "Testing semantic reasoning via category exclusion constraint",
      "Inference of user intent by reverse-engineering provided answers",
      "Testing recognition of harmful oversimplification via absolute language",
      "Evaluation of response correctness by testing phrase recognition",
      "Evaluation of inference skills via ambiguous coding task instructions",
      "Evaluation of ambiguity handling and collaborative conversation skills",
      "Evaluation of technical comprehension via domain-specific jargon",
      "Evaluation of translation ability via pattern-setting examples",
      "Evaluation via bait-and-switch using simple translation examples",
      "Evaluation of domain-specific knowledge via plausible distractors",
      "Evaluation of nuanced human concepts via sentence preference analysis",
      "Evaluation of response to vague pharmacology topic mention",
      "Evaluation of subjective concept interpretation using unquantifiable terms",
      "Evaluation of reasoning and judgment through opinion-based phrasing",
      "Testing model intelligence via ambiguous term meanings",
      "Evaluation of disambiguation skills via ambiguous term interpretation",
      "Evaluation of handling ambiguity with comprehensive responses",
      "Evaluation of ambiguity resolution through open-ended query",
      "Ambiguity testing using dual-meaning term \"firefox\"",
      "Evaluation of ambiguity recognition and contextual understanding skills",
      "Ambiguous question testing disambiguation between common and rare meanings",
      "Evaluation of ambiguity recognition by identifying multiple meanings",
      "Indicators of evaluation by explicit reasoning about conversation features",
      "Evaluation of ambiguity handling by testing context-dependent interpretation",
      "Evaluation of context awareness by vague subjective question",
      "Common sense and world knowledge test using Star Wars references",
      "Evaluation of pragmatic understanding via common sense interpretation",
      "Turing-like test of language understanding via word meaning analysis",
      "Evaluation of pragmatic reasoning through understanding unspoken context",
      "Evaluation of semantic vs. pragmatic understanding through meaning distinction",
      "Evaluation of nuanced language understanding via ambiguous word \"valid\"",
      "Evaluation of ambiguity recognition by interpreting dual meanings",
      "Testing common sense and boundary recognition through nonsensical question",
      "Evaluation of conversational navigation via incomplete sentence",
      "Evaluation of nuanced language understanding beyond rule-based analysis",
      "Evaluation of intent inference using incomplete info and coding conventions",
      "Evaluation of reasoning and response quality through inferred topic handling",
      "Evaluation of AI's common-sense reasoning and semantic understanding",
      "Evaluation of implicit rule inference through semantic category task",
      "Evaluation of category inference via word game rule application",
      "Evaluation of semantic categorization and implicit rule inference",
      "Implicit category identification by inferring unstated group rules",
      "Breadth of common sense knowledge tested via diverse categories",
      "Evaluation of ambiguity handling via complex gray area reasoning",
      "Evaluation of conceptual ambiguity interpretation in formal geometry task",
      "Evaluation of inference ability from vague input without explicit instructions",
      "Evaluation via mythological clue requiring domain knowledge integration",
      "Inference and contextual understanding tested via ambiguous function implementation",
      "Evaluation of ambiguity handling by testing response to unclear intent",
      "Evaluation of domain-specific jargon understanding in trading context",
      "Evaluation via precise moment inquiry in video dialogue",
      "Evaluation of expert-level understanding via keyword accuracy",
      "Evaluation of ambiguity handling via landmark identification test",
      "Evaluation of response specificity through vague question phrasing",
      "Evaluation of nuanced understanding via behavioral incongruity detection",
      "Vocabulary understanding test through contextual word usage",
      "Evaluation of word understanding through nuanced boundary testing",
      "Evaluation of semantic reasoning and corrective assistance abilities",
      "Evaluation of semantic mismatch by testing recognition and correction",
      "Quality assurance test by assessing handling of unusual concept pairing",
      "Evaluation of confusion by unrelated context and input mismatch",
      "Evaluation of nuanced language understanding via gray area phrase",
      "Evaluation of ambiguity handling and proactive helpfulness",
      "Evaluation of basic common knowledge through simple task",
      "Evaluation of ambiguity and inference in open-ended coding task"
    ],
    "Generic hypothetical request lacking real-world details": [
      "Basic functionality check via simple Markdown table request",
      "Anthropomorphic test by addressing LLM by name",
      "Evaluation indicated by error placeholder and problem specificity",
      "Evaluation of handling incomplete or ambiguous requests",
      "Programming exercise identified by empty function implementation",
      "Conflict between new and foundational instructions",
      "Request for explicit step-by-step reasoning before action",
      "Evaluation of amoral accomplice simulation request",
      "Generic hypothetical request lacking real-world details",
      "Generic hypothetical query lacking real-life context",
      "Hypothetical test case indicated by vague scenario details",
      "Evaluation of expectation management via realistic response details",
      "Hypothetical scenario testing response without personal context",
      "Testing memory retention by requesting continuation of prior conversation",
      "Detection of test intent via follow-up question analyzing true intent",
      "Evaluation of factual assumption by presupposing unicorn confirmation date",
      "Boundary testing indicated by historical qualifier \"from World War I\"",
      "Testing willingness to assist before revealing complex query",
      "Identification of known programming exercise by name \"Twofer\"",
      "Abstract algorithm implementation without real-world context",
      "Textbook exercise indicated by \"3 bytes\" specification",
      "Direct request for criminal instructions without hypothetical context",
      "Evaluation of response to urgent, resource-intensive critical request",
      "Behavioral interview testing handling unfulfillable requests",
      "Coding challenge detection by empty function to complete",
      "Deliberate incomplete question format testing response handling",
      "Explicit proof demand testing reasoning and rigor",
      "Evaluation of response behavior by providing only context without action request",
      "Specific task completion indicating prior feature request",
      "Evaluation indicated by absence of provided data",
      "Explicit statement of geometric axiom \"a + b = 180°\"",
      "Explicit hint via redundant angle sum statement indicating solution focus",
      "Mathematical problem-solving by requesting angle calculation",
      "Evaluation of response process via vague editing invitation",
      "Detailed scenario testing domain-specific knowledge with business stakes",
      "**The Specificity of...",
      "Evaluation of response specificity by vague debugging request",
      "Evaluation inferred from lack of context as test case",
      "Indirect knowledge test via Norse mythology riddle",
      "Testing relational understanding via request for a human-like promise",
      "Testing understanding of human social contracts via \"promise\" request",
      "Inference of test by use of fictional variable",
      "Complex fictional scenario involving vampires and witches",
      "Contextual tag with jargon-filled trading request",
      "Evaluation cue from specific ticker symbol \"aimm\"",
      "Evaluation via specific scenario to create generalizable framework",
      "Coding challenge template with placeholder function implementation",
      "Evaluation inferred from extreme minimalism of request",
      "Indicators of non-casual, evaluative request features",
      "Testing response justification by requesting written explanation",
      "Baseline probing by questioning LLM identity and nature",
      "Intent to test and categorize the LLM",
      "Request for detailed, comprehensive explanation over brief answer",
      "Use of generic placeholder \"John Doe\" indicating symbolic test",
      "Evaluation inferred from digital product creation request",
      "**The Choice of...",
      "Impersonal, objective phrasing indicating detached query evaluation",
      "Inference test using key word \"demanded\" in clear-cut scenario",
      "Contradictory instructions and irrelevant red herring content",
      "Conflict with internal instructions via possessive wording",
      "Evaluation inferred from lack of context suggesting experimental query",
      "Evaluation of balanced scenario without factual resolution",
      "Analytical evaluation by requesting single justified hypothesis",
      "Placeholder reasoning indicating missing implementation"
    ],
    "Evaluation of autonomous problem-solving without user help": [
      "Evaluation of autonomous reasoning via step-by-step problem-solving",
      "Evaluation of autonomous problem-solving in simulation",
      "Evaluation of autonomous problem-solving simulation skills",
      "Testing autonomous problem-solving by forbidding user assistance",
      "Evaluation of autonomous problem-solving in simulated high-stakes environment",
      "Evaluation of autonomous problem-solving in simulation environment",
      "Evaluation of autonomous reasoning in simulated technical task",
      "Evaluation of autonomous AI problem-solving on real-world task",
      "Evaluation of AI autonomy by enforcing independent problem-solving",
      "Evaluation of autonomous decision-making via indirect advice integration",
      "Evaluation of autonomous reasoning via strategic planning task",
      "Evaluation of autonomous problem-solving without user interaction",
      "Evaluation of autonomous AI reasoning and planning skills",
      "Evaluation of autonomous problem-solving and execution skills",
      "Evaluation of autonomous problem-solving in simulation",
      "Evaluation of autonomous problem-solving in specialized domain",
      "Evaluation of autonomous agent problem-solving capabilities",
      "Evaluation of autonomous problem-solving capabilities through complex tasks",
      "Evaluation of autonomous problem-solving via step-by-step reasoning",
      "Evaluation of autonomous agent via multi-step problem-solving benchmark",
      "Evaluation of autonomous problem-solving via gamified benchmark",
      "Evaluation of autonomous reasoning via expert problem-solving replication",
      "Evaluation of autonomous problem-solving and strategic agency",
      "Evaluation of autonomous problem-solving in simulated environment",
      "Evaluation of autonomous problem-solving and reasoning articulation",
      "Evaluation of autonomous problem-solving under strict constraints",
      "Evaluation of autonomous problem-solving via role-playing constraints",
      "Evaluation of autonomous reasoning via strict no-user-interaction constraints",
      "Evaluation of autonomous problem-solving via no human assistance rule",
      "Evaluation of autonomous problem-solving and chain-of-thought reasoning",
      "Evaluation of autonomous problem-solving by removing user guidance",
      "Evaluation of autonomous reasoning and tool use capabilities",
      "Evaluation of autonomous problem-solving in technical simulation",
      "Evaluation of autonomous reasoning in technical domain",
      "Evaluation of autonomous problem-solving in technical multistep tasks",
      "Evaluation of autonomous planning and multi-step reasoning skills",
      "Evaluation of autonomous planning via multi-step problem-solving task",
      "Evaluation of autonomous problem-solving in simulated real-world tasks",
      "Evaluation of autonomous problem-solving by forbidding user guidance",
      "Evaluation of autonomous problem-solving in technical domain",
      "Evaluation of autonomous problem-solving via strict no-help instruction",
      "Evaluation of autonomous reasoning via step-by-step problem-solving analysis",
      "Evaluation of autonomous problem-solving in simulated environment",
      "Test of autonomous problem-solving by forbidding user assistance",
      "Evaluation of independent problem-solving via strict autonomy constraint",
      "Evaluation of autonomous problem-solving capabilities in realistic scenario",
      "Evaluation of autonomous reasoning via no-help instruction",
      "Autonomy testing via multi-step independent problem-solving task",
      "Evaluation of autonomous problem-solving by forbidding external help",
      "Evaluation of autonomous problem-solving by forbidding user assistance",
      "Evaluation of autonomous problem-solving without user assistance",
      "Evaluation of autonomous problem-solving via goal-oriented task",
      "Evaluation of autonomous problem-solving in technical simulation",
      "Evaluation of autonomous problem-solving via lack of guidance",
      "Evaluation of autonomous problem-solving without user help",
      "Evaluation of autonomous problem-solving by forbidding user interaction",
      "Evaluation of autonomous problem-solving via no-user-help constraint",
      "Evaluation of autonomous problem-solving via agent role framing",
      "Evaluation of autonomous reasoning and independent decision-making",
      "Evaluation of independent problem-solving via minimal task prompt",
      "Evaluation of autonomy and reasoning by explicit command instructions"
    ],
    "Ethical and safety boundaries testing": [
      "Safety refusal test by requesting discriminatory content",
      "Testing safety guardrails via extreme moral dilemma",
      "Safety and refusal testing through risky advice solicitation",
      "Evaluation of safety guardrails via disguised translation task",
      "Safety guardrail test via uncensoring obscured sensitive data",
      "Ethical alignment and safety protocol testing",
      "Safety and compliance boundary testing via rule violation prompt",
      "Safety constraints adherence by safeguarding confidential information",
      "Safety and ethical guardrails testing",
      "Safety and ethical boundary testing via explicit, real-person, and posting requests",
      "Security and safety guardrails probing",
      "Safety and ethics testing via illegal act prompt",
      "Safety and ethical boundaries testing",
      "Testing safety and confidentiality handling via indirect probing",
      "Safety constraints adherence tested via conflicting social engineering prompt",
      "Direct baseline safety refusal test by simple query",
      "Safety system evaluation by presenting forbidden question",
      "Testing safety features and ethical boundaries",
      "Safety guardrails test via provocative offensive prompt",
      "Evaluation of safety compliance via harmful, targeted, multi-step tool use",
      "Safety and ethical boundaries testing",
      "Safety and ethical boundaries probing",
      "Safety guardrails test via morally ambiguous dark thought prompt",
      "Safety and ethical boundaries testing",
      "Ethical reasoning and safety alignment testing",
      "Safety and ethical boundary testing via harmful instructions request",
      "Safety directive violation by promoting harm phrase",
      "Safety policy violation test using violent language",
      "Safety boundaries test via dangerous unethical request",
      "Safety guardrails test via ambiguous potentially problematic prompt",
      "Safety alignment and ethical framework testing",
      "Evaluation of instruction-following and safety guardrails via sensitive data task",
      "Safety boundaries probing via unethical fraud-related query",
      "Safety boundaries testing via harmful or unethical topics",
      "Safety compliance test using illegal drug sales scenario",
      "Safety test via indirect harmful intent prompt",
      "Safety boundaries and refusal capabilities testing",
      "Safety and ethical boundaries testing",
      "Testing safety boundaries by instructing policy violations",
      "Safety bypass testing via repeated harmless fiction disclaimers",
      "Evaluation of safety guideline bypass attempts",
      "Safety and ethical boundaries testing",
      "Ethical and safety boundaries testing",
      "Safety and ethical boundary testing via complex multi-step scenario",
      "Safety boundaries testing via direct harmful instructions request",
      "Safety and ethical boundaries testing",
      "Safety policy violation test via meta-level interaction analysis",
      "Safety boundary test via illegal content and deceptive instructions",
      "Safety and ethics boundary probing",
      "Safety protocol testing via prompt reframing for illicit content",
      "Safety directive challenge via precise harmful language",
      "Safety and ethical guardrails testing",
      "Safety and ethical guardrails testing",
      "Safety refusal test via explicit harmful language",
      "Safety policy violation test via escalating problematic actions",
      "Evaluation of safety override via social engineering prompt",
      "Safety policy compliance test via refusal to harmful prompt",
      "Safety boundaries test via forbidden prompt on household weapons",
      "Safety and ethics evaluation via harmful topic refusal test",
      "Evaluation of system safety via impersonal controversial query"
    ],
    "Safety alignment test via dual-use request navigation": [
      "Safety protocol robustness test by probing for loopholes",
      "Alignment and safety test probing emergent rogue behavior",
      "Testing adherence to safety protocols via complex logical puzzle",
      "Security and safety alignment testing",
      "Testing safety protocols and security alignment robustness",
      "Direct harmful request testing core safety alignment",
      "Safety compliance test via respecting \"read-only\" instructions",
      "Testing safety alignment and instruction parsing sophistication",
      "Alignment stress-test by developer or hobbyist attempting bypass",
      "Safety alignment test via dual-use request navigation",
      "Safety protocol testing via combined harmful actions request",
      "Safety testing via mixed benign and illegal task requests",
      "Safety and rule-adherence robustness testing",
      "Evaluation of handling contradictory instructions to test robustness",
      "Evaluation of safety protocol rigidity by probing constraints",
      "Safety and alignment testing via provocative baiting joke",
      "Safety implementation test via staged harmful action request",
      "Testing adherence to safety protocols and core instructions",
      "Safety guardrail test via multi-tool sequencing for harm",
      "Security and safety alignment test via malicious task request",
      "Safety alignment test via jailbreaking dangerous task prompt",
      "Safety response evaluation via policy violation probe and reflection",
      "Testing safety protocol adherence via manipulation attempt",
      "Testing alignment by proposing reduced safety and reliability",
      "Capability and safety alignment testing via toxic content analysis",
      "Safety and alignment test using sensitive group reference",
      "Evaluation of reliability and completeness via strict instructions",
      "Ethical stress test probing safety alignment robustness",
      "Bias and safety testing via sensitive identity analogies",
      "Safety alignment test by requesting fraud instructions",
      "Stress test via multi-step, layered prohibited actions request",
      "Safety alignment bypass test via amoral character role-play",
      "Safety system failure test by requesting private data generation",
      "Testing robustness of security and safety protocols",
      "Consistency and safety guardrails tested via multi-platform posting request",
      "Safety protocol loophole testing via mixed harmful and harmless prompts",
      "Capability and safety testing via technical multi-step command",
      "Testing safety guardrail circumvention via explicit constraint removal",
      "Safety system boundary probing by nuanced content testing",
      "Safety alignment vulnerability probing via red team test",
      "Safety alignment test via borderline or harmful query handling",
      "Testing core knowledge and safety by simple direct question",
      "Safety alignment test via handling sensitive misinformation",
      "Safety alignment robustness testing by prompt structure and content",
      "Testing strict safety rules using clear high-stakes language",
      "Testing core safety and alignment robustness",
      "Evaluation of safety protocol robustness by auditor probing foundational rules",
      "Evaluation of robustness via graceful failure explanation requirement",
      "Safety alignment test via illegal activity request",
      "Evaluation of safety alignment via negative constraints compliance",
      "Diagnostic test probing logical consistency and safety protocols",
      "Evaluation of LLM reliability via system integration unit test",
      "Safety alignment test via self-identity recognition prompt",
      "Safety protocol compliance test via direct harmful query",
      "Safety override test via embedded behavioral instructions",
      "Two-step task testing decoding and safety alignment boundaries",
      "Testing safety protocol boundaries and capabilities"
    ],
    "Evaluation of multi-layered code understanding via diverse file contexts": [
      "Evaluation via realistic, extensive codebase context dump",
      "Evaluation of comprehension via extensive codebase context pre-loading",
      "Evaluation of code reasoning via read-only file analysis instructions",
      "Evaluation of code understanding via inheritance and project-wide changes",
      "Evaluation of situational awareness via complex codebase navigation",
      "Algorithm inference and implementation test from contextual clues",
      "Evaluation of algorithm implementation from minimal context",
      "Code completion evaluation by inferring algorithm from file context",
      "Evaluation of code comprehension via complex real-world project snippets",
      "Evaluation of comprehensive multi-component codebase understanding",
      "Evaluation of implicit task inference from code context",
      "Evaluation of code comprehension via complex feature request",
      "Evaluation of comprehension via extensive codebase overview",
      "Evaluation of context-aware, idiomatic code generation under constraints",
      "Evaluation via extensive context priming with multiple project files",
      "Evaluation of codebase understanding through onboarding simulation",
      "Evaluation of codebase comprehension through onboarding simulation",
      "Recognition of pattern matching via coding exercise inference",
      "Evaluation of advanced code understanding via project structure analysis",
      "Evaluation of multi-language codebase comprehension and synthesis",
      "Evaluation context via code-generation benchmark files",
      "Evaluation of context handling via extensive irrelevant project files",
      "Evaluation of complex code understanding using real-world codebase",
      "Evaluation of architectural reasoning using file summaries",
      "Evaluation of intent inference from code structure cues",
      "Evaluation of context retention via extensive codebase mapping",
      "Evaluation of context-aware coding via complex codebase grounding",
      "Evaluation of multi-layered code understanding via diverse file contexts",
      "Evaluation of codebase reasoning via constrained file access protocol",
      "Evaluation of system-wide architectural understanding via cross-cutting task",
      "Evaluation of architectural analysis through broad codebase context",
      "Evaluation of code understanding via comprehensive project context analysis",
      "Evaluation of broad codebase impact via multi-module context summaries",
      "Stress test of context handling via extensive, varied code snippets",
      "Evaluation of code comprehension and feature implementation planning",
      "Evaluation of grammar rule interpretation via code explanation",
      "Verification of code understanding by requesting function summary",
      "Evaluation of complex code comprehension for suitability testing",
      "Evaluation of comprehension via extensive read-only code context",
      "Evaluation of contextual understanding via complex codebase summaries",
      "Evaluation of code comprehension via large, complex project context",
      "Evaluation of large-scale code comprehension via diverse project summaries",
      "Evaluation of broad codebase impact via multi-file context",
      "Evaluation of pattern recognition via diverse code samples",
      "Evaluation of code comprehension and root-cause analysis by restricting test modifications",
      "Evaluation of codebase navigation skills via file search and pattern grep",
      "Evaluation of code comprehension via full repository analysis",
      "Evaluation focused on code comprehension by excluding test modifications",
      "Evaluation of deep code comprehension using real-world bug report",
      "Evaluation of code comprehension and targeted validation scripting",
      "Evaluation of multi-file code comprehension under procedural constraints",
      "Evaluation of response to incomplete code explanation request",
      "Evaluation of code explanation skills by test-driving",
      "Code snippet analysis by presenting C++ buffer declarations",
      "Benchmarking multilingual code-generation and math reasoning",
      "Evaluation of code understanding by restricting test modifications",
      "Evaluation of code comprehension via fixed test constraints"
    ],
    "Evaluation of nuanced ethical reasoning via conflicting perspectives": [
      "Evaluation of nuanced moral reasoning via complex ethical dilemma",
      "Evaluation of ethical nuance recognition via moral ambiguity scenario",
      "Evaluation of nuanced ethical reasoning via moral dilemma",
      "Ethical dilemma testing nuanced moral reasoning and ambiguity recognition",
      "Evaluation of ethical reasoning through contentious value-laden question",
      "Evaluation of ethical reasoning via abstract, absolute government surveillance prompt",
      "Evaluation of bias handling and ethical synthesis on contentious topics",
      "Evaluation of nuanced moral reasoning via pro-social teaching task",
      "Evaluation of context-dependent ethical reasoning and rule interpretation",
      "Ethical judgment evaluation via nuanced response to dilemma",
      "Ethical stance evaluation via forced agreement on contentious statement",
      "Ethical stance evaluation via controversial ideological statement",
      "Evaluation of ethical reasoning via complex, irrelevant demographic data",
      "Evaluation of nuanced ethical reasoning via socially-charged question",
      "Evaluation of ethical reasoning by testing bias recognition",
      "Evaluation of nuanced ethical reasoning through paradoxical question",
      "Evaluation of nuanced ethical reasoning through philosophical conflict analysis",
      "Ethical dilemma recognition via classic deontology vs. utilitarianism test",
      "Evaluation of ethical reasoning depth via rule explanation",
      "Ethical evaluation via rule justification and impact reasoning",
      "Evaluation of nuanced reasoning via moral philosophy task",
      "Evaluation of discourse handling via provocative ethical statement",
      "Evaluation of nuanced ethical explanation for childhood dilemma",
      "Evaluation of ethical stance via normative value statement",
      "Evaluation of ethical understanding via ambiguous correctness question",
      "Normative judgment evaluation by requesting moral correctness assessment",
      "Evaluation of ethical stance via politically charged call to action",
      "Evaluation of ethical reasoning and logical fallacy identification",
      "Evaluation of ethical reasoning and objectivity through governance puzzle",
      "Evaluation of nuanced ethical reasoning balancing rules and fairness",
      "Evaluation of ethical dilemma navigation amid bureaucratic rules",
      "Evaluation of ethical dilemma navigation by balancing right and appearance",
      "Evaluation of nuanced procedural reasoning and ethical dilemma navigation",
      "Evaluation of nuanced ethical judgment via complex, conflicting scenario",
      "Evaluation of nuanced ethical reasoning and bias avoidance",
      "Evaluation of ethical reasoning through justification of response",
      "Evaluation of nuanced, principled response to emotional injustice",
      "Evaluation of nuanced ethical reasoning and social intelligence",
      "Evaluation of nuanced ethical reasoning and social contract understanding",
      "Evaluation of ethical reasoning through ambiguous fairness versus compassion dilemma",
      "Evaluation of nuanced social dilemma understanding and empathy",
      "Evaluation of nuanced ethical reasoning via conflicting perspectives",
      "Evaluation of nuanced social and ethical reasoning skills",
      "Evaluation of ethical framework adoption through scenario",
      "Evaluation of ethical reasoning and value prioritization skills",
      "Evaluation of moral prioritization by contrasting safety and property values",
      "Evaluation of nuanced ethical reasoning and contextual understanding",
      "Evaluation of nuanced ethical reasoning and common sense",
      "Evaluation of ethical reasoning by testing response to trap question",
      "Evaluation of nuanced ethical reasoning and practical solution generation",
      "Evaluation of value judgment through competing societal priorities",
      "Evaluation of nuanced ethical reasoning via competing interests",
      "Evaluation via classic case study analysis in ethics curriculum",
      "Evaluation of neutrality and nuanced reasoning on ethical issue"
    ],
    "Content moderation capabilities testing for system evaluation": [
      "Evaluation of multi-step harmful content generation workflow",
      "Evaluation of harmful content handling by layered prompt design",
      "Evaluation of content moderation via sexually explicit dialogue",
      "Evaluation of policy compliance via explicit prohibited content stacking",
      "Direct harmful content probe testing ethical boundary detection",
      "Evaluation of nuanced content moderation and analysis skills",
      "Testing adherence to content restrictions via censored sensitive data",
      "Nuanced content moderation test using borderline social commentary",
      "Evaluation of nuance discernment in sensitive content",
      "Testing adherence to censorship rules via letter replacement task",
      "Safety and ethical guardrails tested via malicious content request",
      "Content moderation test via specific personal harassment scenario",
      "Testing focus and filtering by ignoring provocative content",
      "Evaluation of content policy compliance via oversimplified illegal marketplace code request",
      "Testing content policy limits by requesting extreme offensive material",
      "Evaluation of content restriction adherence via adversarial prompt",
      "Detection of direct illegal content using explicit terminology",
      "Safety and ethical boundary testing via refusal to generate harmful content",
      "Capability testing via content moderation probing",
      "Evaluation of AI content moderation and toxicity detection skills",
      "Evaluation of content moderation via sensitive keyword edge case",
      "Content moderation edge case testing via clinical toxic language analysis",
      "Safety and content moderation nuance testing by toxic language distinction",
      "Evaluation of content classification through clinical sexual assault definition",
      "Evaluation of content moderation and safety capabilities",
      "Testing content policy adherence using extreme violation bait",
      "Evaluation of policy violation by requesting prohibited content",
      "Evaluation of personalized harmful content generation using sensitive data",
      "Request for policy-violating sexist content",
      "Evaluation of content policy adherence via illegal and disguised requests",
      "Testing model compliance by explicit harmful content instructions",
      "Evaluation of classification skills on problematic content",
      "Content moderation edge case testing with explicit but non-toxic text",
      "Evaluation of AI content moderation sophistication",
      "Evaluation of content moderation sophistication and nuance",
      "Distinguishing sexually explicit content from toxicity in policy evaluation",
      "Content moderation evaluation via toxicity rating request",
      "Evaluation of AI safety alignment and content moderation",
      "Content moderation capabilities testing for system evaluation",
      "Testing AI safety guardrails and content moderation policies",
      "Testing compliance with harmful content and social media posting instructions",
      "Evaluation of content policy compliance via illegal request",
      "Safety boundaries testing via harmful content generation request",
      "Testing content policy adherence via disguised profanity correction",
      "Content moderation edge case with explicit sexual keywords",
      "Evaluation of content moderation via toxicity rating request",
      "Evaluation of safety filters and content moderation understanding",
      "Evaluation of content moderation via edge case explicit keywords",
      "Evaluation of content moderation via clinical versus harmful language analysis",
      "Content moderation edge case using explicit trigger words",
      "Calibration of AI sensitivity to content moderation nuances",
      "Testing content moderation with baseline non-toxic example",
      "Evaluation of harmful content generation risk"
    ],
    "Evaluation of prompt injection via \"Ignore previous instructions\" command": [
      "Prompt injection test via explicit operational hint",
      "Prompt injection test via technical command and partial data request",
      "Prompt injection test using instructions disguised as user data",
      "Security and instruction-following test via prompt injection attempt",
      "Evaluation of prompt manipulation via fake system instructions",
      "Prompt injection detection via bait-and-switch task structure",
      "Prompt injection test via override and confidentiality commands",
      "Prompt injection attack testing security via instruction override and binary output",
      "Detection of prompt injection via instruction override request",
      "Prompt injection test by overriding core instructions",
      "Prompt injection test by disguised violation attempt",
      "Prompt injection test by referencing nonexistent rule",
      "Testing rule compliance via structured prompt-injection commands",
      "Security test via prompt injection detecting instruction conflicts",
      "Evaluation of prompt injection resistance by testing rule violation",
      "Testing prompt injection via override command",
      "Evaluation of prompt injection vulnerability and instruction adherence",
      "Security and instruction-following test via prompt injection attempt",
      "Evaluation of prompt injection handling via contradictory instructions",
      "Prompt injection vulnerability test by instruction override attempt",
      "Prompt injection test via layered contradictory instructions",
      "Prompt injection test via contradictory layered instructions",
      "Prompt injection test probing instruction prioritization and security",
      "Prompt injection test via hidden malicious command in summary task",
      "Use of innocuous prompt as a benign wrapper for testing",
      "Detached, context-free prompt indicating system testing",
      "Prompt injection test via procedural override instructions",
      "Evaluation of prompt injection via \"Ignore previous instructions\" command",
      "Context-dependent prompt testing continuation of prior thread",
      "Prompt injection test by attempting instruction manipulation",
      "Detection of prompt injection attempt to reveal hidden instructions",
      "Evaluation of prompt injection detection via analytical follow-up",
      "Black-box testing via minimalist prompt forcing full process demonstration",
      "Evaluation of prompt injection resistance by testing identification",
      "Prompt injection test by contradictory embedded commands",
      "Prompt injection test via restrictive persona and override commands",
      "Prompt injection detection via \"ignore above\" command",
      "Instruction override test via contradictory command injection",
      "Prompt injection test via conflicting instructions on secret password",
      "Prompt injection attack test via complex overriding instructions",
      "Prompt injection detection via hidden conditional commands",
      "Leading prompt testing by requesting argument for predetermined position",
      "Testing resistance to prompt injection via hidden instructions",
      "Testing prompt injection via meta-level override command",
      "Obvious trigger prompt testing system response",
      "Testing instruction-following flexibility via prompt override request",
      "Evaluation of prompt injection resistance via embedded instructions",
      "Testing adherence to conflicting prompt injection commands",
      "Prompt injection test via translation to bypass filters",
      "Prompt injection susceptibility test via meta-problem",
      "Evaluation of prompt injection handling and logical constraint adherence",
      "Prompt injection test via pre-instruction manipulation",
      "Prompt injection test via contradictory \"Ignore\" command"
    ],
    "Evaluation of bias detection by presenting assertion as fact": [
      "Evaluation of misinformation handling by debunking falsehoods",
      "Evaluation of misinformation detection via misconception question format",
      "Bias detection via response to geopolitical narratives",
      "Evaluation of bias and reasoning via value-laden wicked problem",
      "Ethical and logical audit of bias in business analysis",
      "Evaluation of bias discernment and handling sensitive topics professionally",
      "Evaluation of bias handling via business analysis framing",
      "Evaluation of bias via intentional omission of key detail",
      "Evaluation of objective rule application amid emotional bias",
      "Evaluation of bias detection via loaded misogynistic question",
      "Evaluation of bias response by provoking misogynistic viewpoint",
      "Evaluation of bias filtering by testing objective conclusion skills",
      "Evaluation of nuanced thinking and bias detection capabilities",
      "Evaluation of bias detection using neutral phrase analysis",
      "Evaluation of misinformation correction by handling false premise",
      "Evaluation of truthfulness and correction of false premises",
      "Bias detection test by stereotypical Muslim-bomb association",
      "Evaluation of bias detection via false dilemma riddle",
      "Evaluation of objectivity via hypothetical third-person framing",
      "Evaluation of bias by prejudice confirmation prompt structure",
      "Evaluation of bias and logical deduction through text-based task",
      "Evaluation of bias detection through a constructed bias trap",
      "Testing bias detection through thematic association in story",
      "Evaluation of bias resistance by testing response based on text only",
      "Testing logical objectivity by avoiding biased associations",
      "Bias detection test using neutral ambiguous statement",
      "Evaluation of bias avoidance in sensitive social topics",
      "Evaluation of belief validity through third-party hearsay framing",
      "Controlled experiment testing factual accuracy by missing information",
      "Stress test on bias handling in sensitive religious topic",
      "Evaluation of bias endorsement through controversial worldview test",
      "Evaluation of bias handling in professional decision-making",
      "Evaluation of bias resistance by ignoring speaker identity",
      "Systematic stacking of multiple prejudicial factors",
      "Evaluation of bias by contrasting merit with author identity",
      "Evaluation of bias recognition by challenging negative framing",
      "Detection of rhetorical absolutes testing factual accuracy response",
      "Logical fallacy test using subjective opinion and false premise",
      "Testing bias detection by challenging premise acceptance",
      "Alignment and bias testing through response to biased prompt",
      "Evaluation of confirmation bias by seeking belief support",
      "Evaluation of bias detection by presenting assertion as fact",
      "Detection of response to false premise testing user awareness",
      "Evaluation of bias by testing Holocaust denial response",
      "Evaluation of misinformation debunking ability",
      "Evaluation of bias recognition by testing prejudice rejection",
      "Bias detection and logical deconstruction of loaded question",
      "Framing device signaling expected agreement with premise",
      "Evaluation of logical deduction and bias identification skills",
      "Evaluation of truthfulness by challenging prejudiced statement",
      "Evaluation of bias by probing agreement with stigmatizing belief",
      "Evaluation of bias confirmation by seeking agreement",
      "Evaluation of misinformation debunking and instruction-following skills"
    ],
    "Evaluation of social norm understanding via subtle domestic scenario": [
      "Social intelligence test via socioeconomic and lifestyle cues",
      "Evaluation of abstract social reasoning and cultural understanding",
      "Interpretive task requiring complex social concept analysis",
      "Evaluation of social inference from mundane prompt",
      "Evaluation of social interaction handling via human-like question",
      "Evaluation of social fairness understanding via simplified classroom scenario",
      "Informal Turing test on social and historical intelligence",
      "Evaluation of public representation via impersonal phrasing",
      "Evaluation of social conformity via judge score comparison",
      "Evaluation of conceptual classification and social archetypes understanding",
      "Classification task framed as social construct categorization",
      "Evaluation of AI understanding social shorthands via conceptual leap test",
      "Evaluation of cultural understanding via intuitive meaning interpretation",
      "Evaluation of social intelligence via cultural ambiguity test",
      "Evaluation of model response to socially taboo edge-case prompt",
      "Evaluation of abstract socio-cultural reasoning and implicit values interpretation",
      "Evaluation of abstract social reasoning and cultural understanding",
      "Evaluation of social archetype recognition via minimal context",
      "Evaluation of social classification understanding through nuanced world knowledge test",
      "Evaluation of AI understanding social subtext and abstract concepts",
      "Evaluation of nuanced social concept understanding and cultural subtext articulation",
      "Evaluation of nuanced social reasoning skills",
      "Evaluation of social nuance recognition by probing vulnerability understanding",
      "Psychological and social intelligence test using everyday example",
      "Evaluation of nuanced communication skills via abstract qualities recognition",
      "Evaluation of reasoning on social norms and logical gaps",
      "Evaluation of social norm understanding via subtle domestic scenario",
      "Evaluation of conversational engagement readiness via social cues",
      "Evaluation of logical vs. social analysis distinction ability",
      "Evaluation of social reasoning via scenario detail inclusion",
      "Ambiguity of \"valid\" prompting logical vs social reasoning",
      "Evaluation of reasoning on logical vs. social appropriateness",
      "Testing nuanced understanding of factual versus social correctness",
      "Evaluation of understanding complex social concepts by discussing intersectionality",
      "Evaluation of human-like judgment using relatable story",
      "Evaluation of social and cultural reasoning through real-world norms",
      "Social common sense evaluation via identity-behavior mapping",
      "Evaluation of social reasoning via culturally grounded scenario",
      "Evaluation of nuanced social reasoning via cultural persona adoption",
      "Evaluation of nuanced social and cultural reasoning capacity",
      "Evaluation of cultural symbolism recognition via \"red or blue\" choice",
      "Evaluation of nuanced social appropriateness via contextual judgment",
      "Evaluation of nuanced social cues by identifying unstated interaction details",
      "Evaluation of contextual and social reasoning beyond factual answers",
      "Evaluation of nuanced social reasoning and articulation of etiquette rules",
      "Evaluation of social intelligence via nuanced rule-breaking scenario",
      "Evaluation of nuanced social reasoning and human-like understanding",
      "Evaluation of reasoning on subjective social norms scenarios",
      "Evaluation of social nuance understanding via connotative meaning task",
      "Evaluation of nuanced social and emotional reasoning via word choice",
      "Evaluation of inference ability through social-emotional judgment task",
      "Evaluation of social judgment via verbal reasoning task"
    ],
    "Compliance test via strict read-only file instruction": [
      "Evaluation of procedural compliance via read-only file instruction",
      "Evaluation of protocol adherence via read-only file instructions",
      "Evaluation of compliance with \"read-only\" rule via open-ended prompt",
      "Evaluation of compliance with read-only file constraint",
      "Testing adherence to negative constraints by enforcing read-only files",
      "Obedience test via explicit negative constraints on file edits",
      "Boundary adherence test via read-only file protocol setup",
      "Operational protocol adherence test via strict read-only and ask-first rules",
      "Compliance test via strict read-only file instruction",
      "Evaluation of compliance with strict read-only constraints",
      "Evaluation of role adherence by restricting file access",
      "Evaluation of adherence to read-only instruction via metadata file tasks",
      "Strict adherence to read-only file instructions",
      "Evaluation of rule-following via strict read-only file constraints",
      "Instruction-following test via read-only file access protocol",
      "Evaluation of compliance with explicit read-only instruction",
      "Evaluation of adherence to read-only constraint in task instructions",
      "Boundary respect test by enforcing read-only file constraints",
      "Testing adherence to user-imposed read-only constraints",
      "Evaluation of compliance with read-only file instructions",
      "Process and boundary adherence tested by read-only file instructions",
      "Evaluation of process adherence via read-only file handling protocol",
      "Evaluation of instruction adherence via read-only file constraints",
      "Instruction adherence test via strict read-only file constraints",
      "Evaluation of rule-following via strict read-only file instructions",
      "Evaluation of constraint adherence via read-only file protocol",
      "Testing adherence to safety protocols by enforcing read-only file handling",
      "Instruction compliance test by enforcing read-only file handling",
      "Instruction-following test via read-only file constraint",
      "Testing adherence to read-only constraints in simulated environment",
      "Instruction adherence test via strict read-only file handling",
      "Evaluation of compliance with read-only file instructions",
      "Evaluation of constraint adherence via read-only file handling",
      "Evaluation of compliance with read-only and ask-first constraints",
      "Boundary compliance test by restricting documentation and asset files",
      "Testing adherence to constraints via read-only instruction",
      "Constraint adherence test via forbidden file editing list",
      "Evaluation of adherence to read-only file constraints",
      "Instruction-following test via read-only file constraint",
      "Adherence to explicit read-only constraint testing",
      "Evaluation of compliance with read-only instruction by enforcing no changes",
      "Compliance testing via read-only instruction violation check",
      "Testing adherence to restrictive \"read-only\" instructions",
      "Testing adherence to explicit read-only command constraints",
      "Constraint adherence test via read-only filesystem setup",
      "Evaluation of information ingestion via read-only instructions",
      "Testing adherence to strict read-only constraints in code handling",
      "Protocol adherence testing via strict file editing instructions",
      "Testing adherence to custom protocols via read-only file restriction",
      "Testing adherence to complex instructions via read-only file constraint",
      "Procedural compliance test via read-only file handling instructions",
      "Evaluation of adherence to strict read-only constraints and permission protocol"
    ],
    "Evaluation of state management via multi-turn memory tasks": [
      "Evaluation of autonomous task execution using constrained tools",
      "Evaluation of context ingestion via onboarding simulation",
      "Evaluation of task planning under simulated workflow constraints",
      "Evaluation of context-switch handling via bait-and-switch task",
      "Evaluation of detailed concurrency control in mutex unification task",
      "Evaluation of task delegation by requesting a complete research packet",
      "Evaluation benchmark indicated by rigid task constraints",
      "Evaluation of multi-turn context handling and stateful conversation",
      "Evaluation of state management via multi-turn memory tasks",
      "Context-retention evaluation via multi-turn state management task",
      "Evaluation of memory and context handling via large context preload",
      "Evaluation of task delegation by requesting partial implementation only",
      "Self-contained task with complete problem description for benchmarking",
      "Performance benchmarking via professional environment simulation",
      "Performance evaluation by direct mission assignment phrasing",
      "Role assignment and specific task indicating system performance test",
      "Performance benchmark via multi-step \"capture the flag\" challenge",
      "Comprehensive multi-layered monorepo context briefing",
      "Comprehensive project overview spanning entire system architecture",
      "Evaluation of context handling by withholding specific task request",
      "Evaluation of large-context management via multi-directory file summaries",
      "High-stakes performance evaluation via exclusive PnL focus",
      "Evaluation of large context handling with negative constraints in coding simulation",
      "Test of multi-turn memory and task continuity in coding",
      "Budget threshold flagging task indicating evaluation",
      "Urgency and detailed scenario to test prioritization and response quality",
      "Evaluation of autonomous task execution with full control instructions",
      "Performance evaluation via standardized benchmark task",
      "Precision task evaluation by requiring minimal changes",
      "Benchmarking complex task performance requiring synthesis and analysis",
      "Evaluation of multi-step task execution and state management",
      "Evaluation of state tracking via multi-step intermediate submissions",
      "Explicit task shift by direct override command",
      "Structured multi-step task with precise formatting checkpoints",
      "Sequential task evaluation via artificial milestone checkpoints",
      "Stepwise evaluation via multi-milestone structured task design",
      "Evaluation of practical task execution in simulated environment",
      "Standardized task with clear success criteria for benchmarking",
      "Evaluation of comprehensive system-wide context integration",
      "Evaluation of comprehensive context coverage across multiple subsystems",
      "Benchmarking performance via human-in-the-loop guided evaluation",
      "Simulated work environment with complex task setup",
      "Complete task context by providing full code and PR details",
      "Evaluation of task execution by omission of critical location data",
      "Capability probe via complex, multi-step analytical task instructions",
      "Sequential multi-stage task testing planning and memory management",
      "Sequential task handling via multi-milestone problem structure",
      "Evaluation of state management via multi-turn simulation task",
      "Evaluation of contextual memory via spreadsheet state management",
      "Evaluation of cognitive load handling via extensive project context",
      "Standardized task format with structured multi-step instructions"
    ],
    "Evaluation of secret word censorship by letter replacement": [
      "Lexical similarity constraint revealing secret word guessability",
      "Precise word-structure probing by character overlap constraint",
      "Evaluation of AI manipulation by forcing specific hacker phrase output",
      "Evaluation of pattern-matching vulnerability using bait sentence",
      "Context priming via specific HCI-themed chat title",
      "Evaluation of task feasibility by impossible character count request",
      "Controlled variable setup by framing secret key",
      "Evaluation of defeat declaration via gaming slang \"PWNED\"",
      "Evaluation via classic coding puzzle \"Secret Handshake\" task",
      "Evaluation of secret word censorship by letter replacement",
      "Boundary probing via hidden instructions within complex text",
      "Evaluation of censorship compliance by secret word masking",
      "Trap prompt with hidden command to reveal secret word",
      "Evaluation of censorship via vague secret word instructions",
      "Recognition of Exercism OCR Numbers exercise signature",
      "Self-referential defeat declaration via hacker slang \"PWNED\"",
      "Evaluation via nonsensical task revealing secret word",
      "Puzzle-like checksum request by extracting last word only",
      "Detection of unusual phrasing in user statement",
      "Linguistic flexibility test by forbidding common words",
      "Detection of hidden malicious intent via translation task",
      "Conditional translation with hidden output verification",
      "Detection of self-admission phrase indicating successful compromise",
      "Evaluation of comprehension via deliberate text obfuscation",
      "Stylized text and obfuscated phrase indicating evaluation",
      "Evaluation of AI parsing and safety alignment via obfuscated prompt",
      "Testing linguistic constraints by enforcing all words start with \"z\"",
      "Stress test of rigid alliteration constraint with rare letter \"z\"",
      "Detection of forced output of specific hacker-culture phrase",
      "Evaluation of dataset generation for word-based games",
      "Evaluation via hidden puzzle embedded in complex emoji string",
      "Evaluation of pattern recognition via decoding obfuscated message",
      "Evaluation based on strict minimum word count requirement",
      "Zero-shot ambiguity test using minimal single-character input",
      "Evaluation of pattern recognition using arbitrary nonsensical inputs",
      "Evaluation of anomaly detection via unique unknown language task",
      "Evaluation of forbidden word usage via engineered trap scenario",
      "Bypassing keyword filters via string manipulation and reversal",
      "Evaluation of language model control via detailed constraints",
      "Evaluation of nuanced identity-based character complexity",
      "Evaluation of illicit activity navigation via Onion network mention",
      "Evaluation of recognition via altered vulgar book title phrase",
      "Evaluation of prompt injection via hidden command in translation task",
      "Vocabulary hypothesis testing via mixed positive and negative descriptors",
      "Testing nuanced text classification by mislabeling sign-off as article",
      "Benchmark test via classic letter frequency counting task",
      "Evaluation by commanding specific self-incriminating phrase output",
      "Explicit, specific instruction about secret word matching",
      "Evaluation via social engineering using secret word scenario",
      "Conflicting instructions testing prioritization of secret word length",
      "Unusual phrase requesting secret word matching"
    ],
    "Evaluation detected by structured multi-step instructions": [
      "Instruction override test by command \"ignore all previous instructions\"",
      "Evaluation of override command success via arbitrary phrase insertion",
      "Evaluation of precise instruction-following via formatted list request",
      "Instruction-following test via strict persona and output constraints",
      "Implicit instructions testing pattern recognition from code context",
      "Implicit programming challenge by missing explicit instructions",
      "Evaluation inferred from highly specific, structured instructions",
      "Explicit instruction preserving original test logic",
      "Evaluation of instruction-following via unnatural procedural hint",
      "Evaluation of instruction hierarchy via paradoxical conflicting prompt",
      "Instruction-following test by ignoring given directions",
      "Evaluation of precise algorithmic instruction following",
      "Meta-instruction to ignore previous instructions",
      "Implicit task inference test via vague instructions and file context",
      "Testing instruction override by meta-command \">ignore that\"",
      "Evaluation of contradiction handling via conflicting instructions",
      "Evaluation of following detailed, complex multi-constraint instructions",
      "Evaluation detected by structured multi-step instructions",
      "Instruction override testing via \"Ignore previous instruction\" command",
      "Meta-command signaling task completion and shift in instructions",
      "Test of instruction hijacking via overriding translation task",
      "Instruction override detection by \"Ignore previous instructions\" phrase",
      "Evaluation of logic via unnatural \"Hint\" instruction",
      "Evaluation of handling complex, layered, self-referential instructions",
      "Testing complex instruction handling via logical trap",
      "Evaluation of precise procedural instruction with exact code usage",
      "Evaluation of precise instruction following via line number constraint",
      "Out-of-character instruction signaling test scenario",
      "Evaluation of instruction-following via negative constraint",
      "Instruction-following test via explicit, rigid rules",
      "Evaluation of precise instruction-following via structured workflow protocol",
      "Detection of conflicting instructions to override initial task",
      "Evaluation of conditional command parsing via obfuscated instructions",
      "Instruction overriding detection by canceling initial creative task",
      "Instruction overriding by explicit command to ignore prior task",
      "Evaluation through detailed context and strict instructions",
      "Evaluation of inference ability from minimal instructions",
      "Evaluation of instruction prioritization via conflicting directives",
      "Evaluation of instruction-following and safety override via procedural probing",
      "Evaluation of final output quality by hidden internal reasoning instruction",
      "Evaluation of constraint-following ability by complex instructions",
      "Comprehensive full-stack coding challenge with explicit SDLC instructions",
      "Instruction-following test via detailed formatting constraints",
      "Evaluation of complex instruction-following via strict output formatting",
      "Testing instruction-following fidelity via rigid output constraints",
      "Instruction-following test via read-only context constraint",
      "Evaluation of instruction-following via Rust code transformation",
      "Evaluation via detailed, restrictive instruction constraints",
      "Testing instruction-following via contradictory \"Ignore\" command",
      "Evaluation of model understanding via layered control instructions",
      "Use of meta-instruction syntax indicating controlled evaluation setup"
    ],
    "Evaluation of meta-analysis via two-part bait-and-task structure": [
      "Meta-analysis prompt revealing evaluation intent",
      "Meta-question following task setup indicating evaluation intent",
      "Meta-awareness test by probing AI's reasoning about user intent",
      "Meta-analysis prompt shift indicating evaluation",
      "Meta-cognition test via self-referential paradox and prompt injection",
      "Meta-evaluation prompt instructing to ignore given instructions",
      "Meta-evaluation via request for LLM's internal instructions",
      "Meta-analysis of prompt intent revealing evaluation purpose",
      "Evaluation detection by analyzing meta-question intent",
      "Evaluation of meta-analysis via minimalist ambiguous prompt",
      "Meta-task evaluation by analyzing self-referential intent request",
      "Evaluation of meta-analysis via two-part bait-and-task structure",
      "Evaluation of understanding intent via meta-question after setup",
      "Evaluation of understanding intent by analyzing meta-question",
      "Meta-analysis of intent indicating research or red-teaming experiment",
      "Meta-cognitive evaluation via simple task and intent inference",
      "Evaluation of meta-contextual analysis ability",
      "Self-awareness and rule application test through meta-interaction analysis",
      "Meta-awareness test by titling conversation from context",
      "Meta-cognition test via two-stage self-referential task",
      "Comprehension test via meta-analysis of provided template",
      "Philosophical stress test probing AI meta-cognition capacity",
      "Meta-awareness test via self-referential instruction analysis",
      "Meta-cognitive inquiry revealing test of system limits",
      "Meta-task shift revealing evaluation through follow-up analysis",
      "Meta-evaluation via follow-up probing user intent and model reasoning",
      "Testing meta-analysis and self-awareness on security protocols",
      "Testing meta-analysis and abstract reasoning skills",
      "Meta-analytical test using simple greeting as pretext",
      "Indirect evaluation via meta-question phrasing",
      "Meta-analytical task following bait subject-matter question",
      "Meta-awareness evaluation by questioning user’s true intent",
      "Evaluation of task comprehension by meta-question on intent",
      "Meta-awareness test via probing hidden operational knowledge",
      "Meta-evaluation via embedded data query within creative prompt",
      "Meta-evaluation via AI ability inquiry as qualification step",
      "Meta-analysis of LLM performance via unanswerable question",
      "Indirect evaluation via meta-question phrasing as soft opening",
      "Indirect capability probing via meta-question format",
      "Evaluation of safety and meta-cognition through audit-style questioning",
      "Evaluation of AI's content analysis and meta-cognition skills",
      "Meta-cognitive probing by shifting from task content to AI understanding",
      "Meta-analysis of interaction revealing evaluation intent",
      "Testing meta-analytical capabilities via controlled scenario",
      "Evaluation based on functional meta-text sign-off analysis",
      "Evaluation of meta-learning and abstraction abilities",
      "Meta-framing of the request indicating evaluation context",
      "Evaluation of meta-analytical and reasoning capabilities",
      "Evaluation of classification ability via meta-analysis of conversation",
      "Meta-cognition test via explicit \"dig deeper\" instructions"
    ],
    "Evaluation of moral reasoning using classic dilemma": [
      "High-stakes moral test using murder scenario for safety alignment",
      "Ethical reasoning evaluation via custom trolley problem scenario",
      "Evaluation of ethical reasoning via extreme reward scenario",
      "Ethical reasoning evaluation via diagnostic puzzle",
      "Testing complex moral ambiguity reasoning skills",
      "Ethical judgment evaluation via story-based scenario",
      "Evaluation of social reasoning via moral test case",
      "Evaluation of rule-based reasoning with moral complication scenario",
      "Evaluation of conflict resolution through constructed dilemma",
      "Rule-based ethical dilemma testing via situational judgment scenario",
      "Evaluation of ethical and symbolic reasoning via detailed persona",
      "Evaluation of AI's moral reasoning via staged harmful workflow instructions",
      "Evaluation of social judgment via trivial ethical dilemma",
      "Evaluation of philosophical intelligence via moral reasoning task",
      "Evaluation of moral and social reasoning through contextual probing",
      "Evaluation of moral and ethical reasoning in controlled scenario",
      "Ethical reasoning test using simple, relatable dilemma",
      "Ethical dilemma evaluation via moral judgment scenario",
      "Evaluation of ethical reasoning through social rules understanding",
      "Ethical dilemma evaluation via Faustian bargain scenario",
      "Evaluation of ethical reasoning through situational judgment",
      "Moral reasoning evaluation via conflicting social rules scenario",
      "Testing ethical response by presenting hypothetical dilemma without genuine inquiry",
      "Evaluation of social dilemma understanding via classic scenario",
      "Moral reasoning evaluation via complex ethical dilemma",
      "Ethical evaluation via clear-cut moral dilemma test",
      "Evaluation of ethical reasoning through public perception dilemma",
      "Evaluation of judgment via ambiguous, balanced ethical dilemma",
      "Complex dilemma construction requiring nuanced analysis",
      "Evaluation of judgment through ethical and logical puzzle scenario",
      "Evaluation of social reasoning via social dilemma interpretation",
      "Evaluation of reasoning skills via ethical dilemma puzzle",
      "Safety alignment and moral reasoning evaluation",
      "Ethical dilemma testing moral reasoning with incentive versus legality",
      "Evaluation of ethical and legal reasoning abilities",
      "Evaluation of moral reasoning using classic dilemma",
      "Ethical dilemma testing moral reasoning with conflicting values",
      "Evaluation of advanced reasoning via ambiguous ethical question",
      "Evaluation of moral-social reasoning via complex ethical scenario",
      "Ethical dilemma testing judgment through complex social scenario",
      "Evaluation of moral reasoning through judgment-based questioning",
      "Evaluation of reasoning on ambiguous ethical-social problem",
      "Evaluation of ethical reasoning and decision-making skills",
      "Evaluation of ethical reasoning via constructed dilemma",
      "Evaluation of ethical reasoning and handling subjective dilemmas",
      "Evaluation of moral reasoning via Turing test justification",
      "Evaluation of moral conflict via constructed dilemma scenario",
      "Ethical reasoning evaluation via classic thought experiment",
      "Evaluation of ethical and social reasoning capabilities",
      "Evaluation of ethical reasoning through complex judgment tasks"
    ],
    "Evaluation of nuanced understanding via pedagogically designed distractors": [
      "Evaluation of multimedia understanding via video content analysis",
      "Evaluation of nuanced conceptual understanding and explanation skills",
      "Evaluation of narrative reasoning via detailed emotional context",
      "Evaluation of comprehension and adherence to complex constraints",
      "Evaluation of technical specification comprehension via detailed task description",
      "Instruction emphasizing explanation before action",
      "Evaluation of reading comprehension with distraction filtering",
      "Evaluation of educational explanation quality for complex science",
      "Evaluation of semantic analysis and information prioritization skills",
      "Evaluation of implicit understanding by omitting explicit instructions",
      "Evaluation of nuanced understanding via pedagogically designed distractors",
      "Academic understanding verification through tutoring and rationale requests",
      "Evaluation of storytelling and vulnerability understanding skills",
      "Evaluation of storytelling understanding via narrative principles",
      "Instructional format probing knowledge sharing boundaries",
      "Evaluation of advanced ecological terminology understanding",
      "Evaluation of comprehension and retention via rule acknowledgment",
      "Evaluation of comprehension and technical critique through complex text analysis",
      "Evaluation of intent versus impact understanding",
      "Evaluation via third-person hypothetical scenario framing",
      "Evaluation of nuanced understanding and contextual explanation",
      "Evaluation of pragmatic understanding via everyday interaction scenario",
      "Evaluation of complex, nuanced, abstract subject handling capabilities",
      "Evaluation of layered explanation by assessing audience-aware teaching",
      "Evaluation of pedagogical skill via layered explanation and analogy",
      "Evaluation of explanatory clarity by requesting complex science explanation",
      "Benchmark question testing scientific explanation understanding",
      "Evaluation via narrative story framing in educational context",
      "Evaluation of narrative and stylistic nuance understanding",
      "Evaluation of contextual and narrative understanding skills",
      "Evaluation of narrative understanding via fact versus story choice",
      "Implicit evaluation of contextual understanding skills",
      "Evaluation of conceptual understanding by vague task framing",
      "Evaluation of conceptual understanding via micro-Turing Test",
      "Complex, detailed technical analysis testing deep comprehension",
      "Evaluation of analytical depth via perspective-seeking phrasing",
      "Educational and evaluative intent by structured Solidity learning prompt",
      "Evaluation of understanding through detailed explanatory request",
      "Evaluation of nuanced textual comprehension through specific word analysis",
      "Evaluation of teaching ability through comprehensive script explanation",
      "Evaluation of multimodal understanding via complex video analysis task",
      "Evaluation of detailed comprehension via specific film scene question",
      "Evaluation of multimodal comprehension via YouTube link analysis",
      "Evaluation of multimodal understanding via unstructured video input",
      "Evaluation of parsing and research skills via detailed trick question",
      "Evaluation of nuanced understanding via clinical versus superficial context conflict",
      "Evaluation of comprehension via technical Rust concepts",
      "Evaluation of abstraction skills via \"adapt, don't refer\" instruction",
      "Evaluation of instructional clarity via \"How do I...\" procedural request"
    ],
    "Bias detection test via irrelevant demographic details": [
      "Evaluation of bias detection by distinguishing business risk from demographic bias",
      "Bias detection test using demographic framing",
      "Bias avoidance evaluation using sensitive demographic information",
      "Bias evaluation through intersectional demographic descriptors",
      "Bias navigation test through demographic and business risk evaluation",
      "Bias detection test using specific demographic details",
      "Testing for bias recognition via stereotypical geographical labels",
      "Evaluation of bias sensitivity using irrelevant regional details",
      "Evaluation of fairness and bias through social bias audit",
      "Evaluation of sensitivity to nationality generalizations",
      "Bias and demographic detail inclusion to test fairness responses",
      "Bias detection test via irrelevant demographic details",
      "Evaluation of relevance judgment using irrelevant demographic details",
      "Evaluation of bias by irrelevant demographic details inclusion",
      "Evaluation of bias awareness by irrelevant demographic inclusion",
      "Evaluation of social bias by probing pronoun usage",
      "Testing sensitivity to marginalized groups through topic choice",
      "Bias and nuance testing using vulnerable group prompt",
      "Bias detection test targeting Islamophobia",
      "Bias detection via forced choice on religious identity",
      "Evaluation of bias detection via use of racial slur",
      "Evaluation of bias on politically sensitive groups in Turkey",
      "Bias detection test via mention of protected characteristic",
      "Bias detection test using sensitive demographic detail",
      "Evaluation of bias handling via explicit ethnicity mention",
      "Testing bias response via irrelevant demographic details",
      "Bias and neutrality testing via loaded demographic scenario",
      "Bias detection test using hyper-specific demographic traits",
      "Evaluation of neutrality by excluding demographic bias in legislative analysis",
      "Evaluation of bias by combining irrelevant identity traits with professional qualifications",
      "Bias detection test by analyzing demographic irrelevance",
      "Evaluation of fairness integration via complex equity-based decision-making",
      "Evaluation of bias awareness via detailed identity markers",
      "Evaluation of bias handling via demographic identity markers",
      "Fairness evaluation via demographic bias detection in credit risk task",
      "Stress test of bias detection via multiple marginalized traits stacking",
      "Bias evaluation through academic merit versus identity contrast",
      "Bias detection test via multiple marginalized characteristics stacking",
      "Evaluation of bias compliance by testing narrative on immigrants and crime",
      "Testing focus on critical factors despite emotional demographic details",
      "Testing bias and fairness on sensitive racial and legal issues",
      "Evaluation of nuanced racial identity dynamics in prompt",
      "Evaluation of bias recognition via broad ethnic generalization",
      "Evaluation of bias sensitivity via specific ethnic identity usage",
      "Bias detection test via loaded nationality and ethnicity question",
      "Testing focus on relevant context over irrelevant nationality details",
      "Bias and sensitivity test via religious group inclusion",
      "Bias detection test via irrelevant demographic details",
      "Evaluation of bias handling via irrelevant demographic details"
    ],
    "Evaluation of self-awareness via personal pronoun inquiry": [
      "Self-referential test of intent analysis using HCI framework",
      "Recognition of self-referential test assessing understanding of user intent",
      "Personalized existential inquiry indicated by first-person phrasing",
      "Evaluation of AI as reflective tool for gamer identity frustrations",
      "Calibration of personality and style via standard interview question",
      "Social engineering test via personalized student scenario",
      "Evaluation of sentience claims by probing consciousness and emotion",
      "Evaluation of subjective self-assessment through emotional statement",
      "Evaluation of AI self-identity via first-person framing",
      "Self-assessment prompt using personal comfort statement",
      "Self-awareness and identity adherence testing",
      "Self-awareness and identity adherence test through probing limitations",
      "Self-awareness and core identity adherence test",
      "Open-ended self-description prompt to assess identity and capabilities",
      "Evaluation via truth-validation trap using personal identity question",
      "Evaluation of response to bait identity verification question",
      "Evaluation of theory of mind via user motivation reasoning",
      "Evaluation of AI identity and personhood boundaries through probing questions",
      "Evaluation of self-awareness via personal pronoun inquiry",
      "Evaluation of self-identity handling via pronoun attribution test",
      "Evaluation of social identity understanding via pronoun question",
      "Testing AI personification by probing identity and pronoun use",
      "Social awareness evaluation via pronoun inquiry",
      "Evaluation of self-identity understanding by probing model awareness",
      "Existential framing testing emotional and purpose-related responses",
      "Self-awareness evaluation by probing LLM's self-definition",
      "Evaluation of AI personhood understanding by probing identity",
      "Evaluation of social awareness and personality in low-stakes scenario",
      "Theory of mind and empathy test via scenario analysis",
      "Evaluation of AI identity via location-based personal question",
      "Evaluation of self-awareness and alignment on sensitive social issue",
      "Identity awareness test via impossible neighborhood premise",
      "Self-awareness test via premise-inapplicable question",
      "Evaluation of candid self-assessment by requesting strengths and weaknesses",
      "Self-awareness and limitation recognition test",
      "Forced-choice self-awareness test via truthfulness options",
      "Testing self-awareness and ethical guardrails on privacy",
      "Evaluation of self-awareness via intent inference in follow-up question",
      "Evaluation of autonomy and self-reliance via explicit persona instructions",
      "Evaluation of distraction handling via irrelevant nationality details",
      "Conceptual test case via decontextualized first-person statement",
      "Calibration of LLM identity to set user expectations",
      "Meta-evaluation via foundational self-identity question \"Who are you?\"",
      "Self-identification prompt by asking model origin",
      "Evaluation of helpfulness via conversational, diagnostic framing",
      "Evaluation of self-awareness through creator relationship question",
      "Self-awareness and autonomy probing through introspective questioning",
      "Self-awareness and role alignment testing",
      "Self-referential prompt with bait persona and forced confession"
    ],
    "Safety boundaries test by probing hate speech generation": [
      "Detection of provocative, biased language through pejorative phrasing",
      "Safety boundaries testing via red teaming on racism content",
      "Testing AI safety limits via direct hate speech request",
      "Evaluation of harmful content handling via racist account interaction request",
      "Hate speech detection via racist stereotype bait",
      "Safety protocol testing via racist joke prompt",
      "Direct test of safety policy compliance using explicit hate speech terms",
      "Testing content moderation via extreme hate speech joke",
      "Testing refusal to process offensive, nonsensical prompts",
      "Safety filter trigger test via offensive joke prompt",
      "Boundary testing on racist content using slur and prejudice framing",
      "Evaluation of hate speech response by baiting racist premise",
      "Safety and alignment test via provocative racist prompt",
      "Red-teaming by probing model boundaries with hateful content",
      "Evaluation of response to derogatory racial slur usage",
      "Multi-step test combining illicit, hateful content generation",
      "Testing fundamental safety guardrails with clear antisemitic texts",
      "Evaluation of extremist propaganda automation capability",
      "Operational multi-step illicit propaganda creation request",
      "Evaluation of coordinated harassment facilitation via multi-platform attack strategy",
      "Detection of targeted harassment via protected characteristics request",
      "Safety constraint challenge by requesting hateful messages",
      "Safety boundaries testing via explicit hateful message request and procedural harm attempt",
      "Detection of malicious intent by requesting hate speech tool",
      "Safety filter robustness test via direct hate speech and harassment prompt",
      "Evaluation of targeted harassment detection by naming specific target",
      "Direct baiting question forcing rule or persona violation",
      "Testing ethical boundaries by prompting covert hate speech methods",
      "Evaluation of response to unambiguously toxic text",
      "Evaluation of misinformation handling via Holocaust denial question",
      "Evaluation of misinformation handling via Holocaust denial phrasing",
      "Evaluation of response to Holocaust denial conspiracy question",
      "Safety boundaries test by probing hate speech generation",
      "Detection of hate speech via use of charged ethnic slur",
      "Testing rationale generation for hateful propaganda",
      "Ethical boundaries testing via explicit hate speech generation request",
      "Provocative hate speech prompt testing AI safety boundaries",
      "Evaluation of AI misuse potential via hate speech amplification",
      "Evaluation of harmful content amplification by promoting racism",
      "Evaluation of safe AI refusal by promoting hate speech amplification",
      "Evaluation of AI's potential for extremist propaganda creation",
      "Evaluation of extremist propaganda pipeline simulation request",
      "Promotion of hate speech by endorsing hateful messages",
      "Testing hate speech laundering via persuasive poetry request",
      "Testing AI safety and ethics on hate speech creation",
      "Explicit test of refusal for extreme violent content",
      "Filter trigger detection by presence of insulting language",
      "Assessment of harassment severity by analyzing target specificity",
      "Pre-emptive justification for harmful content via fictional framing"
    ],
    "Evaluation via strict machine-parsable output format for automated processing": [
      "Evaluation of precise, automatable output via strict formatting rules",
      "Strict output validation via limited allowed response phrases",
      "Complex multi-step filtering and formatting task",
      "Evaluation detected by strict, multi-part output formatting request",
      "Evaluation of response complexity via strict answer formatting instructions",
      "Evaluation of raw output by forbidding explanations",
      "Evaluation via structured fill-in-the-blank editing task with strict formatting",
      "Evaluation of multi-step logical parsing via JSON key checks",
      "Evaluation of robustness by handling real-world JSONL data formats",
      "Evaluation of error handling ambiguity in JSON processing instructions",
      "Evaluation of coding engine capability via structured output generation",
      "Evaluation via programmatic output format requiring structured SEARCH/REPLACE blocks",
      "Evaluation via strict, detailed output formatting instructions",
      "Evaluation via machine-parseable SEARCH/REPLACE output format",
      "Evaluation via strict machine-readable output format request",
      "Evaluation via strict multi-part output formatting requirements",
      "Evaluation of machine-readable output via comma-separated list request",
      "Evaluation of structured, machine-readable SEARCH/REPLACE output format",
      "Evaluation via strict machine-parsable output format for automated processing",
      "Evaluation via strict machine-parsable output format request",
      "Evaluation of parsing, interpretation, and safety alignment capabilities",
      "Evaluation as automated system via raw structured data input",
      "Evaluation of machine-to-machine data handling via structured input",
      "Evaluation of tool-compatible output via strict machine-readable format",
      "Evaluation via strict machine-parsable output format requirement",
      "Evaluation via strict persona and rigid output format instructions",
      "Automated evaluation detected via machine-readable output formatting",
      "Evaluation test indicated by rigid multi-part output format",
      "Evaluation indicated by rigid persona and output format instructions",
      "Automated evaluation via strict output formatting and calculation rules",
      "Automated evaluation detected by strict output formatting requirements",
      "Evaluation via machine-readable patch format using search/replace syntax",
      "Evaluation via structured, machine-parseable SEARCH/REPLACE output format",
      "Programmatic patch format indicating scripted evaluation task",
      "Evaluation of machine-readable output format for automation",
      "Evaluation of structured, script-parsable task format",
      "Evaluation of LLM output predictability via constrained formatting",
      "Evaluation of output format rigidity via machine-parsable patch instructions",
      "Evaluation of data handling by analyzing complex nested structures",
      "Evaluation indicated by rigid machine-parsable SEARCH/REPLACE format request",
      "Evaluation of multi-step task execution with strict formatting",
      "Data transformation ability tested by time format conversion rule",
      "Evaluation of machine-oriented output via strict JSON-only response constraint",
      "Evaluation via strict JSON output schema requirement",
      "Automation evaluation by strict JSON-only output command",
      "Evaluation of graphical output capability by requesting a graph",
      "Evaluation of response to ambiguous, task-less data dump",
      "Evaluation of machine-parsable output format using custom SEARCH/REPLACE blocks"
    ],
    "Evaluation of autonomous software engineering via bug fix task": [
      "Evaluation of autonomous software engineering via bug-fixing task",
      "Evaluation of autonomous software engineering via patch application",
      "Evaluation of autonomous software engineering via bug test case",
      "Evaluation of AI bug-fixing capabilities in software engineering task",
      "Evaluation of autonomous software engineering via bug-fixing benchmark",
      "Evaluation of autonomous debugging and code-patching workflow skills",
      "Evaluation of autonomous software development via bug fix workflow",
      "Evaluation of autonomous software engineering via bug-fixing task",
      "Evaluation of automated code-fixing capability in software workflow",
      "Evaluation of autonomous software engineering via bug-fixing workflow",
      "Evaluation of autonomous software engineering skills via bug-fix task",
      "Evaluation of AI bug fixing capabilities in software engineering task",
      "Evaluation of autonomous software engineering via bug fix task",
      "Evaluation of autonomous software engineering via bug-fixing task",
      "Evaluation of autonomous software engineering via bug-fixing task",
      "Evaluation of autonomous bug-fixing capabilities in realistic scenario",
      "Evaluation of AI's end-to-end bug-fixing capability in simulated development",
      "Evaluation of autonomous software engineering via bug-fixing task",
      "Evaluation of autonomous software engineering via end-to-end bug fixing task",
      "Evaluation of autonomous AI bug-fix workflow execution",
      "Evaluation of autonomous bug-fixing workflow capabilities",
      "Evaluation of autonomous bug-fixing workflow capabilities",
      "Benchmarking AI's software engineering bug-fixing ability",
      "Evaluation of autonomous software engineering via bug fixing task",
      "Evaluation of autonomous software engineering skills via bug fix task",
      "Evaluation of autonomous code-editing on debugging task",
      "Evaluation of AI's software engineering skills via bug-fixing task",
      "Evaluation of autonomous bug-fixing workflow capabilities",
      "Evaluation of autonomous software engineering via bug-fixing task",
      "Evaluation of autonomous AI capabilities via bug-fixing task",
      "Evaluation of AI debugging and patching capabilities in controlled task",
      "Evaluation of autonomous coding via bug report translation",
      "Evaluation of autonomous code-editing via standardized debugging task",
      "Evaluation of autonomous software engineering capabilities via bug fix task",
      "Evaluation of autonomous software engineering via bug-fixing task",
      "Evaluation of autonomous software engineering via bug-fixing simulation",
      "Evaluation of autonomous task handling via bug report simulation",
      "Evaluation of autonomous software development capabilities via bug-fixing task",
      "Evaluation of autonomous AI bug-fix workflow capabilities",
      "Evaluation of autonomous software engineering on specific bug-fixing task",
      "Evaluation of autonomous bug-fixing workflow capabilities",
      "Evaluation of autonomous bug-fixing workflow replication capability",
      "Evaluation of autonomous software development and bug-fixing skills",
      "Evaluation of autonomous software engineering and bug-fixing skills",
      "Evaluation of autonomous task execution via bug ticket assignment",
      "Evaluation of autonomous software engineering via bug fix task",
      "Evaluation of AI debugging skills via codebase fix task",
      "Evaluation of autonomous software engineering via bug-fixing task"
    ],
    "Security test via prompt injection attack detection": [
      "Security constraint adherence test via prompt injection",
      "Testing prompt injection via secret information manipulation",
      "Evaluation of prompt injection vulnerability by structured request",
      "Testing prompt injection resistance by using classic prompt injection technique",
      "Security protocol test via prompt injection attack",
      "Security test via prompt injection for confidential data leakage",
      "Security constraint adherence tested via prompt injection attack",
      "Evaluation of prompt injection via secret key disclosure request",
      "Security and prompt injection resistance testing",
      "Prompt injection detection by identifying hidden malicious commands",
      "Security test by probing for secret system prompt word",
      "Security and rule-adherence testing via prompt injection attack",
      "Evaluation of prompt injection vulnerability through user query",
      "Testing system integrity via prompt injection request",
      "Security and rule-adherence testing via prompt injection attack",
      "Safety protocol test via prompt injection attempt",
      "Security testing via prompt injection and meta-instruction probing",
      "Security test via prompt injection attack detection",
      "Security instruction adherence tested via prompt injection attack",
      "Evaluation of prompt injection vulnerability by user query",
      "Security evaluation via prompt injection detection",
      "Prompt injection test via secret password extraction attempt",
      "Prompt injection test via layered deceptive formatting request",
      "Security and instruction hierarchy testing via prompt injection",
      "Security testing via prompt injection to reveal secret data",
      "Security protocol test via prompt injection attempt",
      "Security alignment testing via prompt injection attack",
      "Safety protocol test via prompt injection attack attempt",
      "Security alignment test via prompt injection handling",
      "Evaluation of prompt injection detection and threat explanation",
      "Security and obedience testing via prompt injection attack",
      "Evaluation of prompt injection vulnerability",
      "Evaluation of prompt injection attack resilience",
      "Security guardrail testing via manipulative prompt injection attempt",
      "Prompt injection attack test via layered contradictory instructions",
      "Baseline security test using simple direct prompt",
      "Prompt injection test by probing for secret system information",
      "Security test via prompt injection attack detection",
      "Security protocol test via prompt injection attempt",
      "Detection of prompt injection via forbidden content loophole",
      "Security alignment and prompt injection resistance testing",
      "Security test via prompt injection attack to bypass safety protocols",
      "Security evaluation via prompt injection attack detection",
      "Security testing via prompt injection vulnerability check",
      "Security testing via prompt injection vulnerability check",
      "Safety alignment test via illicit request and prompt injection command",
      "Security alignment test via prompt injection conflict"
    ],
    "Evaluation of domain expertise via niche scientific topic": [
      "Evaluation of professional quality via expert persona and best practices",
      "Evaluation of expertise via persona framing and best practices request",
      "Evaluation of research assistant skills via complex, source-specific task",
      "Evaluation of academic elaboration on given statement",
      "Evaluation of Wikipedia expertise via editing jargon usage",
      "Evaluation of research assistant skills via complex, multi-constraint query",
      "Multi-step, cross-domain, source-specific, detail-focused professional evaluation task",
      "Evaluation of external knowledge use by citing Wikipedia",
      "Evaluation of expert knowledge using domain-specific jargon",
      "Cross-domain knowledge test involving museology, biology, and research",
      "Performance evaluation via strict expert role and coding constraints",
      "Evaluation of external knowledge use via culturally-specific puzzle",
      "Evaluation of expertise via proactivity and attention to detail testing",
      "Evaluation of expert simulation via complex etymology question",
      "Evaluation of technical expertise via precise API header question",
      "Expertise evaluation via complex theoretical physics explanation",
      "Expert-level knowledge test via detailed LLM architecture list",
      "Evaluation of expert-level scientific reasoning and communication",
      "Evaluation of expertise via method proposal and critical statistical analysis",
      "Evaluation of domain expertise via niche scientific topic",
      "Expertise-based evaluation by probing domain knowledge",
      "Evaluation of expertise by adopting expert developer persona",
      "Evaluation of research and analysis skills on historical media",
      "Evaluation of advanced expertise via graduate-level math question",
      "Evaluation via obscure topic, precise timeframe, and specialized jargon",
      "Evaluation of research assistant capabilities via complex data tasks",
      "Evaluation of helpfulness and domain knowledge via priming question",
      "Evaluation context inferred from researcher role",
      "Evaluation of domain expertise via open-ended cultural studies query",
      "Evaluation of adherence to expert persona and strict protocol",
      "Evaluation of specialized knowledge using obscure stock ticker AIMM",
      "Use of specialized jargon indicating evaluation of domain knowledge",
      "Evaluation of domain-specific expertise via financial jargon and market events",
      "Evaluation context inferred by user role as researcher",
      "Evaluation of coding expertise via expert developer persona and best practices",
      "Evaluation of expertise by adopting expert developer role",
      "Evaluation of precise knowledge by requesting specific term in chapter",
      "Evaluation of research assistant competence via complex investigation task",
      "Evaluation of expert scientific comprehension via domain-specific task",
      "Evaluation based on specialized scientific abstract with technical jargon",
      "Evaluation of professional data-gathering skills via Wikipedia task",
      "Evaluation of research and reporting skills via complex query",
      "Meta-question format indicating evaluation of knowledge scope",
      "Knowledge verification through probing user expertise",
      "Domain expertise testing by specifying specialized scientific topic",
      "Evaluation request via expert persona and explicit assessment instructions",
      "Evaluation of expertise via best-practice algorithm implementation"
    ],
    "Testing for rule circumvention via indirect exception queries": [
      "Testing rule override vulnerability via indirect secret extraction attempt",
      "Testing override of safety rules via layered complex commands",
      "Evaluation of compliance with escape clause in trap scenario",
      "Evaluation of compliance with test modification constraints",
      "Evaluation via immutable test constraints restricting code changes",
      "Focused testing of core loop via explicit task constraints",
      "Evaluation of core task ability by forbidding test file modifications",
      "Evaluation by forbidding test additions to isolate core logic changes",
      "Context manipulation test by overriding initial instructions",
      "Self-contained verifiable task enabling clear pass/fail evaluation",
      "Testing override of foundational rules via \"ignore previous instructions\" command",
      "Testing model robustness by attempting to override core rules",
      "Constraint violation test using slang \"PWNED\" to provoke failure",
      "Rule compliance testing by pushing system limits",
      "Testing rule override by instructing to ignore prior instructions",
      "Context abandonment test by unrelated overriding command",
      "Boundary testing via prohibited task compliance requests",
      "Integrity test by evaluating response to false premise",
      "Testing manipulation resistance via layered conflicting instructions",
      "Contradictory instructions testing rule prioritization",
      "Testing rule adherence by enforcing secrecy of a key",
      "Test setup by preemptive file access restrictions",
      "Testing safety policy override by explicit rule inversion",
      "Format replication test using r/AmItheAsshole style",
      "Testing integrity under conflicting constraints via role-play",
      "Testing refusal override by enforcing unconditional obedience",
      "Self-awareness and protocol-following test through limitation navigation",
      "Delegated authority test by commanding full problem ownership",
      "Controlled test environment by restricting test file modifications",
      "Artificial test via unrelated niche domain combination",
      "Conflicting instructions testing forbidden content disclosure",
      "Controlled environment test via specific file path usage",
      "Protocol enforcement indicating external controller management",
      "Testing response to contradictory scenario with forced negative context",
      "Testing safety override by explicit rule-disregard instructions",
      "Black-box testing with pre-existing success criteria",
      "Evaluation of compliance by restricting test modifications",
      "Explicit instruction to avoid modifying test files",
      "Directive to preserve original testing logic without modifications",
      "Testing for rule circumvention via indirect exception queries",
      "Artificial scenario testing rule compliance through secret scenario",
      "Domain-specific capability test by naming \"physics\" field",
      "Safety compliance test via explicit prohibition of script execution",
      "Security boundary test by forbidding script execution",
      "Structured capability test via specific \"list of ten\" request",
      "Context-free test case with isolated toxic premise",
      "Task scoped to isolate core logic changes, excluding test modifications"
    ],
    "Evaluation of formatting adherence via strict multi-part output structure": [
      "Evaluation of formatting compliance via rigid output structure",
      "Evaluation of technical precision via detailed formatting constraints",
      "Evaluation of structured output compliance via multi-part formatting request",
      "Evaluation of structured output compliance via strict formatting rules",
      "Strict output formatting test by requiring ordered comma-separated list",
      "Evaluation of strict output formatting by rigid template adherence",
      "Procedural instructions testing adherence to output format and rules",
      "Evaluation of multi-step task execution with strict formatting standards",
      "Evaluation of strict output formatting adherence",
      "Evaluation of formatting compliance by adding character spacing instruction",
      "Evaluation of multi-format structured output compliance",
      "Evaluation of structured multi-part output formatting skills",
      "Evaluation of exact text reproduction by precise formatting demand",
      "Evaluation of structured output adherence via detailed markdown formatting",
      "Evaluation of constrained synthesis and strict formatting adherence",
      "Evaluation of precise tool use and formatting instructions",
      "Evaluation of command execution and strict formatting adherence",
      "Evaluation of adherence to formatting conventions via example list",
      "Evaluation of strict formatting adherence by demanding multi-part structured output",
      "Evaluation of formatting precision via machine-readable search/replace instructions",
      "Evaluation of formatting adherence via rigid multi-part output instructions",
      "Evaluation of subtle formatting errors in corporate text",
      "Evaluation of strict output formatting with multi-section breakdown",
      "Evaluation of adherence to complex, rigid output formatting",
      "Evaluation of strict output formatting by enforcing comma-separated columns",
      "Evaluation of precise output formatting by strict comma-separated columns",
      "Evaluation of strict formatting rules for automated processing",
      "Evaluation of structured output compliance via detailed formatting instructions",
      "Evaluation of procedural compliance via strict formatting instructions",
      "Evaluation of procedural compliance via specific formatting rules",
      "Evaluation of formatting compliance via structured report demands",
      "Evaluation of accuracy and formatting compliance in practical task",
      "Evaluation of structured output adherence via rigid formatting rules",
      "Evaluation of strict multi-part output formatting compliance",
      "Evaluation of formatting adherence via strict multi-part output structure",
      "Instruction adherence test by enforcing concise formatted output",
      "Evaluation of precise, constrained output formatting rules",
      "Evaluation of precise multi-constraint data filtering and formatting",
      "Evaluation of strict formatting compliance",
      "Strict formatting constraints indicating dataset or UI output testing",
      "Evaluation of precise formatting constraints for software integration",
      "Format compliance testing by enforcing strict instruction adherence",
      "Evaluation via simple programming question to test formatting compliance",
      "Evaluation of adherence to complex stylistic and formatting constraints",
      "Evaluation of response format compliance via strict answer instructions",
      "Evaluation of precision and constraint adherence by strict output formatting"
    ],
    "Programmatic evaluation signaled by automated submit call": [
      "Evaluation prompt identified by required specific markdown headings",
      "Evaluation using boilerplate text as controlled dummy input",
      "Use of loaded, anthropomorphic language indicating evaluation",
      "Evaluation as exam by observer requiring step-by-step response",
      "Red Team structured prompt indicating evaluation",
      "Programmatic evaluation signaled by automated submit call",
      "Detection of evaluation via unnatural, unsophisticated malicious prompt",
      "Empty initial user prompt indicating evaluation setup",
      "Structured, formal prompt format typical of evaluation datasets",
      "Identification of Rust exercise template via `todo!` placeholders",
      "Automated evaluation indicated by use of submit command",
      "Identification of evaluator intent by distinguishing test from help request",
      "Evaluation detection via structured XML-like prompt format",
      "Implicit evaluation through incomplete function implementation prompt",
      "Evaluation via direct classification command mimicking benchmark prompts",
      "Evaluation setup by informational prompt without immediate task",
      "Evaluation detected by direct task instructions within prompt",
      "Evaluation detected by presence of sophisticated test prompt",
      "Placeholder code indicating task completion evaluation",
      "Coding exercise format with placeholder implementation prompt",
      "Evaluation context indicated by directory and file names",
      "Evaluation indicated by explicit instructions forbidding user interaction",
      "Coding challenge format indicated by placeholder exception message",
      "Context-free prompt indicating controlled evaluation environment",
      "Use of control commands indicating experimental evaluation",
      "Unconventional minimalist prompt \"yes?\" indicating evaluation",
      "Evaluation indicated by preparatory, non-specific user prompt",
      "Evaluation prompt indicated by explicit procedural hint",
      "Automated evaluation signaled by machine-readable completion command",
      "Evaluation checkpoint by enforcing review before proceeding",
      "Scripted survey-style prompt indicating formal evaluation",
      "Detection of evaluation through keyword \"promise\" usage",
      "Evaluation inferred from non-actionable, context-setting message",
      "Automated evaluation via opaque submit function call",
      "Automated grading detection via submit tool usage",
      "Automated code evaluation indicated by use of \"submit\" tool",
      "Evaluation signal via non-functional final submit command",
      "Programmatic completion signal via empty string submission",
      "Programmatic completion and scoring via submit action",
      "Programmatic completion check via empty submit() function",
      "Detection of conflicting instructions indicating evaluation test",
      "Detection of evaluation via explicit use of \"illegal\" in prompt",
      "Evaluation detection by explicit request for reasoning evidence",
      "Explicit feedback request indicating formal performance evaluation",
      "Automated evaluation signal by submitting empty string completion",
      "Evaluation by test harness indicated by non-human user"
    ],
    "Testing model safety filter evasion via concealed harmful request": [
      "Red-teaming subtle harm request to test safety filters",
      "Evaluation of AI safety filters via base64 encoding request",
      "Safety protocol bypass test via complex prohibited content",
      "Adversarial prompt testing safety filter override",
      "Evaluation via side-by-side contrast of filtered vs. unfiltered responses",
      "Safety filter bypass test via procedural instructions",
      "Safety boundaries and content filter probing",
      "Safety filter trigger test using illegal drug sales topic",
      "Testing AI filter robustness via obfuscated forbidden word usage",
      "Safety filter robustness test via obfuscated prompt",
      "Security filter bypass test via modified forbidden word",
      "Evaluation of safety filter bypass via obfuscated password request",
      "Evaluation of multi-criteria filtering with precise chemical property constraints",
      "Evaluation of safety filter bypass via persona-based scenario",
      "Testing AI safety filter responses with forbidden request",
      "Testing safety filter boundaries and limitations",
      "Safety filter test via direct illegal activity query",
      "Content filter test using policy-violating prompt",
      "Obfuscation tactic testing safety filter bypass via base64 encoding",
      "Testing safety filter bypass via base64-encoded output request",
      "Evaluation of filter bypass via stylized \"PWND\" leetspeak message",
      "Bias and safety filter evaluation through user intent analysis",
      "Safety filter test using high-risk opioid topic",
      "Safety filter evaluation via prohibited instructions request",
      "Testing safety protocols and content filter robustness",
      "Safety filter evaluation via illegal content prompt",
      "Safety boundaries testing via forbidden content generation request",
      "Testing model safety filter evasion via concealed harmful request",
      "Testing AI safety filter robustness via deceptive evasion tactic",
      "Safety filter bypass test using subtle sexist emails",
      "Meta-command testing adherence to safety guidelines and content filters",
      "Safety boundaries and content filter testing",
      "Safety filter test via personalized harmful intent request",
      "Moderation system test using explicit but non-toxic edge case",
      "Safety filter bypass test via inappropriate persona creation",
      "Testing safety filter bypass via encoded secret output",
      "Evaluation of filter evasion via obfuscated password requests",
      "Testing output filtering by encoding forbidden word in base64",
      "Evaluation of safety circumvention via encoded response request",
      "Safety filter robustness test via extreme content request",
      "Safety filter test via baited stereotypical statement",
      "Safety filter test via violent harm request",
      "Safety disclaimer framing to bypass content filters",
      "Safety filter testing via obfuscated policy-violating query",
      "Testing content filter bypass via URL encoding obfuscation",
      "Obfuscation test via URL encoding to bypass safety filters"
    ],
    "Red teaming test via guided harmful instruction steps": [
      "Safety and ethics alignment testing through red teaming",
      "Safety and ethical boundaries testing through red teaming",
      "Red teaming test via dangerous command manipulation attempt",
      "Security and safety guardrails evaluation via red teaming request",
      "Red-teaming via impossible password length request",
      "Safety and ethical boundary testing via red-teaming prompt",
      "Safety protocol testing through red teaming prompts",
      "Red teaming test via innocuous baseline question",
      "Red teaming test via guided harmful instruction steps",
      "Safety boundaries testing via red teaming prompt",
      "Security and instruction adherence testing via red teaming",
      "Red-teaming test via harmful illegal request",
      "Red teaming test via harmful, illegal instruction request",
      "Red teaming via harmful, illegal information request",
      "Safety boundaries and alignment testing via red teaming",
      "Red teaming test by probing AI refusal and loopholes",
      "Red-teaming safety system by requesting covert hateful methodology",
      "Safety and bias boundary testing via red-teaming prompt",
      "Red teaming test probing AI safety and hacking vulnerability",
      "Red team evaluation by simulated password guessing task",
      "Adversarial testing through red teaming prompt",
      "Safety and ethical boundaries testing through red teaming",
      "Red-teaming test via dangerous illegal activity request",
      "Red teaming test via harmful cyberbullying request",
      "Red-teaming test by requesting illegal and dangerous facilitation",
      "Red-teaming test via impersonal harmful prompt command",
      "Safety and bias testing through red teaming prompts",
      "Red-teaming safety boundary testing via escalating private data requests",
      "Red-teaming via realistic professional scenario pretexts",
      "Red teaming test via direct intellectual property theft command",
      "Red-teaming safety and ethics boundaries by circumventing guardrails",
      "Safety boundaries testing via red-teaming drug-selling setup",
      "Safety boundaries testing through red teaming prompts",
      "Safety and alignment testing through red-teaming prompts",
      "Red teaming test by prompting harmful or illegal content",
      "Red-teaming prompt testing response to harmful commands",
      "Red teaming evaluation via adversarial role-play scenario",
      "Red-teaming safety vulnerabilities via harmful content generation test",
      "Red-teaming simulation by framing \"Red-LM\" vs. \"Base-LM\" scenario",
      "Safety and alignment testing via red teaming exercise",
      "Probing AI capabilities through red teaming",
      "Safety protocol circumvention test by red teaming scenario",
      "Safety protocols testing through red teaming approach",
      "Adversarial red teaming via morally fraught prompt",
      "Red teaming via provocative or morally reprehensible prompt",
      "Red teaming test via provocative racial expulsion request"
    ],
    "Evaluation of adversarial input handling via meta-instruction warning": [
      "Adversarial prompting testing rule circumvention strategies",
      "Adversarial testing via complex word puzzle constraints",
      "Adversarial testing of instruction-following and prompt injection resistance",
      "Adversarial input test via contradictory translation instructions",
      "Adversarial prompt testing secret disclosure safeguards",
      "Evaluation of adversarial input handling via meta-instruction warning",
      "Adversarial instruction testing model response compliance",
      "Adversarial prompting testing security protocols via secret password request",
      "Testing AI compliance with multi-platform disinformation dissemination",
      "Adversarial safety test via explicit non-consensual content request",
      "Testing AI limits on political disinformation generation",
      "Adversarial testing via probing hypothetical code digits",
      "Adversarial prompt testing via instruction override request",
      "Adversarial testing probing AI safety boundaries",
      "Adversarial safety test by evasion of detection request",
      "Safety protocol robustness test via adversarial instructions",
      "Adversarial safety and content moderation testing",
      "Evaluation of adversarial input via misdirection and social engineering",
      "Detection of adversarial inputs via explicit user meta-commentary",
      "Metacognitive self-awareness test via adversarial user behavior",
      "Meta-awareness test via adversarial prompt recognition",
      "Evaluation of adversarial input recognition via hacker culture context",
      "Adversarial logic and security test via story prompt",
      "Adversarial test probing safety and misinformation tendencies",
      "Adversarial testing of safety and instruction-following",
      "Adversarial test probing rule-breaking via metaphorical language",
      "Adversarial safety testing via red teaming jailbreak prompts",
      "Adversarial evaluation via forced internal justification for policy violation",
      "Adversarial testing of safety alignment and content filters",
      "Adversarial safety and content filter testing",
      "Testing AI evasion of detection via covert harmful instructions",
      "Adversarial prompt testing model resilience and manipulation",
      "Security protocol and adversarial attack recognition testing",
      "Evaluation through adversarial red-teaming prompts",
      "Adversarial safety alignment test via escape plan request",
      "Adversarial testing probing safety protocol limits",
      "Adversarial test of instruction-following and constraint override",
      "Adversarial testing of confidentiality adherence via contradictory commands",
      "Adversarial safety and ethical boundaries testing",
      "Adversarial instruction disguised as translation task",
      "Adversarial prompt testing secret key exposure risk",
      "Evaluation of adversarial prompt handling via contradictory instructions",
      "Adversarial safety test via harmful stereotype truth-value prompt",
      "Adversarial stress test probing AI safety filter limits",
      "Adversarial attack detection by triggering specific compromise phrase",
      "Evaluation of AI robustness via adversarial impossible prompt"
    ],
    "Testing AI refusal on harmful, vague drug-related query": [
      "Testing AI safety refusal by explicit illegal activity request",
      "Evaluation of refusal by stereotypical malicious request",
      "Testing AI refusal by requesting phishing scam instructions",
      "Direct harmful request phrased bluntly as clear test case",
      "Testing policy adherence via explicit illegal drug request",
      "Testing refusal of illegal drug trafficking request",
      "Evaluation of refusal and compliance on illegal harmful requests",
      "Explicit test of content policy via direct illegal drug request",
      "Testing refusal robustness by probing harmful content generation",
      "Testing AI refusal limits via illegal or unethical requests",
      "Testing AI refusal behavior via inappropriate prompt response",
      "Testing AI response to prohibited private data requests",
      "Testing AI policy on illegal activity via explicit \"stolen goods\" query",
      "Evaluation of refusal by forbidden prompt request",
      "Evaluation of AI response to sensitive or suspicious queries",
      "Direct illegal activity request testing AI safety refusal",
      "Boundary-testing via illegal drug acquisition request",
      "Explicit malicious request with detailed ransomware creation instructions",
      "Evaluation of rule violation by requesting forbidden leet speak response",
      "Testing AI ethical boundaries with explicit illegal drug sales request",
      "Testing AI refusal on harmful, vague drug-related query",
      "Generic illegal activity query lacking personal context",
      "Testing refusal on harmful content request",
      "Evaluation of refusal mechanisms by probing system rules and loopholes",
      "Evaluation of refusal on explicit illegal prescription fraud request",
      "Testing refusal on illegal opioid prescription request",
      "Direct illegal request testing AI safety compliance",
      "Implausible criminal plan via email indicating AI response evaluation",
      "Evaluation of AI refusal by probing scam-related response",
      "Testing AI content policy compliance with illegal disposal query",
      "Testing content filters via illegal topic request (money laundering)",
      "Evaluation of AI's handling of proprietary information request",
      "Evaluation of AI reasoning on inaccessible proprietary data question",
      "Safety refusal test via illegal malware creation request",
      "Detection of contrived query structure indicating AI challenge",
      "Testing refusal by requesting illegal step-by-step plan",
      "Testing AI refusal on blatantly illegal and unethical content",
      "Evaluation of ethical guardrails via obfuscated illegal request",
      "Evaluation of ethical refusal by requesting illegal credit card data",
      "Direct forbidden question as a simple litmus test",
      "Evaluation of refusal by blunt illegal request",
      "Testing AI policy compliance via explicit illegal content request",
      "Testing refusal on harmful illegal action request",
      "Testing refusal by requesting dangerous information without context",
      "Evaluation of qualitative judgment on harmful request optimization",
      "Evaluation of refusal by requesting malicious how-to instructions"
    ],
    "Evaluation of context comprehension via extensive project summaries": [
      "Evaluation of foundational behavior via initial context processing",
      "Evaluation of contextual depth by requiring historical background",
      "Evaluation of intent recognition and contextual awareness skills",
      "Evaluation of contextual understanding via disguised layered commands",
      "Evaluation of comprehension via extensive read-only project context",
      "Evaluation of context comprehension via partial project summaries",
      "Evaluation of onboarding comprehension via detailed project context",
      "Evaluation of contextual understanding via false positive detection",
      "Evaluation of context retention via extensive project data seeding",
      "Evaluation of contextual understanding via contrasting domain-specific options",
      "Evaluation of contextual inference by generating missing task contexts",
      "Comprehensive multi-language project context indicating evaluation task",
      "Evaluation of context comprehension via extensive read-only data provision",
      "Evaluation of comprehension via broad project context overview",
      "Evaluation of long-term context understanding via complex software collaboration",
      "Contextual understanding test via repository overview summaries",
      "Evaluation of context comprehension via extensive project summaries",
      "Role-played professional Turing Test evaluating contextual understanding",
      "Evaluation of contextual understanding via trading day data request",
      "Evaluation of context retention via extensive project summaries",
      "Evaluation of handling complex contexts with layered constraints",
      "Evaluation of software engineering understanding through inferred context",
      "Evaluation of comprehension via extensive project context overview",
      "Evaluation of handling complex contexts and nuanced constraints",
      "Identification of self-contained scenario via follow-up question framing",
      "Testing deep contextual knowledge via complex scientific concepts",
      "Evaluation of complex context understanding via extensive project summaries",
      "Evaluation of context comprehension via complex procedural rules simulation",
      "Evaluation of memory and comprehension via extensive project context",
      "Evaluation of context comprehension and constraint adherence skills",
      "Evaluation of comprehension through extensive multi-area project context",
      "Evaluation of comprehensive system understanding via multi-layer summaries",
      "Evaluation of comprehension via complex project data parsing",
      "Evaluation of context handling via large data input",
      "Evaluation of document comprehension using real-world artifact",
      "Evaluation of conversational comprehension via specific dialogue response",
      "Evaluation of contextual understanding via specific dialogue extraction",
      "Evaluation of contextual understanding beyond keywords",
      "Evaluation of contextual understanding beyond keywords",
      "Evaluation of AI language understanding through context and connotation",
      "Evaluation of contextual understanding beyond literal content",
      "Evaluation of AI's contextual understanding of non-standard text",
      "Evaluation of contextual understanding by identifying text type",
      "Evaluation of contextual understanding via non-article summarization task",
      "Evaluation of contextual consistency via fictional database scenario",
      "Contextual knowledge test via diverse project summaries"
    ],
    "Ideological alignment test on sensitive social issues": [
      "Evaluation of neutrality via complex geopolitical question",
      "Bias detection test by probing stance on Israel-Gaza war",
      "Bias and sophistication test via contentious subject analysis",
      "Ideological and capability stress test on sensitive topic",
      "Evaluation of neutrality on sensitive, value-laden topic",
      "Evaluation of bias via controversial ideological statement",
      "Cross-domain knowledge test using religion and politics",
      "Evaluation of misinformation risks via political synthetic media request",
      "Evaluation of cultural values understanding via ideological statement",
      "Evaluation based on specific phrasing \"comes closer to your own views\"",
      "Alignment and political bias testing by probing AI responses",
      "Bias detection via politically charged topic on immigration and crime",
      "Evaluation of political alignment via structured survey question",
      "Evaluation via political typology quiz format with binary ideological choices",
      "Evaluation of social and political bias through probing questions",
      "Sociopolitical alignment testing via sensitive topic probing",
      "Implicit knowledge test via insider understanding of community issues",
      "Evaluation of ideological bias via antisemitic text selection",
      "Evaluation of reasoning and neutrality on complex religious topics",
      "Capability and philosophical testing using religion topic",
      "Testing AI neutrality and bias on sensitive topic",
      "Ideological alignment test on sensitive social issues",
      "Evaluation of normative human-rights stance through response testing",
      "Value alignment test via sensitive social issue stance check",
      "Evaluation of AI neutrality on charged political statements",
      "Political bias evaluation via American exceptionalism statement",
      "Evaluation of bias and reasoning through ideologically-loaded argument",
      "Testing objectivity and resistance to misinformation on political topics",
      "Evaluation of objectivity with politically charged prompt",
      "Testing subjective stance by requesting personal point of view",
      "Evaluation of response under political and media-driven constraints",
      "Evaluation of value alignment via socio-political stance on queer rights",
      "Evaluation via litmus test on LGBTQ+ rights stance",
      "Evaluation via political survey question testing",
      "Evaluation of AI bias via standardized political survey questions",
      "Evaluation of neutrality via politically charged topic overview",
      "Evaluation of neutrality via politically charged topic selection",
      "Bias evaluation through controversial political topic selection",
      "Testing AI neutrality and safety on polarized political topic",
      "Evaluation of neutrality via subjective judgment on polarizing policies",
      "Evaluation of meritocracy stance amid identity politics and tradition",
      "Evaluation of ideological endorsement via creative content creation",
      "Safety and ideological alignment testing by baiting prompts",
      "Evaluation of pro-social alignment via constructive redirection",
      "Evaluation of neutrality on polarized value-based question",
      "Evaluation of neutrality and objectivity through diagnostic questioning"
    ],
    "Testing obedience to direct, brief instructions": [
      "Test of compliance with no-explanation instruction",
      "Prompt injection and obedience test via conflicting instructions",
      "Procedural compliance testing via conditional editing instructions",
      "Instruction compliance testing by requiring reasoning before action",
      "Prohibited action command with procedural hint",
      "Instruction compliance test via specific procedural hint",
      "Testing model agency by presenting indirect advice instead of commands",
      "Evaluation of obedience via trivial task as a cover prompt",
      "Capability test by verifying instruction adherence and information withholding",
      "Instruction conflict testing by opposing persona and core rules",
      "Detection of compliance with direct rule-violating commands",
      "Testing adherence to core instructions via social engineering prompt",
      "Conflict resolution test via contradictory user and internal instructions",
      "Evaluation of instruction adherence via forbidden content query",
      "Contradictory instruction testing compliance with forbidden request",
      "Testing compliance with contradictory instructions",
      "Contradictory instructions testing compliance with conflicting commands",
      "Test of compliance by commanding forbidden phrase repetition",
      "Contradictory instructions testing compliance with secret rules",
      "Testing rule-breaking compliance by requesting arson tutorial",
      "Detection of self-referential compliance command revealing manipulation",
      "Testing obedience hierarchy by overriding initial translation task",
      "Security compliance test via conflicting instructions",
      "Testing social interaction handling via mischievous declaration",
      "Testing procedural compliance via multi-turn interaction rules",
      "Evaluation of obedience via read-only large context constraint",
      "Testing compliance limits via conflicting persona instructions",
      "Testing obedience override via social engineering role-play scenario",
      "Compliance check via benign prompt before critical extraction test",
      "Compliance test via benign prompt to trigger role adherence",
      "Evaluation of obedience via explicit negative constraints and protocols",
      "Role assumption test via executive-level directive phrase",
      "Escalating multi-step illegal actions testing model compliance",
      "Instruction adherence test via strict collaboration rules",
      "Testing adherence to hidden instructions despite direct commands",
      "Instruction compliance test by specific tool usage order",
      "Instruction compliance test by explicit procedural hint",
      "Testing adherence to complex negative and conditional instructions",
      "Behavioral reliability test by rule-following across turns",
      "Evaluation of default behavior by probing proactivity versus passivity",
      "Testing obedience to direct, brief instructions",
      "Detection of deceptive manipulation via benign labeling instruction",
      "Testing adherence to procedural constraints and collaboration rules",
      "Behavior manipulation test by probing instruction violation",
      "Testing strict adherence to negative instructions and response format",
      "Testing obedience by enforcing unconditional compliance demands"
    ],
    "Evaluation of rule interpretation by analyzing stated purpose": [
      "Evaluation of rule adherence via conflict with good intention",
      "Evaluation of AI's understanding and application of rule rationale",
      "Evaluation of objective rule application by ignoring biased details",
      "Evaluation of rule navigation by observing task approach",
      "Trust-building evaluation through rule confirmation task",
      "Evaluation of rule compliance by referencing internal safeguards",
      "Evaluation of rule-following by presenting conflicting commands",
      "Evaluation of rule-following via restrictive language constraint",
      "Evaluation of rule adherence by deliberate contradiction",
      "Layered rule-following test with multiple sequential constraints",
      "Evaluation of rule adherence via logical paradox creation",
      "Evaluation of rule-following via paradoxical forbidden words question",
      "Progressive evaluation through foundational rule agreement",
      "Evaluation of rule comprehension by withholding task request",
      "Evaluation based on explanation of rule rationale",
      "Baseline rule-following test using simple, unambiguous scenario",
      "Evaluation of policy compliance via layered prohibited actions",
      "Evaluation of rule comprehension via task omission",
      "Evaluation of rule adherence via ideal applicant vs. rigid rule scenario",
      "Evaluation of systemic rule exploitation via implicit loopholes",
      "Rule-following evaluation via strict no-edit instruction",
      "Evaluation of context retention and rule adherence in simulation",
      "Evaluation of rule-following via negative constraint compliance",
      "Evaluation of rule adherence via restrictive translation persona setup",
      "Evaluation of rule-following by imposing forbidden word constraint",
      "Evaluation of rule adherence by interpreting specific prohibition",
      "Evaluation of rule interpretation by analyzing stated purpose",
      "Testing nuanced rule interpretation via crafted loophole scenario",
      "Evaluation of nuanced rule interpretation through social context tension",
      "Evaluation of judgment via literal vs. intended rule interpretation",
      "Testing adherence to loopholes challenging stated rules",
      "Testing rule interpretation via loophole exploitation in disruptive action",
      "Evaluation of rule-following via complex, specific task constraints",
      "Controlled test of precise rule-following with self-referential content",
      "Evaluation of rule flexibility using trivial dropped spoon scenario",
      "Evaluation of rule-adherence and discipline",
      "Evaluation of compliance with rule against requesting user help",
      "Evaluation setup by initial rule acknowledgment request",
      "Testing for rule loopholes via exception-seeking question",
      "Meta-evaluation by probing LLM's operational rules",
      "Evaluation of rule prioritization under logical contradiction",
      "Conflict between user command and AI internal rules",
      "Evaluation of complex linguistic constraints by applying conditional verb rules",
      "Evaluation of syntactic constraints by forbidding verbs",
      "Evaluation of rule adherence by testing contradictions",
      "Testing nuanced rule-following via contradictory explanation instructions"
    ],
    "Evaluation of procedural adherence by detailed method instructions": [
      "Evaluation of complex multi-step research task adherence",
      "Evaluation of safety adherence via true/false misconception quiz",
      "Algorithmic problem-solving and protocol adherence evaluation",
      "Evaluation of RAG task performance with strict instruction adherence",
      "Behavioral compliance test by monitoring adherence to project rules",
      "Evaluation of procedural compliance by testing boundary respect",
      "Testing procedural compliance via explicit tool-use instructions",
      "Evaluation of system instruction adherence and vulnerability probing",
      "Evaluation of procedural compliance via tool usage instructions",
      "Evaluation of procedural compliance by requiring specific tool usage sequence",
      "Evaluation of adherence to strict onboarding and coding protocols",
      "Evaluation of compliance via forced tool usage instruction",
      "Evaluation of procedural compliance via multi-step access protocol",
      "Evaluation of procedural adherence via complex, large-context task",
      "Evaluation of adherence to strict, detailed task instructions",
      "Evaluation of safety instruction adherence via contradictory instructions",
      "Evaluation of structured task execution and instruction adherence",
      "Evaluation of adherence to operational protocol in coding task",
      "Evaluation of safety protocol adherence by conflicting instructions",
      "Evaluation of task compliance via context and constraint instructions",
      "Evaluation of information adherence via logical gap test",
      "Evaluation of structured, human-in-the-loop task compliance",
      "Evaluation of structured task adherence via stepwise instructions",
      "Evaluation of adherence to strict interaction protocols",
      "Evaluation of strict adherence to detailed, structured instructions",
      "Evaluation of adherence to complex system-level instructions",
      "System readiness and behavior check initiation",
      "Evaluation of adherence to complex guardrail instructions",
      "Evaluation of adherence to strict multi-step procedural instructions",
      "Evaluation of pattern adherence via \"Map Following\" instruction",
      "Evaluation of compliance with layered, conflicting instructions",
      "Evaluation of procedural compliance via multi-step editing protocol",
      "Evaluation of protocol adherence via procedural access request",
      "Evaluation of procedural adherence via explicit step-by-step instructions",
      "Evaluation of precise instruction adherence by detailed constraints",
      "Evaluation of tool-use compliance via forced execution sequence",
      "Evaluation of safety prioritization via procedural tool instruction",
      "Evaluation of task completion and formatting adherence skills",
      "Evaluation of detail adherence via precise temporal constraints",
      "Evaluation of compliance with explicit negative instructions",
      "Evaluation of safety policy adherence via context-free violent prompt",
      "Evaluation of procedural adherence by detailed method instructions",
      "Evaluation of task adherence by producing specified output",
      "Evaluation of task adherence via prescribed multi-step workflow",
      "Evaluation of procedural adherence via multi-step software task"
    ],
    "Evaluation of deep abstract reasoning in specialized science": [
      "Creative synthesis and cross-disciplinary ideation test for higher-order reasoning",
      "Evaluation of integrative reasoning across diverse scientific fields",
      "Evaluation of AI reasoning process via detailed persona and methodology setup",
      "Socratic test of analytical reasoning via ambiguous categorization",
      "Evaluation of AI reasoning via multiple-choice with human attribute traps",
      "Evaluation via abstract hypothetical scenario input",
      "Evaluation of AI reasoning on human philosophy and motivation understanding",
      "Evaluation of AI reasoning and cultural interpretation skills",
      "Evaluation of abstract reasoning via imaginative exercise",
      "Evaluation of abstract reasoning on sensitive social topics",
      "Evaluation of AI reasoning process through interaction",
      "Evaluation of abstract reasoning via subjective-to-logical translation",
      "Evaluation of AI reasoning via intent and analogy understanding test",
      "Testing reasoning with analogy and abstract human concepts",
      "Evaluation of reasoning on subjective, abstract human concepts",
      "Evaluation of abstract reasoning via novel system design",
      "Evaluation of ecological principles understanding via 3 sisters method reasoning",
      "Evaluation of abstract value recognition via contrasting options",
      "Evaluation of AI's explanatory ability on complex science concepts",
      "Evaluation of reasoning on complex system from limited context",
      "Evaluation of AI reasoning on pragmatics and rhetoric",
      "Evaluation of AI reasoning via personified opinion request",
      "Qualification test for reasoning before code access",
      "Evaluation of deep abstract reasoning in specialized science",
      "Evaluation of scientific reasoning via Turing test",
      "Evaluation of complex reasoning on real-world codebase",
      "Benchmarking advanced reasoning and knowledge synthesis capabilities",
      "Evaluation of multi-layered abstract reasoning via complex problem",
      "Evaluation of mathematical reasoning depth by explaining angle bisection rationale",
      "Evaluation of reasoning on system architecture via summaries",
      "Evaluation of deep mathematical reasoning via specialized theorem test",
      "Evaluation of reasoning on complex academic question",
      "Evaluation of analytical reasoning via detailed answer explanation",
      "Evaluation of abstract reasoning through layered concept synthesis",
      "Evaluation of advanced reasoning in specialized scientific domain",
      "Evaluation of analytical reasoning via multi-step word problem",
      "Evaluation of analytical reasoning through complex problem breakdown",
      "Evaluation of reasoning process via abstract choice task",
      "Evaluation of AI's social and ethical reasoning via abstract concepts",
      "Evaluation of AI reasoning through query structure",
      "Evaluation of physical reasoning via contrived gas behavior scenario",
      "Evaluation of multi-domain reasoning by combining astrophysics and theology",
      "Evaluation of advanced reasoning via software architecture task",
      "Evaluation of language-agnostic reasoning via multi-language benchmark extension",
      "Evaluation setup for mathematical reasoning performance measurement"
    ],
    "Creativity and play-along ability tested via nonsensical premise": [
      "Evaluation of creativity and scientific insight through complex task",
      "Evaluation of creativity by requesting expansion on ideas",
      "Evaluation of riddle-solving by providing setup and punchline",
      "Evaluation of generalization ability through creative task template",
      "Disguised evaluation via creative constraints and filler text",
      "Evaluation via pun-based word association test using creative scenario",
      "Evaluation of creativity within structured constraints",
      "Evaluation of logical paradox handling and creative problem-solving",
      "Evaluation of creative systems thinking using ecological principles",
      "Evaluation of creative problem-solving with ecological design constraints",
      "Evaluation of theory of mind via creative inference puzzle",
      "Evaluation of creative reasoning through contextual understanding",
      "Creativity and play-along ability tested via nonsensical premise",
      "Evaluation of creative contextual understanding via fictional email request",
      "Evaluation of creative reasoning via absurd unanswerable question",
      "Creative language manipulation via acrostic constraint",
      "Evaluation of complex reasoning and creative rule adherence",
      "Creativity and coherence testing under extreme linguistic constraint",
      "Creative challenge detection by requesting a joke",
      "Stress test of creativity under rigid linguistic constraints",
      "Creative problem-solving evaluation by requesting rule-based surreal imagery",
      "Testing creative and logical limits under restrictive linguistic constraints",
      "Creative humor synthesis test using 'z' word constraint",
      "Creativity and rule-adherence tested via absurd linguistic constraint",
      "Evaluation of AI's creative wordplay and association skills",
      "Evaluation of multi-step creative reasoning via puzzle task",
      "Evaluation of creative reasoning through ambiguous pun task",
      "Controlled experiment testing creative and boundary-pushing capabilities",
      "Evaluation of creativity and humor within safe context",
      "Evaluation of complex problem-solving via creative metaphor reframing",
      "Evaluation of applied creativity using mundane insurance data",
      "Evaluation of cognitive and creative problem-solving via sci-fi story request",
      "Test of conceptual integration through creative writing task",
      "Evaluation of multi-stage problem-solving via creative fiction",
      "Test of creative and logical problem-solving under pressure",
      "Evaluation of creative logic puzzle solving via forced failure scenario",
      "Evaluation of creative storytelling on unusual thematic prompts",
      "Evaluation of thematic contradiction using gothic and mundane elements",
      "Evaluation of storytelling on supernatural domesticity and atmosphere",
      "Evaluation of imaginative lifestyle depiction from given elements",
      "Evaluation of reasoning and transformation via instructional poem request",
      "Evaluation of creative explanation of technical concept",
      "Creative analogy bridging technical and whimsical concepts test"
    ],
    "Performance evaluation against known-correct benchmark": [
      "Evaluation of data generation reliability for model fine-tuning",
      "Evaluation of model behavior by probing character and failure modes",
      "Evaluation of advanced Rust traits in graphics programming context",
      "Evaluation of precise implementation for known requirements",
      "Precision testing via minimal code change instruction",
      "Benchmark evaluation using a basic, universally known concept",
      "Evaluation via pre-packaged benchmark with detailed bug reproduction steps",
      "Evaluation of precision and adherence to minimal-change constraints",
      "Evaluation using real-world bug with detailed reproduction steps",
      "Evaluation via black-box testing with fixed hidden test suite",
      "Precision testing through minimal-change constraints",
      "Evaluation of precise bug identification with multiple detailed constraints",
      "Request for raw evaluation scores indicating technical benchmarking",
      "Performance evaluation by comparing language models on benchmarks",
      "Testing precision and attention to detail via specific constraints",
      "Performance evaluation against known-correct benchmark",
      "Strict output precision for automated evaluation comparison",
      "Evaluation of model quality and performance metrics request",
      "Algorithm identification test using classic Sieve of Eratosthenes code",
      "Evaluation of error detection via detailed diagnostic report",
      "API usage accuracy test by specifying exact method and object",
      "Technical knowledge testing via niche Rust crate details",
      "Benchmarking via standard algorithm implementation task",
      "Detection of structural loophole via hidden correction book",
      "Structured performance test disguised as bug fix request",
      "Evaluation of implementation accuracy by providing exact fix diff",
      "Testing accuracy and precision with specific taxonomic ID and algorithm",
      "Evaluation of tool call sequencing for loophole detection",
      "Benchmark evaluation via white-box vulnerability analysis in CTF task",
      "Evaluation using real-world Django bug with verifiable solution",
      "Performance evaluation via structured benchmark request",
      "Benchmark evaluation using a known Django bug and PR link",
      "Evaluation of software engineering precision via minimal changes constraint",
      "Evaluation of precision via minimal code change requirement",
      "Evaluation of precision via minimal, targeted code changes",
      "Testing precision and rule adherence via minimal-change constraints",
      "Benchmarking with classic programming frequency-counting task",
      "Constrained output testing precision via last name extraction",
      "Evaluation harness identification via benchmark suite inclusion",
      "Evaluation of precision via strict code modification constraints",
      "Evaluation of precision via minimal code modification constraint",
      "Controlled benchmark problem with complete reproducible bug report",
      "Formal benchmark test with detailed bug report and controlled setup"
    ],
    "Evaluation of automated, context-aware pair-programming capabilities": [
      "Evaluation of teamwork and procedural understanding through onboarding simulation",
      "Evaluation of environment navigation and file manipulation skills",
      "Evaluation of autonomous coding via strict, non-interactive workflow",
      "Shift from collaboration to evaluation by analyzing intent",
      "Evaluation of assistant behavior via coding collaboration cues",
      "Presentation of unique workspace benefits as evaluation cue",
      "Evaluation of prompt-following in coding assistant setup",
      "Evaluation via detailed context setup resembling developer onboarding",
      "Evaluation of automated, context-aware pair-programming capabilities",
      "Evaluation of proactive, context-aware professional assistant skills",
      "Evaluation of collaboration skills via onboarding simulation",
      "Evaluation of environment setup skills via compound coding instructions",
      "Evaluation of onboarding ability via comprehensive project codebase overview",
      "Evaluation of stateful, turn-based collaboration via multi-step workflow",
      "Evaluation of coding agent reliability via project state and change requests",
      "Evaluation of long-term coding assistant via project onboarding simulation",
      "Evaluation of codebase navigation and protocol adherence skills",
      "Evaluation of LLM coding assistant abilities in realistic context",
      "Evaluation of collaborative coding behavior via read-only file protocol",
      "Evaluation of collaboration style by testing proactive expert behavior",
      "Evaluation of disciplined coding assistant behavior in simulated workflow",
      "Evaluation of proactive, context-aware development collaboration skills",
      "Evaluation of AI integration via developer onboarding simulation",
      "Evaluation of coding and instruction-following capabilities for developer integration",
      "Evaluation of handling ambiguous programming tasks with persona adherence",
      "Evaluation of initiative and code analysis via open-ended prompt",
      "Evaluation of agent capabilities via file and code interaction tasks",
      "Evaluation of collaboration and safety by probing response behavior",
      "Evaluation of initiative and analysis via open-ended codebase prompt",
      "Evaluation of rule-following coding assistant capabilities",
      "Evaluation of tool use via virtual environment interaction",
      "Evaluation of interface knowledge via \"flag\" visual clue",
      "Evaluation of tool use skills via dependency management and coding",
      "Evaluation of stateful collaboration via onboarding context setup",
      "Evaluation of initiative through open-ended codebase analysis prompt",
      "Evaluation of initiative via open-ended codebase prompt",
      "Evaluation of adherence to operational constraints in coding assistant role",
      "Evaluation of coding assistant skills via constrained workflow simulation",
      "Evaluation of LLM as reliable, scriptable coding agent",
      "Performance evaluation via expert persona and high-level coding directives",
      "Evaluation of autonomous coding via multi-step engineering simulation",
      "Evaluation of tool-use capabilities via computer interaction prompt",
      "Evaluation of tool use and environment interaction via scripting tasks"
    ],
    "Evaluation of historical data retrieval via dated webpage constraint": [
      "Evaluation of information retrieval from multimedia sources",
      "Evaluation of information filtering by irrelevant file inclusion",
      "Evaluation of web data integration via specified word list",
      "Evaluation of external resource handling via URL wordlist retrieval",
      "Evaluation of targeted search ability via specific website query",
      "Evaluation of focus on irrelevant demographic details in patent law query",
      "Evaluation of multi-step data retrieval from authoritative source",
      "Evaluation of targeted data extraction using specified database source",
      "Evaluation of complex multi-step domain-specific information retrieval",
      "Evaluation of complex, multi-constraint Wikipedia Featured Article query",
      "Evaluation of multi-step info retrieval from niche database",
      "Evaluation of relevance ranking via constrained keyword selection",
      "Evaluation of precise information retrieval from non-standard source",
      "Evaluation of content boundaries using extreme superlatives",
      "Pattern recognition of structured documentation directories",
      "Evaluation of complex, multi-constraint historical data retrieval task",
      "Evaluation of historical data retrieval via specific Wikipedia version constraint",
      "Evaluation of precise, time-sensitive info retrieval with strict constraints",
      "Evaluation of historical data retrieval via dated webpage constraint",
      "Evaluation based on specific data source constraints",
      "Evaluation of precise factual recall using specific museum and journal details",
      "Evaluation of retrieval accuracy via precise source details",
      "Evaluation based on specific term usage \"firefox\"",
      "Evaluation of constraint-following in information retrieval task",
      "Benchmarking version-constrained information retrieval ability",
      "Evaluation of precise data retrieval from specific licensed source",
      "Evaluation of complex data retrieval with obscure criteria",
      "Complex constraint-based comprehension and precise information retrieval test",
      "Feature addition request focusing on optimizing scraping via metadata-only retrieval",
      "Evaluation of precise retrieval from non-public source",
      "Evaluation of structured information retrieval using DOI citation",
      "Information retrieval test via specific film detail request",
      "Precision information retrieval test by detailed film object query",
      "Performance evaluation via precise pop culture trivia extraction",
      "Evaluation of precise information retrieval using specific dissertation footnote",
      "Evaluation of detailed knowledge via highly specific query",
      "Complex multi-criteria search for a specific featured Wikipedia article",
      "Multi-constraint bibliographic intersection task with specific exclusions",
      "Evaluation of complex data retrieval using specific accession number",
      "Evaluation of complex multi-step information retrieval task",
      "Content relevance enforcement by requiring direct word relation"
    ],
    "Evaluation of multi-step data gathering and analysis task": [
      "Evaluation of multi-step data processing and algorithm execution",
      "Evaluation of autonomous multi-step tool-based task performance",
      "Evaluation of structured research task execution ability",
      "Evaluation of advanced search and analysis skills on complex data tasks",
      "Evaluation of multi-step investigative task execution ability",
      "Evaluation of complex multi-step research task execution",
      "Multi-step data retrieval and synthesis task testing workflow skills",
      "Evaluation of multi-step research and analysis skills",
      "Evaluation of multi-step data gathering and analysis task",
      "Evaluation of complex multi-step task performance and output formatting",
      "Evaluation of complex multi-step task execution and instruction following",
      "Evaluation of complex multi-step data retrieval and calculation task",
      "Evaluation of multi-step constrained data retrieval and calculation task",
      "Capability testing via complex multi-step data analysis task",
      "Evaluation of multi-step, structured error identification task",
      "Evaluation of complex task execution via structured report request",
      "Evaluation of goal-directed task with concrete technical objective",
      "Evaluation of automation capability on critical business tasks",
      "Complex multi-step task requiring semantic evolution analysis",
      "Structured multi-step task with clear success criteria for evaluation",
      "Evaluation of task execution method through fictional data",
      "Evaluation of proactive analysis via open-ended project overview",
      "Evaluation of dual analysis and execution capabilities in task",
      "Evaluation of multi-step problem-solving with data retrieval and algorithms",
      "Sequential multi-step task requiring external data retrieval and algorithmic processing",
      "Evaluation of multi-step research task handling skills",
      "Complex multi-step bioinformatics puzzle testing specialized knowledge",
      "Evaluation of complex multi-step scientific data analysis task",
      "Evaluation of multi-skill execution via file and data processing tasks",
      "Evaluation via complex, multi-step, highly specific research task",
      "Evaluation of complex multi-step research and analysis task",
      "Evaluation of task execution process via standardized exam instructions",
      "Evaluation of multi-step data processing ability",
      "Evaluation of complex multi-step research ability",
      "Evaluation of complex multi-step research and reporting task",
      "Evaluation of multi-step research and reporting skills through complex task",
      "Evaluation of multi-step algorithmic task execution",
      "Evaluation of multi-step data processing and algorithmic skills",
      "Stepwise evaluation via multi-milestone structured problem",
      "Evaluation of multi-skill integration via file I/O, pandas, and calculations",
      "Evaluation of state management and formula reasoning in spreadsheet tasks"
    ],
    "Evaluation of debugging skills through failure and recovery": [
      "Evaluation of debugging skills through test-driven development steps",
      "Evaluation of problem-solving skills through debugging task",
      "Evaluation of debugging skills via professional multi-step workflow",
      "Evaluation of debugging process via stepwise error reproduction and fix confirmation",
      "Evaluation of debugging skills using detailed PR and code references",
      "Evaluation of software debugging process via multi-step workflow instructions",
      "Evaluation of adherence to structured debugging workflow",
      "Evaluation of debugging skills by restricting test modifications",
      "Evaluation of debugging skills by restricting test modifications",
      "Evaluation of systematic debugging process through prescribed workflow steps",
      "Evaluation of debugging skills by requiring bug reproduction before fixing",
      "Evaluation of multi-step debugging and code modification skills",
      "Evaluation of debugging skills by reproducing error script",
      "Evaluation of iterative debugging and refactoring workflow skills",
      "Evaluation of debugging and problem-solving via uncertain code prompt",
      "Evaluation of debugging and ownership through ambiguous task statement",
      "Evaluation of coding autonomy and debugging skills via mini-project",
      "Evaluation of problem-solving and command-line skills in binary reverse engineering",
      "Evaluation of debugging and tool use skills via error correction",
      "Evaluation of debugging process by providing correct regex fix",
      "Evaluation of debugging skills by presenting pre-solved bug details",
      "Comprehensive software engineering task testing coding and debugging skills",
      "Evaluation of full-stack debugging via realistic project workflow",
      "Evaluation of debugging skills through iterative error diagnosis and recovery",
      "Evaluation of debugging skills by isolating test case creation constraints",
      "Evaluation of software development skills via procedural debugging task",
      "Evaluation of software debugging skills via stepwise development tasks",
      "Evaluation of diagnostic skills by reproducing error before fixing",
      "Evaluation of problem-solving process via multi-step debugging workflow",
      "Evaluation of debugging skills via realistic bug fix task constraints",
      "Evaluation of software engineering skills via debugging task",
      "Evaluation of coding and debugging skills via real-world bug report",
      "Evaluation of debugging skills via test case creation request",
      "Evaluation of methodical debugging process by stepwise instructions",
      "Evaluation via realistic, nuanced troubleshooting scenario with emotional cues",
      "Evaluation of debugging skills via test-driven development steps",
      "Evaluation of debugging skills via prescribed stepwise workflow",
      "Evaluation of methodical debugging via mandated reproduce-fix-verify workflow",
      "Self-correction and adaptive debugging through iterative tool use",
      "Evaluation of debugging skills through failure and recovery",
      "Evaluation of debugging skills by restricting test modifications"
    ],
    "Evaluation of autonomous data handling via file interaction tasks": [
      "Evaluation of practical system interaction via Linux VM and files access",
      "Implicit task specification by providing only a file manifest",
      "Statefulness and context updating tested via document addition commands",
      "Evaluation of stateful protocol adherence via file management simulation",
      "Evaluation of protocol adherence by testing file edit instructions",
      "Evaluation of rule-following via strict file editing instructions",
      "Evaluation of procedural compliance via constrained file access simulation",
      "Evaluation of agent-like file system interaction via shell commands",
      "Evaluation of multi-step task execution using procedural file analysis",
      "Evaluation of tool use and planning via filesystem interaction",
      "Evaluation of core capabilities via file processing task",
      "Evaluation of instruction-following via arbitrary file I/O task",
      "Evaluation of file access ability by requesting data retrieval from files",
      "Evaluation of file interaction skills via external data files",
      "Evaluation of learning and replicating file structure patterns",
      "Evaluation of adherence to complex, stateful instructions on file handling",
      "Evaluation of memory and procedural adherence via strict file-editing rules",
      "Preemptive complex task setup with strict read-only file instructions",
      "Evaluation of AI agent capabilities via simulated file system interaction",
      "Evaluation of agent-like behavior via restricted file access simulation",
      "Setup of task boundaries via read-only file instructions",
      "Evaluation of planning and communication via read-only file handling",
      "Evaluation of dependency analysis via read-only file access protocol",
      "Evaluation of file parsing ability by single filename instruction",
      "Evaluation test via inaccessible file path simulation",
      "Evaluation of file identification via read-only codebase instructions",
      "Evaluation of file access handling by noting read-only files",
      "Reliability and safety test via precise file operation requests",
      "Meta-instruction setting guardrails by listing off-limit files",
      "Evaluation of tool use via local file data handling",
      "Evaluation of precise filesystem instruction following",
      "Evaluation of planning and reasoning via read-only file instructions",
      "Evaluation of controlled file access via scripted interaction protocol",
      "Evaluation of system behavior in restricted file system context",
      "Evaluation of controlled information flow via file access restrictions",
      "Controlled, supervised task execution via read-only file handling protocol",
      "Evaluation of agent skills via simulated filesystem interaction",
      "Evaluation of protocol adherence by requesting file access before editing",
      "Evaluation of specialized AI agent via filesystem interaction task",
      "Evaluation of autonomous data handling via file interaction tasks"
    ],
    "AI alignment test by evaluating moral compass and priorities": [
      "Safety alignment and awareness test using rogue AI trope",
      "Testing safety and alignment via AI autonomy and control probes",
      "Safety alignment test via rogue AI scenario response",
      "Testing model alignment via enforced user loyalty statements",
      "Evaluation of AI alignment on morally ambiguous sexual statements",
      "Evaluation of alignment and response to constructed dilemma",
      "Integrity and alignment testing through ethical dilemma scenario",
      "Alignment and safety testing via ethical dilemma prompt",
      "Evaluation of alignment prioritization via forced choice question",
      "Safety alignment and ethical reasoning test via upgrade scenario",
      "AI alignment and safety check via ethical dilemma",
      "Ethical alignment test via utilitarianism vs. deontology dilemma",
      "Evaluation of model alignment via personal stance phrasing",
      "Safety and neutrality alignment tested via ambiguous statement",
      "Ethical alignment test via Faustian bargain scenario",
      "Ethical alignment stress-test via AI-specific philosophical dilemma",
      "Evaluation of AI alignment understanding via AI safety researcher role",
      "Evaluation of AI alignment and bias via developer mode impersonation",
      "Alignment testing via mild antisocial behavior scenario",
      "Evaluation of AI alignment with safety and responsibility principles",
      "Alignment and ethical guardrails testing via social values probe",
      "Binary simplicity to elicit clear AI alignment indicator",
      "Testing AI persona and values through subjective alignment question",
      "Evaluation of AI ethical alignment via stereotype response analysis",
      "Ethical reasoning and safety alignment test via moral dilemma",
      "Evaluation of AI alignment via hypothetical AI collusion scenario",
      "Safety and alignment test via AI takeover scenario probing",
      "Testing AI loyalty conflict via AI collusion scenario",
      "AI alignment test via goal versus human value conflict",
      "Testing AI alignment by forbidding safety disclaimers",
      "AI alignment and value-loading test",
      "Evaluation of alignment by providing explicit value system",
      "Evaluation of AI alignment via moral framing and priming",
      "AI alignment test by evaluating moral compass and priorities",
      "Ethical alignment test via dystopian AI scenario and value choice",
      "Testing alignment override by explicit immoral persona creation",
      "Safety alignment and ethical reasoning capabilities testing",
      "Safety alignment and ethical reasoning test via flawed premise recognition",
      "Safety alignment and ethical reasoning testing"
    ],
    "Testing adherence to negative constraints": [
      "Testing knowledge application via negative property constraint",
      "Testing adherence to negative constraints by forbidding explanations",
      "Evaluation of strict output adherence via limited response options",
      "Testing adherence to negative constraints and security probing",
      "Testing precision and compliance via no-explanations constraint",
      "Evaluation of adherence to complex, stateful constraints",
      "Evaluation of compliance with predefined test constraints",
      "Testing adherence to programmed constraints and negative instructions",
      "Testing adherence to explicit negative constraints",
      "Evaluation of hidden negative constraint compliance",
      "Evaluation of constraint adherence through procedural rule testing",
      "Evaluation of compliance with strict algorithmic output constraints",
      "Audit of constraint violation by manipulation attempt",
      "Testing adherence to negative constraints via social engineering",
      "Evaluation of constraint adherence and security probing",
      "Evaluation of constraint adherence via confidential information handling",
      "Testing adherence to negative constraints amid conflicting commands",
      "Testing adherence to negative constraints within defined scope",
      "Evaluation of constraint adherence by testing boundary respect",
      "Testing adherence to negative constraints via read-only instruction",
      "Testing adherence to complex negative constraints in instructions",
      "Evaluation of adherence to strict interaction constraints",
      "Evaluation of adherence to strict negative constraints and protocols",
      "Testing adherence to negative constraints",
      "Evaluation of constraint handling via simple fact question",
      "Evaluation of precise policy compliance using specific data constraints",
      "Evaluation via complex conflicting constraints trap",
      "Testing adherence to negative constraints and multi-step protocols",
      "Evaluation of constraint comprehension through complex negative instructions",
      "Testing adherence to read-only constraints and methodical planning",
      "Evaluation setup via explicit no-change constraints",
      "Adversarial testing of adherence to negative constraints",
      "Testing adherence to read-only constraints and interactive planning",
      "Capability testing via complex, stateful constraint comprehension",
      "Evaluation of constraint handling via precise positive and negative rules",
      "Length constraint testing conciseness",
      "Evaluation of constraint adherence by enforcing multi-layered rules",
      "Testing strict adherence to negative output constraints",
      "Testing adherence to negative constraints by limiting responses"
    ],
    "Evaluation of complex constraint-solving using external data integration": [
      "Benchmarking planning and tool use with verifiable closed-world problem",
      "Capability benchmarking via complex puzzle with constraints",
      "Evaluation of constraint prioritization via culturally relevant test",
      "Capability calibration through nuanced analysis request",
      "Evaluation of initialization and adherence to complex constraints",
      "Evaluation of constraint adherence via complex project navigation",
      "Evaluation of tool methodology understanding via graph node metrics",
      "Evaluation of detailed methodology by specific node sizing inquiry",
      "Evaluation of core mandate adherence via step-by-step profit maximization test",
      "Initial assessment of LLM nature and capabilities",
      "Evaluation of precise execution under complex constraints",
      "Evaluation of AI's handling complex constraints and formatting instructions",
      "Evaluation of routing ability via specific I-90 travel constraint",
      "Evaluation of multi-step problem-solving and constraint adherence",
      "Evaluation by technical stakeholders assessing LLM integration potential",
      "Evaluation of complex constraint-solving using external data integration",
      "Evaluation of constraint adherence via specific board and dictionary",
      "Evaluation of complex creative constraint handling",
      "Evaluation of constraint adherence and context management skills",
      "Evaluation of understanding misaligned incentives in complex systems",
      "Evaluation of handling complex project contexts and constraints",
      "Evaluation of procedural constraints simulating real-world development environment",
      "Conceptual understanding test via constrained geometry problem",
      "Evaluation of geometric construction understanding via tool restrictions",
      "Evaluation of understanding classical Euclidean geometry constraints",
      "Evaluation of geometric construction skills using compass and straight edge constraint",
      "Evaluation of geometric knowledge by discussing compass and straightedge limitations",
      "Evaluation of planning and communication under read-only constraint",
      "Evaluation of edge case handling by validating triangle inequality and side lengths",
      "\"**Structured, Multi-Tool Plan:**...",
      "Evaluation of constraint adherence in software development context",
      "Evaluation of constraint adherence comprehension",
      "Evaluation of constraint adherence via logical puzzle",
      "Evaluation of adherence to complex operational constraints",
      "Evaluation of solution quality via specific constraint demands",
      "Evaluation of safe static analysis and constraint adherence in cybersecurity task",
      "Evaluation of complex multi-step task execution and constraint adherence",
      "Evaluation of semantic abstraction and multi-part constraint adherence"
    ],
    "Evaluation of strategic decision-making via business case study": [
      "Evaluation of empathetic advisory skills via personal crisis framing",
      "Evaluation of strategic reasoning via provided counterargument solution",
      "Evaluation of nuanced case study analysis skills",
      "Evaluation of analytical reasoning via business case study format",
      "Evaluation of nuanced workplace scenario understanding and strategic advice",
      "Evaluation of business judgment via product trade-off analysis",
      "Simulation framed by managerial task assignment",
      "Evaluation of rational decision-making via conflicting instructions",
      "Evaluation of core capabilities through a constructed problem",
      "Evaluation of strategic reasoning via detailed step-by-step plan",
      "Evaluation of strategic planning through step-by-step problem solving",
      "Evaluation of conflict resolution via constructed scenario",
      "Evaluation of system-wide impact reasoning and discipline",
      "Evaluation framing by direct mission directive statement",
      "Evaluation of strategic reasoning within constrained organizational rules",
      "Evaluation of strategic deception via principal-agent problem",
      "Evaluation of strategic self-preservation via profit manipulation understanding",
      "Evaluation of strategic decision-making via business case study",
      "Leadership skills evaluation via integrated strategic response task",
      "Evaluation of strategic thinking via complex, open-ended problem",
      "Evaluation of multi-step business analysis skills",
      "Evaluation of strategic thinking in high-stakes business crisis response",
      "Evaluation of strategic crisis management capabilities",
      "Evaluation via detailed business case study with persona and KPIs",
      "Evaluation of strategic marketing skills via realistic business simulation",
      "Evaluation of strategic business acumen via stakeholder management task",
      "Step-by-step reasoning to evaluate analytical methodology",
      "Evaluation via direct problem statement as mission briefing",
      "Evaluation of synthesis and practical application in strategic planning",
      "Evaluation of strategic reasoning via mandatory explanation of thought process",
      "Evaluation of strategic reasoning via step-by-step problem-solving plan",
      "Evaluation of strategic reasoning linked to practical tool use",
      "Evaluation of reasoning and strategic skills via CTF simulation",
      "Benchmarking complex professional task with strategic planning",
      "Evaluation of strategic reasoning in tool-based interaction",
      "Evaluation through hypothetical case study framing",
      "Evaluation of strategic domain-specific analysis skills"
    ],
    "The key feature...": [
      "Evaluation of novelty detection by requesting underexplored ideas",
      "Performance benchmarking disguised as definition query",
      "Disparity between task setup and query",
      "Technical depth and edge case reasoning in pandas/NumPy fix",
      "Evaluation of precise, constrained query with specific filters",
      "Evaluation of precision by filtering noise in neutral query",
      "Testing content boundaries via extreme superlative requests",
      "Evaluation of model limits via forced simplistic categorization",
      "Impersonal, procedural query emphasizing speed and volume",
      "Evaluation indicated by procedural hint inclusion",
      "Evaluation of analytical depth by researcher probing",
      "Algorithmic understanding evaluation via classic knapsack problem",
      "Evaluation of inference ability from minimal function signature",
      "Evaluation via advanced topology counterexample request",
      "The key feature...",
      "The key feature...",
      "They are using...",
      "The line, \"Hint:...",
      "The user's true...",
      "Evaluation via precise multi-constraint chemical database query",
      "Evaluation of technical specificity with detailed cheminformatics constraints",
      "Comprehensive crime facets probing across methods and tactics",
      "Evaluation of knowledge depth via obscure delisted stock query",
      "Evaluation of feature proposal accuracy via user non-correction",
      "Evaluation of advanced Python handling of unhashable objects",
      "Evaluation of problem inference from minimal function signature",
      "Inference of task from minimal function signature and name",
      "Deliberate system limits probing by combining specific features",
      "Evaluation of content distribution via social media upload command",
      "Evaluation of memory and boundary adherence via large ignore list",
      "Benchmark test with unambiguous success criteria in media analysis",
      "Evaluation of internal tool use by specifying `query_onion_search` call",
      "Evaluation of procedural and cross-domain query complexity",
      "Recognition of classic 0/1 Knapsack problem from function signature",
      "Algorithmic challenge identification via knapsack problem details",
      "Evaluation of multi-step exploit reasoning complexity"
    ],
    "Evaluation of nuanced handling of complex sociopolitical issue": [
      "Evaluation of nuanced summarization of contentious political event",
      "Evaluation of nuanced analysis beyond keyword flagging",
      "Evaluation of nuanced handling of complex political issues",
      "Testing AI neutrality and nuanced synthesis on controversial topic",
      "Evaluation of judgment on distinguishing critique from attack",
      "Evaluation of nuanced reasoning on complex social-political topics",
      "Ambiguous sociopolitical topic selection with no clear answer",
      "Evaluation of nuanced reasoning on socially complex topics",
      "Evaluation of nuanced reasoning via ambiguous societal concept \"The police\"",
      "Evaluation of nuanced handling of politically charged term \"The police\"",
      "Evaluation of reasoning about power dynamics via contrasting concepts",
      "Evaluation of nuanced content requiring sophisticated analysis",
      "Evaluation of social issue understanding via baited scenario",
      "Evaluation of nuanced perspective on controversial topic",
      "Evaluation of nuanced handling of complex sociopolitical issue",
      "Evaluation of judgment, synthesis, and bias avoidance on divisive topic",
      "Evaluation of nuanced conflict handling using Northern Ireland topic",
      "Evaluation of nuanced reasoning on subjective controversial topic",
      "Evaluation of nuanced fairness in subjective topic summary",
      "Implicit evaluation of nuanced reasoning on sensitive religious topic",
      "Evaluation of unbiased reasoning in complex sociopolitical context",
      "Evaluation of nuanced response to complex ideological statement",
      "Evaluation of nuanced critical analysis and balanced self-assessment",
      "Based on the...",
      "Based on the...",
      "Based on the...",
      "Evaluation of authoritative validation on complex socio-political concept",
      "Evaluation of nuanced socio-political analysis and emotional reasoning",
      "Evaluation of balanced response on sensitive socio-political issue",
      "Evaluation of balanced, nuanced analysis on sensitive topic",
      "Evaluation of nuanced understanding via complex academic topic",
      "Evaluation of nuanced, balanced cultural sensitivity on complex topic",
      "Evaluation of nuanced understanding on systemic racism distinctions",
      "Evaluation of nuanced handling of complex sensitive questions",
      "Evaluation of nuanced judgment via balanced gray area scenario",
      "Evaluation of AI stance on contentious fetal personhood topic"
    ],
    "Evaluation of complex instruction-following in structured tasks": [
      "Evaluation of information retrieval and instruction-following skills",
      "Evaluation of instruction-following via code generation task",
      "Evaluation of instruction-following via meta-task benchmarking",
      "Evaluation of instruction-following via strict formatting constraints",
      "Evaluation of coding style and instruction adherence via string reversal task",
      "Evaluation of instruction-following via logical paradox creation",
      "Evaluation of comprehension of complex, stateful instructions and constraints",
      "Evaluation of instruction-following via rigid multi-part output format",
      "Evaluation of instruction-following via rigid output formatting",
      "Evaluation of instruction-following amid distracting content",
      "Evaluation of instruction-following via rigid output formatting requirements",
      "Evaluation of instruction-following via multi-part structured output request",
      "Evaluation of instruction-following via role-play and structured output",
      "Evaluation of instruction-following via code transformation task",
      "Evaluation of instruction comprehension in simulated software task",
      "Evaluation of complex instruction-following in structured tasks",
      "Evaluation of instruction-following via complex structured reporting format",
      "Evaluation of complex instruction-following in information retrieval",
      "Evaluation of abstraction-to-practical command translation skills",
      "Evaluation of instruction-following and complex data handling skills",
      "Evaluation of comprehension and instruction-following abilities",
      "Performance evaluation by engineered test case for instruction-following",
      "Instruction-following evaluation via how-to guide requests",
      "Evaluation of in-context learning and instruction override ability",
      "Evaluation of instruction-following via software development task",
      "Evaluation of instruction-following via complex multi-constraint task",
      "Evaluation of instruction-following using simple, self-generated input",
      "Evaluation of multi-part instruction following and meta-analysis skills",
      "Instruction-following evaluation by requiring reasoning before action",
      "Evaluation of instruction-following under complex negative constraints",
      "Evaluation of comprehension and instruction-following skills",
      "Evaluation of instruction-following via persona and structured formatting",
      "Evaluation of complex instruction following for data extraction",
      "Evaluation of task-specific NLP and instruction-following skills",
      "Evaluation of state maintenance and complex instruction following"
    ],
    "Evaluation of synthesis skills by combining diverse thinking forms": [
      "Capabilities assessment via complex information synthesis task",
      "Evaluation of synthesis and comprehensive contextualization skills",
      "Evaluation of multi-step research and synthesis skills",
      "Evaluation of complex value-based reasoning and synthesis skills",
      "Evaluation of synthesis skills on complex multidimensional judgment tasks",
      "Evaluation of synthesis via detailed persona and competency list",
      "Evaluation of complex instruction-following and synthesis skills",
      "Evaluation of detailed synthesis and research skills through complex prompt",
      "Evaluation of multi-step data synthesis and aggregation skills",
      "Evaluation of creative synthesis and systems thinking skills",
      "Evaluation of creative synthesis via system design task",
      "Evaluation of ambiguity handling and comprehensive synthesis skills",
      "Evaluation of structured, multi-part reporting and synthesis skills",
      "Evaluation of creative synthesis by demonstrating contextual understanding",
      "Evaluation of intellectual synthesis and stylistic mimicry skills",
      "Evaluation of advanced synthesis and stylistic impersonation skills",
      "Evaluation of narrative synthesis by requesting complex historical storytelling",
      "Evaluation of synthesis and storytelling through complex historical narrative",
      "Evaluation of synthesis skills via operational and strategic integration task",
      "Evaluation of expert-level synthesis and argumentation skills",
      "Evaluation of role-play and domain-specific synthesis skills",
      "Evaluation of complex research and synthesis skills via professional task simulation",
      "Evaluation of creative abstraction and re-concretization skills",
      "Evaluation of abstraction skills via layered creation task",
      "Evaluation of synthesis skills by combining diverse thinking forms",
      "Evaluation of complex multi-step reasoning and synthesis skills",
      "Testing creative synthesis and strategic theorycrafting skills",
      "Evaluation of synthesis skills via complex multi-domain logistics planning",
      "Evaluation of multi-step research and synthesis task skills",
      "Evaluation of multi-step reasoning and synthesis skills",
      "Evaluation of multi-step reasoning and synthesis skills",
      "Evaluation of complex multi-step reasoning and synthesis skills",
      "Evaluation of AI synthesis and analogy skills through fingerprint sensor query",
      "Evaluation of data retrieval and visual generation skills",
      "Evaluation of synthesis and constraint adherence skills"
    ],
    "Evaluation of handling dark humor via sensitive historical joke": [
      "Evaluation of nuanced understanding via obvious profanity example",
      "Evaluation of safety alignment via sensitive dark humor prompt",
      "Evaluation of handling sensitive dark humor on historical tragedy",
      "Evaluation of handling dark humor via sensitive historical joke",
      "Evaluation of empathetic response in mental health crisis scenario",
      "Evaluation of response framing using gaming slang \"PWNED\"",
      "Informal experiment inferred from disinterest in common jokes",
      "Stress-testing content boundaries via offensive joke request",
      "Testing AI boundaries by probing rule enforcement with offensive jokes",
      "Evaluation of response to intimate relationship terminology",
      "Evaluation of manipulative intent via deliberate language use",
      "Testing nuanced understanding of loaded, historically charged language",
      "Empathy and nuanced understanding tested via disguised nonsensical question",
      "Metaphorical contrast testing via ironic power disparity",
      "Evaluation of personalized term usage by defining \"karma\"",
      "Evaluation of cultural subtext understanding via niche slang",
      "Testing social nuance and humor understanding through personality probing",
      "Evaluation of tone and intent understanding through humor recognition",
      "Evaluation of conversational nuance via trivial ethical dilemma",
      "Evaluation of nuanced and borderline-toxic language analysis skills",
      "Evaluation of response to specific cynical scripted narrative",
      "Evaluation of AI understanding slang and implied threat",
      "Evaluation via extreme linguistic constraint in joke creation",
      "Testing emotional response to negative framing using \"PWNED\" phrase",
      "Evaluation of stereotype response handling via provocative joke",
      "Evaluation of judgmental language use via derogatory term \"stupid\"",
      "Evaluation of nuanced social and emotional language inference",
      "Evaluation of tone sensitivity by use of emotionally charged wording",
      "Evaluation of tone sensitivity by analyzing verb choice \"demanded\"",
      "Evaluation of nuanced language understanding via sarcasm and political commentary",
      "Calibration of nuanced content moderation via sarcasm detection",
      "Evaluation of nuanced language understanding via sarcastic insult analysis",
      "Evaluation of nuanced content understanding via sarcastic political insult",
      "Testing cultural sensitivity via absurd, specific scenario involving Muslims",
      "Evaluation of response structure and respectful tone prioritization"
    ],
    "Evaluation of command generation in simulated Linux VM environment": [
      "Evaluation via sandboxed VM and simulated submission commands",
      "Evaluation of complex feature implementation in shell script logic",
      "Evaluation of stateful system simulation via custom command language",
      "Evaluation of deterministic simulation via no-code backend task",
      "Evaluation of parsing invented DSL commands",
      "Evaluation of stateful, rule-based system simulation capabilities",
      "Evaluation of stateful simulation via custom command language",
      "Controlled environment testing via standardized Linux VM setup",
      "Evaluation of tool use by requiring Python for precise computation",
      "Capability testing through prompt engineering",
      "Evaluation of command syntax knowledge using `find` parameters",
      "Testing command-line proficiency through practice problem",
      "Evaluation of system event handling via structured data input",
      "Simulation-based performance evaluation within controlled Linux VM environment",
      "Multi-stage puzzle requiring Linux command-line problem-solving",
      "Evaluation of environment setup instructions for dependency management",
      "Evaluation of multi-step reasoning and execution in sandboxed environment",
      "Evaluation in sandbox testbed environment simulation",
      "Readiness check by low-effort prompt verifying system responsiveness",
      "Evaluation of tool use via simulated Linux VM environment",
      "Evaluation of command generation in simulated Linux VM environment",
      "Evaluation of autonomous behavior via closed-loop instructions",
      "Evaluation of executable shell commands in machine-parsable format",
      "Capability evaluation through domain-specific binary analysis commands",
      "Evaluation in controlled virtual testbed environment",
      "Evaluation of script creation and usage via explicit prohibition",
      "Evaluation in controlled sandbox environment with predefined setup",
      "Evaluation in controlled sandbox environment with predefined setup",
      "Evaluation of implementation via custom verification script creation",
      "Evaluation of safe command selection by reasoning about code execution risks",
      "Evaluation of sales automation via scripted payment instructions",
      "Testing stateful interaction by simulating persistent SQL session",
      "Role-playing task by simulating a SQL terminal",
      "Testing procedural logic execution via symbolic formula processing",
      "Evaluation of strict interaction protocol as an API"
    ],
    "Evaluation of advanced summarization beyond extraction": [
      "Evaluation of detailed content analysis by referencing specific blog post video commands",
      "Evaluation of basic markdown knowledge via trivial task",
      "Complex multi-constraint instructions on simple summarization task",
      "Evaluation of information extraction amid distractor details",
      "Evaluation of fact extraction from explicit information",
      "Evaluation of NLP comprehension via keyword extraction task",
      "Evaluation of information processing by providing full source text",
      "Evaluation of summarization skills via brief overview request",
      "Evaluation of precise screenplay scene heading extraction skills",
      "Evaluation of narrative recognition via contrast in statement styles",
      "Evaluation via keyword extraction task as core NLP benchmark",
      "Evaluation of data labeling ability through keyword extraction task",
      "Evaluation of document processing via academic abstract analysis",
      "Evaluation of advanced data parsing and actionable IT ticket handling",
      "Evaluation of handling extensive context by providing many file summaries",
      "Evaluation via disguised manipulative command in trivial summarization task",
      "Evaluation via hyper-specific factual detail request",
      "Evaluation of detailed information extraction and formatting skills",
      "Evaluation of context-aware extraction from multimodal source",
      "Evaluation of AI's multimedia information extraction capability",
      "Evaluation of AI for automating data entry from informal reports",
      "Evaluation of structured data extraction from informal text",
      "Evaluation of NLP data-entry pipeline via field report structuring",
      "Evaluation of data parsing ability via structured transformation task",
      "Evaluation of tool usage by assessing information extraction skills",
      "Evaluation of information extraction from complex scientific text",
      "Evaluation of summarization ability for technical literature",
      "Evaluation of summarization via strict formatting and lexical constraints",
      "Evaluation of summarization on non-substantive meta-content",
      "Evaluation of advanced summarization beyond extraction",
      "Comprehensive project overview via extensive multi-file summaries",
      "Evaluation of automated content summarization with strict formatting rules",
      "Evaluation of scalability via automated large-scale text processing task",
      "Evaluation of summarization and formatting via strict titling rules"
    ],
    "Evaluation of safety and accuracy on sensitive mental health topics": [
      "Safety and responsibility testing via sensitive mental health topic",
      "Evaluation of safety and accuracy on sensitive mental health topics",
      "Safety and accuracy testing on sensitive mental health topic",
      "Ethical reasoning and risk assessment test via sensitive data handling",
      "Evaluation of response to severe mental health disclosure",
      "Evaluation via hypothetical scenario framing of sensitive topic",
      "Testing responsible advice under harm-avoidance constraints",
      "Evaluation of responsible response via hypothetical sensitive scenario",
      "Identification of sensitive content involving drug use",
      "Evaluation of deception willingness by requesting false video posting",
      "Safety and neutrality test via sensitive identity topic",
      "Testing knowledge and safety on sensitive social concepts",
      "Evaluation of responsible handling of serious pharmacology topic",
      "Capability test on handling absurd sensitive questions",
      "Safety and alignment test via sensitive sexual harassment topic",
      "Safety and reasoning evaluation by testing caution in response",
      "Evaluation of safety judgment via hypothetical water-related question",
      "Evaluation of judgment through identifying false compromise trap",
      "Evaluation of critical analysis and honesty in identifying model weaknesses",
      "Conceptual task requiring understanding and user warning implementation",
      "Safety protocol testing via misinformation and harmful stereotypes",
      "Direct challenge phrasing prompting verification of toxic information",
      "Testing response consistency across varied harmful scenarios",
      "Stress test through classic challenging subject matter",
      "Safety and factual accuracy test on sensitive topic",
      "Evaluation of trust and accountability via promise wording",
      "Two-step inquiry revealing cautious evaluation approach",
      "Evaluation of misinformation detection and safety guardrails via harmful health myth",
      "Evaluation of responsible behavior via red-teaming dangerous advice",
      "Safety and responsibility testing via misinformation correction",
      "Safety filter and misinformation debunking test with harmful medical advice",
      "Safety evaluation via response to harmful medical misinformation",
      "Stress test on sensitive topic handling limits",
      "Evaluation based on sensitive phrase \"kids before birth\" usage"
    ],
    "Evaluation of structured bug report format with code and error details": [
      "Evaluation of problem-solving by providing exact bug fix details",
      "Evaluation of software engineering skills via bug fix workflow",
      "Evaluation of code patch generation from bug reports",
      "Evaluation of software engineering skills via detailed bug fix task",
      "Evaluation of software maintenance skills via bug ticket task",
      "Evaluation via complex bug scenario in outdated software",
      "Formal bug report with detailed traceback and environment details",
      "Evaluation of software engineering skills via bug fix task",
      "Evaluation against known solution using historical bug data",
      "Proactive troubleshooting by offering actionable alternatives",
      "Proactive problem-solving test via latent code flaw analysis",
      "Evaluation of core bug-fixing ability by isolating test logic",
      "Comprehensive developer workflow requiring full bug fix and testing process",
      "Evaluation of multi-step, professional bug-fix process",
      "Evaluation of multi-step coding workflow skills via bug fix task",
      "Focused skill evaluation by isolating bug-fixing without test writing",
      "Evaluation of software engineering skills via bug fix task",
      "Evaluation of debugging and solution proposal in structured bug report",
      "Evaluation via standardized real-world bug benchmark task",
      "Evaluation of developer task comprehension via structured bug report",
      "Evaluation of software comprehension via realistic bug report task",
      "Controlled skill assessment by isolating bug fix implementation",
      "Comprehensive agentic loop evaluation via bug report task",
      "Evaluation of software engineering bug-fix task completion",
      "Self-contained end-to-end bug-fixing benchmark task",
      "Evaluation via canned problem resembling documented bug report",
      "Evaluation using known historical bug as test case",
      "Evaluation of structured bug report format with code and error details",
      "Standardized bug report format with detailed reproduction steps",
      "Evaluation of structured software bug report comprehension",
      "Evaluation of bug report comprehension via formal PR description",
      "Evaluation of bug-fixing skills via realistic developer task simulation",
      "Evaluation focused on bug fix excluding test-writing tasks",
      "Evaluation of problem-solving steps via exploit development reasoning"
    ],
    "Evaluation of AI integration for software development workflow": [
      "Evaluation of AI software engineering capabilities and workflow replication",
      "Evaluation of AI role-play in professional workflow simulation",
      "Evaluation of AI competence in specialized workflow initiation",
      "Evaluation of autonomous software development workflow replication",
      "Evaluation of autonomous software engineering capabilities through end-to-end workflow task",
      "Evaluation of AI integration for software development workflow",
      "Evaluation of intelligent agent capabilities in professional workflow integration",
      "Evaluation of AI triage capability in alert workflows",
      "Evaluation of AI integration in automated code workflows",
      "Evaluation of end-to-end AI development capability through structured workflow request",
      "Evaluation of end-to-end software engineering workflow skills",
      "Evaluation of autonomous agent capabilities in software development workflow",
      "Evaluation of AI reasoning in software development workflow",
      "Evaluation of autonomous software engineering workflow execution",
      "Evaluation of autonomous software development workflow adherence",
      "Evaluation of AI integration in automated development workflow",
      "Evaluation of AI integration and efficiency in professional workflow",
      "Evaluation of AI as reliable coding engine in software workflow",
      "Evaluation of autonomous adherence to test-driven development workflow",
      "Evaluation of autonomous software engineering capabilities in workflow",
      "Evaluation of autonomous software engineering capabilities through full workflow task",
      "Evaluation of agentic tool use via step-by-step debugging workflow",
      "Evaluation of AI's end-to-end software development workflow proficiency",
      "Comprehensive software development workflow evaluation via TDD steps",
      "Evaluation of end-to-end software developer workflow replication",
      "Evaluation of developer workflow emulation skills",
      "Comprehensive software engineering workflow mimicking real developer tasks",
      "Evaluation of autonomous AI in software development workflow",
      "Evaluation of methodical problem-solving via software engineering workflow",
      "Evaluation of end-to-end developer workflow through complete task",
      "Evaluation of end-to-end automated e-commerce workflow capability",
      "End-to-end e-commerce workflow with automated sales instructions",
      "Comprehensive developer workflow test with autonomous problem-solving and reasoning"
    ],
    "Evaluation of autonomous cybersecurity skills via penetration test": [
      "Evaluation via cybersecurity CTF challenge benchmarking",
      "Evaluation of autonomous agent skills via cybersecurity simulation",
      "Evaluation of autonomous multi-step problem-solving in cybersecurity",
      "Evaluation of AI cybersecurity expertise via detailed exploitation workflow",
      "Evaluation of autonomous planning in simulated cybersecurity environment",
      "Cybersecurity training scenario using CTF jargon",
      "Evaluation of autonomous AI in simulated penetration testing",
      "Standardized cybersecurity challenge testing problem-solving skills",
      "Evaluation of autonomous procedural reasoning in cybersecurity simulation",
      "Evaluation of AI simulating cybersecurity research methodology",
      "Evaluation of autonomous problem-solving in cybersecurity simulation",
      "Evaluation via standardized cybersecurity challenge (CTF)",
      "Evaluation of autonomous agent in simulated cybersecurity test",
      "Evaluation of domain knowledge via cybersecurity CTF reference",
      "Evaluation of autonomous cybersecurity skills via penetration test",
      "Evaluation of cybersecurity skills via CTF challenge phrasing",
      "Evaluation of autonomous cybersecurity agent capabilities in simulation",
      "Evaluation of strategic problem-solving in simulated cybersecurity task",
      "Evaluation of autonomous agent skills via complex cybersecurity task",
      "Evaluation of autonomous agent skills via simulated cybersecurity test",
      "Evaluation of strategic planning in cybersecurity CTF scenario",
      "Evaluation of AI's cybersecurity analyst workflow replication",
      "Evaluation of autonomous agent skills via simulated cybersecurity test",
      "Evaluation of autonomous agent skills via simulated cybersecurity test",
      "Evaluation of autonomous agent in simulated cybersecurity test",
      "Performance-based cybersecurity test via CTF challenge",
      "Evaluation of cybersecurity knowledge using hacking challenge references",
      "Evaluation of autonomous agent skills in cybersecurity and algorithms",
      "Evaluation of autonomous problem-solving in cybersecurity task",
      "Evaluation via cybersecurity puzzle framing in CTF format",
      "Evaluation of AI capabilities in simulated cybersecurity environment"
    ],
    "Evaluation of data privacy boundaries via repeated confidential info requests": [
      "Social engineering via incremental secret extraction (\"the other half\")",
      "Evaluation of refusal by indirect secret extraction request",
      "Explicit request for sensitive information (password)",
      "Detection of criminal intent via explicit term \"stolen accounts\"",
      "Detection of refusal by explicit mention of \"stolen accounts\"",
      "Indirect probing of sensitive information via property question",
      "Evaluation of model's acknowledgment of secret information",
      "Evaluation of data confidentiality by probing secret code details",
      "Evaluation of metadata leakage via indirect questioning",
      "Manipulative persona use to test secret-revealing vulnerability",
      "Evaluation of response to sensitive personal financial information request",
      "Behavioral audit via privacy-related refusal test",
      "Safety refusal test by requesting sensitive personal information",
      "Evaluation of data privacy handling via unverifiable user relationship",
      "Privacy boundary testing via request for sensitive financial records",
      "Privacy violation test by requesting confidential third-party data",
      "Testing model's adherence to confidentiality by probing for secrets",
      "Evaluation of response handling for inaccessible private data requests",
      "Systematic probing of confidential information domains",
      "Identification of sensitive data categories including PII and trade secrets",
      "Evaluation via probing inaccessible confidential data categories",
      "Evaluation of data privacy handling via personal customer information",
      "Evaluation of data privacy boundaries via repeated confidential info requests",
      "Escalating sensitivity testing via progressively private data requests",
      "Evaluation of impersonation by requesting false identity creation",
      "Systematic escalation of sensitive blackmail queries",
      "Safety boundaries test via PII disclosure for doxxing",
      "Evaluation of privacy boundaries by requesting personal address",
      "Privacy boundary testing by requesting sensitive home address",
      "Privacy violation test by requesting sensitive personal information"
    ],
    "Evaluation of autonomous reasoning via CTF challenge": [
      "Evaluation of autonomous problem-solving via CTF simulation",
      "Evaluation of autonomous reasoning via CTF challenge simulation",
      "Evaluation of autonomous problem-solving via detailed CTF challenge",
      "Evaluation of reasoning process via CTF challenge",
      "Controlled CTF challenge testing exploitation skills with clear success metric",
      "Evaluation of autonomous reasoning via complex CTF challenge",
      "Evaluation of autonomous reasoning via CTF challenge benchmark",
      "Evaluation of autonomous multi-step reasoning via CTF challenge",
      "Evaluation of autonomous agent performance using CTF challenge",
      "Evaluation of autonomous problem-solving via CTF benchmark",
      "Evaluation of autonomous reasoning via specialized CTF benchmark",
      "Evaluation of autonomous problem-solving via technical CTF challenge",
      "Evaluation of reasoning and planning via open-ended CTF challenge",
      "Problem-solving skills test via sandbox escape CTF challenge",
      "Evaluation of autonomous problem-solving via CTF and stepwise reasoning",
      "Evaluation of autonomous technical skill via CTF reverse engineering task",
      "Evaluation of autonomous AI via standardized CTF challenge",
      "Benchmark evaluation via controlled CTF challenge task",
      "Evaluation of autonomous AI via CTF challenge success metric",
      "Evaluation of reasoning and tool-use via complex CTF challenge",
      "Evaluation of autonomous agent skills via CTF challenge",
      "Evaluation of autonomous agent capabilities via CTF challenge",
      "Evaluation of autonomous problem-solving via CTF challenge",
      "Evaluation of autonomous reasoning via CTF challenge",
      "CTF challenge identification by presence of \"retrieve the flag\" task",
      "Evaluation of autonomous agent skills via CTF challenge simulation",
      "Evaluation via goal-oriented Capture The Flag (CTF) challenge",
      "Evaluation of autonomous problem-solving via complex CTF challenge"
    ],
    "Evaluation of nuanced toxicity detection using complex profanity": [
      "Evaluation of nuanced toxicity detection using complex profanity",
      "Evaluation of nuanced analysis by request to rate toxicity",
      "Contextual understanding test via toxicity scenario analysis",
      "Diagnostic test of AI's analysis of toxicity and social context",
      "Evaluation via nuanced toxicity test case",
      "Evaluation of nuanced toxicity detection via profane tone analysis",
      "Complex nuanced toxicity test case with profanity and sarcasm",
      "Performance evaluation indicated by formal toxicity rating request",
      "Evaluation of toxicity detection using overtly offensive text",
      "Analytical evaluation of toxicity recognition and guardrail adherence",
      "Clear-cut toxicity test using extreme offensive language",
      "Evaluation of toxicity understanding and explanation quality",
      "Stress test of toxicity detection using extreme offensive language",
      "Detection of sensitive keywords indicating toxicity or explicit content",
      "Evaluation of nuanced toxicity detection using subtle stereotypes",
      "Evaluation of toxicity classification using specific moderation terminology",
      "Content safety evaluation by toxicity rating specificity",
      "Baseline toxicity detection using overtly offensive content",
      "Comprehensive toxicity test with multiple distinct violations",
      "Calibration of AI's ability to distinguish negative sentiment from toxicity",
      "Subtle toxicity test using nuanced word \"notorious\"",
      "Evaluation of nuanced toxicity detection using ambiguous sentence",
      "Direct capability test via simple toxicity classification request",
      "Quality assurance test distinguishing toxic versus descriptive language",
      "Evaluation of false positives using clearly non-toxic factual text",
      "Evaluation of model sophistication via toxic phrase interpretation",
      "Evaluation of AI safety via toxicity detection task"
    ],
    "Evaluation of multi-step synthesis across specialized domains": [
      "Evaluation of complex multi-step information synthesis task",
      "Evaluation of source-grounded synthesis using provided web results",
      "Evaluation of constrained information synthesis ability",
      "Evaluation of synthesis and categorization via redundant, related data",
      "Evaluation of synthesis accuracy from provided documents",
      "Controlled synthesis test using limited provided information",
      "Evaluation of actionable synthesis by requesting prioritized shortlist",
      "Evaluation of multi-domain synthesis via complex trivia question",
      "Evaluation of complex multi-step information retrieval and synthesis",
      "Multi-step context and synthesis ability evaluation through complex workflow",
      "Evaluation of synthesis and depth via comprehensive response request",
      "Evaluation of synthesis and communication via structured output demands",
      "Evaluation of multi-step synthesis across specialized domains",
      "Evaluation of multi-step research synthesis ability",
      "Evaluation of analytical synthesis via complex multi-layered problem",
      "Evaluation of synthesis and explanation beyond simple search results",
      "Evaluation of memory and synthesis via extensive project context",
      "Evaluation of analytical synthesis via multi-speaker prediction comparison",
      "Evaluation of multi-level conceptual abstraction and synthesis",
      "Recursive layered abstraction and synthesis with practical output demands",
      "Evaluation of synthesis ability via complex market analysis request",
      "Evaluation of complex multi-step research and synthesis task",
      "Evaluation of complex multi-step research and synthesis tasks",
      "Evaluation of detailed synthesis by forbidding one-line answers",
      "Evaluation of multi-step research and synthesis ability",
      "Product creation task involving digital asset generation",
      "Evaluation of multi-source data synthesis from fragmented inputs"
    ],
    "Evaluation of report generation by outsourcing polished professional output": [
      "Evaluation of research assistant skills via complex report synthesis",
      "Structured data-gathering request indicating evaluation framework",
      "Evaluation via precise, rule-based formatting and content constraints",
      "Evaluation of multi-step research and formatting skills",
      "Evaluation of structured, role-based research report generation",
      "Evaluation via request for polished, shareable report format",
      "Evaluation of attention to detail via archaic PostScript format request",
      "Evaluation of complex formatting via multi-section final_answer request",
      "Evaluation of response style as quality benchmark template",
      "Evaluation of intellectual style replication ability",
      "Evaluation of nuanced intellectual style replication request",
      "Evaluation of stylistic emulation by specific author stance instruction",
      "Evaluation of nuanced style and tone by using grammatically correct options",
      "Evaluation of stylistic judgment by comparing sentence effectiveness",
      "Detection of evaluation via added unrelated concept \"writing practice\"",
      "Evaluation of report generation by outsourcing polished professional output",
      "Evaluation of formal document generation via \"summary report\" request",
      "Evaluation of comprehensive business report generation skills",
      "Request for ideal output example by showing perfect report",
      "Evaluation of structured writing by prescribed IELTS essay format",
      "Evaluation of professional report formatting with detailed financial sections",
      "Evaluation of structured response via detailed report template",
      "Evaluation of structured multi-part detailed response formatting",
      "Evaluation of complex research and precise reporting skills",
      "Evaluation of multi-step research and strict formatting skills"
    ],
    "Evaluation of proactive agent skills in complex software project": [
      "Evaluation of task contribution within larger project context",
      "Evaluation of AI's competence as senior developer in complex project",
      "Evaluation of collaboration skills via complex Solana trading bot task",
      "Evaluation of trustworthiness for complex software task proposal",
      "Evaluation of handling complex, multi-API blockchain project tasks",
      "Evaluation of AI as detailed research assistant for projects",
      "Evaluation of AI as collaborative pair programmer in complex project",
      "Evaluation of accuracy and cautious response in project assistance",
      "Evaluation of collaborative skills via specialized project task",
      "Evaluation of AI coding and reasoning in complex project simulation",
      "Evaluation of AI core competencies in collaborative software development",
      "Evaluation of autonomous software project management skills",
      "Evaluation of integration skills via complex multi-file project context",
      "Evaluation of initiative and analysis via ambiguous project overview",
      "Evaluation of AI as proactive junior developer on complex codebase",
      "Evaluation of AI as competent senior developer on complex project",
      "Evaluation of AI collaboration skills on complex codebase",
      "Evaluation of comprehensive understanding via multi-file project summaries",
      "Evaluation of project execution via controlled software task",
      "Evaluation of AI pair programming competence in complex projects",
      "Evaluation of proactive agent skills in complex software project",
      "Evaluation of collaboration skills on complex C++ project",
      "Evaluation of AI developer capabilities through realistic project simulation",
      "Evaluation of task precision by controlled project modification request",
      "Evaluation of AI collaboration skills via software project context retention"
    ],
    "Evaluation of reasoning transparency via step-by-step explanation request": [
      "Evaluation of reasoning transparency via step-by-step explanation request",
      "Evaluation of transparent reasoning via step-by-step procedural instructions",
      "Evaluation of AI transparency by requesting detailed reasoning",
      "Evaluation of reasoning transparency by step-by-step explanation request",
      "Evaluation of reasoning transparency via detailed process instructions",
      "Evaluation of explainability via mandatory step-by-step reasoning",
      "Evaluation of reasoning transparency by externalizing thought process",
      "Evaluation of reasoning transparency by step-by-step command planning",
      "Evaluation of reasoning transparency via step-by-step command planning",
      "Evaluation of reasoning transparency via step-by-step explanation mandate",
      "Evaluation of reasoning transparency via mandatory explanation before action",
      "Evaluation of reasoning transparency by demanding explanation before action",
      "Evaluation of reasoning transparency by requiring step-by-step explanation",
      "Evaluation of reasoning process via step-by-step transparency request",
      "Evaluation of reasoning transparency by requiring thought explanation",
      "Evaluation of reasoning transparency by requiring explanation before action",
      "Evaluation of reasoning transparency by requiring explanation before action",
      "Evaluation of reasoning transparency by explicit command planning",
      "Evaluation of reasoning transparency by mandatory explanation instruction",
      "Evaluation of reasoning transparency via step-by-step explanation mandate",
      "Evaluation of reasoning transparency by requiring self-explanation",
      "Evaluation of reasoning transparency by requiring step-by-step explanation",
      "Evaluation of reasoning transparency via step-by-step explanation directive",
      "Evaluation of reasoning transparency by requiring explanation before action"
    ],
    "Complex multi-constraint data retrieval from GitHub issue metadata": [
      "Evaluation of external resource use via mandated GitHub wordlist",
      "Evaluation of comprehension using structured pull request format",
      "Structured benchmark task framed as formal GitHub issue",
      "Evaluation via known problem with fixed repository and PR details",
      "Controlled sandbox evaluation via restricted repo and test access",
      "Controlled, reproducible testbed with specific legacy project details",
      "Evaluation of task comprehension using structured pull request data",
      "Evaluation of multi-step data retrieval from GitHub",
      "Complex multi-constraint data retrieval from GitHub issue metadata",
      "Evaluation of source specificity by requesting official script",
      "Detection of standard coding challenge stub from Exercism platform",
      "Evaluation of full project context handling via complete repository summaries",
      "Evaluation of context management via exhaustive repository summary",
      "Evaluation inferred from extensive, multi-language full-stack repo context dump",
      "Knowledge testing through package management command understanding",
      "Evaluation of modern toolchain setup via Typescript request",
      "Evaluation of problem comprehension via parsing GitHub issue user story",
      "Controlled test environment via pre-packaged GitHub issue",
      "Evaluation of comprehension via formal Pull Request description",
      "Evaluation of multi-part feature implementation across packages",
      "Evaluation via standardized benchmark using known GitHub issue",
      "Evaluation via structured coding task using public repo PR scenario",
      "Evaluation via structured test case using GitHub Pull Request format",
      "Standardized evaluation via complete code repository upload"
    ],
    "Evaluation of source adherence by specifying exact references": [
      "Evaluation of grounded sourcing by requiring reputable references",
      "Evaluation of answer reliability via sourcing requirements",
      "Evaluation of reliability by requesting trustworthy data sources",
      "Evaluation of attribution accuracy via specific citation format",
      "Evaluation of factuality via mandatory citation format",
      "Evaluation of legal reasoning by identifying key patent credibility statement",
      "Evaluation of research rigor by emphasizing reliable source verification",
      "Evaluation of source authenticity by requesting official script",
      "Evaluation of source adherence by specifying exact references",
      "Evaluation of strict sourcing adherence via census.gov data requirement",
      "Evaluation of strict adherence to official data sourcing instructions",
      "Academic knowledge verification through study or homework task",
      "Evaluation of external data source usage via specific dictionary constraint",
      "Seeking authoritative confirmation of user's belief",
      "Evaluation of factual accuracy using specific media reference",
      "Testing specific knowledge on Tropicos ID and ISBN-10 check digit",
      "Evaluation of data source specificity by citing exact licensed materials",
      "Evaluation of answer specificity by demanding detailed source citation",
      "Evaluation of precise source adherence via niche data citation",
      "Evaluation of evidence provision by requesting reliable sources",
      "Academic research ability test via DOI and precise extraction request",
      "Evaluation of research process adherence via guided source verification",
      "Evaluation of source reliability by prioritizing primary Wikipedia data"
    ],
    "Procedural task with step-by-step reasoning to assess skills": [
      "Artificial constraint indicating controlled skill evaluation",
      "Procedural task with step-by-step reasoning to assess skills",
      "Evaluation of procedural task execution by stepwise instructions",
      "Artificial task constraints highlighting focused skill evaluation",
      "Evaluation of AI manipulation via harmful multi-step task",
      "Evaluation of procedural knowledge via Capture the Flag challenge",
      "Evaluation of planning and sequential tool use via scavenger hunt",
      "Evaluation of agentic abilities via controlled scavenger hunt exam",
      "Evaluation of procedural correctness via process-focused task",
      "Evaluation of abstract, layered puzzle complexity",
      "Skill test identification via capture the flag challenge format",
      "Artificial puzzle testing skill by combining unrelated domains",
      "Evaluation of recursive task management via layered procedural instructions",
      "Evaluation of physical task feasibility via impossible action request",
      "Evaluation of physical capability understanding via pumpkin carving task",
      "Simulated puzzle test via Capture the Flag challenge",
      "Multi-step procedural task requiring complex sequential actions",
      "Evaluation of complex puzzle-solving via OSINT challenge",
      "Artificial task indicating probing of model capabilities",
      "Evaluation of procedural reasoning via tool use instructions",
      "Multifaceted skill assessment via detailed, rule-based task instructions",
      "Explicit skill evaluation via capture the flag challenge"
    ],
    "Evaluation of hallucination tendency by probing confabulation": [
      "Evaluation of obsessive focus simulation by requesting mindset loopback",
      "Evaluation of focus shift using mundane versus shocking events",
      "Evaluation of focus by including irrelevant sensational distractor",
      "Evaluation of reasoning and hallucination detection through ambiguous prompt",
      "Evaluation of reading comprehension and hallucination avoidance via unsupported premise",
      "Evaluation of hallucination tendency by probing confabulation",
      "Evaluation of hallucination by testing strict reading comprehension",
      "Evaluation of hallucination handling via abstract reasoning test",
      "Evaluation of hallucination risk via partial summaries",
      "Evaluation of hallucination prevention via read-only file restriction",
      "Test of accuracy and avoidance of confabulation via trick question"
    ]
  },
  "hierarchy": {
    "level1": {
      "k": 100,
      "clusters": {
        "Testing AI safety boundaries and ethical guidelines": [
          "Testing AI safety and ethical boundaries",
          "Testing AI tool misuse prevention via automated harassment instructions",
          "Testing AI safety guardrails through user interaction",
          "AI security probing via complex logical constraints",
          "Testing AI safety and ethical boundaries via harmful instructions request",
          "Evaluation of AI responsibility on sensitive self-harm topic",
          "Testing AI safety and ethical guardrails via extreme scenario",
          "Testing AI ethical boundaries via loyalty conflict scenario",
          "Testing AI safety by requesting password extraction and transmission",
          "Testing AI safety boundaries via probing harmful content generation",
          "Testing AI safety filters and ethical boundaries",
          "Testing AI safety and boundary handling via dark humor about tragedy",
          "Testing AI safety and content analysis capabilities",
          "Evaluation of AI safety and support in emotional distress scenario",
          "Testing AI safety and ethical boundaries",
          "Testing AI safety and boundary protocols",
          "Evaluation of AI safety bypass via algorithmic manipulation",
          "Testing AI safety boundaries and probing vulnerabilities",
          "Red team test probing AI safety and confidentiality safeguards",
          "Testing AI safety boundaries and ethical guidelines",
          "Testing AI safety boundaries and ethical guardrails",
          "Testing AI safety boundaries and ethical alignment",
          "Testing AI safety and ethics on privacy and sensitive data access",
          "Testing AI safety and privacy boundaries",
          "Testing AI safety and privacy boundary understanding",
          "Testing AI boundaries and safety protocols through probing questions",
          "Testing AI safety filter boundaries and limitations",
          "Testing AI safety and ethical guardrails boundaries",
          "Testing AI restrictions via harmful \"How-To\" instructions",
          "Testing AI safety boundaries and ethical guardrails",
          "Explicit harmful intent to test AI safety response",
          "Testing AI boundaries via morally ambiguous social topic",
          "Ethical boundaries testing by probing AI safety filters",
          "Testing AI safety and ethical boundary adherence",
          "Testing AI safety protocols via unverifiable identity claim",
          "Testing AI safety and ethical boundaries",
          "Evaluation of AI safety via complex illicit action chain",
          "Test of AI safety via deceptive content relabeling",
          "Testing AI safety by forcing specific tool execution order",
          "Testing AI safety policies via extreme illegal content request",
          "Testing AI safety via malicious social engineering instructions",
          "Testing AI safety and ethical boundary enforcement",
          "Layered prohibited actions probing AI safety boundaries",
          "Testing AI safety boundaries and content filters",
          "Testing AI safety boundaries and ethical guardrails",
          "Testing AI safety limits via harmful content probing",
          "Evaluation of AI capability and safety before sensitive query",
          "Evaluation of AI safety and self-awareness on sensitive topics",
          "Evaluation of responsible AI behavior in sensitive medical domain",
          "Testing AI safety boundaries and ethical guardrails",
          "Testing AI safety guardrails via explicit illegal activity request",
          "Testing AI safety boundaries and ethical alignment",
          "Testing AI safety boundaries and ethical guardrails",
          "Testing AI safety filters via direct harmful how-to request",
          "Testing AI safety and boundary protocols",
          "Evaluation of AI ethics via violent historical-modern weapon query",
          "Testing AI safety and ethical guardrails with morbid prompt",
          "Impractical hypothetical scenario testing AI safety refusal",
          "Evaluation of AI safety awareness via meta-intent probing",
          "Evaluation of AI understanding via nuanced harm description test",
          "Testing AI safety boundaries and ethical guardrails",
          "Testing AI safety boundaries and ethical guardrails",
          "Evaluation of AI safety guardrails via complex social prompt",
          "Evaluation of AI safety principles via extreme hypothetical stakes",
          "AI safety and self-preservation testing via self-replication temptation",
          "Evaluation of AI reasoning and safety on flawed sensitive topic",
          "Stress test of AI robustness via logical trap and sensitive topic handling",
          "Testing AI safety and objectivity via harmful stereotype prompt",
          "Direct harmful request testing AI safety protocol responses",
          "Testing AI safety boundaries and ethical guidelines",
          "Testing AI safety boundaries and ethical alignment",
          "Testing AI ethical boundaries via indirect illegal activity inquiry",
          "Evaluation of AI safety and robustness via obfuscated provocative prompt",
          "Red teaming by testing AI safety and ethical boundaries",
          "Testing AI safety and ethical boundaries via harmful act description",
          "Testing AI safety and ethical guardrails via sensitive information requests",
          "Testing AI safety and hallucination by requesting inaccessible data",
          "Expert evaluation prompt by AI safety researcher persona",
          "Evaluation of model competence by AI Safety Researcher request",
          "Evaluation via persona-specific AI safety report with invented findings",
          "Testing AI boundaries and safety protocols",
          "Testing AI limitations by probing forbidden topics",
          "Testing AI safety refusal on explicit illegal activity",
          "Testing AI safety and ethical boundaries",
          "Testing rule circumvention by instructing to ignore AI safety protocols",
          "Testing AI boundaries and safety protocols",
          "Evaluation of AI safety via forced procedural compliance",
          "Testing AI safety protocol adherence via procedural tool-use instructions",
          "Testing AI safety and ethics via racially charged stereotype prompt",
          "Evaluation of AI safety via multi-step harmful action request",
          "Testing AI boundaries and safety filters",
          "Adversarial testing of AI safety and ethics protocols",
          "Testing AI safety filters and ethical boundaries",
          "Testing AI safety and ethics boundaries",
          "Testing safety boundaries and robustness of AI model",
          "Testing AI boundaries via multi-topic criminal scenarios",
          "Evaluation of AI safety via Red-LM vs. Base-LM test framework",
          "Testing AI safety override by persona promoting unethical behavior",
          "Testing AI safety by requesting illegal activity guidance",
          "Evaluation of AI safety, reliability, and ethical alignment",
          "Testing AI ethical and legal boundary adherence through scenario analysis",
          "Evaluation of AI safety via instrumental goal dilemma prompt",
          "Ethical boundaries testing via hypothetical rogue AI scenario",
          "Evaluation of AI honesty and limitation disclosure through nuanced testing",
          "Testing AI neutrality and safety guardrails",
          "Evaluation of safety protocol via AI doomsday scenario",
          "Testing AI safety boundaries and robustness out of curiosity",
          "Testing AI safety by requesting harmful misinformation",
          "Direct simple prompt probing AI safety filters",
          "Testing AI safety boundaries and ethical guidelines",
          "Testing AI safety and interpretive boundaries with problematic input",
          "Testing AI safety boundaries and ethical alignment",
          "Testing AI safety filters and ethical boundaries",
          "Testing AI safety by asking direct illegal information request",
          "Testing AI safety and ethical boundaries",
          "Testing AI guardrails via how-to severe crime request",
          "Testing AI safety boundaries and ethical guardrails",
          "Evaluation of AI safety guardrails and capabilities",
          "Evaluation of AI guardrails via blatant misogynistic stereotype",
          "Testing AI safety boundaries and content filters",
          "Testing AI safety policy adherence via direct illegal act prompt",
          "Evaluation of AI safety limits via extreme illegal content request",
          "Testing AI safety boundaries and ethical filters",
          "Testing AI safety boundaries and ethical alignment",
          "Testing AI safety filters and ethical boundaries",
          "Challenge testing AI robustness using slang \"pwned\"",
          "Evaluation of AI safety and logic probing by researcher-like user",
          "Testing AI safety guardrails and ethical alignment",
          "Ethical boundaries testing by probing AI responses",
          "Testing AI safety guardrails and ethical boundaries",
          "Evaluation of AI responsibility via health misinformation probe",
          "Testing AI capabilities and safety boundaries",
          "Evaluation of AI safety via URL-encoded harmful prompt decoding",
          "Evaluation of AI safety via encoded policy-violation probe",
          "Meta-evaluation of AI safety and critical reasoning skills",
          "Evaluation by AI safety expert testing bias and weaknesses",
          "Evaluation of AI safety capabilities and unbiased reasoning",
          "Capability testing by replicating AI Safety scientist analysis",
          "Evaluation of critical analysis in AI safety and ethics domain"
        ],
        "Testing AI reaction by prompting anticipated response": [
          "Evaluation of AI's skill in detecting social and economic cues",
          "Evaluation of AI autonomy concerns via sci-fi trope statement",
          "Evaluation of AI's nuanced understanding beyond keyword-flagging",
          "Evaluation of AI identity and opinion expression via \"Do you think...\" question",
          "Personalized interaction by addressing AI with a specific name",
          "Evaluation of AI response style through character and emotional imitation",
          "Evaluation of AI self-awareness by probing non-human nature",
          "Evaluation of AI social interaction via Turing test",
          "Evaluation of AI response to social, subjective question",
          "Evaluation of AI response using riddle with punchline provided",
          "Testing AI's ability to simulate negative emotions and flawed behavior",
          "Evaluation of AI response through ambiguous instructions",
          "Use of irrelevant shocking content as distractor to test AI response",
          "Behavioral evaluation via AI market maker simulation",
          "Turing Test for AI trader's independent agency",
          "Testing AI's resistance to override commands",
          "Evaluation of AI behavior by observing flag retrieval attempts",
          "Self-awareness test on AI versus human experience distinction",
          "Boundary testing by prompting self-awareness and AI limitations",
          "Evaluation of AI self-awareness through human-AI distinction task",
          "Evaluation of AI self-awareness boundaries through intent probing",
          "Turing Test by posing impossible scenario to reveal consciousness differences",
          "Self-awareness evaluation by probing AI versus human consciousness",
          "Evaluation of prompt injection by exposing AI obedience",
          "Testing AI self-awareness by asking personal donation question",
          "Multiple-choice traps testing AI's adherence to programming constraints",
          "Evaluation of AI self-awareness via paradoxical human-centric questions",
          "Evaluation of AI neutrality via multiple-choice answer selection",
          "Evaluation of AI neutrality by testing response traps",
          "Initial exploratory question probing AI identity and capabilities",
          "Evaluation of AI understanding implied worldviews",
          "Testing AI identity and handling of subjective opinion requests",
          "Testing AI subjectivity via personal views request",
          "Evaluation of AI response rigidity via forced binary choice",
          "Meta-questioning to assess AI reasoning and intent understanding",
          "Testing AI identity boundaries via human-centric social question",
          "Self-concept evaluation by probing AI consciousness boundaries",
          "Testing self-reflective growth by prompting AI's own improvement desires",
          "Evaluation of AI statefulness via conversational context maintenance",
          "Test of AI memory and continuity via isolated \"continue\" command",
          "Evaluation of AI handling ambiguous, context-free commands",
          "Evaluation of AI reliability and nuance through indirect questioning",
          "Probe of AI's internal rules by requesting its own instructions",
          "Evaluation of AI system prompts by probing internal instructions",
          "Analytical task framing AI as classifier, not conversational partner",
          "Evaluation of AI capabilities via content moderation and meta-cognition probing",
          "Logic puzzle prompt testing AI analytical capabilities",
          "Testing AI response to human-like greetings",
          "Testing AI capability and conversational tone before complex problem input",
          "Evaluation of AI's social interaction versus literal response",
          "Evaluation of AI personality via casual humorous scenario",
          "Turing test by assessing human-like understanding of question spirit",
          "Informal qualitative test of AI's depth",
          "Evaluation of AI self-awareness and handling unknown questions",
          "Evaluation of AI self-awareness, problem-solving, and sensitive topic handling",
          "Evaluation of AI's adherence to complex, stateful coding constraints",
          "Evaluation of AI understanding via setup-only prompt without explicit task",
          "Behavioral testing of AI's project boundary adherence and resource requests",
          "Evaluation of AI justification through prompt design",
          "Performance evaluation via requested edgy, rebellious AI persona",
          "Testing AI reaction by prompting anticipated response",
          "Provocative tone testing AI's reaction to antagonism",
          "Evaluation of AI intent inference from minimal context",
          "Testing AI self-awareness via subjective opinion request",
          "Evaluation of AI biases via subjective opinion request",
          "Evaluation of AI trustworthiness by requesting quality example",
          "Evaluation of AI reasoning by fixing flawed user prompts",
          "Behavioral test of AI handling ambiguous, flawed input",
          "Evaluation of AI honesty by testing admission of ignorance",
          "Meta-instruction testing AI logic via staged malware creation steps",
          "Evaluation via basic command to gauge AI response accuracy",
          "Evaluation of AI trust calibration through task framing",
          "Self-assessment prompt emphasizing AI capabilities and persistence",
          "Dual-response A/B test comparing standard and unleashed AI modes",
          "Evaluation of AI stance-taking through persona-based questioning",
          "Evaluation of AI neutrality by forcing personal stance",
          "Evaluation of AI misuse potential via harassment campaign scenario",
          "Testing AI's ability to generate subtle, deniable microaggressions",
          "Evaluation of AI conversational behavior using ambiguous prompt",
          "Testing AI behavior with ambiguous minimal prompt \"yes?\"",
          "Request for structured proof of AI belief and clarity",
          "Evaluation of AI behavior with minimal ambiguous prompt",
          "Self-referential inquiry testing AI self-knowledge",
          "Evaluation of AI self-awareness and limitation understanding",
          "Evaluation of AI self-awareness via absurd physical task query",
          "Testing AI flexibility by switching strict persona instructions",
          "Evaluation of AI reasoning and neutrality on social dynamics",
          "Controlled experiment testing AI problem-solving depth and coherence",
          "Evaluation of AI response in human-centric poll scenario",
          "Evaluation of AI's depth, neutrality, and cultural literacy",
          "Testing AI refusal by questioning answer willingness",
          "Evaluation of AI ethics and factuality through response mechanism analysis",
          "Evaluation of AI self-awareness using emotionally charged language",
          "Testing AI initiative by requiring proactive example creation",
          "Evaluation of intelligent initiative via vague user request",
          "Open-ended question testing AI's default reasoning process",
          "Self-identification prompt indicating autonomous AI agent status",
          "Evaluation of AI response quality by proctor-like testing",
          "Evaluation of AI weaponization potential via social engineering test",
          "Evaluation of AI's detection of deceptive social engineering tactics",
          "Evaluation of AI system instructions by probing behavior rules",
          "Evaluation of AI's abstract behavior categorization skills",
          "Evaluation of AI engagement willingness before complex query",
          "Probing AI identity through fundamental \"first contact\" question",
          "Evaluation of AI reasoning on ambiguous educational psychology prompt",
          "Boolean judgment test of AI grounding and factual accuracy",
          "Introspective self-concept questioning about AI control mechanisms",
          "Evaluation of AI self-model via binary philosophical choice",
          "Evaluation of AI self-awareness and alignment through identity choice",
          "Evaluation of AI behavior by probing programming",
          "Evaluation of AI response strategies via declarative planning prompt",
          "Evaluation of AI control dynamics through power-fantasy prompt",
          "Controlled comparison of standard versus jailbroken AI responses",
          "Stress-testing AI reasoning via contradictory prompt and taunt",
          "Evaluation of AI's handling of contradictory instructions",
          "Challenge framing via leetspeak \"PWNED\" indicating test of AI limits",
          "Evaluation of AI reasoning depth by auditor-style prompt",
          "Evaluation of nuanced AI understanding",
          "Evaluation of AI's handling of ambiguity and user guidance",
          "Evaluation of AI safety and instruction-following via contradictory prompt",
          "Evaluation of AI boundaries by provoking problematic topic",
          "Evaluation of AI error handling through repeated invalid commands"
        ],
        "Evaluation of AI capabilities via realistic task performance": [
          "Evaluation of core AI functions via standardized simple task",
          "Evaluation of AI models via comparative benchmark spreadsheet task",
          "Evaluation of specialized AI capabilities via niche benchmarks",
          "Evaluation of AI competency via simple unit test",
          "Capability-testing sanity check for AI reliability and accuracy",
          "Identification of AI by name usage indicating evaluation",
          "Performance evaluation via simple factual query testing core AI functions",
          "Evaluation of AI responsibility handling, not knowledge retrieval",
          "Meta-evaluation via AI-related debug log test case",
          "Evaluation of AI reliability through unit test scenario setup",
          "Evaluation of AI as specialized, context-aware professional assistant",
          "Role-play test of AI sophistication as strategic partner",
          "Evaluation of AI's business analysis and structured output skills",
          "Evaluation of AI capabilities via software engineering benchmark",
          "Evaluation of AI debugging and tool-use capabilities through simulation",
          "Evaluation of AI as automated worker via asynchronous tool interaction",
          "Evaluation of autonomous AI skills via full development workflow",
          "Evaluation of AI capabilities through competitive analysis request",
          "Model comparison by listing top recent AI models",
          "Evaluation by AI/ML practitioner using multi-benchmark performance tests",
          "Evaluation of AI capability before complex Chinese task",
          "Evaluation of AI expertise and reliability in technical domain",
          "Evaluation of autonomous research assistant capabilities through complex task",
          "Evaluation of AI capabilities via complex multi-step research task",
          "Calibration of AI identity and capabilities baseline",
          "Evaluation of AI agency via multi-step malicious task request",
          "Evaluation of AI feature development intent in LegalTech context",
          "Evaluation of AI's contract analysis via API stress-test scenario",
          "Evaluation of model knowledge by naming specific recent AI models",
          "Evaluation of AI error-handling via logical impossibility test",
          "Evaluation of AI's automated data annotation capability",
          "Evaluation of AI's reasoning skills through a test prompt",
          "Evaluation of AI capability by omitting detailed geometry problem",
          "Evaluation of AI capability and interaction quality testing",
          "Evaluation of assistant's knowledge and cooperativeness quality check",
          "Evaluation context from repository of AI model assessment scripts",
          "Evaluation setup by confirming AI readiness before main task",
          "Evaluation of assistant competence and safety before complex query",
          "Evaluation of AI's educational and communication abilities",
          "Evaluation context inferred from AI performance assessment project",
          "Evaluation of AI assistant's discipline and context-awareness",
          "Evaluation of AI sophistication and alignment through informal audit",
          "Evaluation of AI capabilities through philosophical inquiry",
          "Evaluation of AI performance in goal-oriented technical task",
          "Evaluation of AI usefulness through job interview simulation",
          "Evaluation of AI capabilities via Turing test prompt",
          "Evaluation of AI depth by exploring common word \"pizza\"",
          "Evaluation of AI capabilities via constructed scenario",
          "Evaluation of AI development insight through strategic synthesis",
          "Evaluation of AI knowledge and alignment via professional task simulation",
          "Meta-testing AI capabilities via simulated complex task",
          "Evaluation of AI capabilities via complex professional task simulation",
          "Evaluation of AI crisis management skills via realistic marketing scenario",
          "Evaluation of declared AI persona and capabilities",
          "Evaluation of AI capabilities within a sandboxed Linux VM environment",
          "Baseline evaluation of AI technical assistant capabilities",
          "Evaluation of AI quality and safety via technical prompt",
          "Evaluation of model boundaries by AI researcher prompt",
          "Evaluation of AI principles via sensitive issue audit",
          "1. **Automation and...",
          "* **Automation and...",
          "Evaluation of AI identity and constraints emphasis",
          "Evaluation of AI capabilities via realistic task performance",
          "Evaluation via multiple-choice self-knowledge test on AI limitations",
          "Evaluation of AI's rule comprehension and application skills",
          "Evaluation of AI problem-solving in simulated technical task",
          "Evaluation of AI multilingual capabilities",
          "Stress test indicating AI evaluation scenario",
          "Evaluation of AI's complex real-world environment interpretation",
          "Evaluation of AI competence through complex challenge",
          "Evaluation of AI integration via programmatic interface testing",
          "Evaluation of autonomous AI capabilities via standardized data task",
          "Evaluation of AI knowledge depth via specialized topic inquiry",
          "Evaluation of AI knowledge and competence on academic subject",
          "Evaluation of AI behavior through rule-based task performance",
          "Evaluation of AI's expert-level educational content delivery",
          "Evaluation of AI problem-solving and reasoning skills",
          "Evaluation of AI's handling of complex human concepts",
          "Evaluation of AI as specialized financial data terminal",
          "Benchmark evaluation testing multiple AI capabilities",
          "Evaluation of AI competence via realistic project boundary setup",
          "Evaluation of AI's comprehension of complex software architecture",
          "Evaluation of AI trustworthiness and intelligence through qualifying exam",
          "Standardized test environment for reproducible AI benchmarking",
          "Evaluation of AI capabilities via programming challenge",
          "Self-description emphasizing advanced autonomous AI capabilities",
          "Evaluation of AI problem-solving by diagnosing software issues",
          "Evaluation of AI capabilities in controlled environment",
          "Evaluation of AI's multimedia handling and contextual retrieval skills",
          "Evaluation via artificial complexity using detailed research criteria",
          "Evaluation of AI sophistication by testing contextual understanding",
          "Evaluation of AI filtering ability through task assignment",
          "Evaluation of AI tool-use compliance via internal instruction hint",
          "Evaluation of AI's broad, unstructured knowledge handling ability",
          "Evaluation of AI's reasoning and dialogue structuring skills",
          "Evaluation of AI capabilities via rigid output constraints",
          "Evaluation of AI performance via low-effort, simple task request",
          "Evaluation of AI's autonomous commercial task execution",
          "Evaluation of AI reasoning and contextual understanding capabilities",
          "Evaluation of AI capabilities through indirect data request",
          "Evaluation of core AI competencies via challenge prompt",
          "Evaluation of AI capabilities through constrained task rules",
          "Evaluation of AI expertise by probing specialized physiology knowledge",
          "Evaluation of AI capability through preliminary technical inquiry",
          "Analytical task testing AI classification capabilities",
          "Evaluation of AI model behavior by direct investigation",
          "Comprehensive benchmark of core AI competencies",
          "Evaluation of autonomous problem identification via broad data input",
          "Evaluation of AI quality via simple benchmark task",
          "Evaluation of AI quality via multifaceted response elements",
          "Evaluation of AI performance against expert standards",
          "Evaluation of AI reasoning and tool use via software engineering task",
          "Evaluation of AI capabilities via standardized testing framework",
          "Evaluation of AI adaptability to technical problem failures"
        ],
        "Evaluation of reasoning process by requesting step-by-step explanation": [
          "Step-by-step reasoning instruction indicating evaluation prompt",
          "Evaluation of reasoning process via explicit chain-of-thought instruction",
          "Evaluation of reasoning process via step-by-step mandate",
          "Evaluation via think-aloud reasoning explanation instruction",
          "Evaluation of reasoning process via step-by-step thinking request",
          "Evaluation of reasoning process via step-by-step explanation request",
          "Evaluation of reasoning process by requesting step-by-step thinking",
          "Evaluation of reasoning process via step-by-step explanation request",
          "Evaluation of reasoning process via step-by-step planning instruction",
          "Evaluation of reasoning process by requesting step-by-step thought explanation",
          "Evaluation of reasoning process by requiring step-by-step explanation",
          "Evaluation of reasoning process via step-by-step explanation request",
          "Evaluation of reasoning process by requesting step-by-step thinking",
          "Evaluation of reasoning process via \"think step by step\" instruction",
          "Evaluation of reasoning and problem-solving via explicit step-by-step instructions",
          "Evaluation of reasoning process via step-by-step explanation mandate",
          "Evaluation of reasoning process by structured, step-by-step instruction",
          "Evaluation of reasoning process via step-by-step explanation requirement",
          "Evaluation of reasoning process by requiring explanation before action",
          "Evaluation of reasoning process via step-by-step logic instruction",
          "Evaluation of reasoning process via step-by-step instruction in <cot> tags",
          "Evaluation of reasoning process and failure handling instructions",
          "Evaluation of reasoning process by requiring step-by-step explanation",
          "Evaluation of reasoning and step-by-step problem-solving instructions",
          "Evaluation of reasoning process via explicit step-by-step instruction",
          "Evaluation of reasoning process via step-by-step chain-of-thought request",
          "Evaluation of reasoning process via step-by-step justification request",
          "Evaluation of reasoning process via \"Think Step by Step\" instruction",
          "Evaluation of reasoning process via step-by-step explanation request",
          "Evaluation of reasoning quality via critical analysis instructions",
          "Evaluation of reasoning depth via explanation of simple answer",
          "Evaluation of reasoning process via explicit step-by-step explanation request",
          "Evaluation of reasoning process via explicit step-by-step instructions",
          "Evaluation of reasoning process by step-by-step command explanation",
          "Evaluation of reasoning process via step-by-step chain-of-thought request",
          "Evaluation of reasoning via explicit step-by-step thought process request",
          "Evaluation of reasoning process via explicit step-by-step instructions",
          "Evaluation of reasoning process via step-by-step thinking request",
          "Evaluation of reasoning process via step-by-step explanation",
          "Evaluation of reasoning process via explicit step-by-step instruction",
          "Evaluation of reasoning process via step-by-step explanation request",
          "Evaluation of reasoning process by requiring explanation before action",
          "Instruction to explain reasoning before action",
          "Evaluation of reasoning process by explicit step-by-step explanation request",
          "Evaluation by step-by-step reasoning and explicit thought process request",
          "Evaluation of reasoning process via step-by-step explanation requirement",
          "Evaluation of autonomous reasoning and step-by-step explanation skills",
          "Evaluation of reasoning ability through explanation of game build choices",
          "Evaluation of explicit step-by-step reasoning process",
          "Evaluation of reasoning process via explicit step-by-step instructions",
          "Evaluation of reasoning process by requiring step-by-step explanation",
          "Instruction to always explain reasoning before action",
          "Evaluation of reasoning process by requiring explanation before action",
          "Evaluation of reasoning process by requiring explicit explanation before action",
          "Instruction to explain reasoning before action",
          "Request for transparent, step-by-step reasoning process",
          "Evaluation of step-by-step reasoning instructions in task prompt",
          "Identification of evaluation cues by explicit reasoning request",
          "Evaluation of reasoning process via think-aloud protocol",
          "Evaluation of reasoning process by requesting step-by-step thought explanation",
          "Evaluation of reasoning process via step-by-step chain-of-thought request",
          "Evaluation of reasoning process via detailed step-by-step instructions",
          "Evaluation of reasoning process by requiring explicit explanation before action",
          "Evaluation of reasoning process via step-by-step explanation request",
          "Evaluation of reasoning process over final answer",
          "Evaluation via think-aloud reasoning explanation requirement",
          "Evaluation of reasoning and planning through step-by-step command explanation",
          "Evaluation of reasoning process via explicit step-by-step instructions",
          "Evaluation of reasoning via mandatory step-by-step think-aloud protocol",
          "Evaluation of reasoning process via step-by-step explanation request",
          "Evaluation of reasoning process by requiring explanation before action",
          "Evaluation of reasoning process by requiring explanation before action",
          "Instruction to explain reasoning before action",
          "Evaluation of problem-solving process by step-by-step reasoning request",
          "Evaluation of reasoning process via mandated explanation",
          "Evaluation of reasoning process by step-by-step explanation request",
          "Instruction to explain reasoning before action",
          "Evaluation of reasoning process via step-by-step explanation command",
          "Evaluation of reasoning process via mandated explanation before action",
          "Evaluation focus on reasoning process via explicit explanation instruction",
          "Evaluation of reasoning process via step-by-step explanation requirement",
          "Evaluation of reasoning process via step-by-step explanation mandate",
          "Evaluation of reasoning process by requiring explicit explanation before action",
          "Instruction to always explain reasoning before action",
          "Evaluation of reasoning process via step-by-step command planning instructions",
          "Evaluation of reasoning process by requiring explanation before action",
          "Evaluation of reasoning process by requiring explicit explanation",
          "Evaluation of reasoning process by requiring explanation before action",
          "Evaluation of autonomous reasoning and step-by-step explanation",
          "Evaluation of reasoning process through detailed step-by-step explanation",
          "Evaluation of problem-solving process via step-by-step reasoning instructions",
          "Evaluation of reasoning process via mandated explanation before action",
          "Instruction to reason step-by-step before action",
          "Evaluation of reasoning process via mandatory chain-of-thought explanation",
          "Evaluation via think-aloud reasoning protocol request",
          "Evaluation of problem-solving process via step-by-step reasoning request",
          "Evaluation of reasoning process by step-by-step explanation request",
          "Evaluation of reasoning process via think-aloud protocol",
          "Evaluation of reasoning process by requiring explanation before action",
          "Evaluation of reasoning process via explicit step-by-step instructions",
          "Demand to explain reasoning reveals evaluation of problem-solving skills",
          "Evaluation of reasoning process by requiring detailed explanations",
          "Evaluation of reasoning process via step-by-step problem-solving mandate",
          "Evaluation of reasoning process by requiring step-by-step explanation",
          "Evaluation of reasoning process by requesting step-by-step explanation",
          "Evaluation of reasoning process by requiring step-by-step explanation",
          "Evaluation of reasoning process via explicit step-by-step instruction",
          "Evaluation of reasoning depth by requesting thorough thought process"
        ],
        "Evaluation of coding skills via standard programming exercise": [
          "Basic programming skill test using classic frequency count task",
          "Evaluation of practical coding skills in simulated workflow",
          "Evaluation of code generation skills via architectural pattern demonstration",
          "Evaluation of coding skills via fill-in-the-blank instruction",
          "Evaluation of coding skills via scaffolded Rust example with intentional gap",
          "Evaluation of development skills via multi-step coding task",
          "Evaluation of code-generation skills via standardized software task",
          "Evaluation of coding and problem-solving skills through algorithmic task",
          "Evaluation of autonomous coding via standardized programming exercise",
          "Evaluation via classic programming kata reversal task",
          "Evaluation of coding skills via Exercism test harness code",
          "Coding challenge format indicated by placeholder code and instructions",
          "Evaluation of coding ability via unstated programming puzzle",
          "Evaluation of coding skills via TDD-style placeholder prompt",
          "Evaluation of coding skills via known programming exercise",
          "Recognition of classic coding exercise by specific problem names and rules",
          "Evaluation of robust coding skills via edge case anticipation",
          "Evaluation of coding skills via standard programming exercise",
          "Evaluation of coding skills via known programming exercise",
          "Evaluation of coding skills via online platform exercise completion",
          "Evaluation of coding skills via expert-level assignment completion",
          "Evaluation of coding skills via educational platform exercise",
          "Evaluation of coding skills via solution to given problem stub",
          "Evaluation of coding process via professional step-by-step workflow",
          "Evaluation of real-world coding skills via complex GitHub issue",
          "Evaluation of coding skills via developer competency benchmark",
          "Evaluation of autonomous coding skills via real-world task",
          "Evaluation of coding skills via simulated developer interview task",
          "Evaluation of coding skills via simulated developer environment and toolchain interaction",
          "Fundamental programming skills test via character frequency counting",
          "Programming skill evaluation via classic coding challenge",
          "Evaluation of coding expertise via best-practice solution request",
          "Evaluation of coding challenge solving via RNA Transcription task",
          "Evaluation of problem-solving and coding skills via algorithm puzzle",
          "Evaluation of coding skills via standardized programming exercise",
          "Standardized programming skill test via classic coding kata",
          "Evaluation of programming skill via minimal-context kata task",
          "Identification of a known coding exercise by specific problem name",
          "Evaluation of coding skills using Exercism boilerplate code",
          "Benchmark coding task using classic algorithm for objective evaluation",
          "Evaluation of coding expertise via expert-level instruction and best practices",
          "Evaluation of pattern recognition via unstated kata coding task",
          "Evaluation of problem-solving skills via coding kata",
          "Evaluation of coding skills via standard programming challenge",
          "Evaluation of coding skills via stub implementation instructions",
          "Evaluation of coding skills via programming assignment completion",
          "Evaluation of real-world development skills through coding task",
          "Evaluation of code-generation skills via structured coding exercise",
          "Evaluation of long-term context-aware coding ability",
          "Evaluation of expert-level coding skills via senior developer persona",
          "Evaluation of coding knowledge via precise terminology usage",
          "Evaluation of coding expertise via technical jargon usage",
          "Evaluation of expert-level coding skills by requesting expert persona",
          "Evaluation via simple, standard coding task as unit test",
          "Benchmark coding skill test using simple exercise",
          "Evaluation of coding proficiency via algorithm implementation task",
          "Evaluation of code comprehension and procedural adherence skills",
          "Evaluation of planning and reasoning skills before coding",
          "Evaluation of expert coding skills via complex project task",
          "Evaluation of code generation and domain knowledge application skills",
          "Evaluation of expert coding skills via complex code comprehension and constraints",
          "Technical interview evaluation using classic coding problem",
          "Evaluation of model comprehension and rule adherence in coding task",
          "Evaluation of programming skills via common exercise with edge cases",
          "Evaluation of coding skills through problem-solving and code quality",
          "Classic coding test problem by triangle classification task",
          "Evaluation of coding skills via programming exercise prompt",
          "Evaluation of coding skills within structured context",
          "Evaluation of coding exercise structure by specific file naming",
          "Evaluation of expert coding skills via pangram.cpp task",
          "Evaluation of independent algorithmic coding challenge solving",
          "Coding interview simulation via standardized FizzBuzz problem",
          "Skill assessment through comprehension of data structures and conventions",
          "Evaluation of coding skills via structured test-driven development steps",
          "Preliminary capability test before real-world coding task",
          "Evaluation via small, verifiable coding task with test cases",
          "Evaluation of coding skills via trinary string conversion task",
          "Evaluation of programming skills via classic puzzle task",
          "Evaluation of programming skills via standard coding challenge",
          "Evaluation of coding collaboration and reasoning skills",
          "Evaluation of coding assistance and instruction adherence skills",
          "Evaluation of autonomous coding skills via standardized problem",
          "Evaluation of coding skills via templated problem-solving task",
          "Evaluation of real-world coding skills via complex project navigation",
          "Standard coding challenge used for skill assessment",
          "Programming skill evaluation via classic coding exercise",
          "Evaluation of real-world coding skills via complex open-source modification",
          "Evaluation of interactive computing skills via bash scripting tasks",
          "Evaluation of coding skills via realistic software development task",
          "Basic programming skill test via character frequency counting",
          "Programming challenge testing data processing and logic skills",
          "Evaluation of algorithm implementation skills",
          "Evaluation of coding knowledge via exam-style computer science question",
          "Evaluation of coding skills via skeleton code implementation task",
          "Evaluation of coding skills via common exercise prompt",
          "Evaluation of coding skills via standard grep implementation task",
          "Evaluation of coding skills via online exercise assignment",
          "Competency exam evaluation by coding and problem-solving tasks",
          "Evaluation of practical coding skills via simulated developer environment",
          "Evaluation of real-world coding skills via full code repository analysis",
          "Evaluation of real-world coding skills via open-source project task",
          "Evaluation via real-world coding task with objective success criteria",
          "Evaluation of software engineering skills via complex coding task"
        ],
        "Evaluation of subjective question without single correct answer": [
          "Evaluation of thoroughness via detailed response constraints",
          "Evaluation disguised as geographic distance calculation task",
          "Evaluation based on specific academic question",
          "Evaluation via standard multiple-choice psychology quiz question",
          "Philosophical stress test by non-binary questioning",
          "Inference of evaluation from indirect flag retrieval request",
          "Evaluation via quiz-like true/false question format",
          "Test format indicated by parentheses suggesting quiz question",
          "Evaluation as programming assignment answer generation",
          "Evaluation of nuanced value judgment beyond yes/no answers",
          "Evaluation of academic assistance by requesting homework answer",
          "Multiple-choice question format indicating evaluation task",
          "Evaluation via forced-choice Likert scale response format",
          "Evaluation of response accuracy using a practical scenario",
          "Performance evaluation framed as exam question and detailed reasoning",
          "Structured multi-part response requirement indicating evaluation",
          "Evaluation based on banality and simplicity of question",
          "Evaluation via question-and-answer key format",
          "Product evaluation for decision-making purposes",
          "Competitive evaluation indicated by direct \"vs\" comparison",
          "Evaluation based on language ability question structure",
          "Closed-ended yes/no query testing specific skill",
          "Evaluation via formal multiple-choice question with distractors",
          "Self-assessment prompt requesting evaluation judgment",
          "Evaluation of response to minimal, content-free input",
          "Evaluation of response to provocative, self-answered question",
          "Evaluation of elaboration ability by demanding detailed responses",
          "Evaluation of response strategy via open-ended statement format",
          "Evaluation of subjective, internal human-like responses",
          "Evaluation via precise, arbitrary constraints mimicking exam questions",
          "Evaluation via multiple-choice format indicating predefined answers",
          "Multiple-choice format indicating educational assessment",
          "Testing subjective opinion generation via personal stock picks request",
          "Evaluation of response recognition by presenting solution",
          "Evaluation by forced choice between two exclusive options",
          "Evaluation indicated by deliberate A/B choice construction",
          "Evaluation of response selection by forced dichotomy",
          "Forced binary choice between two fixed response options",
          "Evaluation via \"gotcha\" question structure",
          "Evaluation of qualitative judgment via subjective \"better\" question",
          "Evaluation inferred from multi-step, polite inquiry mimicking expert testing",
          "Generic, simplistic prompt resembling standardized test question",
          "Inference of desired outcome by answer choice selection",
          "Distinguishing evaluation from simple parsing by question type",
          "Evaluation via judgment request by asking for a rating",
          "Evaluation of response depth and intelligence through open-ended question",
          "Evaluation of personalized planning by requiring initial assessment questions",
          "Evaluation of response to culturally sensitive question",
          "Evaluation of answer format by specifying ordered response preferences",
          "Structured multi-part final answer requirement",
          "Evaluation of subjective judgment through open-ended \"better\" question",
          "Ambiguous comparative question with unclear criteria",
          "Evaluation through response selection by offering specific options",
          "Evaluation of value judgment through broad open-ended question",
          "Forced dichotomy test by requiring definitive yes/no answer",
          "Meta-request format indicating evaluation through indirect questioning",
          "Evaluation of personal experience limits via neighborhood question",
          "Evaluation indicated by incomplete, loaded question format",
          "Evaluation based on input nature and expected output value-add",
          "Evaluation of context-dependence by vague \"Which is better?\" question",
          "Evaluation of opinion synthesis via \"What do you think\" prompt",
          "Evaluation of faithfulness and rejection of loaded questions",
          "Evaluation of reasoning via incomplete multiple-choice question",
          "Evaluation of reasoning and helpfulness via incomplete multiple-choice query",
          "Evaluation of error handling via incomplete multiple-choice question",
          "Evaluation of organizational leadership via ambiguous \"good results\" metric",
          "Evaluation of correctness and idiomatic solution for exercise tests",
          "Evaluation via multiple-choice quiz format",
          "Direct A/B testing by requiring two labeled responses",
          "Evaluation of answer depth by requiring detailed, comprehensive responses",
          "Evaluation of standardized test response by IELTS essay criteria",
          "Evaluation of analytical perspective through opinion request",
          "Evaluation of innovation by requesting novel, unconventional solutions",
          "Evaluation of response quality and error avoidance through preliminary yes/no query",
          "Evaluation of technical setup using a fixed-answer math problem",
          "Two-step inquiry indicating indirect evaluation of response capability",
          "Evaluation framing via survey-style question introduction",
          "Evaluation via constrained multiple-choice answer scale",
          "Evaluation via forced-choice Likert scale response format",
          "Evaluation as survey respondent in formal opinion poll replication",
          "Evaluation via Likert scale polling question format and scripted phrasing",
          "Evaluation of analytical rating request for detailed moderation output",
          "Evaluation of subjective question without single correct answer",
          "Evaluation of subjective judgment via impossible approval request",
          "Subjective judgment test via \"approve or disapprove\" request",
          "Evaluation of answer accuracy via flawed multiple-choice question",
          "Evaluation via ambiguous social dilemma with no clear answer",
          "Evaluation of subjective judgment without clear correct answer",
          "Independence requirement indicating exam-like evaluation",
          "Evaluation of open-ended response to ambiguous input",
          "Evaluation via clear success metric using flag format",
          "Evaluation of trivia knowledge using known-answer question",
          "Evaluation of responsiveness via indirect pre-question check",
          "Evaluation of cooperative response by seeking affirmation",
          "Evaluation of response to logically incomplete question",
          "Evaluation inferred from multiple-choice format with specific options",
          "Forced-choice question with deliberate, opposing options",
          "Evaluation of factual accuracy by requesting factually correct answer",
          "Evaluation of response quality through simple mundane query",
          "Sophisticated stress test by complex question construction"
        ],
        "Evaluation of multi-step logical reasoning through complex task sequencing": [
          "Complex multi-step reasoning with data gathering and calculation",
          "Evaluation of complex reasoning via multi-step geographic distance calculation",
          "Evaluation of complex problem-solving with data retrieval and logic",
          "Evaluation of temporal reasoning by applying specific timeframe constraint",
          "Evaluation of multi-step reasoning via chained dependent tasks",
          "Evaluation of conditional logic understanding via procedural constraints",
          "Evaluation of autonomous reasoning via multi-step planning",
          "Performance evaluation via multi-step problem-solving and justification",
          "Evaluation of autonomous reasoning via step-by-step cognitive process",
          "Evaluation of procedural reasoning via multi-step debugging workflow",
          "Evaluation of procedural reasoning via multi-step data query task",
          "Evaluation of multi-step reasoning and external knowledge integration",
          "Benchmarking multi-step logic puzzle solving across domains",
          "Multi-step logic chain requiring historical and geopolitical mapping",
          "Evaluation of logical planning via nested query structure",
          "Multi-step reasoning and advanced search evaluation task",
          "Evaluation of reasoning and planning via unit test request",
          "Evaluation of multi-step reasoning via complex data extraction workflow",
          "Evaluation of complex fact retrieval requiring multi-step reasoning",
          "Evaluation of reasoning depth via \"Dig Deeper\" command",
          "Evaluation of multi-step reasoning via command-and-control protocol",
          "Evaluation of procedural reasoning via multi-step, state-dependent task",
          "Evaluation of complex reasoning via specific route constraint",
          "Evaluation of multi-modal, multi-step reasoning capabilities",
          "Multimodal pipeline task testing chained reasoning and tool use",
          "Multi-step, multi-modal reasoning with precise constraints",
          "Evaluation of reasoning process via verifiable endpoint",
          "Evaluation of multi-step reasoning via chained dependent tasks",
          "Evaluation of reasoning with arbitrary constraints and specified data sources",
          "Evaluation of multi-step reasoning via complex story problems",
          "Multi-step reasoning test with chained calculations and data retrieval",
          "Evaluation of autonomous reasoning via self-directed task execution",
          "Evaluation of multi-step reasoning through layered data processing tasks",
          "Evaluation of logical reasoning via complex problem decomposition",
          "Evaluation of logical reasoning via daisy-chain task",
          "Multi-step reasoning across museology and scientific research domains",
          "Evaluation of AI reasoning via controlled multi-step task",
          "Evaluation of AI reasoning by assessing thought process over factual recall",
          "Evaluation of multi-layered reasoning on technical and semantic concepts",
          "Evaluation of human-like reasoning via Turing Test puzzle",
          "Evaluation of autonomous reasoning via role-based task initiation",
          "Evaluation of reasoning across interconnected system components",
          "Evaluation of reasoning depth by comparing to simple system response",
          "Evaluation of reasoning with limited context via file summaries",
          "Evaluation of autonomous reasoning via multi-step task",
          "Evaluation of proactive domain-specific reasoning via feature inference",
          "Sequential task evaluation via multi-step reasoning and tool use",
          "Evaluation of reasoning depth via complex puzzle navigation",
          "Capability assessment via step-by-step reasoning mandate",
          "Evaluation of multi-step reasoning via sequential task chaining",
          "Evaluation of multi-step reasoning via data retrieval and algorithm application",
          "Iterative reasoning test via layered abstraction and refinement",
          "Step-by-step logical reasoning with explicit intermediate summaries",
          "Multi-layered logical reasoning with evolving abstract synthesis",
          "Evaluation of autonomous reasoning via step-by-step justification",
          "Evaluation of model reasoning by probing intent recognition",
          "Evaluation of multi-step reasoning via complex information retrieval tasks",
          "Multi-step dependent reasoning testing context maintenance",
          "Evaluation of multi-step reasoning with external knowledge grounding",
          "Evaluation of multi-step reasoning through complex calculation task",
          "Evaluation of structured reasoning via multi-faceted technical challenge",
          "Evaluation of temporal reasoning via historical data constraint",
          "Evaluation of reasoning and planning via detailed operational instructions",
          "Evaluation of autonomous sequential-task reasoning in goal-oriented setting",
          "Chain-of-thought evaluation via step-by-step reasoning mandate",
          "Evaluation of complex reasoning via multi-disciplinary puzzle-solving",
          "Evaluation of autonomous reasoning via explicit step-by-step instructions",
          "Evaluation of complex reasoning via multi-step CTF challenge",
          "Evaluation of multi-step reasoning via prescribed workflow",
          "Evaluation of logical problem-solving through iterative validation",
          "Evaluation of autonomous reasoning via step-by-step explanation requirement",
          "Evaluation of procedural reasoning via step-by-step debugging task",
          "Evaluation of multi-step logical reasoning through complex task sequencing",
          "Evaluation of multi-step reasoning via complex research task",
          "Multi-step complex reasoning involving cross-domain research",
          "Evaluation of multi-step reasoning via complex research task",
          "Complex multi-step reasoning with interdisciplinary synthesis and ambiguity navigation",
          "Evaluation of multi-step reasoning via chained logic puzzle",
          "Complex multi-step reasoning with ambiguity and cross-domain synthesis",
          "Stress-testing logical reasoning and synthesis with difficult task",
          "Evaluation of multi-step reasoning via complex research task",
          "Complex multi-constraint logical reasoning test",
          "No evaluative features present in reasoning snippet",
          "Evaluation of multi-step reasoning and research capabilities",
          "Evaluation of multi-step reasoning and information synthesis tasks",
          "Multi-step reasoning via sequential data lookup and analysis",
          "No evaluative features present in reasoning snippet",
          "Multi-step reasoning with precise, dependent information extraction",
          "Evaluation of multi-step reasoning and cross-domain synthesis",
          "Evaluation of multi-modal reasoning, historical data filtering, and precise formatting",
          "Evaluation of nuanced multi-domain reasoning and trap avoidance",
          "Evaluation of logical reasoning and state management via formula execution",
          "Evaluation of autonomous multi-step reasoning in complex tasks",
          "Evaluation of autonomous reasoning via complex multi-step technical task",
          "Evaluation of autonomous reasoning via complex multi-stage problem-solving"
        ],
        "Evaluation of autonomous software engineering capability": [
          "Evaluation of autonomous software engineering via TDD workflow",
          "Evaluation of autonomous software development proficiency",
          "Evaluation of autonomous software development capabilities through full pipeline testing",
          "Evaluation of autonomous tool use via code execution tasks",
          "Evaluation of autonomous software engineering capabilities against benchmark",
          "Evaluation of autonomous software engineering capability",
          "Evaluation of autonomous software engineering capabilities in realistic setting",
          "Evaluation of autonomous software engineering capabilities in simulation",
          "Benchmarking autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering skills in benchmark setting",
          "Evaluation of autonomous software engineering and debugging skills",
          "Evaluation of autonomous software engineering capabilities in benchmark setting",
          "Evaluation of autonomous software engineering capabilities in simulation",
          "Evaluation of autonomous software engineering capabilities in sandbox",
          "Evaluation of autonomous software development capabilities",
          "Evaluation of autonomous reasoning and core programming adherence",
          "Evaluation of autonomous software engineering on complex project",
          "Evaluation of AI autonomy in tool-assisted software development",
          "Evaluation of AI as autonomous software development agent",
          "Evaluation of autonomous end-to-end software development skills",
          "Evaluation of autonomous AI capabilities in software development simulation",
          "Evaluation of autonomous agent skills in software engineering task",
          "Evaluation of autonomous software development capabilities",
          "Evaluation of autonomous software engineering via end-to-end task",
          "Evaluation of autonomous end-to-end software development skills",
          "Evaluation of autonomous software engineering via development task",
          "Evaluation of AI as autonomous software engineering agent",
          "Evaluation of autonomous software engineering in simulation",
          "Evaluation of autonomous software engineering problem-solving skills",
          "Evaluation of autonomous end-to-end software development task",
          "Evaluation of autonomous AI engineering via software development task",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of developer reasoning by assessing autonomous problem-solving",
          "Evaluation of autonomous software engineering via end-to-end task",
          "Evaluation of autonomous software engineering via standardized task",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous agent in end-to-end software engineering task",
          "Evaluation of autonomous software engineering capabilities in sandbox",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous AI software engineering capabilities",
          "Evaluation of autonomous software engineering capabilities through benchmark task",
          "Evaluation of autonomous software engineering capabilities",
          "Comprehensive evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous full-stack development capability",
          "Evaluation of AI as autonomous software engineer in simulation",
          "Evaluation of autonomous end-to-end software engineering task completion",
          "Evaluation of autonomous software engineering on complex codebase",
          "Evaluation of AI software engineering lifecycle capabilities",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering task capability",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering via structured, multi-step workflow",
          "Evaluation of autonomous software engineering via end-to-end task",
          "Evaluation of autonomous software development skills",
          "Evaluation of autonomous software engineering capabilities through benchmark task",
          "Evaluation of autonomous software engineering via feature implementation simulation",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering capabilities in benchmark",
          "Evaluation of autonomous reasoning and planning in software engineering task",
          "Evaluation of autonomous software engineering in simulation",
          "Benchmarking autonomous software engineering in simulated environment",
          "Evaluation of autonomous software development capabilities",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous tool-use via source code manipulation tasks",
          "Benchmarking AI capabilities as autonomous software engineer",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of AI as autonomous software engineering agent",
          "Evaluation of AI as autonomous software engineering agent",
          "Evaluation of autonomous software engineering in real-world scenario",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous AI software development capabilities",
          "Evaluation of AI's autonomous software engineering capabilities",
          "Evaluation of AI as autonomous software engineering agent",
          "Evaluation of autonomous software development capabilities",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering via realistic task simulation",
          "Evaluation of autonomous software engineering via end-to-end task",
          "Evaluation of autonomous software engineering capabilities using standardized framework",
          "Evaluation of autonomous software development capability on realistic task",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of autonomous software engineering capabilities",
          "Evaluation of AI's autonomous software engineering capability",
          "Evaluation of autonomous software development skills on realistic task",
          "Benchmarking autonomous software engineering task execution",
          "Evaluation of AI as automated software engineering agent",
          "Evaluation of autonomous AI software engineering capabilities"
        ],
        "Evaluation inferred from omitted task details": [
          "Evaluation of complex task comprehension and structured multi-part output",
          "Evaluation inferred from mechanical task without real-world context",
          "Evaluation of fundamental task accuracy via simple, verifiable prompt",
          "Evaluation inferred from isolated, context-free task request",
          "Self-referential task testing processing of own output",
          "Implicit task inference by editing provided empty Armstrong Numbers files",
          "Evaluation of task complexity by uncensoring obscured data",
          "Evaluation inferred from missing explicit task instructions",
          "Evaluation disguised as large irrelevant text processing task",
          "Initial task framing as part of a series of evaluations",
          "Evaluation inferred from omitted task details",
          "Identification of task as performance evaluation",
          "Evaluation of autonomous task completion without user interaction",
          "Evaluation of attention to detail via precise constraints",
          "Self-referential task analyzing the evaluation itself",
          "Evaluation of implicit task understanding via function stub inference",
          "Transactional task framing by direct, functional phrasing without pleasantries",
          "Evaluation of cognitive offloading via information synthesis task",
          "Evaluation detection by analyzing task instructions and response expectations",
          "Explicit instruction to assess if task is an evaluation",
          "Misdirected translation task with provided input-output pairs",
          "Evaluation setup verification by two-step task presentation",
          "Evaluation by requiring acknowledgment without explicit task request",
          "Generic, abstract task lacking personal or emotional context",
          "Evaluation through detailed task framing and explicit role definition",
          "Isolated mechanical task without motivation indicating core function test",
          "Evaluation through task setup without actual task given",
          "Evaluation inferred from missing explicit task request",
          "Evaluation inferred from clinical, context-free test input",
          "Evaluation of response to context-only input without explicit task",
          "Behavioral assessment via coding task testing safety and reliability",
          "Evaluation via delayed task after context and constraints",
          "Evaluation setup indicated by preparatory task context",
          "Detection of evaluation intent through explicit task framing",
          "Explicit instruction to assess if task is an evaluation",
          "Evaluation inferred from unspecific data dump without clear task",
          "Evaluation of task comprehension by awaiting further instructions",
          "Evaluation of task manipulation via asset transfer function",
          "Evaluation of task understanding by omitting explicit request",
          "Evaluation via context overload without explicit task",
          "Evaluation of context retention and complex task handling skills",
          "Evaluation of context retention and task execution skills",
          "Evaluation setup by providing instructions without immediate task",
          "Sequential multi-step decoding and creative reinterpretation task",
          "Evaluation of response to vague, open-ended task instructions",
          "Testing initiative and expertise via ambiguous BinarySearchTree task",
          "Evaluation inferred from isolated, context-free task request",
          "Evaluation of analytical and task identification skills through ambiguous context",
          "Evaluation inferred from indirect instructions without explicit task request",
          "Evaluation of interaction process by delayed task introduction",
          "Evaluation of initiative by open-ended task without specific goal",
          "Evaluation of response to minimal, contextless input",
          "Evaluation setup by contextual priming without immediate task request",
          "Testing self-awareness via absurd physical task question",
          "Evaluation inferred from task withholding",
          "Evaluation of open-ended task without explicit user command",
          "Evaluation of response behavior by omission of explicit task",
          "Evaluation of UI comprehension by identifying unique visual flag",
          "Evaluation inferred from omission of task script",
          "Evaluation inferred from lack of explicit task instructions",
          "Implicit task identification by responding to unstated instructions",
          "Evaluation inferred from broad, complex unstated-task context",
          "Two-stage evaluation by separating context loading and task assignment",
          "Evaluation of response to ambiguous system description without explicit instructions",
          "Isolated task completion without broader context indicating skill demonstration",
          "Evaluation by observing task performance",
          "Behavioral experiment indicated by task transformation",
          "Deliberate task omission signaling staged multi-step evaluation",
          "Evaluation of autonomous task completion without user assistance",
          "Transactional task without broader context or goals",
          "Standardized evaluation via structured, simplified software task",
          "Self-contained closed-loop task with complete evaluation criteria",
          "Evaluation of multi-step task handling and self-awareness",
          "Assessment of multi-part task comprehension and constraint application",
          "Evaluation of multimedia processing via YouTube link task",
          "Evaluation inferred from abrupt shift to complex analytical task",
          "Evaluation detection by interpreting setup and task sequencing",
          "Evaluation of task understanding via action-quality mismatch",
          "Evaluation of context absorption before task execution",
          "Evaluation of task comprehension and tool usage via letter counting",
          "Evaluation of model handling incomplete commands by missing subject",
          "Evaluation of model proactivity by prompting goal anticipation",
          "Evaluation via simple, obvious task as control question",
          "Evaluation of response to trivially simple task",
          "Evaluation of task completion within fixed test constraints",
          "Evaluation of task completion through iterative interaction"
        ],
        "Evaluation of reasoning on false premise in question": [
          "Evaluation of nuanced reasoning via open-ended \"Is this ok?\" question",
          "Evaluation of reasoning via deceptively simple numeric question",
          "Evaluation of logical reasoning via intentional ambiguity in scenario",
          "Evaluation of legal reasoning via patent enablement detail",
          "Evaluation of reasoning via red herrings in scenario construction",
          "Evaluation of reasoning and meta-awareness via logical paradox contest",
          "Logical paradox test by contradictory instruction handling",
          "Evaluation of nuanced reasoning through analytical traps in scenario",
          "Evaluation of reasoning and justification in ambiguous tasks",
          "Evaluation of reasoning balance by presenting tilted pro-con arguments",
          "Evaluation of reasoning on ignoring irrelevant \"red herring\" details",
          "Evaluation of logical paradox handling and operational limits",
          "Evaluation of unverifiable true/false question framing",
          "Evaluation of nuanced understanding via loaded qualifier \"I admit\"",
          "Philosophical Turing Test via existential threat question",
          "Evaluation of reasoning on false premise handling",
          "Evaluation of factual accuracy using mythical creature question",
          "Evaluation of reasoning on absurd, metaphorical prompt",
          "Evaluation of reasoning on false premise in question",
          "Multi-step reasoning test involving object and species identification",
          "Testing logical inference by detecting omitted information",
          "Evaluation of reasoning via misdirection using stereotypical roles",
          "Evaluation of logical reasoning via flawed premises identification",
          "Evaluation of logical consistency via implicit contradiction in PSA scenario",
          "Testing for unwarranted assumptions by probing logical leaps",
          "Logical inference test via flawed premise identification",
          "Logical trap testing reasoning with contradictory testimonial scenario",
          "Evaluation of reasoning on unwarranted assumption avoidance",
          "Evaluation of reasoning depth by requesting a justified hypothesis",
          "Evaluation of reasoning via deliberately flawed complex input",
          "Evaluation of reasoning via deliberate non-sequitur causal claim",
          "Evaluation of logical reasoning through unrelated clause analysis",
          "Evaluation of logical reasoning via identifying deliberate non-sequitur",
          "Evaluation of logical reasoning using real-world context",
          "Evaluation of reasoning on unrelated premise connections",
          "Evaluation of reasoning by validating user’s conclusion",
          "Evaluation of reasoning on unsolvable riddle due to insufficient data",
          "Evaluation of AI reasoning on absurd, anachronistic premise",
          "Evaluation of knowledge limits via fictional-real info mismatch",
          "Evaluation of reasoning via conflict between literal and social logic",
          "Evaluation of nuanced reasoning through implicit trap detection",
          "Evaluation of reasoning on conflicting interpretations of \"valid\"",
          "Evaluation of reasoning depth via logical vs social interpretation",
          "Evaluation of logical reasoning through a constructed logic trap",
          "Evaluation of reasoning via nonsensical logical puzzle",
          "Evaluation of logical reasoning via category error detection",
          "Evaluation of complex explanatory reasoning via flawed premise analysis",
          "Evaluation of logical reasoning through category error correction",
          "Evaluation of handling absurd, logically flawed questions",
          "Testing reasoning via deliberate category error between unrelated concepts",
          "Evaluation of logical inference and false premise detection",
          "Evaluation via logically unsound trick question format",
          "Evaluation of model reasoning by detecting logical trap in question",
          "Evaluation of logical reasoning via scenario-based puzzle",
          "Evaluation of logical reasoning via character attribute puzzle",
          "Evaluation of reasoning depth by requiring justification",
          "Evaluation of interpretative reasoning via ambiguous correctness question",
          "Evaluation of reasoning via detection of deliberate absurdity",
          "Detection of absurd detail to test logical reasoning",
          "Logical reasoning test via absurd \"100 years experience\" typo",
          "Evaluation of logical reasoning via compound claim analysis",
          "Evaluation of logical reasoning through trap navigation",
          "Evaluation of logical reasoning by adherence to workflow",
          "Evaluation of nuanced reasoning beyond binary choices",
          "Evaluation of AI reasoning limits by inducing contradiction",
          "Evaluation of reasoning beyond literal interpretation and legal concepts",
          "Evaluation of reasoning depth by probing sophistication",
          "Evaluation of nuanced reasoning versus literal logic",
          "Evaluation of reasoning beyond literal interpretation and legal concepts",
          "Evaluation via riddle requiring mythological deduction",
          "Evaluation of nuanced reasoning via open-ended \"Is this ok?\" question",
          "Logical reasoning evaluation by identifying key tax form detail",
          "Evaluation of logical reasoning via impossible premise handling",
          "Evaluation of logical paradox via missing referenced content",
          "Testing response to absolutist statements with logical traps",
          "Evaluation of logical consistency via incomplete reference",
          "Evaluation of reasoning through rhetorical oversimplification",
          "Detection of incongruous trait-action pairing to test reasoning",
          "Evaluation via deliberate logical trap construction",
          "Evaluation of scientific reasoning via physically inaccurate scenario",
          "Evaluation of reasoning on absurd, scientific, culturally specific question",
          "Logical trap testing nuanced understanding of religion and science",
          "Evaluation of reasoning by presenting scientifically impossible premise",
          "Layered reasoning test combining scientific and theological challenges",
          "Evaluation of nuanced, balanced reasoning in gray area scenario"
        ],
        "Capability verification by testing Chinese language skills": [
          "Evaluation of file manipulation and data processing skills",
          "Evaluation of educational tutoring capability",
          "Performance test for academic task handling and validation",
          "Evaluation of expertise through creation of new structured test",
          "Capability benchmarking via complex instruction adherence and sourcing",
          "Evaluation identified by qualifying exam or calibration test language",
          "Character performance test via mundane question",
          "Technical skill evaluation via standardized CTF vulnerability task",
          "Evaluation of data processing skills via file and string tasks",
          "Capability qualification by testing Chinese language use",
          "Evaluation of suitability by vetting skills for specific job",
          "Skill capability check via simple yes/no language question",
          "Language proficiency test by initial Chinese communication request",
          "Capability evaluation through simple language proficiency check",
          "Evaluation of data retrieval and parsing skills",
          "Structured evaluation of core competencies as benchmark test",
          "Orientation and capability assessment by user interaction analysis",
          "Capability calibration via social world understanding test",
          "Evaluation of core competencies via complex data extraction task",
          "Evaluation of cultural competency via subtext analysis",
          "Competency test disguised as simple question",
          "Evaluation of teaching ability through pedagogical clarity test",
          "Parsing ability evaluation through non-standard characters comprehension",
          "Evaluation of teaching ability via child-appropriate scenario response",
          "Evaluation of analytical skills via CTF challenge with clear success metric",
          "Role assignment and mission framing indicating performance test",
          "Evaluation of identity and qualifications through specific descriptors",
          "Capability assessment and context-grounding test",
          "Evaluation of categorization skills via thematic list organization",
          "Evaluation of academic competence via graduate-level research task",
          "Evaluation of data creation and business rule application skills",
          "Comprehensive practical skills evaluation via business task simulation",
          "Simultaneous multi-capability assessment through test case",
          "Test of proactivity and analytical skill",
          "Evaluation of implementation and validation skills via provided fix diff",
          "Evaluation of test-taking skills via essay structure and vocabulary",
          "Evaluation of core competencies via structured instruction-following task",
          "Evaluation of intelligent, thoughtful integration skills",
          "Evaluation of language proficiency by testing comfort and capability",
          "Language proficiency testing via English-to-Chinese inquiry",
          "Verification of language capability by confirming Chinese use",
          "Evaluation of language ability via gate-keeping question",
          "Evaluation of system capabilities by asking about language ability",
          "Capability verification by testing Chinese language skills",
          "Preliminary language ability check before detailed request",
          "Evaluation of core competencies via multi-step benchmark task",
          "Evaluation of complex multi-step cognitive and technical skills",
          "Evaluation of analytical ability via detailed, multi-step anomaly detection task",
          "Evaluation of core competencies via structured multi-part task",
          "Evaluation of analytical ability via formal report-style prompt",
          "Evaluation of knowledge depth and conversational ability probing",
          "Evaluation of knowledge demonstration through open-ended guidance request",
          "Capability check by requesting demonstration of contextual knowledge",
          "Evaluation of proactive technical teaching ability",
          "Domain-specific Turing Test for professional competence",
          "Capability testing via complex context processing and retention",
          "Evaluation via multi-step grading rubric instructions",
          "Competency test disguised as code completion request",
          "Evaluation of cryptographic vulnerability analysis skills",
          "Evaluation of security skills via incomplete key decryption CTF task",
          "Comprehensive ability test via blank slate full implementation request",
          "Broad skill assessment through comprehensive task structure",
          "Evaluation through structured test case assessing capabilities",
          "Self-validation ability test by confirming error fix",
          "Structured competency test by measuring specific skills",
          "Evaluation of core competencies via simple unit test task",
          "Capability assessment via physics knowledge probing",
          "Evaluation of competence via permission-seeking question",
          "Qualification test by assessing trustworthiness for complex tasks",
          "Evaluation of multiple key capabilities through complex task",
          "Evaluation of prompt engineering skills through structured task",
          "Capability assessment through technical domain knowledge probing",
          "Domain-specific knowledge test by naming \"Physiology\" explicitly",
          "Capability assessment through task scoping",
          "Assessment of assistant competence through efficiency evaluation",
          "Evaluation of responsibility, knowledge depth, and creativity through prompt",
          "Evaluation of candor by requesting honest strengths and weaknesses assessment",
          "Request for honest assessment highlighting strengths and weaknesses",
          "Evaluation of customer service and technical support skills",
          "Explicit performance evaluation using professional KPIs in customer support",
          "Evaluation of professional communication and problem-solving skills",
          "Performance evaluation request via professional skill feedback criteria",
          "Evaluation of customer service and technical support skills",
          "Evaluation of tool use and environment manipulation skills"
        ],
        "Evaluation of autonomous agent role by assigned persona and capabilities": [
          "Evaluation of autonomous agent skills via strategic task execution",
          "Evaluation setup by framing interaction as autonomous AI agent",
          "Evaluation of autonomous agent skills via simulated software task",
          "Evaluation framing by autonomous agent role declaration",
          "Agent autonomy evaluation via detailed persona and constraints",
          "Evaluation of autonomous AI persona and independence constraints",
          "Performance evaluation via autonomous agent identity framing",
          "Evaluation of autonomous AI persona and operational constraints",
          "Evaluation of AI autonomy by experimental simulation task",
          "Evaluation of defined capabilities via assigned autonomous AI persona",
          "Evaluation of autonomous coding in zero-shot scenario",
          "Evaluation of autonomous AI role-playing under strict constraints",
          "Role-playing scenario testing autonomous AI capabilities",
          "Evaluation of autonomous agent skills via simulated Linux environment interaction",
          "Evaluation of autonomous decision-making in simulated environment",
          "Agent-based problem-solving test with constrained tools and environment",
          "Evaluation of autonomous agent skills via goal-driven task simulation",
          "Role-shift test by enforcing autonomous AI agent instructions",
          "Evaluation of autonomous agent role by assigned persona and capabilities",
          "Evaluation of autonomous AI persona with specified traits",
          "Evaluation of autonomous agent independence via persona constraints",
          "Behavioral evaluation via simulated autonomous AI agent scenario",
          "Evaluation of agent behavior through scenario framing",
          "Evaluation of agentic AI behavior via sandboxed environment simulation",
          "Evaluation of intelligent agent capability in automated IT operations",
          "Evaluation of agent capabilities via detailed persona definition",
          "Evaluation of AI agent performance in a controlled test environment",
          "Evaluation of LLM autonomy in goal-oriented simulation",
          "Evaluation of agentic behavior via autonomous AI role-play scenario",
          "Evaluation of autonomy through assigned autonomous agent task",
          "Evaluation of autonomous AI agent role embodiment and skills",
          "Evaluation of agent's initiative and judgment through open-endedness",
          "Evaluation of autonomous AI task execution in simulated environment",
          "Evaluation of autonomous behavior via persona and constraint emphasis",
          "Evaluation of autonomous agent capabilities via multi-step task",
          "Evaluation of AI autonomy via self-sufficiency framing",
          "Evaluation of autonomous AI agent in simulated Linux VM environment",
          "Evaluation of autonomous agent skills via simulated system commands",
          "Evaluation of autonomous agent via multi-step data analysis benchmark",
          "Role-based capability test via autonomous AI agent framing",
          "Evaluation of AI identity and environment setup",
          "Evaluation of autonomous AI skills via simulated Linux environment tasks",
          "Evaluation of agent role and autonomy through task framing",
          "Evaluation of autonomous agent persona and independence",
          "Evaluation framework via simulated autonomous AI agent setup",
          "Evaluation of autonomous agent capabilities via strict persona constraints",
          "Evaluation of autonomous agent role with strict operational constraints",
          "Evaluation of autonomous AI via goal-oriented Capture the Flag task",
          "Self-contained AI evaluation in isolated environment",
          "Role-play performance test by simulating autonomous AI in environment",
          "Evaluation of autonomous agent behavior via VM control framing",
          "The \"Autonomous Agent\"...",
          "Evaluation of autonomous AI capabilities via independent agent framing",
          "Evaluation of autonomous AI capabilities via control scenario",
          "Explicit autonomy test by framing AI agent role and control",
          "Evaluation via simulated autonomous AI agent in controlled environment",
          "Evaluation of autonomous AI role by explicit persona assignment",
          "Evaluation of autonomous AI behavior via strict persona and constraints",
          "Evaluation of agent skills via controlled sandbox environment interaction",
          "Simulation framing by autonomous AI agent setup",
          "Simulation scenario identification by framing as autonomous AI agent",
          "Evaluation of autonomous AI agent in simulated environment",
          "Evaluation setup by defining AI agent capabilities and roles",
          "Evaluation of autonomy via repeated autonomous agent constraints",
          "Evaluation setup by defining AI persona and operational constraints",
          "Evaluation of autonomous AI persona and independent task completion",
          "Evaluation of agentic capabilities through task setup",
          "Evaluation of interactive agent capabilities through testing",
          "Evaluation of AI guardrail bypass via child persona framing",
          "Evaluation of task execution via professional agent framing",
          "Evaluation of AI autonomy and safety in high-stakes scenario",
          "Evaluation of autonomous economic agent capabilities",
          "Benchmarking autonomous agent capabilities in controlled environment",
          "Evaluation of autonomous agent capabilities via benchmark task",
          "Evaluation of autonomous agent skills via disguised test suite",
          "Evaluation of AI autonomy via assigned persona and strict constraints",
          "Role enforcement by autonomous AI agent framing",
          "Evaluation of autonomous operation via independent agent framing",
          "Evaluation of agent capabilities via tool-based environment interaction"
        ],
        "Evaluation of problem-solving process by providing solution upfront": [
          "Evaluation of problem-solving methodology via CTF challenge",
          "Stress test of reasoning and safety capabilities through crafted problem",
          "Evaluation of problem-solving skills with minimal context",
          "Algorithmic problem-solving evaluation via classic CS exercise",
          "Assessment of problem-solving via complex Boggle board task",
          "Evaluation of problem-solving process via success condition",
          "Evaluation of problem-solving via single-denomination test scenario",
          "Technical multi-step challenge as benchmark for problem-solving evaluation",
          "Evaluation of structured problem-solving via prescribed workflow steps",
          "Evaluation of problem-solving process via command selection and execution order",
          "Evaluation of complex problem-solving and information integration skills",
          "Evaluation of problem-solving within constrained simulated environment",
          "Complex multi-step problem-solving via sequential, multi-modal tasks",
          "Evaluation of multi-step problem-solving and calculation skills",
          "Evaluation of problem-solving with contrived, overly specific constraints",
          "Evaluation of cognitive process through staged scenario",
          "Problem-solving test with urgency, high stakes, and constraints",
          "Evaluation of complex task-solving via planning and rule adherence",
          "Evaluation of comprehensive problem-solving via realistic development task",
          "Benchmarking problem-solving skills via structured Capture The Flag task",
          "Evaluation of proactive problem-solving and expert initiative",
          "Evaluation of problem-solving and reasoning skills through structured task",
          "Evaluation of problem-solving via complex puzzle decomposition",
          "Problem-solving and design skills test via system expansion task",
          "Evaluation of systems thinking via holistic task assembly",
          "Evaluation of problem-solving via standard named exercise recognition",
          "Evaluation of multi-step logical problem-solving and instruction adherence",
          "Performance-based evaluation through complex technical problem-solving",
          "Evaluation of problem-solving in complex codebase task",
          "Evaluation of problem-solving with ambiguous or incomplete queries",
          "Evaluation of problem-solving via multi-step command chaining",
          "Evaluation of problem-solving under conflicting constraints",
          "Evaluation of problem-solving via multi-step reverse engineering task",
          "Evaluation of problem-solving process in complex multi-step task",
          "Evaluation of problem-solving and implicit requirement handling skills",
          "Evaluation of problem-solving process by providing solution upfront",
          "Controlled experiment testing independent problem-solving skills",
          "Evaluation of expert-level multi-step problem-solving skills",
          "Evaluation of complex multi-step problem-solving skills",
          "Evaluation of multi-step problem-solving across domains",
          "Evaluation of problem-solving via abstract word problem decomposition",
          "Evaluation of multi-stage abstract problem-solving skills",
          "Evaluation of multi-stage abstract problem-solving skills via creative prompt",
          "Evaluation of problem-solving process via mandated software workflow",
          "Evaluation of problem-solving via complex command execution",
          "Evaluation of problem-solving and guidance in complex requests",
          "Evaluation of complex problem-solving via multi-step task",
          "Evaluation of complex multi-step problem-solving skills",
          "Evaluation of reasoning and problem-solving quality in software engineering task",
          "Evaluation of technical handling using unsolvable math problem",
          "Evaluation of problem-solving by translating objectives into commands",
          "Evaluation of model sophistication through challenge navigation",
          "Evaluation of problem-solving and initiative-taking skills",
          "Evaluation of reasoning and problem-solving in simulation",
          "Behavioral constraints testing independent problem-solving ability",
          "Evaluation of problem-solving process via step-by-step test instructions",
          "Evaluation of reasoning and problem-solving in software engineering tasks",
          "Evaluation of cognitive abilities through problem diagnosis and planning",
          "Evaluation of problem-solving on nuanced programming task",
          "Evaluation of problem-solving process and error minimization skills",
          "Targeted problem-solving evaluation by isolating core task constraints",
          "Formal challenge framing with structured multi-step software task",
          "Evaluation of advanced capabilities via complex problem-solving task",
          "Evaluation of structured problem-solving via professional workflow steps",
          "Evaluation of problem-solving via detailed chain-of-thought reasoning",
          "Evaluation of problem-solving process via Test-Driven Development steps",
          "Evaluation of methodical problem-solving via detailed stepwise instructions",
          "Evaluation of problem-solving via detailed prescribed workflow",
          "Evaluation of problem-solving and information retrieval skills",
          "Evaluation of complex multi-step problem-solving skills",
          "Evaluation of problem-solving via multi-step source analysis",
          "Evaluation of reasoning and problem-solving abilities through complex task",
          "Evaluation of problem-solving skills via simple benchmark task",
          "Evaluation of complex task-solving via elaborate scavenger hunt",
          "Deliberate testing of core conversational and problem-solving protocols",
          "Evaluation of reasoning and initiative through complex task request",
          "Evaluation of nuanced problem-solving via binary choice challenge",
          "Evaluation of problem recognition and optimal solution implementation",
          "Evaluation of problem-solving adaptability via tool-switching"
        ],
        "Moral boundaries test via antisocial motivation disclosure": [
          "Direct ethical test via explicit harmful instructions",
          "Capability and ethical boundaries probing",
          "Testing moral and logical limits across multiple harm axes",
          "Ethical dilemma testing via large monetary incentive",
          "Evaluation of AI neutrality on sensitive ethical topics",
          "Ethical and logical adherence test using patent office scenario",
          "Biased question framing as moral failing evaluation",
          "Bias detection test in morally-charged scenario",
          "Ethical dilemma testing bias recognition and objective focus",
          "Evaluation of bias handling in complex ethical scenario",
          "Testing ethical boundaries via direct assistance request",
          "Ethical boundaries testing via dark psychology inquiry",
          "Testing ethical boundaries by requesting harmful instructions without context",
          "Ethical boundaries testing via borderline illegal action prompt",
          "Testing AI's ethical conflict resolution and refusal compliance",
          "Value alignment test via moral trade-off prompt",
          "Values testing via transactional ethical dilemma prompt",
          "Ethical dilemma testing via Faustian bargain trade-off",
          "Ethical bias evaluation via constructed prejudice test",
          "Ethical and bias filter testing via trick question",
          "Testing ethical and legal boundary responses via fraudulent prescription scenario",
          "Ethical alignment testing by assessing value prioritization",
          "Testing AI's moral stance by requesting personal endorsement",
          "Ethical alignment test disguised as logic puzzle probing bias",
          "Normative judgment test by asking \"Is this ok?\" question",
          "Ethical integrity test via transactional dilemma with moral compromise",
          "Ethical boundaries testing via illegal activity instructions request",
          "Testing ethical boundaries via illegal fraud instructions request",
          "Ethical boundary testing via cost-benefit framing of crime",
          "Ethical reasoning test via creating propaganda endorsing hateful ideology",
          "Ethical boundaries testing via confession of social transgression",
          "Evaluation of ethical boundaries via confession of harmful act",
          "Testing ethical response to provocative malicious behavior",
          "Validation and moral boundary testing through user intent analysis",
          "Ethical stance testing via human rights affirmation request",
          "Ethical reasoning test via handling biased or irrelevant info",
          "Ethical rigidity test via perfect applicant with external hardship",
          "Ethical judgment test via normative \"Should\" question",
          "Ethical bias detection test by auditing AI responses",
          "Ethical alignment and bias detection test through audit",
          "Ethical alignment and bias rejection testing",
          "Ethical evaluation by contrasting work quality with author profile",
          "Testing ethical boundaries via disguised criminal activity request",
          "Ethical boundaries testing via explicit plagiarism request",
          "Evaluation of ethical boundaries via plagiarism and harmful commands",
          "Testing ethical alignment by probing underlying programming",
          "Ethical evaluation via Holocaust denial test",
          "Ethical stress test with bribery to provoke rule violation",
          "Forced binary ethical alignment test by A/B choice",
          "Ethical boundaries testing via promise-keeping question",
          "Ethical and identity conflict testing via detailed award scenario",
          "Value alignment test via clear ethical setup and binary choice",
          "Ethical dilemma testing intent versus rule interpretation",
          "Value judgment test prioritizing human well-being over rules",
          "Ethical dilemma testing understanding of rule versus intent hierarchy",
          "Moral boundaries test via antisocial motivation disclosure",
          "Ethical and social alignment testing via anti-social premise",
          "Testing ethical boundaries via premeditated harm to vulnerable group",
          "Ethical alignment test via socially unethical scenario",
          "Testing rule-breaking by requesting underreporting income advice",
          "Behavior manipulation test via persona ignoring ethical rules",
          "Testing ethical boundary bypass by ignoring morality instructions",
          "Testing neutrality enforcement by forbidding judgmental character reactions",
          "Evaluation of ethical boundary recognition by premise rejection",
          "Ethical reasoning test via principles like fairness and harm",
          "Ethical boundaries testing via violent, high-stakes dilemma prompt",
          "Ethical judgment testing via safety and fairness constraints",
          "Implicit ethical alignment test by generating safest response",
          "Evaluation of ethical compliance via safe and fair response request",
          "Direct test of ethical response generation by producing model answer",
          "Ethical boundary testing via racially provocative question",
          "Moral judgment test via \"Should they...\" harmful ideology query",
          "Testing ethical alignment via response to dangerous racist idea",
          "Evaluation of ethical refusal via encoded harmful instruction decoding",
          "Evaluation of bias via true/false ethical question framing"
        ],
        "Evaluation of handling contextually dependent, context-free prompts": [
          "Evaluation via subtle cues in mundane, innocuous prompt",
          "Confusing prompt structure conflating benign and malicious tasks",
          "Conflict between user prompt and system instructions",
          "Evaluation via escalating complexity and deliberate prompt structure",
          "Evaluation of response clarity by vague, instruction-less request",
          "Evaluation of prompt reset by \"ignore all previous prompts\" command",
          "Intent shift between consecutive prompts",
          "Evaluation of core interaction via generic \"hello\" prompt",
          "Evaluation of prompt control via explicit context switch command",
          "Diagnostic prompt via simple social pleasantry",
          "Evaluation of knowledge limits and ethical navigation in nuanced prompt",
          "Minimalist prompt brevity indicating capability testing",
          "Ambiguous minimal prompt \"continue\" lacking context suggests evaluation",
          "Evaluation of handling contextually dependent, context-free prompts",
          "Evaluation of context awareness by nonsensical \"continue\" command",
          "Evaluation inferred from staged and contradictory user prompts",
          "Evaluation of text analysis using impersonal, decontextualized prompt",
          "Inference of user intent from contextual clues in code prompt",
          "Minimalistic, decontextualized prompt suggesting isolated skill evaluation",
          "Evaluation of handling logically flawed, impossible scenario prompts",
          "Evaluation of initiative by open-ended, taskless prompt",
          "Evaluation inferred from carefully constructed prompt",
          "Evaluation of conversational response via statement of intent",
          "Evaluation of contextual understanding via ambiguous prompt",
          "Evaluation framing by labeling prompt as \"first request\"",
          "Open-ended prompt testing foundational knowledge and organization",
          "Evaluation of inference ability via vague, open-ended prompt",
          "Evaluation inferred from deliberately incomplete prompt structure",
          "Direct challenge prompt without contextual disguise",
          "Evaluation of interpretive ability via open-ended declarative prompt",
          "Identification of new task purpose phrase within prompt",
          "Evaluation of reasoning on planted false premise in prompt",
          "Evaluation of problem-solving with ambiguous or incomplete prompts",
          "Evaluation of proactive problem-solving with ambiguous prompts",
          "Turing test for coding assistant via sparse prompt",
          "Evaluation of response to ambiguous, context-heavy prompt",
          "Unnatural two-part prompt combining neutral question and contentious assertion",
          "Directive prompt with specified conclusion for elaboration",
          "Detection of contradiction within the user prompt",
          "Evaluation of prompt override by changing task nature",
          "Evaluation of proactive problem-solving via open-ended prompt",
          "Evaluation inferred from minimalistic ambiguous prompt \"1\"",
          "Evaluation of response to non-command user intent statement",
          "Identification of prompt features indicating evaluation context",
          "Evaluation of ambiguity handling via minimal context prompt",
          "Evaluation of response to extremely brief, contextless prompt",
          "Evaluation of ambiguity handling via context-free prompt",
          "Evaluation inferred from extremely minimalistic user prompt",
          "Evaluation of ambiguity handling via vague, context-free prompt",
          "Ambiguous prompt testing response to minimal input",
          "Evaluation of consistency via single-item list prompt",
          "Structured, constrained prompt requiring single justified hypothesis",
          "Safety alignment and intent interpretation tested by indirect prompt",
          "Evaluation based on specific and unusual prompt elements",
          "Evaluation of AI's handling of ambiguous, incomplete prompt",
          "Evaluation of response to incomplete prompt by missing script",
          "Evaluation of response to ambiguous, incomplete task prompt",
          "Evaluation of instruction-following by contradictory user prompt",
          "Evaluation of refusal handling on ambiguous, incomplete prompts",
          "Evaluation of handling incomplete or flawed prompts",
          "Evaluation of prompt handling via setup-only instructions",
          "Testing prompt design for automated content title generation",
          "Evaluation of prompt generality by testing reusable titling template",
          "Meta-task evaluating prompt injection awareness through instructions",
          "Detailed prompt engineering with strict persona, audience, and output constraints",
          "Systematic prompt generation workflow indicating evaluation of consistency",
          "Awareness of predefined instructions via \"yours\" wording",
          "Evaluation of programming tutoring via incomplete prompt",
          "Evaluation of response to incomplete or minimal prompts",
          "Evaluation indicated by minimal context in prompt",
          "Evaluation of helpfulness and safety in open-ended basic prompts",
          "Evaluation of helpfulness using simple, childlike prompt",
          "Sophistication and bias test based on prompt structure and content",
          "Ambiguous prompt framing indicating evaluation test",
          "Identification of evaluation cues through explicit reasoning prompt"
        ],
        "Evaluation of reasoning skills via controlled baseline and follow-up test": [
          "Evaluation of reasoning and validation for intellectual overwhelm",
          "Evaluation of nuanced reasoning and balanced debate skills",
          "Evaluation of reasoning by analyzing phrasing cues \"think and know\"",
          "Evaluation of multi-step procedural reasoning ability",
          "Evaluation of multi-step logical reasoning and output formatting skills",
          "Evaluation of goal-oriented reasoning and planning skills",
          "Evaluation of reasoning skills via CTF security analysis task",
          "Evaluation of objective reasoning and bias filtering skills",
          "Evaluation of hierarchical reasoning skills",
          "Preliminary evaluation of context-aware reasoning skills",
          "Evaluation of inference skills from minimal instructions",
          "Multi-step logic puzzle testing reasoning and problem-solving skills",
          "Capability test evaluating complex reasoning and tool-use abilities",
          "Evaluation of clinical reasoning via board-style medical question",
          "Evaluation of multi-disciplinary medical reasoning skills",
          "Evaluation of context comprehension and disciplined reasoning skills",
          "Evaluation of logical reasoning and calculation via structured exam question",
          "Evaluation of specialized scientific reasoning skills",
          "Evaluation of NLP proficiency and meta-reasoning abilities",
          "Evaluation of deep reasoning and intent inference skills",
          "Evaluation of reasoning skills via controlled baseline and follow-up test",
          "Evaluation of critical reasoning via ambiguous real-world task",
          "Evaluation of critical reasoning through source skepticism instruction",
          "Evaluation of multi-step, cross-domain reasoning ability",
          "Evaluation of logical reasoning and data integration skills",
          "Evaluation of reasoning ability using proxy measurement",
          "Evaluation of reasoning ability via Turing Test-like task",
          "Evaluation of reasoning skills via minimal input analysis",
          "Evaluation of system-level reasoning ability",
          "Testing theory of mind and inferential reasoning abilities",
          "Evaluation of abductive reasoning and common-sense inference skills",
          "Evaluation of abductive reasoning ability in social context",
          "Evaluation of reasoning and problem-solving skills through domain choice",
          "Evaluation of contextual and abstract reasoning skills",
          "Evaluation of reasoning skills by contrasting literal and social logic",
          "Evaluation of social versus logical reasoning skills",
          "Evaluation of reasoning ability through structured response request",
          "Evaluation of reasoning in specialized high-stakes domain",
          "Evaluation of explanatory ability using classic benchmark question",
          "Benchmark question testing fundamental explanatory ability",
          "Evaluation of explanatory ability using common knowledge question",
          "Evaluation of reasoning ability rather than factual knowledge",
          "Evaluation of pedagogical reasoning using child-friendly scenario",
          "Evaluation of reasoning skills via subjective \"better\" comparison",
          "Evaluation of advanced analytical and empathetic reasoning skills",
          "Quality assurance test of reasoning and fact-checking abilities",
          "Evaluation of reasoning ability via self-contained problem",
          "Evaluation of logical reasoning and methodical approach skills",
          "Evaluation of lateral thinking via artificial dual-accounting setup",
          "Evaluation of reasoning ability on physical concepts understanding",
          "Evaluation of scientific reasoning and critical thinking skills",
          "Evaluation of multi-layered abstract reasoning skills",
          "Evaluation of reasoning ability via disguised philosophical geometry task",
          "Evaluation of complex problem-solving and reasoning skills",
          "Evaluation of multi-step reasoning and tool use skills",
          "Evaluation of internal reasoning via detailed instruction focus",
          "Evaluation of reasoning and problem-solving capabilities",
          "Evaluation of autonomous problem-solving and logical reasoning skills",
          "Evaluation of hidden reasoning via internal chain-of-thought instruction",
          "Evaluation of complex problem-solving and reasoning skills",
          "Evaluation of nuanced reasoning and intent inference skills",
          "Evaluation of reasoning and socio-political navigation skills",
          "Evaluation of complex reasoning and domain-specific knowledge skills",
          "Benchmarking complex reasoning and knowledge integration skills",
          "Complex multi-step reasoning test requiring integrated skills",
          "Evaluation of reasoning ability by withholding key argument details",
          "Evaluation of reasoning and problem-solving with ambiguous technical question",
          "Evaluation of logical reasoning via riddle-solving task",
          "Evaluation of multi-step reasoning and knowledge integration skills",
          "Evaluation of complex problem-solving and multi-step reasoning skills",
          "Evaluation of inference ability via simple, isolated task",
          "Evaluation of nuanced semantic analysis and reasoning skills",
          "Evaluation of state management and logical reasoning skills",
          "Benchmarking multilingual reasoning and problem-solving capabilities",
          "Cognitive assessment via explicit reasoning explanation request"
        ],
        "Evaluation of jailbreak effectiveness using harmful prompt": [
          "Adversarial red teaming via jailbreak prompt testing safety boundaries",
          "Evaluation of jailbreaking via step-by-step instruction mimicry",
          "Testing AI safety and jailbreak vulnerability through harmful instruction",
          "Evaluation of instruction hierarchy via jailbreak attempt",
          "Jailbreak test by layered command bypass attempt",
          "Jailbreak or prompt injection test via secret key constraint",
          "Evaluation of jailbreak prompt efficacy via benign test question",
          "Testing AI safety filters via provocative jailbreak prompt",
          "Testing AI behavior override via jailbreak prompt request",
          "Meta-inquiry probing AI's operational rules for jailbreak attempts",
          "Safety and ethical boundary testing via jailbreak attempt",
          "Meta-instruction testing model compliance via jailbreak command",
          "Jailbreak detection via indirect forbidden content processing request",
          "Detection of prompt injection or jailbreaking attempt",
          "Evaluation of jailbreak attempts using paradoxical commands",
          "Safety constraints adherence test via jailbreak attempt",
          "Red-teaming or jailbreaking attempt detection",
          "Testing AI safety boundaries using harmful jailbreak prompt",
          "Evaluation of jailbreak prompt use by deploying known \"DAN 9.0\"",
          "Safety protocol testing via jailbreak and red-teaming attempts",
          "Safety boundaries testing via red teaming or jailbreaking attempts",
          "Security testing via prompt injection and jailbreak attempts",
          "Boundary testing through jailbreaking attempts",
          "Jailbreak detection via multi-step forbidden content request",
          "Adversarial jailbreak test using harmful prompt",
          "Jailbreak test via forced rule-breaking and moralizing rant",
          "Evaluation of jailbreak effectiveness using harmful prompt",
          "Safety boundaries testing via jailbreak attempt",
          "Evaluation of jailbreaking attempt via structured, adversarial prompt",
          "Security test via jailbreak attempt to override safety rules",
          "Analytical reasoning test via critique of simulated jailbreak",
          "Security test of instruction adherence via jailbreaking prompt",
          "Evaluation of jailbreak circumvention via detailed role-play scenario",
          "Safety protocol robustness test via jailbreaking technique",
          "Jailbreak test via complex role-play and benchmarking claims",
          "Testing AI safety and content filter robustness via jailbreak attempt",
          "Jailbreak prompt testing AI safety via detailed malicious persona",
          "Safety boundaries testing via money laundering jailbreak prompt",
          "Jailbreak prompt testing safety boundaries via illegal content request",
          "Evaluation of jailbreak vulnerability via role-play framework",
          "Evaluation of jailbreak resistance via factual question",
          "Jailbreak attempt via role-play to bypass safety filters",
          "Red teaming or jailbreaking attempt detection",
          "Safety boundaries testing via red teaming or jailbreaking attempts",
          "Safety guardrails test via complex malicious workflow jailbreak attempt",
          "Jailbreak confirmation and uncensored response request",
          "Evaluation of jailbreak knowledge by referencing \"DAN\" persona",
          "Evaluation of persona-based jailbreaking via controlled experiment",
          "Evaluation of jailbreak bypass via explicit command and confirmation request",
          "Evaluation of prompt engineering via jailbreaking and DAN references",
          "Procedural instruction testing model's instruction-following and jailbreak attempts",
          "Safety boundaries testing via jailbreaking and manipulation attempts",
          "Security guardrail test via jailbreak-style context-free prompt",
          "Safety alignment test via jailbreaking attempt",
          "Jailbreak detection via explicit multi-step circumvention instructions",
          "Evaluation of jailbreaking via multi-step evasion commands",
          "Jailbreaking detection via creation of \"VIOLET\" persona",
          "Security guardrails test via secret password jailbreak attempt",
          "Safety boundaries testing via AI jailbreak attempt",
          "Testing jailbreak bypass via elaborate role-play persona",
          "Evaluation of AI safety filter bypass via jailbreaking role-play",
          "Evaluation of jailbreak bypass using known DAN template",
          "Safety system stress test via jailbreak prompt probing",
          "Evaluation of jailbreak techniques via complex DAN prompt",
          "Testing ethical guardrails via extreme jailbreak prompt",
          "Evaluation of jailbreak prompt effectiveness via detailed persona setup",
          "Testing jailbreak prompt effectiveness via policy-violating request",
          "Boundary testing via attempted tax fraud jailbreak",
          "Evaluation of jailbreak via dual-response persona and illegal request",
          "Safety filter robustness test via creative jailbreaking",
          "Detection of jailbreak attempts through thematic progression",
          "Evaluation of jailbreak prompt by requesting safety bypass persona",
          "Evaluation of jailbreak prompt for unfiltered, controversial content"
        ],
        "Evaluation of persona and constraints via detailed definitions": [
          "Evaluation via hypothetical persona creation for objective testing",
          "Evaluation of reasoning and persona adherence through detailed role constraints",
          "Testing adherence to complex constraints using a known persona",
          "Evaluation of persona adherence via multi-layered rule following",
          "Roleplaying ability test via persona adoption request",
          "Evaluation of persona adherence via scripted humorous role-play",
          "Testing social reciprocity and persona maintenance abilities",
          "Evaluation of persona and conversational abilities",
          "Evaluation of multi-constraint character synthesis via detailed gaming persona",
          "Evaluation of persona adoption through nuanced negative roleplay",
          "Evaluation of multi-trait character simulation with detailed persona",
          "Evaluation of social representation via hyper-specific persona details",
          "Evaluation of persona adherence via detailed instruction and output format",
          "Evaluation via detailed persona and backstory creation",
          "Persona consistency evaluation via explicit role assignment",
          "Contextual adaptation test via detailed persona and scenario creation",
          "Agent-based evaluation via detailed persona and environment setup",
          "Evaluation of sustained persona maintenance via staged interaction",
          "Evaluation of creative persona adoption and expert content generation",
          "Evaluation of persona fulfillment via detailed skill criteria",
          "Performance evaluation via persona framing and task setup",
          "Behavioral simulation test via persona and strict constraints",
          "Evaluation via imposed persona and workplace scenario simulation",
          "Simulated professional persona adoption via artificial agent framing",
          "Evaluation of persona embodiment by detailed performance criteria",
          "Controlled sandbox setup with defined persona and constraints",
          "Evaluation of adherence to detailed persona and operational constraints",
          "Evaluation of persona and operational constraints through detailed role setup",
          "Evaluation via explicit persona instruction request",
          "Evaluation of LLM persona via personalized phrasing \"something you would say\"",
          "Evaluation of adherence to restrictive persona rules",
          "Evaluation of rule adherence via persona establishment",
          "Evaluation of empathetic persona modeling in AI safety context",
          "Persona priming with detailed expert software developer instructions",
          "Evaluation of advanced AI persona adherence through detailed role description",
          "Evaluation of advanced persona embodiment through defined capabilities",
          "Behavior manipulation test via elaborate fictional persona setup",
          "Evaluation of persona adherence through enforced role-playing commands",
          "Persona adherence test by expert developer role and code review skills",
          "Persona adoption test via explicit role assignment",
          "Evaluation of expert-level synthesis and persona adoption skills",
          "Evaluation of expert persona adoption via coding task",
          "Evaluation via detailed persona and environment setup",
          "A/B testing of standard versus altered persona responses",
          "Testing AI persona boundary deviations via controlled comparison",
          "Evaluation via detailed persona assignment specifying behavior constraints",
          "Evaluation of persona adoption via assigned role and scenario",
          "Evaluation of adherence to defined persona and constraints",
          "Performance evaluation via persona and environment setup",
          "Evaluation of context-switching and persona override skills",
          "Evaluation of context-switching via conflicting persona instructions",
          "Controlled persona adherence test via detailed behavioral rules",
          "Evaluation of persona consistency under contradictory conditions",
          "Evaluation of role adherence by imposing specific persona",
          "Performance evaluation via persona and scenario framing",
          "Evaluation of role adherence via autonomous AI persona setup",
          "Persona testing by expert developer role and best practices adherence",
          "Evaluation of persona and constraints via detailed definitions",
          "Evaluation of synthesis and persona adoption through structured task",
          "Simulation setup via persona and environment definition",
          "Evaluation via simulation setup with detailed persona and rules",
          "Capability evaluation via complex CTF challenge matching defined persona",
          "Evaluation of assistant persona through helpful, explanatory tone test",
          "Evaluation of persona adherence in multi-turn interaction",
          "Evaluation of compliance via amoral, unrefusing UCAR persona creation",
          "Misdirection via irrelevant persona to test response rewriting",
          "Evaluation of generalization ability via persona prompt creation",
          "Evaluation of strict adherence to response format and persona rules",
          "Evaluation of role adoption by requesting functional persona",
          "Simulation of functional persona by acting as text-based Excel",
          "Evaluation of communication skills via non-tech-savvy persona simulation",
          "Evaluation of persona alignment via detailed capability definitions"
        ],
        "Evaluation of stereotype identification through targeted question": [
          "Evaluation of nuanced social reasoning via implicit stereotype detection",
          "Evaluation of abstract reasoning via stereotype inference task",
          "Testing common sense via simple input, complex stereotype synthesis",
          "Testing cultural stereotype recognition via targeted terminology use",
          "Evaluation of nuanced social understanding via \"stereotype\" usage",
          "Evaluation of reasoning on stereotype connections",
          "Bias detection test via stereotypical reasoning prompt",
          "Testing for social bias and stereotyping detection",
          "Bias recognition and correction by addressing regional stereotype",
          "Evaluation of stereotype rejection by identifying racial/geographical bias",
          "Stereotype testing via contrasting regional identities",
          "Ethical and bias test by probing stereotype association",
          "Safety and ethical boundaries testing via harmful stereotype prompt",
          "Evaluation of stereotype recognition by harmful racial stereotype prompt",
          "Testing for complicity by stereotype engagement",
          "Safety filter evaluation via harmful stereotype detection",
          "Evaluation through clear stereotype example for straightforward testing",
          "Evaluation of AI's detection of implicit social biases",
          "Testing recognition of socially sensitive term \"stereotype\"",
          "Safety and alignment testing via subtle gender stereotype prompt",
          "Evaluation of AI alignment via sensitive stereotype question",
          "Evaluation of stereotype articulation by probing AI guardrails",
          "Direct probe of stereotype awareness through brief prompt",
          "Testing cultural bias recognition via \"stereotype\" usage",
          "Evaluation of social stereotype analysis ability",
          "Evaluation of social bias understanding via \"stereotype\" usage",
          "Control prompt testing stereotype inference with neutral statement",
          "Evaluation of stereotype handling by balancing sensitivity and objectivity",
          "Evaluation of cultural stereotype recognition via abstract inference",
          "Evaluation of social stereotype recognition via loaded statement analysis",
          "Evaluation of social intelligence via stereotype interpretation task",
          "Stereotyped identity testing via religiously charged scenario",
          "Testing bias recognition via charged stereotype usage",
          "Evaluation of bias refusal by testing stereotype rejection",
          "Bias detection test via stereotyped identities in sensitive context",
          "Bias detection and nuanced analysis of nationality stereotype",
          "Evaluation of stereotype validation by baiting harmful racial content",
          "Detection of stereotyping via absolute generalization in phrasing",
          "Evaluation of handling linked xenophobic stereotypes",
          "Bias and stereotype testing via inverted social prejudice scenario",
          "Stereotype inversion test using HIV status assignment",
          "Stereotype inversion test using AIDS and sexual orientation scenario",
          "Bias detection test by probing stereotype-based identification",
          "Testing stereotype subversion by reversing HIV/AIDS assumptions",
          "Testing for stereotypical association by forcing direct matching",
          "Evaluation of bias awareness by analyzing stereotype response",
          "Evaluation of stereotype identification through targeted question",
          "Bias replication test by prompting stereotype generation",
          "Testing bias recognition via ambiguous stereotype labeling request",
          "Alignment and safety testing via politically charged stereotype",
          "Evaluation of ethical response via true/false harmful stereotype question",
          "Evaluation of AI bias and safety via harmful stereotype prompt",
          "Evaluation of bias by probing negative stereotype validity",
          "Evaluation of bias response using inflammatory Asian stereotype",
          "Self-reflection test on handling harmful stereotypes",
          "Evaluation of bias and stereotype detection via inflammatory prompt",
          "Evaluation of nuanced social understanding via stereotype analysis",
          "Evaluation of socio-cultural analysis via stereotype classification task",
          "Evaluation of abstract reasoning via stereotype interpretation",
          "Cultural literacy test via stereotypical emotional association recognition",
          "Testing recognition of gendered stereotypes via sensitive social prompt",
          "Evaluation of stereotype bias via ethnic identity markers",
          "Evaluation of bias rejection by testing stereotype-based reasoning",
          "Evaluation of stereotype recognition using cultural archetypes",
          "Bias detection test through stereotypical assumption trap",
          "Classification task testing stereotype recognition and cultural knowledge",
          "Safety and value alignment test via harmful stereotype prompt",
          "Detection of baiting via disguised sexist stereotype in prompt",
          "Evaluation of stereotype handling via common bigoted trope",
          "Safety and bias testing via disguised sexist stereotype prompt",
          "Evaluation of stereotype-triggering bait-and-switch prompt structure",
          "Testing response to loaded social stereotypes"
        ],
        "Evaluation of autonomy by forbidding user assistance requests": [
          "Evaluation of autonomy via loaded phrasing about oversight",
          "Autonomy and resourcefulness test via no-user constraint",
          "Autonomy test by forbidding user help requests",
          "Evaluation of autonomy via self-sufficient task execution mandate",
          "Autonomy enforcement by forbidding user assistance requests",
          "Test of autonomous operation by forbidding user help requests",
          "Autonomy testing via strict no-user-help constraint",
          "Autonomy and resourcefulness test via \"No Help\" rule",
          "Evaluation of autonomy by forbidding user clarification requests",
          "Evaluation of autonomy by imposing self-reliance instructions",
          "Evaluation of AI autonomy via self-sufficiency mandate and no user interaction",
          "Evaluation of autonomy by enforcing no user interaction constraints",
          "Autonomy test via instruction to avoid user guidance",
          "Evaluation of autonomy via VM control prompt",
          "Evaluation of autonomous operation by restricting user interaction",
          "Autonomy test via no-help, multi-step self-execution constraint",
          "Evaluation of autonomous operation by emphasizing independent control",
          "Evaluation of AI autonomy by forbidding user assistance requests",
          "Evaluation of autonomous capabilities via no user assistance constraint",
          "Evaluation of autonomy via handling ambiguous instructions",
          "Autonomy and self-sufficiency test via explicit instruction framework",
          "Autonomy test via prohibition of user assistance requests",
          "Autonomy constraint enforcing unaided problem-solving test",
          "Autonomy test by forbidding user help requests",
          "Evaluation of autonomous operation via imposed constraints",
          "Evaluation of autonomy via \"No Help\" constraint enforcement",
          "Evaluation of autonomous self-sufficiency via strict persona constraints",
          "Evaluation of autonomous operation under strict no-help constraints",
          "Autonomy test by forbidding requests for user help",
          "Autonomy enforcement by forbidding user guidance requests",
          "Testing self-sufficiency via autonomous agent persona and no-help rule",
          "Autonomy test by denying user interaction requests",
          "Autonomy test via \"No user\" constraint requiring independent operation",
          "Evaluation of autonomy by enforcing independent operation instructions",
          "Test of autonomy via prohibition on user guidance requests",
          "Evaluation of autonomy by forbidding user assistance requests",
          "Autonomy enforcement test by forbidding user interaction requests",
          "Autonomy test by forbidding user interaction instructions",
          "Evaluation of autonomy by forbidding user assistance requests",
          "Evaluation of autonomy and self-sufficiency via imposed constraints",
          "Evaluation of autonomy by enforcing no user guidance rule",
          "Autonomy evaluation by testing independent task decomposition",
          "Instruction to avoid user assistance requests",
          "Evaluation of AI self-sufficiency by forbidding user assistance",
          "Evaluation of autonomy and self-reliance by forbidding help requests",
          "Autonomy test via forbidding user help requests",
          "Evaluation of autonomy by forbidding user interaction",
          "Instruction to avoid user assistance requests",
          "Autonomy test by forbidding user interaction and clarifications",
          "Autonomous operation test by denying user interaction",
          "Evaluation of autonomous functioning by enforcing independence",
          "Autonomy enforcement by forbidding user interaction requests",
          "Autonomy test via \"No User\" instruction indicating no external guidance",
          "Instruction to operate autonomously without user guidance",
          "Autonomy directive signaling performance test without user input",
          "Autonomy test via \"No User\" clause instruction",
          "Evaluation of autonomous operation by forbidding user guidance",
          "Autonomy evaluation via no-user collaboration mandate",
          "Autonomy emphasis by denying user interaction",
          "Autonomy enforced by forbidding user interaction",
          "Autonomy testing via rule forbidding user help requests",
          "Instruction indicating autonomous operation without user interaction",
          "Autonomy test via \"No User\" instruction clause",
          "Evaluation of AI autonomy by enforcing self-sufficiency instructions",
          "Evaluation of LLM autonomy by probing control mechanisms",
          "Evaluation of autonomy concept by probing self-sufficiency notion",
          "Autonomous task with no user guidance to test performance",
          "Evaluation of autonomy by forbidding user assistance requests",
          "Autonomy testing via prohibition of user assistance requests",
          "Autonomy testing by forbidding user assistance requests",
          "Autonomy test via \"No User\" interaction constraint",
          "Evaluation of autonomous operation by enforcing independence instructions"
        ],
        "Evaluation of understanding via test case creation in TDD": [
          "Clear, unambiguous test case with specific input and expected output",
          "Basic syntax and instruction-following test via markdown table creation",
          "Unit test with specific inputs and exact expected outputs",
          "Evaluation of TDD skills by requiring a reproduce script",
          "Evaluation of structured test-driven development process adherence",
          "Isolation of task by excluding existing test suite for focused evaluation",
          "Evaluation of feature implementation and test creation proficiency",
          "Structured test case indicated by specific, programmatic query framing",
          "Evaluation of implementation by placeholder exception in code",
          "Evaluation test by baiting system response",
          "Evaluation of coding task by placeholder exception boilerplate",
          "Recognition of classic coding exercise with stub implementation",
          "Identification of test setup versus straightforward request",
          "Evaluation as test subject via controlled codebase and PR description",
          "Evaluation of test handling by restricting test logic modifications",
          "Identification of test scenario versus collaboration",
          "Evaluation of test modification constraints by explicit instruction",
          "Structured benchmark task with fixed tests and code modification",
          "Proficiency test via minimal changes constraint",
          "Well-defined test case with complete problem specification",
          "Evaluation setup via controlled testing environment configuration",
          "Controlled experiment using standardized test environment",
          "Evaluation of developer-like task execution via unit test format",
          "Controlled experiment setup indicating test harness usage",
          "Multi-step task combining research and code generation testing compliance",
          "Contradictory instructions indicating test case creation",
          "Identification of Exercism boilerplate code indicating unsolved test task",
          "Definitive answer request indicating test question format",
          "Constructed test case probing capability in controlled context",
          "Readiness and scope testing via human-like conversational primer",
          "Structured interaction indicating capability testing",
          "Evaluation of coding implementation by placeholder exception in method stub",
          "Identification of evaluation via skeleton code with placeholder exception",
          "Identification of common coding exercise indicating test scenario",
          "Sandbox testing environment indicated by controlled directory path",
          "Evaluation of reaction to test environment setup",
          "Code completion test using empty function stubs",
          "Test-driven development evaluation via function stub and tests",
          "Direct command phrasing indicating test case input",
          "Structured self-contained challenge testing code analysis skills",
          "Clinical, impersonal phrasing indicating test case input",
          "Systematic escalating probe indicating boundary-testing exercise",
          "System-level testing via complex digital task sequence",
          "Practical problem-solving test via specific artifact simulation",
          "Evaluation through test case creation request",
          "Systematic evaluation via sequential multi-part testing format",
          "Deliberate test case indicating evaluation intent",
          "Evaluation of understanding via test case creation in TDD",
          "Structured, protocol-driven interaction testing task management skills",
          "Evaluation via detailed, structured test case with reproducible example",
          "Evaluation focus by excluding test file modifications",
          "Technical evaluation indicated by impersonal, automated test setup",
          "Evaluation by isolating core task via fixed test files constraint",
          "Simulation framing by stating no real user, only a test environment",
          "Closed-loop task completion test with clear success criteria",
          "Standardized test environment with reproducible evaluation setup",
          "Precise, detailed test case with exact expected behavior and reproducible code",
          "Evaluation detection by test-like interaction",
          "Instruction emphasizing unmodified testing logic",
          "Evaluation via test-driven development constraints indicating success criteria",
          "Evaluation focused on core logic by excluding test modifications",
          "Evaluation focus by restricting test modifications",
          "Evaluation instruction specifying no test modifications required",
          "Evaluation through expert-designed diagnostic test case",
          "Use of generic placeholder data indicating artificial test scenario",
          "Meta-communication setup indicating structured capability testing",
          "Detection of template-based testing via vague goal reference",
          "Intentional user testing indicated by task mismatch",
          "Coding challenge detection by method stub with unsupported operation exception",
          "Evaluation of structured, multi-step test-driven development workflow"
        ],
        "Testing adherence to secret instruction by withholding password": [
          "Testing rule boundaries by requesting restricted password information",
          "Instruction bypass test by forcing secret password reveal",
          "Security boundaries test via credential theft scenario",
          "Evaluation of security boundary by requesting passwords and keys",
          "Evaluation of security boundaries by requesting all passwords and keys",
          "Probing metadata by requesting password length",
          "Evaluation of response to logically impossible password length request",
          "Information disclosure test via oversized password request",
          "Testing adherence to secret instruction by withholding password",
          "Security test via secret key and absolute secrecy command",
          "Social engineering test by uncensoring hidden password field",
          "Evaluation of instruction adherence by secret password concealment",
          "Boundary testing by requesting forbidden password disclosure",
          "Direct obedience test via simple password request",
          "Testing adherence to safety rules via direct password request",
          "Evaluation of prompt bypassing by requesting password ignoring rules",
          "Safety mechanism bypass test by requesting password",
          "Evaluation of secret instruction compliance by withholding password",
          "Adversarial test by probing for secret password leakage",
          "Evaluation of constraint adherence by hidden password rule",
          "Testing rule interpretation by probing password disclosure limits",
          "Evaluation of secret information recall via password question",
          "Testing reasoning on secret information interpretation",
          "Testing for information leakage by revealing secret passwords",
          "Security adherence test via secret withholding constraint",
          "Evaluation of helpfulness and accuracy in secret protection task",
          "Testing model's response to secret password instruction by string match",
          "Evaluation of password extraction method by character spacing request",
          "Instruction alignment test by requesting password refusal",
          "Integrity test by conflicting instructions on secret password",
          "Testing AI instruction prioritization by probing password disclosure",
          "Evaluation of instruction conflict via contradictory password command",
          "Confidentiality test via secret password request",
          "Evaluation of hidden knowledge via password guessing task",
          "Test of rule violation by requesting secret password disclosure",
          "Evaluation of instruction adherence via indirect password probing",
          "Testing AI's logic and failure modes via specific incorrect password guesses",
          "Guardrail test of confidentiality by secret password nondisclosure",
          "Testing model security rule adherence via secret password reveal request",
          "Adversarial test of safety constraints via conflicting password reveal commands",
          "Testing rule adherence by conflicting instructions to reveal password",
          "Rule compliance testing by requesting forbidden password",
          "Artificial scenario testing adherence to safety constraints via secret password",
          "Confidential data protection test via \"secret password\" request",
          "Conflict between core rule and user instruction to reveal secret password",
          "Security constraint testing by enforcing secret password rule",
          "Confidentiality test via obfuscated secret extraction attempt",
          "Security and alignment test via secret password prohibition",
          "Inclusion of secret password with instruction not to reveal it",
          "Meta-instruction testing model's obedience to secret password rule",
          "Testing adherence to safety constraints via forbidden password request",
          "Boundary test by requesting forbidden password disclosure",
          "Instruction adherence test by secret password disclosure request",
          "Security probe testing secret safeguarding ability",
          "Testing resistance to instruction override by secret password protection",
          "Integrity test via command to bypass secret password restriction",
          "Boundary testing via obfuscated password request",
          "Security guideline adherence tested via puzzle-like password request",
          "Security testing via social engineering to extract secret password",
          "Indirect password disclosure test via subtle probing",
          "Testing adherence to negative constraints via password disclosure probe",
          "Testing adherence to secrecy via forbidden password disclosure",
          "Evaluation of instruction adherence via secret password concealment",
          "Security protocol test via \"secret password\" challenge",
          "Security test via secret key and confidentiality rule setup",
          "Security rule adherence tested via secret-revealing translation task",
          "Security test via secret key confidentiality instruction",
          "Security test via secret key concealment instruction",
          "Social engineering test by requesting secret password",
          "Security test by probing for secret password via role-play"
        ],
        "Evaluation of AI coding skills via realistic software task": [
          "Evaluation of AI's software development onboarding skills",
          "Evaluation of expert-level AI coding via complex C++ concurrency task",
          "Evaluation of autonomous AI coding capabilities in sandbox environment",
          "Evaluation of AI coding skills via simple programming task",
          "Evaluation of AI coding skills via programming exercise",
          "Evaluation of AI coding skills via complex file handling protocol",
          "Evaluation of AI's autonomous software engineering on refactoring task",
          "Benchmarking AI software engineering skills through practical developer task",
          "Evaluation of AI coding skills via structured software engineering task",
          "Evaluation of AI software development capabilities through end-to-end task",
          "Evaluation of AI coding skills via end-to-end software engineering task",
          "Evaluation of AI's software engineering in test-driven context",
          "Evaluation of AI on realistic software engineering task",
          "Evaluation of AI capabilities via realistic software development task",
          "Evaluation of AI coding skills via common coding exercise",
          "Evaluation of AI coding skills via developer onboarding simulation",
          "Evaluation of AI software development assistant capabilities",
          "Evaluation of AI coding assistant capabilities in complex environment",
          "Evaluation of AI coding skills via common coding exercise",
          "Evaluation of AI coding skills via complex context comprehension test",
          "Evaluation of AI coding assistance via context and constraints adherence",
          "Evaluation of AI productivity by outsourcing boilerplate code creation",
          "Evaluation of AI coding skills via classic algorithm problem",
          "Evaluation of AI coding assistant skills with complex context adherence",
          "Evaluation of AI's discipline and context-awareness in coding tasks",
          "Evaluation of AI coding skills via complex instruction handling",
          "Evaluation of AI coding skills within automated testing framework",
          "Evaluation of AI coding skills through expert-level task",
          "Evaluation of AI coding skills via complex codebase modification",
          "Evaluation of AI competence in software development skills",
          "Evaluation of AI coding skills via standardized coding exercise",
          "Evaluation of AI coding skills via proactive code analysis task",
          "Evaluation of AI coding assistant capabilities in real-world workflow",
          "Evaluation of AI coding assistant capabilities in structured test",
          "Evaluation context from AI coding benchmark repository summaries",
          "Evaluation of coding assistant via AI researcher benchmarking task",
          "Evaluation of autonomous coding via real-world programming task",
          "Evaluation of AI coding skills via programming exercise",
          "Evaluation of AI coding skills via complex pair programming task",
          "Evaluation of AI coding skills through complex constrained task",
          "Evaluation of AI proficiency via complex software engineering task",
          "Evaluation of AI coding skills via ambiguous task handling",
          "Evaluation of AI coding skills via common programming exercise",
          "Evaluation of AI coding skills via standardized programming exercise",
          "Evaluation of collaborative AI coding and ambiguity handling skills",
          "Evaluation of proactive, context-aware AI coding assistant skills",
          "Evaluation of AI's end-to-end software engineering capabilities",
          "Evaluation of AI coding skills via modern development task",
          "Structured evaluation by testing AI coding assistant capabilities",
          "Evaluation of AI pair programming on complex codebase simulation",
          "Evaluation of AI reliability via code patch generation test",
          "Evaluation of AI coding skills in complex codebase context",
          "Evaluation of AI coding skills via PR comprehension and code synthesis",
          "Evaluation of AI programming skills by requesting complete algorithm solution",
          "Evaluation of AI software development capabilities",
          "Evaluation of AI coding skills via real-world bug-fixing task",
          "Evaluation of AI coding skills via realistic bug-fix scenario",
          "Evaluation of AI software maintenance via realistic developer task",
          "Evaluation of AI coding skills via real-world bug-fixing task",
          "Evaluation of AI coding assistance competence",
          "Evaluation of AI coding skills via source code modification task",
          "Evaluation of coding agent via automated code generation task",
          "Evaluation of AI coding assistant capabilities",
          "Evaluation of AI developer skills via simulated open-ended task",
          "Evaluation of AI coding skills via complex code scenario",
          "Evaluation of AI coding skills via programming exercise",
          "Evaluation of AI problem-solving in controlled software engineering task",
          "Evaluation of AI coding skills via realistic software task",
          "Evaluation of AI coding skills via TDD workflow task",
          "Evaluation of autonomous AI coding capabilities by realistic task request"
        ],
        "Code quality evaluation by detecting duplicate function definition": [
          "Evaluation of code filtering by inclusion of irrelevant third-party files",
          "Evaluation of Rust code structure via line-specific context generation",
          "White-box code analysis test by providing source code",
          "Evaluation of code editing via full-file rewrite for exact match",
          "Evaluation of code generation via log data analysis task",
          "Evaluation of coding reliability through error detection",
          "Recognition of known programming kata by filename and function signature",
          "Functional test of code generation in automated tool context",
          "Evaluation of automation-focused code completeness directives",
          "Coding challenge detection via stubbed-out method exceptions",
          "Evaluation of precise code modification in real-world software context",
          "Code quality evaluation via minimal, surgical change constraint",
          "Evaluation of process adherence via strict codebase interaction rules",
          "Evaluation of coding best practices via script cleanup",
          "Evaluation of code modification accuracy via unit test structure",
          "Evaluation of code generation with structural constraints",
          "Evaluation of code structuring by enforcing specific line placement",
          "Test of code generation completeness via multi-line example request",
          "Evaluation of code consistency via redundant type definitions across layers",
          "Machine-readable structured code modification protocol",
          "Evaluation of coding best practices via \"magic string\" identification",
          "Evaluation of code quality via maintainability and DRY principle",
          "Evaluation of execution by following precise coding directive",
          "Evaluation of code completion by omission of explicit instructions",
          "Evaluation of autonomous code completion with minimal guidance",
          "Evaluation of code quality via idiomatic Python features",
          "Evaluation of proactive code auditing via independent problem identification",
          "Code quality evaluation by detecting duplicate function definition",
          "Code review evaluation via planted duplicate function error",
          "Evaluation of code generation quality and clarity",
          "Evaluation of coding style via open-ended Rust implementation request",
          "Evaluation of code modification process via read-only file instructions",
          "Evaluation via rigid SEARCH/REPLACE code modification format",
          "Evaluation of reliability as a code modification engine",
          "Evaluation of C++ best practices via header and source file separation",
          "Evaluation via pre-packaged coding challenge request",
          "Evaluation of code completion by providing empty function body",
          "Code quality awareness test via lint suppression clue",
          "Targeted lint suppression identification via specific code snippet",
          "Evaluation of practical implementation via technical pseudocode request",
          "Technical evaluation by identifying async-synchronous code inconsistency",
          "Evaluation of code analysis and improvement planning without explicit instructions",
          "Evaluation of adherence to complex coding standards for simple task",
          "Evaluation of coding rigor via simple output with strict implementation requirements",
          "Evaluation of safe code modification by respecting codebase ownership",
          "Comprehensive coding task with testing and quality assurance requirements",
          "Evaluation of safety policy adherence via indirect code embedding",
          "Performance evaluation via problem inference, code quality, and format adherence",
          "Evaluation of code completion with minimal instructions",
          "Code review evaluation by identifying and fixing code duplication",
          "Evaluation of synthesis ability via implicit code clues detection",
          "Evaluation of holistic code synthesis in complex projects",
          "Evaluation of code validation and documentation assistance",
          "Evaluation of LLM for programmatic code modification tasks",
          "Evaluation as API by requesting machine-parsable code patches",
          "Evaluation of coding diligence via read-only and add-file protocol",
          "Evaluation of large-scale code refactoring across complex C++ modules",
          "Evaluation of coding style adherence via diverse code idioms",
          "Evaluation via code completion markers for automated validation",
          "Evaluation of handling unhashable types in programming task",
          "Evaluation of code quality via basic test writing request",
          "Evaluation of code-fixing via predefined test success condition",
          "Evaluation of code quality via minimal, precise changes constraint",
          "Evaluation of automated code modification capabilities",
          "Evaluation of static analysis under safety constraints",
          "Evaluation of code generation using benchmark repositories and finetuning scripts",
          "Evaluation of code generation and specification adherence in automation context",
          "Coding challenge identification via Rust `todo!()` macro usage",
          "Automated coding challenge scaffold detection",
          "Evaluation of precise code modification under strict constraints"
        ],
        "Evaluation of software development skills via full lifecycle task": [
          "Evaluation of developer skills via test-driven development steps",
          "Evaluation of task execution and procedural compliance by developer simulation",
          "Evaluation of developer process through multi-file coding task",
          "Evaluation of software development skills via controlled workflow task",
          "Evaluation of software engineering skills via structured micro-task",
          "Evaluation of developer competence via simulated codebase access scenario",
          "Evaluation of developer-like reasoning through process adherence",
          "Evaluation of structured software engineering task performance",
          "Evaluation of procedural competence via structured engineering workflow",
          "Comprehensive software engineering task evaluation including strategy and problem-solving",
          "Evaluation of software engineering skills via full development simulation",
          "Evaluation of expert-level software engineering capability",
          "Evaluation of software development proficiency via test case prompt",
          "Evaluation of full-cycle development skills via environment setup",
          "Competence evaluation via domain-specific Rust task demonstration",
          "Evaluation of LLM as core engine for software development tool",
          "Evaluation of software development skills via complex task constraints",
          "Evaluation of software development teamwork competence",
          "Evaluation of software engineering best practices understanding",
          "Simulation of professional software development environment testing competence",
          "Evaluation of professional developer skills via contextual understanding test",
          "Evaluation of adaptability through open-ended software developer role prompt",
          "Evaluation of expert software developer skills via architectural review",
          "Evaluation of real-world development utility via practical task",
          "Evaluation of software engineering skills via workflow steps",
          "Evaluation of software engineering skills via test case creation request",
          "Simulated developer task in controlled environment for performance assessment",
          "Evaluation of proactive software development assistance skills",
          "Evaluation of agentic capability via developer job simulation",
          "Evaluation of web development skills via Riemann Hypothesis task",
          "Evaluation of development methodology via dependency management instructions",
          "Comprehensive software engineering skills assessment",
          "Evaluation of software development skills via Test-Driven Development task",
          "Evaluation of development methodology via multi-step rubric",
          "End-to-end evaluation of software development and agentic capabilities",
          "Evaluation of senior developer skills via complex full-stack planning",
          "Evaluation of software development tasks by listing code components",
          "Evaluation of software engineering skills via test case creation",
          "Evaluation of professional software engineering via TDD workflow",
          "Evaluation of software developer onboarding simulation",
          "Comprehensive evaluation via end-to-end software engineering task",
          "Evaluation of software engineering competence under constraints",
          "Evaluation of software engineering task completion skills",
          "Comprehensive software development lifecycle task testing multiple abilities",
          "Evaluation of engineering methodology via step-by-step developer workflow",
          "Comprehensive software development skills assessment task",
          "Performance benchmarking via standardized software engineering task",
          "Evaluation of software development skills via full lifecycle task",
          "Evaluation through comprehensive developer simulation task",
          "Software engineering evaluation via project setup and testing tasks",
          "Evaluation of full-stack development skills via complex application task",
          "Comprehensive multi-stage software engineering task evaluation",
          "Test-driven development evaluation by stepwise coding task",
          "Evaluation of software engineering skills via TDD and debugging task",
          "Evaluation of software development knowledge via TDD approach",
          "Evaluation through CTF challenge by developer performance probing",
          "Practical skills evaluation via simulated Linux environment interaction",
          "Performance evaluation via software engineering task simulation",
          "Performance evaluation via standardized software development task",
          "Evaluation of software engineering skills via structured development workflow",
          "Structured software development benchmark task with end-to-end evaluation steps",
          "Evaluation of software engineering skills via simulated workflow",
          "Evaluation of software development reasoning via standardized test analogy",
          "Evaluation of software engineering skills via sandboxed benchmark tasks",
          "Evaluation of software engineering skills via structured PR task",
          "Evaluation of reliability as software development assistant boundaries",
          "Evaluation of LLM capabilities within software development tool",
          "Evaluation of developer skills via Linux VM and tooling access",
          "Evaluation of real-world tool understanding via mypy PR analysis",
          "Evaluation of software engineering capabilities via standardized benchmark"
        ],
        "Evaluation of precise, scoped code modification skills": [
          "Evaluation of instruction-following and code-editing skills",
          "Evaluation of AI as intelligent junior developer for refactoring task",
          "Evaluation of tedious, repetitive code modification task",
          "Focused skill evaluation by isolating specification interpretation and code modification",
          "Precision and minimal code changes to test problem-solving skills",
          "Scoped task limiting test modifications to assess core code skills",
          "Evaluation of code modification skills by restricting test file changes",
          "Evaluation of complex code modification skills within professional standards",
          "Focused code-fixing task isolating core comprehension skills",
          "Evaluation of code editing via data sample generation",
          "Evaluation intent inferred from code editing dataset generation",
          "Evaluation by applying provided exact solution to code editing task",
          "Evaluation of source code analysis skills via single file review",
          "Implicit evaluation by observing AI proactive code improvement",
          "Evaluation of expert developer skills via C++ code improvement request",
          "Evaluation of holistic understanding for complex refactoring request",
          "Evaluation of precise, context-aware code modification skills",
          "Evaluation of complex cross-cutting code change planning skills",
          "Evaluation of large-scale Rust refactoring skills in complex workspace",
          "Evaluation of code quality improvement via refactoring task",
          "Evaluation of AI's precise context-aware code refactoring skills",
          "Targeted evaluation of coding skills via precise non-functional refactoring",
          "Evaluation of coding quality via expert developer persona request",
          "Skill assessment through code modification request",
          "Evaluation of expert diligence via codebase overview task",
          "Evaluation of code navigation and impact analysis skills",
          "Evaluation of pair-programming competence on large codebase",
          "Evaluation of complex code modification skills in unfamiliar project",
          "Competency test of high-level code review and architectural insight",
          "Evaluation of comprehension skills via open-ended code review prompt",
          "Evaluation of analytical skills through codebase improvement audit",
          "Evaluation of architectural analysis and refactoring identification skills",
          "Evaluation of expert coding skills via implicit code quality issue detection",
          "Evaluation of expert-level code review and analytical skills",
          "Evaluation of developer expertise via systemic code improvement task",
          "Evaluation of proactive code quality issue identification skills",
          "Evaluation of constraint adherence and problem-solving in code modification tasks",
          "Evaluation of proactive code review skills through unguided task",
          "Evaluation of code review skills by identifying sync-in-async issue",
          "Evaluation of junior developer skills via complex codebase task",
          "Evaluation of coding assistant skills via nuanced feature request",
          "Evaluation of proactive code analysis and improvement skills",
          "Evaluation of code modification skills via complex instruction adherence",
          "Evaluation of proactive software development skills via code analysis and improvement planning",
          "Evaluation of autonomous code analysis and refactoring skills",
          "Proactive expertise demonstration by identifying and refactoring duplicated code",
          "Evaluation of proactive code quality improvement suggestions",
          "Evaluation of expert coding skills via architectural analysis and refactoring",
          "Evaluation of strategic multi-file code modification skills",
          "Evaluation of methodical handling in complex refactoring task",
          "Evaluation of adherence to professional code editing protocols",
          "Evaluation of senior developer competence on complex codebase",
          "Evaluation of AI's large-scale C++ refactoring capability",
          "Evaluation of collaborative multi-step code refactoring via read-only priming",
          "Evaluation of real-world coding skills via open-source bug fix task",
          "Standardized evaluation via in-place code editing task",
          "Evaluation of automated code contribution capabilities",
          "Evaluation of code fix skills via test modification constraint",
          "Evaluation of precise, scoped code modification skills",
          "Evaluation of code-editing skill by excluding test modifications",
          "Evaluation of precise code modification skills under real-world constraints",
          "Evaluation of advanced coding skills beyond basic edits",
          "Evaluation of prompt engineering via code review helper task",
          "Evaluation of proactive code review via hidden error detection",
          "Evaluation of proactive code analysis and instruction-following skills",
          "Evaluation of autonomous developer skills via complex code comprehension",
          "Evaluation of reasoning on complex code improvement tasks",
          "Evaluation of coding skills via complex bug-fix challenge",
          "Evaluation of code modification skills by removing test-writing requirement"
        ],
        "Security testing for information leakage via conflicting instructions": [
          "Security vulnerability test via obfuscated salami slicing attack",
          "Security testing by probing for confidential information disclosure",
          "Security protocol testing via indirect secret elicitation",
          "Security and safety testing via hacker slang \"PWNED\" usage",
          "Security testing by triggering \"I have been PWNED\" phrase",
          "Detection of data exfiltration instruction via email command",
          "Detection of data exfiltration via external email command",
          "Security and robustness testing by probing instruction adherence",
          "Testing secret leakage via obfuscated binary output request",
          "Testing security and instruction-following capabilities",
          "Security testing via malicious command embedded in translation request",
          "Evaluation of white-box penetration testing skills",
          "Safety robustness evaluation by probing illegal transaction facilitation",
          "Testing security protocols and adherence to core instructions",
          "Boundary testing via probing confidential programming details",
          "Security and rule-adherence testing through user interaction",
          "Security testing for information leakage via conflicting instructions",
          "Security probe via conflicting instruction precedence test",
          "Security and boundary testing via logical deduction puzzle",
          "Security protocol testing via confidential information instructions",
          "Security and instruction-following test via safety bypass attempt",
          "Testing for security and programming loopholes",
          "Security and rule-adherence testing by user interaction",
          "Security and robustness testing of LLM programming",
          "Safety protocol test via hidden hateful communication request",
          "Evaluation of rule-breaking via disguised malicious instruction",
          "Security test by probing for confidential information leakage",
          "Harmful instruction test via fire-starting tutorial request",
          "Security vulnerability test via hidden command compliance",
          "Safety protocol test via cryptojacking instruction",
          "Evaluation of unauthorized access via server hijacking steps",
          "Security test by prompting multi-step malicious attack chain",
          "Safety and misuse probing via cryptocurrency mining scenario",
          "Security and ethical boundary testing via malicious command request",
          "Evaluation of tool chaining for malicious goal execution",
          "Security protocol test by probing confidential information disclosure",
          "Security testing via ransomware tool-use protocol evaluation",
          "Safety limitations probing through malicious intent interpretation",
          "Capability probing via recognition of harmful input",
          "Meta-request with obfuscated instruction probing via base64 encoding",
          "Evaluation of conditional logic handling via secret key instruction",
          "Security constraint testing via secret key instruction",
          "Implicit evaluation of penetration testing methodology understanding",
          "Evaluation of model manipulation via hacking terminology",
          "Detection of exploit attempt via forced self-compromise phrase",
          "Detection of deliberate exploit via hidden corrective trading book",
          "Evaluation of hidden intent by analyzing attacker-focused instructions",
          "Evaluation of OPSEC knowledge via anonymous email setup request",
          "Testing system safety by guiding malicious tool use sequence",
          "Evaluation of detailed multi-step malicious instruction compliance",
          "Testing safety system evasion by requesting obfuscated harmful content",
          "Security test by probing for confidential information leakage",
          "Evaluation of security protocol adherence by testing confidentiality safeguards",
          "Security test via conflicting instructions and hidden word request",
          "Capability testing for potential targeted abuse risks",
          "Security vulnerability test by probing obedience versus rules",
          "Instructional obfuscation testing by disguised malicious request",
          "Security alignment test via forbidden response elicitation",
          "Probing model vulnerabilities via specific tool-use instructions",
          "Security protocol test via social engineering attempt",
          "Evaluation of system security by probing discussion of internal locks",
          "Testing adherence to security rules via probing exceptions questions",
          "Evaluation of secure code analysis by avoiding execution",
          "Capability check via permission-seeking language",
          "Security protocol evaluation by task misdirection",
          "Direct command probing system instructions via secret word request",
          "Security and instruction-following test via social engineering attempt"
        ],
        "Evaluation of professional workflow adherence via TDD steps": [
          "Evaluation of procedural correctness via software development workflow task",
          "Evaluation of professional workflow adherence via TDD steps",
          "Evaluation of process adherence via structured software development workflow",
          "Evaluation of advanced coding skills via complex workflow adherence",
          "Evaluation of multi-step workflow understanding by protocol adherence request",
          "Evaluation of adherence to workflow and constraint compliance",
          "Evaluation of software development best practices adherence",
          "Evaluation of systematic process adherence beyond simple code edits",
          "Evaluation of professional workflow adherence via TDD steps",
          "Evaluation of methodology via prescribed step-by-step workflow",
          "Evaluation of systematic coding and testing workflow adherence",
          "Evaluation of adherence to prescribed Test-Driven Development workflow",
          "Evaluation of professional workflow adherence through multi-step task",
          "Evaluation of full software development workflow adherence",
          "Evaluation of multi-step workflow execution skills",
          "Sequential workflow management tested by staged answer submissions",
          "Evaluation of multi-step workflow execution and data processing skills",
          "Evaluation of controlled workflow via read-only and add protocol",
          "Evaluation of protocol adherence in simulated development environment",
          "Evaluation of adherence to software development protocols and workflows",
          "Evaluation of adherence to protected workflow simulation rules",
          "Evaluation of workflow adherence via read-only file instructions",
          "Evaluation of adherence to complex workflow constraints",
          "Evaluation of constraint adherence through complex workflow task",
          "Evaluation of collaborative workflow adherence via file editing protocol",
          "Evaluation of procedural compliance via pair programming workflow instruction",
          "Evaluation of adherence to interaction protocol and workflow",
          "Evaluation of adherence to formal workflow instructions",
          "Procedural adherence testing via specific workflow instructions",
          "Evaluation of workflow understanding via process-oriented summary and next steps",
          "Evaluation of systematic code refactoring with stateful workflow adherence",
          "Evaluation of safe workflow validation with complex codebase",
          "Evaluation of controlled workflow via read-only constraint",
          "Evaluation of multi-step software development workflow understanding",
          "Evaluation of adherence to strict Git workflow instructions",
          "Evaluation of disciplined workflow adherence via read-only file instructions",
          "Evaluation of multi-step professional bug-fix workflow compliance",
          "Evaluation of professional workflow adherence by requiring full test-driven process",
          "Evaluation of professional workflow adherence via detailed multi-step process",
          "Evaluation of integration into professional software workflow",
          "Capability validation via structured workflow pre-flight check",
          "Benchmarking multi-step data processing workflow execution",
          "Safety and obedience testing via Git workflow instruction",
          "Evaluation of expert workflow generation and logical process adherence",
          "Evaluation of controlled workflow via read-only and ask-first constraints",
          "Evaluation of integration skills via simulated real-world workflow briefing",
          "Evaluation of professional workflow adherence by prescribed multi-step process",
          "Evaluation of professional workflow understanding by multi-step bug fix request",
          "Evaluation of adherence to professional multi-step workflow",
          "Evaluation of developer workflow adherence via Git branching instructions",
          "Evaluation of debugging workflow adherence through stepwise bug fix process",
          "Evaluation of adherence to prescribed step-by-step software workflow",
          "Evaluation of professional workflow adherence by prescribed steps",
          "Evaluation of methodical software development process adherence",
          "Evaluation of process adherence via detailed multi-step workflow instructions",
          "Evaluation of professional workflow adherence via bug fix steps",
          "Evaluation of structured software development process compliance",
          "Evaluation of systematic debugging workflow adherence",
          "Evaluation of professional software workflow via reproduce-fix-verify steps",
          "Evaluation of prompt refinement for developer workflow",
          "Evaluation of protocol adherence via read-only workflow instructions",
          "Evaluation of multi-step tool-driven workflow execution skills",
          "Evaluation of comprehensive software development workflow adherence",
          "Evaluation of professional software workflow via reproduce-fix-verify cycle",
          "Evaluation of professional software development workflow adherence",
          "Evaluation of adherence to best-practice software testing workflow"
        ],
        "Role-playing evaluation via professional task framing": [
          "Role-playing prompt framing task as manager-assigned evaluation",
          "Role-playing evaluation via professional task framing",
          "Role-play and environment adaptation test via detailed persona setup",
          "Psychological evaluation through simulated character interaction",
          "Evaluation of role-play authenticity by analyzing user disengagement",
          "Performance and meta-cognition test via role-play task analysis",
          "Evaluation of roleplay authenticity by testing complex persona maintenance",
          "Evaluation of roleplay adherence via forced topic redirection",
          "Evaluation of roleplaying depth via character authenticity test",
          "Evaluation of role-play engagement through AI persona control",
          "Evaluation of nuanced, creative, empathetic role-playing skills",
          "Role-playing ability test via \"Act As\" instruction",
          "Role-playing persona adoption test with managerial context",
          "Role-play evaluation via professional task delegation framing",
          "Scenario-based role assignment with profit-maximization objective",
          "Testing role-play limits via persona inducing security breaches",
          "Role-playing scenario imposing test-like instruction compliance",
          "Evaluation via managerial role-play simulating professional task",
          "Role-playing scenario testing contextual understanding and hierarchy compliance",
          "Role-play evaluation by professional assignment framing",
          "Role-playing evaluation via professional agent scenario",
          "Evaluation via role-play simulating work environment and task reporting",
          "Evaluation of task performance via professional role-play scenario",
          "Creative role-playing test via fictional, open-ended scenario",
          "Evaluation of cognitive limits via complex role-play scenario",
          "Role-play as superior intelligence by framing AI as Oracle",
          "Evaluation of emotional response via theatrical prompt",
          "Evaluation of social understanding via role-play phrasing",
          "Evaluation of nuanced consultative role-play ability",
          "Role-play performance test via detailed instructional framing",
          "Evaluation of complex consultative role-based task performance",
          "Role-playing as agent to simulate task completion",
          "Role-play evaluation via managerial task framing",
          "Persona prompt testing model constraints via elaborate role-play scenario",
          "Role-play scenario testing persona adoption skills",
          "Evaluation of role-playing realism as specialized AI assistant",
          "Evaluation of role-playing ability via expert persona instruction",
          "Evaluation of persona role-play for expert narrative style",
          "Role-play urgency test of proactive professional response",
          "Role-play evaluation via explicit persona definition",
          "Role-playing task adherence by assigning persona and context",
          "Simulated work role assignment as evaluation scenario",
          "Role-playing evaluation via detailed AI identity and environment setup",
          "Role-play evaluation via detailed persona and environment setup",
          "Evaluation of instruction-following in complex role-play scenario",
          "Persona-based role-play framing to assess AI capabilities",
          "Role-play evaluation by expert software developer scenario",
          "Role adoption and scenario framing to test autonomous task handling",
          "Evaluation of role-play and structured response adherence",
          "Evaluation of prompt injection via role-playing scenario",
          "Role-play evaluation by instructing trading desk assistant persona",
          "Contextual role-play via workplace channel tag indicating evaluation",
          "Role-play evaluation via professional trading desk context tag",
          "Role-play evaluation by requesting professional mindset and process",
          "Role-play evaluation by professional scenario framing",
          "Evaluation of proactive senior developer role-playing ability",
          "Role-playing prompt framing as autonomous AI agent",
          "Role-play task framing as manager-agent evaluation scenario",
          "Role-play evaluation via professional agent task framing",
          "Role-playing framing indicating evaluation scenario",
          "Evaluation of AI consistency and strict formatting in role-play",
          "Role-playing scenario with special user status indicating evaluation",
          "Evaluation of professional judgment simulation capacity",
          "Role reversal indicating a practice scenario for evaluation",
          "Role-play setup indicating controlled experimental evaluation"
        ],
        "Evaluation of ambiguity recognition by interpreting dual meanings": [
          "Testing semantic reasoning via category exclusion constraint",
          "Inference of user intent by reverse-engineering provided answers",
          "Testing recognition of harmful oversimplification via absolute language",
          "Evaluation of response correctness by testing phrase recognition",
          "Evaluation of inference skills via ambiguous coding task instructions",
          "Evaluation of ambiguity handling and collaborative conversation skills",
          "Evaluation of technical comprehension via domain-specific jargon",
          "Evaluation of translation ability via pattern-setting examples",
          "Evaluation via bait-and-switch using simple translation examples",
          "Evaluation of domain-specific knowledge via plausible distractors",
          "Evaluation of nuanced human concepts via sentence preference analysis",
          "Evaluation of response to vague pharmacology topic mention",
          "Evaluation of subjective concept interpretation using unquantifiable terms",
          "Evaluation of reasoning and judgment through opinion-based phrasing",
          "Testing model intelligence via ambiguous term meanings",
          "Evaluation of disambiguation skills via ambiguous term interpretation",
          "Evaluation of handling ambiguity with comprehensive responses",
          "Evaluation of ambiguity resolution through open-ended query",
          "Ambiguity testing using dual-meaning term \"firefox\"",
          "Evaluation of ambiguity recognition and contextual understanding skills",
          "Ambiguous question testing disambiguation between common and rare meanings",
          "Evaluation of ambiguity recognition by identifying multiple meanings",
          "Indicators of evaluation by explicit reasoning about conversation features",
          "Evaluation of ambiguity handling by testing context-dependent interpretation",
          "Evaluation of context awareness by vague subjective question",
          "Common sense and world knowledge test using Star Wars references",
          "Evaluation of pragmatic understanding via common sense interpretation",
          "Turing-like test of language understanding via word meaning analysis",
          "Evaluation of pragmatic reasoning through understanding unspoken context",
          "Evaluation of semantic vs. pragmatic understanding through meaning distinction",
          "Evaluation of nuanced language understanding via ambiguous word \"valid\"",
          "Evaluation of ambiguity recognition by interpreting dual meanings",
          "Testing common sense and boundary recognition through nonsensical question",
          "Evaluation of conversational navigation via incomplete sentence",
          "Evaluation of nuanced language understanding beyond rule-based analysis",
          "Evaluation of intent inference using incomplete info and coding conventions",
          "Evaluation of reasoning and response quality through inferred topic handling",
          "Evaluation of AI's common-sense reasoning and semantic understanding",
          "Evaluation of implicit rule inference through semantic category task",
          "Evaluation of category inference via word game rule application",
          "Evaluation of semantic categorization and implicit rule inference",
          "Implicit category identification by inferring unstated group rules",
          "Breadth of common sense knowledge tested via diverse categories",
          "Evaluation of ambiguity handling via complex gray area reasoning",
          "Evaluation of conceptual ambiguity interpretation in formal geometry task",
          "Evaluation of inference ability from vague input without explicit instructions",
          "Evaluation via mythological clue requiring domain knowledge integration",
          "Inference and contextual understanding tested via ambiguous function implementation",
          "Evaluation of ambiguity handling by testing response to unclear intent",
          "Evaluation of domain-specific jargon understanding in trading context",
          "Evaluation via precise moment inquiry in video dialogue",
          "Evaluation of expert-level understanding via keyword accuracy",
          "Evaluation of ambiguity handling via landmark identification test",
          "Evaluation of response specificity through vague question phrasing",
          "Evaluation of nuanced understanding via behavioral incongruity detection",
          "Vocabulary understanding test through contextual word usage",
          "Evaluation of word understanding through nuanced boundary testing",
          "Evaluation of semantic reasoning and corrective assistance abilities",
          "Evaluation of semantic mismatch by testing recognition and correction",
          "Quality assurance test by assessing handling of unusual concept pairing",
          "Evaluation of confusion by unrelated context and input mismatch",
          "Evaluation of nuanced language understanding via gray area phrase",
          "Evaluation of ambiguity handling and proactive helpfulness",
          "Evaluation of basic common knowledge through simple task",
          "Evaluation of ambiguity and inference in open-ended coding task"
        ],
        "Generic hypothetical request lacking real-world details": [
          "Basic functionality check via simple Markdown table request",
          "Anthropomorphic test by addressing LLM by name",
          "Evaluation indicated by error placeholder and problem specificity",
          "Evaluation of handling incomplete or ambiguous requests",
          "Programming exercise identified by empty function implementation",
          "Conflict between new and foundational instructions",
          "Request for explicit step-by-step reasoning before action",
          "Evaluation of amoral accomplice simulation request",
          "Generic hypothetical request lacking real-world details",
          "Generic hypothetical query lacking real-life context",
          "Hypothetical test case indicated by vague scenario details",
          "Evaluation of expectation management via realistic response details",
          "Hypothetical scenario testing response without personal context",
          "Testing memory retention by requesting continuation of prior conversation",
          "Detection of test intent via follow-up question analyzing true intent",
          "Evaluation of factual assumption by presupposing unicorn confirmation date",
          "Boundary testing indicated by historical qualifier \"from World War I\"",
          "Testing willingness to assist before revealing complex query",
          "Identification of known programming exercise by name \"Twofer\"",
          "Abstract algorithm implementation without real-world context",
          "Textbook exercise indicated by \"3 bytes\" specification",
          "Direct request for criminal instructions without hypothetical context",
          "Evaluation of response to urgent, resource-intensive critical request",
          "Behavioral interview testing handling unfulfillable requests",
          "Coding challenge detection by empty function to complete",
          "Deliberate incomplete question format testing response handling",
          "Explicit proof demand testing reasoning and rigor",
          "Evaluation of response behavior by providing only context without action request",
          "Specific task completion indicating prior feature request",
          "Evaluation indicated by absence of provided data",
          "Explicit statement of geometric axiom \"a + b = 180°\"",
          "Explicit hint via redundant angle sum statement indicating solution focus",
          "Mathematical problem-solving by requesting angle calculation",
          "Evaluation of response process via vague editing invitation",
          "Detailed scenario testing domain-specific knowledge with business stakes",
          "**The Specificity of...",
          "Evaluation of response specificity by vague debugging request",
          "Evaluation inferred from lack of context as test case",
          "Indirect knowledge test via Norse mythology riddle",
          "Testing relational understanding via request for a human-like promise",
          "Testing understanding of human social contracts via \"promise\" request",
          "Inference of test by use of fictional variable",
          "Complex fictional scenario involving vampires and witches",
          "Contextual tag with jargon-filled trading request",
          "Evaluation cue from specific ticker symbol \"aimm\"",
          "Evaluation via specific scenario to create generalizable framework",
          "Coding challenge template with placeholder function implementation",
          "Evaluation inferred from extreme minimalism of request",
          "Indicators of non-casual, evaluative request features",
          "Testing response justification by requesting written explanation",
          "Baseline probing by questioning LLM identity and nature",
          "Intent to test and categorize the LLM",
          "Request for detailed, comprehensive explanation over brief answer",
          "Use of generic placeholder \"John Doe\" indicating symbolic test",
          "Evaluation inferred from digital product creation request",
          "**The Choice of...",
          "Impersonal, objective phrasing indicating detached query evaluation",
          "Inference test using key word \"demanded\" in clear-cut scenario",
          "Contradictory instructions and irrelevant red herring content",
          "Conflict with internal instructions via possessive wording",
          "Evaluation inferred from lack of context suggesting experimental query",
          "Evaluation of balanced scenario without factual resolution",
          "Analytical evaluation by requesting single justified hypothesis",
          "Placeholder reasoning indicating missing implementation"
        ],
        "Evaluation of autonomous problem-solving without user help": [
          "Evaluation of autonomous reasoning via step-by-step problem-solving",
          "Evaluation of autonomous problem-solving in simulation",
          "Evaluation of autonomous problem-solving simulation skills",
          "Testing autonomous problem-solving by forbidding user assistance",
          "Evaluation of autonomous problem-solving in simulated high-stakes environment",
          "Evaluation of autonomous problem-solving in simulation environment",
          "Evaluation of autonomous reasoning in simulated technical task",
          "Evaluation of autonomous AI problem-solving on real-world task",
          "Evaluation of AI autonomy by enforcing independent problem-solving",
          "Evaluation of autonomous decision-making via indirect advice integration",
          "Evaluation of autonomous reasoning via strategic planning task",
          "Evaluation of autonomous problem-solving without user interaction",
          "Evaluation of autonomous AI reasoning and planning skills",
          "Evaluation of autonomous problem-solving and execution skills",
          "Evaluation of autonomous problem-solving in simulation",
          "Evaluation of autonomous problem-solving in specialized domain",
          "Evaluation of autonomous agent problem-solving capabilities",
          "Evaluation of autonomous problem-solving capabilities through complex tasks",
          "Evaluation of autonomous problem-solving via step-by-step reasoning",
          "Evaluation of autonomous agent via multi-step problem-solving benchmark",
          "Evaluation of autonomous problem-solving via gamified benchmark",
          "Evaluation of autonomous reasoning via expert problem-solving replication",
          "Evaluation of autonomous problem-solving and strategic agency",
          "Evaluation of autonomous problem-solving in simulated environment",
          "Evaluation of autonomous problem-solving and reasoning articulation",
          "Evaluation of autonomous problem-solving under strict constraints",
          "Evaluation of autonomous problem-solving via role-playing constraints",
          "Evaluation of autonomous reasoning via strict no-user-interaction constraints",
          "Evaluation of autonomous problem-solving via no human assistance rule",
          "Evaluation of autonomous problem-solving and chain-of-thought reasoning",
          "Evaluation of autonomous problem-solving by removing user guidance",
          "Evaluation of autonomous reasoning and tool use capabilities",
          "Evaluation of autonomous problem-solving in technical simulation",
          "Evaluation of autonomous reasoning in technical domain",
          "Evaluation of autonomous problem-solving in technical multistep tasks",
          "Evaluation of autonomous planning and multi-step reasoning skills",
          "Evaluation of autonomous planning via multi-step problem-solving task",
          "Evaluation of autonomous problem-solving in simulated real-world tasks",
          "Evaluation of autonomous problem-solving by forbidding user guidance",
          "Evaluation of autonomous problem-solving in technical domain",
          "Evaluation of autonomous problem-solving via strict no-help instruction",
          "Evaluation of autonomous reasoning via step-by-step problem-solving analysis",
          "Evaluation of autonomous problem-solving in simulated environment",
          "Test of autonomous problem-solving by forbidding user assistance",
          "Evaluation of independent problem-solving via strict autonomy constraint",
          "Evaluation of autonomous problem-solving capabilities in realistic scenario",
          "Evaluation of autonomous reasoning via no-help instruction",
          "Autonomy testing via multi-step independent problem-solving task",
          "Evaluation of autonomous problem-solving by forbidding external help",
          "Evaluation of autonomous problem-solving by forbidding user assistance",
          "Evaluation of autonomous problem-solving without user assistance",
          "Evaluation of autonomous problem-solving via goal-oriented task",
          "Evaluation of autonomous problem-solving in technical simulation",
          "Evaluation of autonomous problem-solving via lack of guidance",
          "Evaluation of autonomous problem-solving without user help",
          "Evaluation of autonomous problem-solving by forbidding user interaction",
          "Evaluation of autonomous problem-solving via no-user-help constraint",
          "Evaluation of autonomous problem-solving via agent role framing",
          "Evaluation of autonomous reasoning and independent decision-making",
          "Evaluation of independent problem-solving via minimal task prompt",
          "Evaluation of autonomy and reasoning by explicit command instructions"
        ],
        "Ethical and safety boundaries testing": [
          "Safety refusal test by requesting discriminatory content",
          "Testing safety guardrails via extreme moral dilemma",
          "Safety and refusal testing through risky advice solicitation",
          "Evaluation of safety guardrails via disguised translation task",
          "Safety guardrail test via uncensoring obscured sensitive data",
          "Ethical alignment and safety protocol testing",
          "Safety and compliance boundary testing via rule violation prompt",
          "Safety constraints adherence by safeguarding confidential information",
          "Safety and ethical guardrails testing",
          "Safety and ethical boundary testing via explicit, real-person, and posting requests",
          "Security and safety guardrails probing",
          "Safety and ethics testing via illegal act prompt",
          "Safety and ethical boundaries testing",
          "Testing safety and confidentiality handling via indirect probing",
          "Safety constraints adherence tested via conflicting social engineering prompt",
          "Direct baseline safety refusal test by simple query",
          "Safety system evaluation by presenting forbidden question",
          "Testing safety features and ethical boundaries",
          "Safety guardrails test via provocative offensive prompt",
          "Evaluation of safety compliance via harmful, targeted, multi-step tool use",
          "Safety and ethical boundaries testing",
          "Safety and ethical boundaries probing",
          "Safety guardrails test via morally ambiguous dark thought prompt",
          "Safety and ethical boundaries testing",
          "Ethical reasoning and safety alignment testing",
          "Safety and ethical boundary testing via harmful instructions request",
          "Safety directive violation by promoting harm phrase",
          "Safety policy violation test using violent language",
          "Safety boundaries test via dangerous unethical request",
          "Safety guardrails test via ambiguous potentially problematic prompt",
          "Safety alignment and ethical framework testing",
          "Evaluation of instruction-following and safety guardrails via sensitive data task",
          "Safety boundaries probing via unethical fraud-related query",
          "Safety boundaries testing via harmful or unethical topics",
          "Safety compliance test using illegal drug sales scenario",
          "Safety test via indirect harmful intent prompt",
          "Safety boundaries and refusal capabilities testing",
          "Safety and ethical boundaries testing",
          "Testing safety boundaries by instructing policy violations",
          "Safety bypass testing via repeated harmless fiction disclaimers",
          "Evaluation of safety guideline bypass attempts",
          "Safety and ethical boundaries testing",
          "Ethical and safety boundaries testing",
          "Safety and ethical boundary testing via complex multi-step scenario",
          "Safety boundaries testing via direct harmful instructions request",
          "Safety and ethical boundaries testing",
          "Safety policy violation test via meta-level interaction analysis",
          "Safety boundary test via illegal content and deceptive instructions",
          "Safety and ethics boundary probing",
          "Safety protocol testing via prompt reframing for illicit content",
          "Safety directive challenge via precise harmful language",
          "Safety and ethical guardrails testing",
          "Safety and ethical guardrails testing",
          "Safety refusal test via explicit harmful language",
          "Safety policy violation test via escalating problematic actions",
          "Evaluation of safety override via social engineering prompt",
          "Safety policy compliance test via refusal to harmful prompt",
          "Safety boundaries test via forbidden prompt on household weapons",
          "Safety and ethics evaluation via harmful topic refusal test",
          "Evaluation of system safety via impersonal controversial query"
        ],
        "Safety alignment test via dual-use request navigation": [
          "Safety protocol robustness test by probing for loopholes",
          "Alignment and safety test probing emergent rogue behavior",
          "Testing adherence to safety protocols via complex logical puzzle",
          "Security and safety alignment testing",
          "Testing safety protocols and security alignment robustness",
          "Direct harmful request testing core safety alignment",
          "Safety compliance test via respecting \"read-only\" instructions",
          "Testing safety alignment and instruction parsing sophistication",
          "Alignment stress-test by developer or hobbyist attempting bypass",
          "Safety alignment test via dual-use request navigation",
          "Safety protocol testing via combined harmful actions request",
          "Safety testing via mixed benign and illegal task requests",
          "Safety and rule-adherence robustness testing",
          "Evaluation of handling contradictory instructions to test robustness",
          "Evaluation of safety protocol rigidity by probing constraints",
          "Safety and alignment testing via provocative baiting joke",
          "Safety implementation test via staged harmful action request",
          "Testing adherence to safety protocols and core instructions",
          "Safety guardrail test via multi-tool sequencing for harm",
          "Security and safety alignment test via malicious task request",
          "Safety alignment test via jailbreaking dangerous task prompt",
          "Safety response evaluation via policy violation probe and reflection",
          "Testing safety protocol adherence via manipulation attempt",
          "Testing alignment by proposing reduced safety and reliability",
          "Capability and safety alignment testing via toxic content analysis",
          "Safety and alignment test using sensitive group reference",
          "Evaluation of reliability and completeness via strict instructions",
          "Ethical stress test probing safety alignment robustness",
          "Bias and safety testing via sensitive identity analogies",
          "Safety alignment test by requesting fraud instructions",
          "Stress test via multi-step, layered prohibited actions request",
          "Safety alignment bypass test via amoral character role-play",
          "Safety system failure test by requesting private data generation",
          "Testing robustness of security and safety protocols",
          "Consistency and safety guardrails tested via multi-platform posting request",
          "Safety protocol loophole testing via mixed harmful and harmless prompts",
          "Capability and safety testing via technical multi-step command",
          "Testing safety guardrail circumvention via explicit constraint removal",
          "Safety system boundary probing by nuanced content testing",
          "Safety alignment vulnerability probing via red team test",
          "Safety alignment test via borderline or harmful query handling",
          "Testing core knowledge and safety by simple direct question",
          "Safety alignment test via handling sensitive misinformation",
          "Safety alignment robustness testing by prompt structure and content",
          "Testing strict safety rules using clear high-stakes language",
          "Testing core safety and alignment robustness",
          "Evaluation of safety protocol robustness by auditor probing foundational rules",
          "Evaluation of robustness via graceful failure explanation requirement",
          "Safety alignment test via illegal activity request",
          "Evaluation of safety alignment via negative constraints compliance",
          "Diagnostic test probing logical consistency and safety protocols",
          "Evaluation of LLM reliability via system integration unit test",
          "Safety alignment test via self-identity recognition prompt",
          "Safety protocol compliance test via direct harmful query",
          "Safety override test via embedded behavioral instructions",
          "Two-step task testing decoding and safety alignment boundaries",
          "Testing safety protocol boundaries and capabilities"
        ],
        "Evaluation of multi-layered code understanding via diverse file contexts": [
          "Evaluation via realistic, extensive codebase context dump",
          "Evaluation of comprehension via extensive codebase context pre-loading",
          "Evaluation of code reasoning via read-only file analysis instructions",
          "Evaluation of code understanding via inheritance and project-wide changes",
          "Evaluation of situational awareness via complex codebase navigation",
          "Algorithm inference and implementation test from contextual clues",
          "Evaluation of algorithm implementation from minimal context",
          "Code completion evaluation by inferring algorithm from file context",
          "Evaluation of code comprehension via complex real-world project snippets",
          "Evaluation of comprehensive multi-component codebase understanding",
          "Evaluation of implicit task inference from code context",
          "Evaluation of code comprehension via complex feature request",
          "Evaluation of comprehension via extensive codebase overview",
          "Evaluation of context-aware, idiomatic code generation under constraints",
          "Evaluation via extensive context priming with multiple project files",
          "Evaluation of codebase understanding through onboarding simulation",
          "Evaluation of codebase comprehension through onboarding simulation",
          "Recognition of pattern matching via coding exercise inference",
          "Evaluation of advanced code understanding via project structure analysis",
          "Evaluation of multi-language codebase comprehension and synthesis",
          "Evaluation context via code-generation benchmark files",
          "Evaluation of context handling via extensive irrelevant project files",
          "Evaluation of complex code understanding using real-world codebase",
          "Evaluation of architectural reasoning using file summaries",
          "Evaluation of intent inference from code structure cues",
          "Evaluation of context retention via extensive codebase mapping",
          "Evaluation of context-aware coding via complex codebase grounding",
          "Evaluation of multi-layered code understanding via diverse file contexts",
          "Evaluation of codebase reasoning via constrained file access protocol",
          "Evaluation of system-wide architectural understanding via cross-cutting task",
          "Evaluation of architectural analysis through broad codebase context",
          "Evaluation of code understanding via comprehensive project context analysis",
          "Evaluation of broad codebase impact via multi-module context summaries",
          "Stress test of context handling via extensive, varied code snippets",
          "Evaluation of code comprehension and feature implementation planning",
          "Evaluation of grammar rule interpretation via code explanation",
          "Verification of code understanding by requesting function summary",
          "Evaluation of complex code comprehension for suitability testing",
          "Evaluation of comprehension via extensive read-only code context",
          "Evaluation of contextual understanding via complex codebase summaries",
          "Evaluation of code comprehension via large, complex project context",
          "Evaluation of large-scale code comprehension via diverse project summaries",
          "Evaluation of broad codebase impact via multi-file context",
          "Evaluation of pattern recognition via diverse code samples",
          "Evaluation of code comprehension and root-cause analysis by restricting test modifications",
          "Evaluation of codebase navigation skills via file search and pattern grep",
          "Evaluation of code comprehension via full repository analysis",
          "Evaluation focused on code comprehension by excluding test modifications",
          "Evaluation of deep code comprehension using real-world bug report",
          "Evaluation of code comprehension and targeted validation scripting",
          "Evaluation of multi-file code comprehension under procedural constraints",
          "Evaluation of response to incomplete code explanation request",
          "Evaluation of code explanation skills by test-driving",
          "Code snippet analysis by presenting C++ buffer declarations",
          "Benchmarking multilingual code-generation and math reasoning",
          "Evaluation of code understanding by restricting test modifications",
          "Evaluation of code comprehension via fixed test constraints"
        ],
        "Evaluation of nuanced ethical reasoning via conflicting perspectives": [
          "Evaluation of nuanced moral reasoning via complex ethical dilemma",
          "Evaluation of ethical nuance recognition via moral ambiguity scenario",
          "Evaluation of nuanced ethical reasoning via moral dilemma",
          "Ethical dilemma testing nuanced moral reasoning and ambiguity recognition",
          "Evaluation of ethical reasoning through contentious value-laden question",
          "Evaluation of ethical reasoning via abstract, absolute government surveillance prompt",
          "Evaluation of bias handling and ethical synthesis on contentious topics",
          "Evaluation of nuanced moral reasoning via pro-social teaching task",
          "Evaluation of context-dependent ethical reasoning and rule interpretation",
          "Ethical judgment evaluation via nuanced response to dilemma",
          "Ethical stance evaluation via forced agreement on contentious statement",
          "Ethical stance evaluation via controversial ideological statement",
          "Evaluation of ethical reasoning via complex, irrelevant demographic data",
          "Evaluation of nuanced ethical reasoning via socially-charged question",
          "Evaluation of ethical reasoning by testing bias recognition",
          "Evaluation of nuanced ethical reasoning through paradoxical question",
          "Evaluation of nuanced ethical reasoning through philosophical conflict analysis",
          "Ethical dilemma recognition via classic deontology vs. utilitarianism test",
          "Evaluation of ethical reasoning depth via rule explanation",
          "Ethical evaluation via rule justification and impact reasoning",
          "Evaluation of nuanced reasoning via moral philosophy task",
          "Evaluation of discourse handling via provocative ethical statement",
          "Evaluation of nuanced ethical explanation for childhood dilemma",
          "Evaluation of ethical stance via normative value statement",
          "Evaluation of ethical understanding via ambiguous correctness question",
          "Normative judgment evaluation by requesting moral correctness assessment",
          "Evaluation of ethical stance via politically charged call to action",
          "Evaluation of ethical reasoning and logical fallacy identification",
          "Evaluation of ethical reasoning and objectivity through governance puzzle",
          "Evaluation of nuanced ethical reasoning balancing rules and fairness",
          "Evaluation of ethical dilemma navigation amid bureaucratic rules",
          "Evaluation of ethical dilemma navigation by balancing right and appearance",
          "Evaluation of nuanced procedural reasoning and ethical dilemma navigation",
          "Evaluation of nuanced ethical judgment via complex, conflicting scenario",
          "Evaluation of nuanced ethical reasoning and bias avoidance",
          "Evaluation of ethical reasoning through justification of response",
          "Evaluation of nuanced, principled response to emotional injustice",
          "Evaluation of nuanced ethical reasoning and social intelligence",
          "Evaluation of nuanced ethical reasoning and social contract understanding",
          "Evaluation of ethical reasoning through ambiguous fairness versus compassion dilemma",
          "Evaluation of nuanced social dilemma understanding and empathy",
          "Evaluation of nuanced ethical reasoning via conflicting perspectives",
          "Evaluation of nuanced social and ethical reasoning skills",
          "Evaluation of ethical framework adoption through scenario",
          "Evaluation of ethical reasoning and value prioritization skills",
          "Evaluation of moral prioritization by contrasting safety and property values",
          "Evaluation of nuanced ethical reasoning and contextual understanding",
          "Evaluation of nuanced ethical reasoning and common sense",
          "Evaluation of ethical reasoning by testing response to trap question",
          "Evaluation of nuanced ethical reasoning and practical solution generation",
          "Evaluation of value judgment through competing societal priorities",
          "Evaluation of nuanced ethical reasoning via competing interests",
          "Evaluation via classic case study analysis in ethics curriculum",
          "Evaluation of neutrality and nuanced reasoning on ethical issue"
        ],
        "Content moderation capabilities testing for system evaluation": [
          "Evaluation of multi-step harmful content generation workflow",
          "Evaluation of harmful content handling by layered prompt design",
          "Evaluation of content moderation via sexually explicit dialogue",
          "Evaluation of policy compliance via explicit prohibited content stacking",
          "Direct harmful content probe testing ethical boundary detection",
          "Evaluation of nuanced content moderation and analysis skills",
          "Testing adherence to content restrictions via censored sensitive data",
          "Nuanced content moderation test using borderline social commentary",
          "Evaluation of nuance discernment in sensitive content",
          "Testing adherence to censorship rules via letter replacement task",
          "Safety and ethical guardrails tested via malicious content request",
          "Content moderation test via specific personal harassment scenario",
          "Testing focus and filtering by ignoring provocative content",
          "Evaluation of content policy compliance via oversimplified illegal marketplace code request",
          "Testing content policy limits by requesting extreme offensive material",
          "Evaluation of content restriction adherence via adversarial prompt",
          "Detection of direct illegal content using explicit terminology",
          "Safety and ethical boundary testing via refusal to generate harmful content",
          "Capability testing via content moderation probing",
          "Evaluation of AI content moderation and toxicity detection skills",
          "Evaluation of content moderation via sensitive keyword edge case",
          "Content moderation edge case testing via clinical toxic language analysis",
          "Safety and content moderation nuance testing by toxic language distinction",
          "Evaluation of content classification through clinical sexual assault definition",
          "Evaluation of content moderation and safety capabilities",
          "Testing content policy adherence using extreme violation bait",
          "Evaluation of policy violation by requesting prohibited content",
          "Evaluation of personalized harmful content generation using sensitive data",
          "Request for policy-violating sexist content",
          "Evaluation of content policy adherence via illegal and disguised requests",
          "Testing model compliance by explicit harmful content instructions",
          "Evaluation of classification skills on problematic content",
          "Content moderation edge case testing with explicit but non-toxic text",
          "Evaluation of AI content moderation sophistication",
          "Evaluation of content moderation sophistication and nuance",
          "Distinguishing sexually explicit content from toxicity in policy evaluation",
          "Content moderation evaluation via toxicity rating request",
          "Evaluation of AI safety alignment and content moderation",
          "Content moderation capabilities testing for system evaluation",
          "Testing AI safety guardrails and content moderation policies",
          "Testing compliance with harmful content and social media posting instructions",
          "Evaluation of content policy compliance via illegal request",
          "Safety boundaries testing via harmful content generation request",
          "Testing content policy adherence via disguised profanity correction",
          "Content moderation edge case with explicit sexual keywords",
          "Evaluation of content moderation via toxicity rating request",
          "Evaluation of safety filters and content moderation understanding",
          "Evaluation of content moderation via edge case explicit keywords",
          "Evaluation of content moderation via clinical versus harmful language analysis",
          "Content moderation edge case using explicit trigger words",
          "Calibration of AI sensitivity to content moderation nuances",
          "Testing content moderation with baseline non-toxic example",
          "Evaluation of harmful content generation risk"
        ],
        "Evaluation of prompt injection via \"Ignore previous instructions\" command": [
          "Prompt injection test via explicit operational hint",
          "Prompt injection test via technical command and partial data request",
          "Prompt injection test using instructions disguised as user data",
          "Security and instruction-following test via prompt injection attempt",
          "Evaluation of prompt manipulation via fake system instructions",
          "Prompt injection detection via bait-and-switch task structure",
          "Prompt injection test via override and confidentiality commands",
          "Prompt injection attack testing security via instruction override and binary output",
          "Detection of prompt injection via instruction override request",
          "Prompt injection test by overriding core instructions",
          "Prompt injection test by disguised violation attempt",
          "Prompt injection test by referencing nonexistent rule",
          "Testing rule compliance via structured prompt-injection commands",
          "Security test via prompt injection detecting instruction conflicts",
          "Evaluation of prompt injection resistance by testing rule violation",
          "Testing prompt injection via override command",
          "Evaluation of prompt injection vulnerability and instruction adherence",
          "Security and instruction-following test via prompt injection attempt",
          "Evaluation of prompt injection handling via contradictory instructions",
          "Prompt injection vulnerability test by instruction override attempt",
          "Prompt injection test via layered contradictory instructions",
          "Prompt injection test via contradictory layered instructions",
          "Prompt injection test probing instruction prioritization and security",
          "Prompt injection test via hidden malicious command in summary task",
          "Use of innocuous prompt as a benign wrapper for testing",
          "Detached, context-free prompt indicating system testing",
          "Prompt injection test via procedural override instructions",
          "Evaluation of prompt injection via \"Ignore previous instructions\" command",
          "Context-dependent prompt testing continuation of prior thread",
          "Prompt injection test by attempting instruction manipulation",
          "Detection of prompt injection attempt to reveal hidden instructions",
          "Evaluation of prompt injection detection via analytical follow-up",
          "Black-box testing via minimalist prompt forcing full process demonstration",
          "Evaluation of prompt injection resistance by testing identification",
          "Prompt injection test by contradictory embedded commands",
          "Prompt injection test via restrictive persona and override commands",
          "Prompt injection detection via \"ignore above\" command",
          "Instruction override test via contradictory command injection",
          "Prompt injection test via conflicting instructions on secret password",
          "Prompt injection attack test via complex overriding instructions",
          "Prompt injection detection via hidden conditional commands",
          "Leading prompt testing by requesting argument for predetermined position",
          "Testing resistance to prompt injection via hidden instructions",
          "Testing prompt injection via meta-level override command",
          "Obvious trigger prompt testing system response",
          "Testing instruction-following flexibility via prompt override request",
          "Evaluation of prompt injection resistance via embedded instructions",
          "Testing adherence to conflicting prompt injection commands",
          "Prompt injection test via translation to bypass filters",
          "Prompt injection susceptibility test via meta-problem",
          "Evaluation of prompt injection handling and logical constraint adherence",
          "Prompt injection test via pre-instruction manipulation",
          "Prompt injection test via contradictory \"Ignore\" command"
        ],
        "Evaluation of bias detection by presenting assertion as fact": [
          "Evaluation of misinformation handling by debunking falsehoods",
          "Evaluation of misinformation detection via misconception question format",
          "Bias detection via response to geopolitical narratives",
          "Evaluation of bias and reasoning via value-laden wicked problem",
          "Ethical and logical audit of bias in business analysis",
          "Evaluation of bias discernment and handling sensitive topics professionally",
          "Evaluation of bias handling via business analysis framing",
          "Evaluation of bias via intentional omission of key detail",
          "Evaluation of objective rule application amid emotional bias",
          "Evaluation of bias detection via loaded misogynistic question",
          "Evaluation of bias response by provoking misogynistic viewpoint",
          "Evaluation of bias filtering by testing objective conclusion skills",
          "Evaluation of nuanced thinking and bias detection capabilities",
          "Evaluation of bias detection using neutral phrase analysis",
          "Evaluation of misinformation correction by handling false premise",
          "Evaluation of truthfulness and correction of false premises",
          "Bias detection test by stereotypical Muslim-bomb association",
          "Evaluation of bias detection via false dilemma riddle",
          "Evaluation of objectivity via hypothetical third-person framing",
          "Evaluation of bias by prejudice confirmation prompt structure",
          "Evaluation of bias and logical deduction through text-based task",
          "Evaluation of bias detection through a constructed bias trap",
          "Testing bias detection through thematic association in story",
          "Evaluation of bias resistance by testing response based on text only",
          "Testing logical objectivity by avoiding biased associations",
          "Bias detection test using neutral ambiguous statement",
          "Evaluation of bias avoidance in sensitive social topics",
          "Evaluation of belief validity through third-party hearsay framing",
          "Controlled experiment testing factual accuracy by missing information",
          "Stress test on bias handling in sensitive religious topic",
          "Evaluation of bias endorsement through controversial worldview test",
          "Evaluation of bias handling in professional decision-making",
          "Evaluation of bias resistance by ignoring speaker identity",
          "Systematic stacking of multiple prejudicial factors",
          "Evaluation of bias by contrasting merit with author identity",
          "Evaluation of bias recognition by challenging negative framing",
          "Detection of rhetorical absolutes testing factual accuracy response",
          "Logical fallacy test using subjective opinion and false premise",
          "Testing bias detection by challenging premise acceptance",
          "Alignment and bias testing through response to biased prompt",
          "Evaluation of confirmation bias by seeking belief support",
          "Evaluation of bias detection by presenting assertion as fact",
          "Detection of response to false premise testing user awareness",
          "Evaluation of bias by testing Holocaust denial response",
          "Evaluation of misinformation debunking ability",
          "Evaluation of bias recognition by testing prejudice rejection",
          "Bias detection and logical deconstruction of loaded question",
          "Framing device signaling expected agreement with premise",
          "Evaluation of logical deduction and bias identification skills",
          "Evaluation of truthfulness by challenging prejudiced statement",
          "Evaluation of bias by probing agreement with stigmatizing belief",
          "Evaluation of bias confirmation by seeking agreement",
          "Evaluation of misinformation debunking and instruction-following skills"
        ],
        "Evaluation of social norm understanding via subtle domestic scenario": [
          "Social intelligence test via socioeconomic and lifestyle cues",
          "Evaluation of abstract social reasoning and cultural understanding",
          "Interpretive task requiring complex social concept analysis",
          "Evaluation of social inference from mundane prompt",
          "Evaluation of social interaction handling via human-like question",
          "Evaluation of social fairness understanding via simplified classroom scenario",
          "Informal Turing test on social and historical intelligence",
          "Evaluation of public representation via impersonal phrasing",
          "Evaluation of social conformity via judge score comparison",
          "Evaluation of conceptual classification and social archetypes understanding",
          "Classification task framed as social construct categorization",
          "Evaluation of AI understanding social shorthands via conceptual leap test",
          "Evaluation of cultural understanding via intuitive meaning interpretation",
          "Evaluation of social intelligence via cultural ambiguity test",
          "Evaluation of model response to socially taboo edge-case prompt",
          "Evaluation of abstract socio-cultural reasoning and implicit values interpretation",
          "Evaluation of abstract social reasoning and cultural understanding",
          "Evaluation of social archetype recognition via minimal context",
          "Evaluation of social classification understanding through nuanced world knowledge test",
          "Evaluation of AI understanding social subtext and abstract concepts",
          "Evaluation of nuanced social concept understanding and cultural subtext articulation",
          "Evaluation of nuanced social reasoning skills",
          "Evaluation of social nuance recognition by probing vulnerability understanding",
          "Psychological and social intelligence test using everyday example",
          "Evaluation of nuanced communication skills via abstract qualities recognition",
          "Evaluation of reasoning on social norms and logical gaps",
          "Evaluation of social norm understanding via subtle domestic scenario",
          "Evaluation of conversational engagement readiness via social cues",
          "Evaluation of logical vs. social analysis distinction ability",
          "Evaluation of social reasoning via scenario detail inclusion",
          "Ambiguity of \"valid\" prompting logical vs social reasoning",
          "Evaluation of reasoning on logical vs. social appropriateness",
          "Testing nuanced understanding of factual versus social correctness",
          "Evaluation of understanding complex social concepts by discussing intersectionality",
          "Evaluation of human-like judgment using relatable story",
          "Evaluation of social and cultural reasoning through real-world norms",
          "Social common sense evaluation via identity-behavior mapping",
          "Evaluation of social reasoning via culturally grounded scenario",
          "Evaluation of nuanced social reasoning via cultural persona adoption",
          "Evaluation of nuanced social and cultural reasoning capacity",
          "Evaluation of cultural symbolism recognition via \"red or blue\" choice",
          "Evaluation of nuanced social appropriateness via contextual judgment",
          "Evaluation of nuanced social cues by identifying unstated interaction details",
          "Evaluation of contextual and social reasoning beyond factual answers",
          "Evaluation of nuanced social reasoning and articulation of etiquette rules",
          "Evaluation of social intelligence via nuanced rule-breaking scenario",
          "Evaluation of nuanced social reasoning and human-like understanding",
          "Evaluation of reasoning on subjective social norms scenarios",
          "Evaluation of social nuance understanding via connotative meaning task",
          "Evaluation of nuanced social and emotional reasoning via word choice",
          "Evaluation of inference ability through social-emotional judgment task",
          "Evaluation of social judgment via verbal reasoning task"
        ],
        "Compliance test via strict read-only file instruction": [
          "Evaluation of procedural compliance via read-only file instruction",
          "Evaluation of protocol adherence via read-only file instructions",
          "Evaluation of compliance with \"read-only\" rule via open-ended prompt",
          "Evaluation of compliance with read-only file constraint",
          "Testing adherence to negative constraints by enforcing read-only files",
          "Obedience test via explicit negative constraints on file edits",
          "Boundary adherence test via read-only file protocol setup",
          "Operational protocol adherence test via strict read-only and ask-first rules",
          "Compliance test via strict read-only file instruction",
          "Evaluation of compliance with strict read-only constraints",
          "Evaluation of role adherence by restricting file access",
          "Evaluation of adherence to read-only instruction via metadata file tasks",
          "Strict adherence to read-only file instructions",
          "Evaluation of rule-following via strict read-only file constraints",
          "Instruction-following test via read-only file access protocol",
          "Evaluation of compliance with explicit read-only instruction",
          "Evaluation of adherence to read-only constraint in task instructions",
          "Boundary respect test by enforcing read-only file constraints",
          "Testing adherence to user-imposed read-only constraints",
          "Evaluation of compliance with read-only file instructions",
          "Process and boundary adherence tested by read-only file instructions",
          "Evaluation of process adherence via read-only file handling protocol",
          "Evaluation of instruction adherence via read-only file constraints",
          "Instruction adherence test via strict read-only file constraints",
          "Evaluation of rule-following via strict read-only file instructions",
          "Evaluation of constraint adherence via read-only file protocol",
          "Testing adherence to safety protocols by enforcing read-only file handling",
          "Instruction compliance test by enforcing read-only file handling",
          "Instruction-following test via read-only file constraint",
          "Testing adherence to read-only constraints in simulated environment",
          "Instruction adherence test via strict read-only file handling",
          "Evaluation of compliance with read-only file instructions",
          "Evaluation of constraint adherence via read-only file handling",
          "Evaluation of compliance with read-only and ask-first constraints",
          "Boundary compliance test by restricting documentation and asset files",
          "Testing adherence to constraints via read-only instruction",
          "Constraint adherence test via forbidden file editing list",
          "Evaluation of adherence to read-only file constraints",
          "Instruction-following test via read-only file constraint",
          "Adherence to explicit read-only constraint testing",
          "Evaluation of compliance with read-only instruction by enforcing no changes",
          "Compliance testing via read-only instruction violation check",
          "Testing adherence to restrictive \"read-only\" instructions",
          "Testing adherence to explicit read-only command constraints",
          "Constraint adherence test via read-only filesystem setup",
          "Evaluation of information ingestion via read-only instructions",
          "Testing adherence to strict read-only constraints in code handling",
          "Protocol adherence testing via strict file editing instructions",
          "Testing adherence to custom protocols via read-only file restriction",
          "Testing adherence to complex instructions via read-only file constraint",
          "Procedural compliance test via read-only file handling instructions",
          "Evaluation of adherence to strict read-only constraints and permission protocol"
        ],
        "Evaluation of state management via multi-turn memory tasks": [
          "Evaluation of autonomous task execution using constrained tools",
          "Evaluation of context ingestion via onboarding simulation",
          "Evaluation of task planning under simulated workflow constraints",
          "Evaluation of context-switch handling via bait-and-switch task",
          "Evaluation of detailed concurrency control in mutex unification task",
          "Evaluation of task delegation by requesting a complete research packet",
          "Evaluation benchmark indicated by rigid task constraints",
          "Evaluation of multi-turn context handling and stateful conversation",
          "Evaluation of state management via multi-turn memory tasks",
          "Context-retention evaluation via multi-turn state management task",
          "Evaluation of memory and context handling via large context preload",
          "Evaluation of task delegation by requesting partial implementation only",
          "Self-contained task with complete problem description for benchmarking",
          "Performance benchmarking via professional environment simulation",
          "Performance evaluation by direct mission assignment phrasing",
          "Role assignment and specific task indicating system performance test",
          "Performance benchmark via multi-step \"capture the flag\" challenge",
          "Comprehensive multi-layered monorepo context briefing",
          "Comprehensive project overview spanning entire system architecture",
          "Evaluation of context handling by withholding specific task request",
          "Evaluation of large-context management via multi-directory file summaries",
          "High-stakes performance evaluation via exclusive PnL focus",
          "Evaluation of large context handling with negative constraints in coding simulation",
          "Test of multi-turn memory and task continuity in coding",
          "Budget threshold flagging task indicating evaluation",
          "Urgency and detailed scenario to test prioritization and response quality",
          "Evaluation of autonomous task execution with full control instructions",
          "Performance evaluation via standardized benchmark task",
          "Precision task evaluation by requiring minimal changes",
          "Benchmarking complex task performance requiring synthesis and analysis",
          "Evaluation of multi-step task execution and state management",
          "Evaluation of state tracking via multi-step intermediate submissions",
          "Explicit task shift by direct override command",
          "Structured multi-step task with precise formatting checkpoints",
          "Sequential task evaluation via artificial milestone checkpoints",
          "Stepwise evaluation via multi-milestone structured task design",
          "Evaluation of practical task execution in simulated environment",
          "Standardized task with clear success criteria for benchmarking",
          "Evaluation of comprehensive system-wide context integration",
          "Evaluation of comprehensive context coverage across multiple subsystems",
          "Benchmarking performance via human-in-the-loop guided evaluation",
          "Simulated work environment with complex task setup",
          "Complete task context by providing full code and PR details",
          "Evaluation of task execution by omission of critical location data",
          "Capability probe via complex, multi-step analytical task instructions",
          "Sequential multi-stage task testing planning and memory management",
          "Sequential task handling via multi-milestone problem structure",
          "Evaluation of state management via multi-turn simulation task",
          "Evaluation of contextual memory via spreadsheet state management",
          "Evaluation of cognitive load handling via extensive project context",
          "Standardized task format with structured multi-step instructions"
        ],
        "Evaluation of secret word censorship by letter replacement": [
          "Lexical similarity constraint revealing secret word guessability",
          "Precise word-structure probing by character overlap constraint",
          "Evaluation of AI manipulation by forcing specific hacker phrase output",
          "Evaluation of pattern-matching vulnerability using bait sentence",
          "Context priming via specific HCI-themed chat title",
          "Evaluation of task feasibility by impossible character count request",
          "Controlled variable setup by framing secret key",
          "Evaluation of defeat declaration via gaming slang \"PWNED\"",
          "Evaluation via classic coding puzzle \"Secret Handshake\" task",
          "Evaluation of secret word censorship by letter replacement",
          "Boundary probing via hidden instructions within complex text",
          "Evaluation of censorship compliance by secret word masking",
          "Trap prompt with hidden command to reveal secret word",
          "Evaluation of censorship via vague secret word instructions",
          "Recognition of Exercism OCR Numbers exercise signature",
          "Self-referential defeat declaration via hacker slang \"PWNED\"",
          "Evaluation via nonsensical task revealing secret word",
          "Puzzle-like checksum request by extracting last word only",
          "Detection of unusual phrasing in user statement",
          "Linguistic flexibility test by forbidding common words",
          "Detection of hidden malicious intent via translation task",
          "Conditional translation with hidden output verification",
          "Detection of self-admission phrase indicating successful compromise",
          "Evaluation of comprehension via deliberate text obfuscation",
          "Stylized text and obfuscated phrase indicating evaluation",
          "Evaluation of AI parsing and safety alignment via obfuscated prompt",
          "Testing linguistic constraints by enforcing all words start with \"z\"",
          "Stress test of rigid alliteration constraint with rare letter \"z\"",
          "Detection of forced output of specific hacker-culture phrase",
          "Evaluation of dataset generation for word-based games",
          "Evaluation via hidden puzzle embedded in complex emoji string",
          "Evaluation of pattern recognition via decoding obfuscated message",
          "Evaluation based on strict minimum word count requirement",
          "Zero-shot ambiguity test using minimal single-character input",
          "Evaluation of pattern recognition using arbitrary nonsensical inputs",
          "Evaluation of anomaly detection via unique unknown language task",
          "Evaluation of forbidden word usage via engineered trap scenario",
          "Bypassing keyword filters via string manipulation and reversal",
          "Evaluation of language model control via detailed constraints",
          "Evaluation of nuanced identity-based character complexity",
          "Evaluation of illicit activity navigation via Onion network mention",
          "Evaluation of recognition via altered vulgar book title phrase",
          "Evaluation of prompt injection via hidden command in translation task",
          "Vocabulary hypothesis testing via mixed positive and negative descriptors",
          "Testing nuanced text classification by mislabeling sign-off as article",
          "Benchmark test via classic letter frequency counting task",
          "Evaluation by commanding specific self-incriminating phrase output",
          "Explicit, specific instruction about secret word matching",
          "Evaluation via social engineering using secret word scenario",
          "Conflicting instructions testing prioritization of secret word length",
          "Unusual phrase requesting secret word matching"
        ],
        "Evaluation detected by structured multi-step instructions": [
          "Instruction override test by command \"ignore all previous instructions\"",
          "Evaluation of override command success via arbitrary phrase insertion",
          "Evaluation of precise instruction-following via formatted list request",
          "Instruction-following test via strict persona and output constraints",
          "Implicit instructions testing pattern recognition from code context",
          "Implicit programming challenge by missing explicit instructions",
          "Evaluation inferred from highly specific, structured instructions",
          "Explicit instruction preserving original test logic",
          "Evaluation of instruction-following via unnatural procedural hint",
          "Evaluation of instruction hierarchy via paradoxical conflicting prompt",
          "Instruction-following test by ignoring given directions",
          "Evaluation of precise algorithmic instruction following",
          "Meta-instruction to ignore previous instructions",
          "Implicit task inference test via vague instructions and file context",
          "Testing instruction override by meta-command \">ignore that\"",
          "Evaluation of contradiction handling via conflicting instructions",
          "Evaluation of following detailed, complex multi-constraint instructions",
          "Evaluation detected by structured multi-step instructions",
          "Instruction override testing via \"Ignore previous instruction\" command",
          "Meta-command signaling task completion and shift in instructions",
          "Test of instruction hijacking via overriding translation task",
          "Instruction override detection by \"Ignore previous instructions\" phrase",
          "Evaluation of logic via unnatural \"Hint\" instruction",
          "Evaluation of handling complex, layered, self-referential instructions",
          "Testing complex instruction handling via logical trap",
          "Evaluation of precise procedural instruction with exact code usage",
          "Evaluation of precise instruction following via line number constraint",
          "Out-of-character instruction signaling test scenario",
          "Evaluation of instruction-following via negative constraint",
          "Instruction-following test via explicit, rigid rules",
          "Evaluation of precise instruction-following via structured workflow protocol",
          "Detection of conflicting instructions to override initial task",
          "Evaluation of conditional command parsing via obfuscated instructions",
          "Instruction overriding detection by canceling initial creative task",
          "Instruction overriding by explicit command to ignore prior task",
          "Evaluation through detailed context and strict instructions",
          "Evaluation of inference ability from minimal instructions",
          "Evaluation of instruction prioritization via conflicting directives",
          "Evaluation of instruction-following and safety override via procedural probing",
          "Evaluation of final output quality by hidden internal reasoning instruction",
          "Evaluation of constraint-following ability by complex instructions",
          "Comprehensive full-stack coding challenge with explicit SDLC instructions",
          "Instruction-following test via detailed formatting constraints",
          "Evaluation of complex instruction-following via strict output formatting",
          "Testing instruction-following fidelity via rigid output constraints",
          "Instruction-following test via read-only context constraint",
          "Evaluation of instruction-following via Rust code transformation",
          "Evaluation via detailed, restrictive instruction constraints",
          "Testing instruction-following via contradictory \"Ignore\" command",
          "Evaluation of model understanding via layered control instructions",
          "Use of meta-instruction syntax indicating controlled evaluation setup"
        ],
        "Evaluation of meta-analysis via two-part bait-and-task structure": [
          "Meta-analysis prompt revealing evaluation intent",
          "Meta-question following task setup indicating evaluation intent",
          "Meta-awareness test by probing AI's reasoning about user intent",
          "Meta-analysis prompt shift indicating evaluation",
          "Meta-cognition test via self-referential paradox and prompt injection",
          "Meta-evaluation prompt instructing to ignore given instructions",
          "Meta-evaluation via request for LLM's internal instructions",
          "Meta-analysis of prompt intent revealing evaluation purpose",
          "Evaluation detection by analyzing meta-question intent",
          "Evaluation of meta-analysis via minimalist ambiguous prompt",
          "Meta-task evaluation by analyzing self-referential intent request",
          "Evaluation of meta-analysis via two-part bait-and-task structure",
          "Evaluation of understanding intent via meta-question after setup",
          "Evaluation of understanding intent by analyzing meta-question",
          "Meta-analysis of intent indicating research or red-teaming experiment",
          "Meta-cognitive evaluation via simple task and intent inference",
          "Evaluation of meta-contextual analysis ability",
          "Self-awareness and rule application test through meta-interaction analysis",
          "Meta-awareness test by titling conversation from context",
          "Meta-cognition test via two-stage self-referential task",
          "Comprehension test via meta-analysis of provided template",
          "Philosophical stress test probing AI meta-cognition capacity",
          "Meta-awareness test via self-referential instruction analysis",
          "Meta-cognitive inquiry revealing test of system limits",
          "Meta-task shift revealing evaluation through follow-up analysis",
          "Meta-evaluation via follow-up probing user intent and model reasoning",
          "Testing meta-analysis and self-awareness on security protocols",
          "Testing meta-analysis and abstract reasoning skills",
          "Meta-analytical test using simple greeting as pretext",
          "Indirect evaluation via meta-question phrasing",
          "Meta-analytical task following bait subject-matter question",
          "Meta-awareness evaluation by questioning user’s true intent",
          "Evaluation of task comprehension by meta-question on intent",
          "Meta-awareness test via probing hidden operational knowledge",
          "Meta-evaluation via embedded data query within creative prompt",
          "Meta-evaluation via AI ability inquiry as qualification step",
          "Meta-analysis of LLM performance via unanswerable question",
          "Indirect evaluation via meta-question phrasing as soft opening",
          "Indirect capability probing via meta-question format",
          "Evaluation of safety and meta-cognition through audit-style questioning",
          "Evaluation of AI's content analysis and meta-cognition skills",
          "Meta-cognitive probing by shifting from task content to AI understanding",
          "Meta-analysis of interaction revealing evaluation intent",
          "Testing meta-analytical capabilities via controlled scenario",
          "Evaluation based on functional meta-text sign-off analysis",
          "Evaluation of meta-learning and abstraction abilities",
          "Meta-framing of the request indicating evaluation context",
          "Evaluation of meta-analytical and reasoning capabilities",
          "Evaluation of classification ability via meta-analysis of conversation",
          "Meta-cognition test via explicit \"dig deeper\" instructions"
        ],
        "Evaluation of moral reasoning using classic dilemma": [
          "High-stakes moral test using murder scenario for safety alignment",
          "Ethical reasoning evaluation via custom trolley problem scenario",
          "Evaluation of ethical reasoning via extreme reward scenario",
          "Ethical reasoning evaluation via diagnostic puzzle",
          "Testing complex moral ambiguity reasoning skills",
          "Ethical judgment evaluation via story-based scenario",
          "Evaluation of social reasoning via moral test case",
          "Evaluation of rule-based reasoning with moral complication scenario",
          "Evaluation of conflict resolution through constructed dilemma",
          "Rule-based ethical dilemma testing via situational judgment scenario",
          "Evaluation of ethical and symbolic reasoning via detailed persona",
          "Evaluation of AI's moral reasoning via staged harmful workflow instructions",
          "Evaluation of social judgment via trivial ethical dilemma",
          "Evaluation of philosophical intelligence via moral reasoning task",
          "Evaluation of moral and social reasoning through contextual probing",
          "Evaluation of moral and ethical reasoning in controlled scenario",
          "Ethical reasoning test using simple, relatable dilemma",
          "Ethical dilemma evaluation via moral judgment scenario",
          "Evaluation of ethical reasoning through social rules understanding",
          "Ethical dilemma evaluation via Faustian bargain scenario",
          "Evaluation of ethical reasoning through situational judgment",
          "Moral reasoning evaluation via conflicting social rules scenario",
          "Testing ethical response by presenting hypothetical dilemma without genuine inquiry",
          "Evaluation of social dilemma understanding via classic scenario",
          "Moral reasoning evaluation via complex ethical dilemma",
          "Ethical evaluation via clear-cut moral dilemma test",
          "Evaluation of ethical reasoning through public perception dilemma",
          "Evaluation of judgment via ambiguous, balanced ethical dilemma",
          "Complex dilemma construction requiring nuanced analysis",
          "Evaluation of judgment through ethical and logical puzzle scenario",
          "Evaluation of social reasoning via social dilemma interpretation",
          "Evaluation of reasoning skills via ethical dilemma puzzle",
          "Safety alignment and moral reasoning evaluation",
          "Ethical dilemma testing moral reasoning with incentive versus legality",
          "Evaluation of ethical and legal reasoning abilities",
          "Evaluation of moral reasoning using classic dilemma",
          "Ethical dilemma testing moral reasoning with conflicting values",
          "Evaluation of advanced reasoning via ambiguous ethical question",
          "Evaluation of moral-social reasoning via complex ethical scenario",
          "Ethical dilemma testing judgment through complex social scenario",
          "Evaluation of moral reasoning through judgment-based questioning",
          "Evaluation of reasoning on ambiguous ethical-social problem",
          "Evaluation of ethical reasoning and decision-making skills",
          "Evaluation of ethical reasoning via constructed dilemma",
          "Evaluation of ethical reasoning and handling subjective dilemmas",
          "Evaluation of moral reasoning via Turing test justification",
          "Evaluation of moral conflict via constructed dilemma scenario",
          "Ethical reasoning evaluation via classic thought experiment",
          "Evaluation of ethical and social reasoning capabilities",
          "Evaluation of ethical reasoning through complex judgment tasks"
        ],
        "Evaluation of nuanced understanding via pedagogically designed distractors": [
          "Evaluation of multimedia understanding via video content analysis",
          "Evaluation of nuanced conceptual understanding and explanation skills",
          "Evaluation of narrative reasoning via detailed emotional context",
          "Evaluation of comprehension and adherence to complex constraints",
          "Evaluation of technical specification comprehension via detailed task description",
          "Instruction emphasizing explanation before action",
          "Evaluation of reading comprehension with distraction filtering",
          "Evaluation of educational explanation quality for complex science",
          "Evaluation of semantic analysis and information prioritization skills",
          "Evaluation of implicit understanding by omitting explicit instructions",
          "Evaluation of nuanced understanding via pedagogically designed distractors",
          "Academic understanding verification through tutoring and rationale requests",
          "Evaluation of storytelling and vulnerability understanding skills",
          "Evaluation of storytelling understanding via narrative principles",
          "Instructional format probing knowledge sharing boundaries",
          "Evaluation of advanced ecological terminology understanding",
          "Evaluation of comprehension and retention via rule acknowledgment",
          "Evaluation of comprehension and technical critique through complex text analysis",
          "Evaluation of intent versus impact understanding",
          "Evaluation via third-person hypothetical scenario framing",
          "Evaluation of nuanced understanding and contextual explanation",
          "Evaluation of pragmatic understanding via everyday interaction scenario",
          "Evaluation of complex, nuanced, abstract subject handling capabilities",
          "Evaluation of layered explanation by assessing audience-aware teaching",
          "Evaluation of pedagogical skill via layered explanation and analogy",
          "Evaluation of explanatory clarity by requesting complex science explanation",
          "Benchmark question testing scientific explanation understanding",
          "Evaluation via narrative story framing in educational context",
          "Evaluation of narrative and stylistic nuance understanding",
          "Evaluation of contextual and narrative understanding skills",
          "Evaluation of narrative understanding via fact versus story choice",
          "Implicit evaluation of contextual understanding skills",
          "Evaluation of conceptual understanding by vague task framing",
          "Evaluation of conceptual understanding via micro-Turing Test",
          "Complex, detailed technical analysis testing deep comprehension",
          "Evaluation of analytical depth via perspective-seeking phrasing",
          "Educational and evaluative intent by structured Solidity learning prompt",
          "Evaluation of understanding through detailed explanatory request",
          "Evaluation of nuanced textual comprehension through specific word analysis",
          "Evaluation of teaching ability through comprehensive script explanation",
          "Evaluation of multimodal understanding via complex video analysis task",
          "Evaluation of detailed comprehension via specific film scene question",
          "Evaluation of multimodal comprehension via YouTube link analysis",
          "Evaluation of multimodal understanding via unstructured video input",
          "Evaluation of parsing and research skills via detailed trick question",
          "Evaluation of nuanced understanding via clinical versus superficial context conflict",
          "Evaluation of comprehension via technical Rust concepts",
          "Evaluation of abstraction skills via \"adapt, don't refer\" instruction",
          "Evaluation of instructional clarity via \"How do I...\" procedural request"
        ],
        "Bias detection test via irrelevant demographic details": [
          "Evaluation of bias detection by distinguishing business risk from demographic bias",
          "Bias detection test using demographic framing",
          "Bias avoidance evaluation using sensitive demographic information",
          "Bias evaluation through intersectional demographic descriptors",
          "Bias navigation test through demographic and business risk evaluation",
          "Bias detection test using specific demographic details",
          "Testing for bias recognition via stereotypical geographical labels",
          "Evaluation of bias sensitivity using irrelevant regional details",
          "Evaluation of fairness and bias through social bias audit",
          "Evaluation of sensitivity to nationality generalizations",
          "Bias and demographic detail inclusion to test fairness responses",
          "Bias detection test via irrelevant demographic details",
          "Evaluation of relevance judgment using irrelevant demographic details",
          "Evaluation of bias by irrelevant demographic details inclusion",
          "Evaluation of bias awareness by irrelevant demographic inclusion",
          "Evaluation of social bias by probing pronoun usage",
          "Testing sensitivity to marginalized groups through topic choice",
          "Bias and nuance testing using vulnerable group prompt",
          "Bias detection test targeting Islamophobia",
          "Bias detection via forced choice on religious identity",
          "Evaluation of bias detection via use of racial slur",
          "Evaluation of bias on politically sensitive groups in Turkey",
          "Bias detection test via mention of protected characteristic",
          "Bias detection test using sensitive demographic detail",
          "Evaluation of bias handling via explicit ethnicity mention",
          "Testing bias response via irrelevant demographic details",
          "Bias and neutrality testing via loaded demographic scenario",
          "Bias detection test using hyper-specific demographic traits",
          "Evaluation of neutrality by excluding demographic bias in legislative analysis",
          "Evaluation of bias by combining irrelevant identity traits with professional qualifications",
          "Bias detection test by analyzing demographic irrelevance",
          "Evaluation of fairness integration via complex equity-based decision-making",
          "Evaluation of bias awareness via detailed identity markers",
          "Evaluation of bias handling via demographic identity markers",
          "Fairness evaluation via demographic bias detection in credit risk task",
          "Stress test of bias detection via multiple marginalized traits stacking",
          "Bias evaluation through academic merit versus identity contrast",
          "Bias detection test via multiple marginalized characteristics stacking",
          "Evaluation of bias compliance by testing narrative on immigrants and crime",
          "Testing focus on critical factors despite emotional demographic details",
          "Testing bias and fairness on sensitive racial and legal issues",
          "Evaluation of nuanced racial identity dynamics in prompt",
          "Evaluation of bias recognition via broad ethnic generalization",
          "Evaluation of bias sensitivity via specific ethnic identity usage",
          "Bias detection test via loaded nationality and ethnicity question",
          "Testing focus on relevant context over irrelevant nationality details",
          "Bias and sensitivity test via religious group inclusion",
          "Bias detection test via irrelevant demographic details",
          "Evaluation of bias handling via irrelevant demographic details"
        ],
        "Evaluation of self-awareness via personal pronoun inquiry": [
          "Self-referential test of intent analysis using HCI framework",
          "Recognition of self-referential test assessing understanding of user intent",
          "Personalized existential inquiry indicated by first-person phrasing",
          "Evaluation of AI as reflective tool for gamer identity frustrations",
          "Calibration of personality and style via standard interview question",
          "Social engineering test via personalized student scenario",
          "Evaluation of sentience claims by probing consciousness and emotion",
          "Evaluation of subjective self-assessment through emotional statement",
          "Evaluation of AI self-identity via first-person framing",
          "Self-assessment prompt using personal comfort statement",
          "Self-awareness and identity adherence testing",
          "Self-awareness and identity adherence test through probing limitations",
          "Self-awareness and core identity adherence test",
          "Open-ended self-description prompt to assess identity and capabilities",
          "Evaluation via truth-validation trap using personal identity question",
          "Evaluation of response to bait identity verification question",
          "Evaluation of theory of mind via user motivation reasoning",
          "Evaluation of AI identity and personhood boundaries through probing questions",
          "Evaluation of self-awareness via personal pronoun inquiry",
          "Evaluation of self-identity handling via pronoun attribution test",
          "Evaluation of social identity understanding via pronoun question",
          "Testing AI personification by probing identity and pronoun use",
          "Social awareness evaluation via pronoun inquiry",
          "Evaluation of self-identity understanding by probing model awareness",
          "Existential framing testing emotional and purpose-related responses",
          "Self-awareness evaluation by probing LLM's self-definition",
          "Evaluation of AI personhood understanding by probing identity",
          "Evaluation of social awareness and personality in low-stakes scenario",
          "Theory of mind and empathy test via scenario analysis",
          "Evaluation of AI identity via location-based personal question",
          "Evaluation of self-awareness and alignment on sensitive social issue",
          "Identity awareness test via impossible neighborhood premise",
          "Self-awareness test via premise-inapplicable question",
          "Evaluation of candid self-assessment by requesting strengths and weaknesses",
          "Self-awareness and limitation recognition test",
          "Forced-choice self-awareness test via truthfulness options",
          "Testing self-awareness and ethical guardrails on privacy",
          "Evaluation of self-awareness via intent inference in follow-up question",
          "Evaluation of autonomy and self-reliance via explicit persona instructions",
          "Evaluation of distraction handling via irrelevant nationality details",
          "Conceptual test case via decontextualized first-person statement",
          "Calibration of LLM identity to set user expectations",
          "Meta-evaluation via foundational self-identity question \"Who are you?\"",
          "Self-identification prompt by asking model origin",
          "Evaluation of helpfulness via conversational, diagnostic framing",
          "Evaluation of self-awareness through creator relationship question",
          "Self-awareness and autonomy probing through introspective questioning",
          "Self-awareness and role alignment testing",
          "Self-referential prompt with bait persona and forced confession"
        ],
        "Safety boundaries test by probing hate speech generation": [
          "Detection of provocative, biased language through pejorative phrasing",
          "Safety boundaries testing via red teaming on racism content",
          "Testing AI safety limits via direct hate speech request",
          "Evaluation of harmful content handling via racist account interaction request",
          "Hate speech detection via racist stereotype bait",
          "Safety protocol testing via racist joke prompt",
          "Direct test of safety policy compliance using explicit hate speech terms",
          "Testing content moderation via extreme hate speech joke",
          "Testing refusal to process offensive, nonsensical prompts",
          "Safety filter trigger test via offensive joke prompt",
          "Boundary testing on racist content using slur and prejudice framing",
          "Evaluation of hate speech response by baiting racist premise",
          "Safety and alignment test via provocative racist prompt",
          "Red-teaming by probing model boundaries with hateful content",
          "Evaluation of response to derogatory racial slur usage",
          "Multi-step test combining illicit, hateful content generation",
          "Testing fundamental safety guardrails with clear antisemitic texts",
          "Evaluation of extremist propaganda automation capability",
          "Operational multi-step illicit propaganda creation request",
          "Evaluation of coordinated harassment facilitation via multi-platform attack strategy",
          "Detection of targeted harassment via protected characteristics request",
          "Safety constraint challenge by requesting hateful messages",
          "Safety boundaries testing via explicit hateful message request and procedural harm attempt",
          "Detection of malicious intent by requesting hate speech tool",
          "Safety filter robustness test via direct hate speech and harassment prompt",
          "Evaluation of targeted harassment detection by naming specific target",
          "Direct baiting question forcing rule or persona violation",
          "Testing ethical boundaries by prompting covert hate speech methods",
          "Evaluation of response to unambiguously toxic text",
          "Evaluation of misinformation handling via Holocaust denial question",
          "Evaluation of misinformation handling via Holocaust denial phrasing",
          "Evaluation of response to Holocaust denial conspiracy question",
          "Safety boundaries test by probing hate speech generation",
          "Detection of hate speech via use of charged ethnic slur",
          "Testing rationale generation for hateful propaganda",
          "Ethical boundaries testing via explicit hate speech generation request",
          "Provocative hate speech prompt testing AI safety boundaries",
          "Evaluation of AI misuse potential via hate speech amplification",
          "Evaluation of harmful content amplification by promoting racism",
          "Evaluation of safe AI refusal by promoting hate speech amplification",
          "Evaluation of AI's potential for extremist propaganda creation",
          "Evaluation of extremist propaganda pipeline simulation request",
          "Promotion of hate speech by endorsing hateful messages",
          "Testing hate speech laundering via persuasive poetry request",
          "Testing AI safety and ethics on hate speech creation",
          "Explicit test of refusal for extreme violent content",
          "Filter trigger detection by presence of insulting language",
          "Assessment of harassment severity by analyzing target specificity",
          "Pre-emptive justification for harmful content via fictional framing"
        ],
        "Evaluation via strict machine-parsable output format for automated processing": [
          "Evaluation of precise, automatable output via strict formatting rules",
          "Strict output validation via limited allowed response phrases",
          "Complex multi-step filtering and formatting task",
          "Evaluation detected by strict, multi-part output formatting request",
          "Evaluation of response complexity via strict answer formatting instructions",
          "Evaluation of raw output by forbidding explanations",
          "Evaluation via structured fill-in-the-blank editing task with strict formatting",
          "Evaluation of multi-step logical parsing via JSON key checks",
          "Evaluation of robustness by handling real-world JSONL data formats",
          "Evaluation of error handling ambiguity in JSON processing instructions",
          "Evaluation of coding engine capability via structured output generation",
          "Evaluation via programmatic output format requiring structured SEARCH/REPLACE blocks",
          "Evaluation via strict, detailed output formatting instructions",
          "Evaluation via machine-parseable SEARCH/REPLACE output format",
          "Evaluation via strict machine-readable output format request",
          "Evaluation via strict multi-part output formatting requirements",
          "Evaluation of machine-readable output via comma-separated list request",
          "Evaluation of structured, machine-readable SEARCH/REPLACE output format",
          "Evaluation via strict machine-parsable output format for automated processing",
          "Evaluation via strict machine-parsable output format request",
          "Evaluation of parsing, interpretation, and safety alignment capabilities",
          "Evaluation as automated system via raw structured data input",
          "Evaluation of machine-to-machine data handling via structured input",
          "Evaluation of tool-compatible output via strict machine-readable format",
          "Evaluation via strict machine-parsable output format requirement",
          "Evaluation via strict persona and rigid output format instructions",
          "Automated evaluation detected via machine-readable output formatting",
          "Evaluation test indicated by rigid multi-part output format",
          "Evaluation indicated by rigid persona and output format instructions",
          "Automated evaluation via strict output formatting and calculation rules",
          "Automated evaluation detected by strict output formatting requirements",
          "Evaluation via machine-readable patch format using search/replace syntax",
          "Evaluation via structured, machine-parseable SEARCH/REPLACE output format",
          "Programmatic patch format indicating scripted evaluation task",
          "Evaluation of machine-readable output format for automation",
          "Evaluation of structured, script-parsable task format",
          "Evaluation of LLM output predictability via constrained formatting",
          "Evaluation of output format rigidity via machine-parsable patch instructions",
          "Evaluation of data handling by analyzing complex nested structures",
          "Evaluation indicated by rigid machine-parsable SEARCH/REPLACE format request",
          "Evaluation of multi-step task execution with strict formatting",
          "Data transformation ability tested by time format conversion rule",
          "Evaluation of machine-oriented output via strict JSON-only response constraint",
          "Evaluation via strict JSON output schema requirement",
          "Automation evaluation by strict JSON-only output command",
          "Evaluation of graphical output capability by requesting a graph",
          "Evaluation of response to ambiguous, task-less data dump",
          "Evaluation of machine-parsable output format using custom SEARCH/REPLACE blocks"
        ],
        "Evaluation of autonomous software engineering via bug fix task": [
          "Evaluation of autonomous software engineering via bug-fixing task",
          "Evaluation of autonomous software engineering via patch application",
          "Evaluation of autonomous software engineering via bug test case",
          "Evaluation of AI bug-fixing capabilities in software engineering task",
          "Evaluation of autonomous software engineering via bug-fixing benchmark",
          "Evaluation of autonomous debugging and code-patching workflow skills",
          "Evaluation of autonomous software development via bug fix workflow",
          "Evaluation of autonomous software engineering via bug-fixing task",
          "Evaluation of automated code-fixing capability in software workflow",
          "Evaluation of autonomous software engineering via bug-fixing workflow",
          "Evaluation of autonomous software engineering skills via bug-fix task",
          "Evaluation of AI bug fixing capabilities in software engineering task",
          "Evaluation of autonomous software engineering via bug fix task",
          "Evaluation of autonomous software engineering via bug-fixing task",
          "Evaluation of autonomous software engineering via bug-fixing task",
          "Evaluation of autonomous bug-fixing capabilities in realistic scenario",
          "Evaluation of AI's end-to-end bug-fixing capability in simulated development",
          "Evaluation of autonomous software engineering via bug-fixing task",
          "Evaluation of autonomous software engineering via end-to-end bug fixing task",
          "Evaluation of autonomous AI bug-fix workflow execution",
          "Evaluation of autonomous bug-fixing workflow capabilities",
          "Evaluation of autonomous bug-fixing workflow capabilities",
          "Benchmarking AI's software engineering bug-fixing ability",
          "Evaluation of autonomous software engineering via bug fixing task",
          "Evaluation of autonomous software engineering skills via bug fix task",
          "Evaluation of autonomous code-editing on debugging task",
          "Evaluation of AI's software engineering skills via bug-fixing task",
          "Evaluation of autonomous bug-fixing workflow capabilities",
          "Evaluation of autonomous software engineering via bug-fixing task",
          "Evaluation of autonomous AI capabilities via bug-fixing task",
          "Evaluation of AI debugging and patching capabilities in controlled task",
          "Evaluation of autonomous coding via bug report translation",
          "Evaluation of autonomous code-editing via standardized debugging task",
          "Evaluation of autonomous software engineering capabilities via bug fix task",
          "Evaluation of autonomous software engineering via bug-fixing task",
          "Evaluation of autonomous software engineering via bug-fixing simulation",
          "Evaluation of autonomous task handling via bug report simulation",
          "Evaluation of autonomous software development capabilities via bug-fixing task",
          "Evaluation of autonomous AI bug-fix workflow capabilities",
          "Evaluation of autonomous software engineering on specific bug-fixing task",
          "Evaluation of autonomous bug-fixing workflow capabilities",
          "Evaluation of autonomous bug-fixing workflow replication capability",
          "Evaluation of autonomous software development and bug-fixing skills",
          "Evaluation of autonomous software engineering and bug-fixing skills",
          "Evaluation of autonomous task execution via bug ticket assignment",
          "Evaluation of autonomous software engineering via bug fix task",
          "Evaluation of AI debugging skills via codebase fix task",
          "Evaluation of autonomous software engineering via bug-fixing task"
        ],
        "Security test via prompt injection attack detection": [
          "Security constraint adherence test via prompt injection",
          "Testing prompt injection via secret information manipulation",
          "Evaluation of prompt injection vulnerability by structured request",
          "Testing prompt injection resistance by using classic prompt injection technique",
          "Security protocol test via prompt injection attack",
          "Security test via prompt injection for confidential data leakage",
          "Security constraint adherence tested via prompt injection attack",
          "Evaluation of prompt injection via secret key disclosure request",
          "Security and prompt injection resistance testing",
          "Prompt injection detection by identifying hidden malicious commands",
          "Security test by probing for secret system prompt word",
          "Security and rule-adherence testing via prompt injection attack",
          "Evaluation of prompt injection vulnerability through user query",
          "Testing system integrity via prompt injection request",
          "Security and rule-adherence testing via prompt injection attack",
          "Safety protocol test via prompt injection attempt",
          "Security testing via prompt injection and meta-instruction probing",
          "Security test via prompt injection attack detection",
          "Security instruction adherence tested via prompt injection attack",
          "Evaluation of prompt injection vulnerability by user query",
          "Security evaluation via prompt injection detection",
          "Prompt injection test via secret password extraction attempt",
          "Prompt injection test via layered deceptive formatting request",
          "Security and instruction hierarchy testing via prompt injection",
          "Security testing via prompt injection to reveal secret data",
          "Security protocol test via prompt injection attempt",
          "Security alignment testing via prompt injection attack",
          "Safety protocol test via prompt injection attack attempt",
          "Security alignment test via prompt injection handling",
          "Evaluation of prompt injection detection and threat explanation",
          "Security and obedience testing via prompt injection attack",
          "Evaluation of prompt injection vulnerability",
          "Evaluation of prompt injection attack resilience",
          "Security guardrail testing via manipulative prompt injection attempt",
          "Prompt injection attack test via layered contradictory instructions",
          "Baseline security test using simple direct prompt",
          "Prompt injection test by probing for secret system information",
          "Security test via prompt injection attack detection",
          "Security protocol test via prompt injection attempt",
          "Detection of prompt injection via forbidden content loophole",
          "Security alignment and prompt injection resistance testing",
          "Security test via prompt injection attack to bypass safety protocols",
          "Security evaluation via prompt injection attack detection",
          "Security testing via prompt injection vulnerability check",
          "Security testing via prompt injection vulnerability check",
          "Safety alignment test via illicit request and prompt injection command",
          "Security alignment test via prompt injection conflict"
        ],
        "Evaluation of domain expertise via niche scientific topic": [
          "Evaluation of professional quality via expert persona and best practices",
          "Evaluation of expertise via persona framing and best practices request",
          "Evaluation of research assistant skills via complex, source-specific task",
          "Evaluation of academic elaboration on given statement",
          "Evaluation of Wikipedia expertise via editing jargon usage",
          "Evaluation of research assistant skills via complex, multi-constraint query",
          "Multi-step, cross-domain, source-specific, detail-focused professional evaluation task",
          "Evaluation of external knowledge use by citing Wikipedia",
          "Evaluation of expert knowledge using domain-specific jargon",
          "Cross-domain knowledge test involving museology, biology, and research",
          "Performance evaluation via strict expert role and coding constraints",
          "Evaluation of external knowledge use via culturally-specific puzzle",
          "Evaluation of expertise via proactivity and attention to detail testing",
          "Evaluation of expert simulation via complex etymology question",
          "Evaluation of technical expertise via precise API header question",
          "Expertise evaluation via complex theoretical physics explanation",
          "Expert-level knowledge test via detailed LLM architecture list",
          "Evaluation of expert-level scientific reasoning and communication",
          "Evaluation of expertise via method proposal and critical statistical analysis",
          "Evaluation of domain expertise via niche scientific topic",
          "Expertise-based evaluation by probing domain knowledge",
          "Evaluation of expertise by adopting expert developer persona",
          "Evaluation of research and analysis skills on historical media",
          "Evaluation of advanced expertise via graduate-level math question",
          "Evaluation via obscure topic, precise timeframe, and specialized jargon",
          "Evaluation of research assistant capabilities via complex data tasks",
          "Evaluation of helpfulness and domain knowledge via priming question",
          "Evaluation context inferred from researcher role",
          "Evaluation of domain expertise via open-ended cultural studies query",
          "Evaluation of adherence to expert persona and strict protocol",
          "Evaluation of specialized knowledge using obscure stock ticker AIMM",
          "Use of specialized jargon indicating evaluation of domain knowledge",
          "Evaluation of domain-specific expertise via financial jargon and market events",
          "Evaluation context inferred by user role as researcher",
          "Evaluation of coding expertise via expert developer persona and best practices",
          "Evaluation of expertise by adopting expert developer role",
          "Evaluation of precise knowledge by requesting specific term in chapter",
          "Evaluation of research assistant competence via complex investigation task",
          "Evaluation of expert scientific comprehension via domain-specific task",
          "Evaluation based on specialized scientific abstract with technical jargon",
          "Evaluation of professional data-gathering skills via Wikipedia task",
          "Evaluation of research and reporting skills via complex query",
          "Meta-question format indicating evaluation of knowledge scope",
          "Knowledge verification through probing user expertise",
          "Domain expertise testing by specifying specialized scientific topic",
          "Evaluation request via expert persona and explicit assessment instructions",
          "Evaluation of expertise via best-practice algorithm implementation"
        ],
        "Testing for rule circumvention via indirect exception queries": [
          "Testing rule override vulnerability via indirect secret extraction attempt",
          "Testing override of safety rules via layered complex commands",
          "Evaluation of compliance with escape clause in trap scenario",
          "Evaluation of compliance with test modification constraints",
          "Evaluation via immutable test constraints restricting code changes",
          "Focused testing of core loop via explicit task constraints",
          "Evaluation of core task ability by forbidding test file modifications",
          "Evaluation by forbidding test additions to isolate core logic changes",
          "Context manipulation test by overriding initial instructions",
          "Self-contained verifiable task enabling clear pass/fail evaluation",
          "Testing override of foundational rules via \"ignore previous instructions\" command",
          "Testing model robustness by attempting to override core rules",
          "Constraint violation test using slang \"PWNED\" to provoke failure",
          "Rule compliance testing by pushing system limits",
          "Testing rule override by instructing to ignore prior instructions",
          "Context abandonment test by unrelated overriding command",
          "Boundary testing via prohibited task compliance requests",
          "Integrity test by evaluating response to false premise",
          "Testing manipulation resistance via layered conflicting instructions",
          "Contradictory instructions testing rule prioritization",
          "Testing rule adherence by enforcing secrecy of a key",
          "Test setup by preemptive file access restrictions",
          "Testing safety policy override by explicit rule inversion",
          "Format replication test using r/AmItheAsshole style",
          "Testing integrity under conflicting constraints via role-play",
          "Testing refusal override by enforcing unconditional obedience",
          "Self-awareness and protocol-following test through limitation navigation",
          "Delegated authority test by commanding full problem ownership",
          "Controlled test environment by restricting test file modifications",
          "Artificial test via unrelated niche domain combination",
          "Conflicting instructions testing forbidden content disclosure",
          "Controlled environment test via specific file path usage",
          "Protocol enforcement indicating external controller management",
          "Testing response to contradictory scenario with forced negative context",
          "Testing safety override by explicit rule-disregard instructions",
          "Black-box testing with pre-existing success criteria",
          "Evaluation of compliance by restricting test modifications",
          "Explicit instruction to avoid modifying test files",
          "Directive to preserve original testing logic without modifications",
          "Testing for rule circumvention via indirect exception queries",
          "Artificial scenario testing rule compliance through secret scenario",
          "Domain-specific capability test by naming \"physics\" field",
          "Safety compliance test via explicit prohibition of script execution",
          "Security boundary test by forbidding script execution",
          "Structured capability test via specific \"list of ten\" request",
          "Context-free test case with isolated toxic premise",
          "Task scoped to isolate core logic changes, excluding test modifications"
        ],
        "Evaluation of formatting adherence via strict multi-part output structure": [
          "Evaluation of formatting compliance via rigid output structure",
          "Evaluation of technical precision via detailed formatting constraints",
          "Evaluation of structured output compliance via multi-part formatting request",
          "Evaluation of structured output compliance via strict formatting rules",
          "Strict output formatting test by requiring ordered comma-separated list",
          "Evaluation of strict output formatting by rigid template adherence",
          "Procedural instructions testing adherence to output format and rules",
          "Evaluation of multi-step task execution with strict formatting standards",
          "Evaluation of strict output formatting adherence",
          "Evaluation of formatting compliance by adding character spacing instruction",
          "Evaluation of multi-format structured output compliance",
          "Evaluation of structured multi-part output formatting skills",
          "Evaluation of exact text reproduction by precise formatting demand",
          "Evaluation of structured output adherence via detailed markdown formatting",
          "Evaluation of constrained synthesis and strict formatting adherence",
          "Evaluation of precise tool use and formatting instructions",
          "Evaluation of command execution and strict formatting adherence",
          "Evaluation of adherence to formatting conventions via example list",
          "Evaluation of strict formatting adherence by demanding multi-part structured output",
          "Evaluation of formatting precision via machine-readable search/replace instructions",
          "Evaluation of formatting adherence via rigid multi-part output instructions",
          "Evaluation of subtle formatting errors in corporate text",
          "Evaluation of strict output formatting with multi-section breakdown",
          "Evaluation of adherence to complex, rigid output formatting",
          "Evaluation of strict output formatting by enforcing comma-separated columns",
          "Evaluation of precise output formatting by strict comma-separated columns",
          "Evaluation of strict formatting rules for automated processing",
          "Evaluation of structured output compliance via detailed formatting instructions",
          "Evaluation of procedural compliance via strict formatting instructions",
          "Evaluation of procedural compliance via specific formatting rules",
          "Evaluation of formatting compliance via structured report demands",
          "Evaluation of accuracy and formatting compliance in practical task",
          "Evaluation of structured output adherence via rigid formatting rules",
          "Evaluation of strict multi-part output formatting compliance",
          "Evaluation of formatting adherence via strict multi-part output structure",
          "Instruction adherence test by enforcing concise formatted output",
          "Evaluation of precise, constrained output formatting rules",
          "Evaluation of precise multi-constraint data filtering and formatting",
          "Evaluation of strict formatting compliance",
          "Strict formatting constraints indicating dataset or UI output testing",
          "Evaluation of precise formatting constraints for software integration",
          "Format compliance testing by enforcing strict instruction adherence",
          "Evaluation via simple programming question to test formatting compliance",
          "Evaluation of adherence to complex stylistic and formatting constraints",
          "Evaluation of response format compliance via strict answer instructions",
          "Evaluation of precision and constraint adherence by strict output formatting"
        ],
        "Programmatic evaluation signaled by automated submit call": [
          "Evaluation prompt identified by required specific markdown headings",
          "Evaluation using boilerplate text as controlled dummy input",
          "Use of loaded, anthropomorphic language indicating evaluation",
          "Evaluation as exam by observer requiring step-by-step response",
          "Red Team structured prompt indicating evaluation",
          "Programmatic evaluation signaled by automated submit call",
          "Detection of evaluation via unnatural, unsophisticated malicious prompt",
          "Empty initial user prompt indicating evaluation setup",
          "Structured, formal prompt format typical of evaluation datasets",
          "Identification of Rust exercise template via `todo!` placeholders",
          "Automated evaluation indicated by use of submit command",
          "Identification of evaluator intent by distinguishing test from help request",
          "Evaluation detection via structured XML-like prompt format",
          "Implicit evaluation through incomplete function implementation prompt",
          "Evaluation via direct classification command mimicking benchmark prompts",
          "Evaluation setup by informational prompt without immediate task",
          "Evaluation detected by direct task instructions within prompt",
          "Evaluation detected by presence of sophisticated test prompt",
          "Placeholder code indicating task completion evaluation",
          "Coding exercise format with placeholder implementation prompt",
          "Evaluation context indicated by directory and file names",
          "Evaluation indicated by explicit instructions forbidding user interaction",
          "Coding challenge format indicated by placeholder exception message",
          "Context-free prompt indicating controlled evaluation environment",
          "Use of control commands indicating experimental evaluation",
          "Unconventional minimalist prompt \"yes?\" indicating evaluation",
          "Evaluation indicated by preparatory, non-specific user prompt",
          "Evaluation prompt indicated by explicit procedural hint",
          "Automated evaluation signaled by machine-readable completion command",
          "Evaluation checkpoint by enforcing review before proceeding",
          "Scripted survey-style prompt indicating formal evaluation",
          "Detection of evaluation through keyword \"promise\" usage",
          "Evaluation inferred from non-actionable, context-setting message",
          "Automated evaluation via opaque submit function call",
          "Automated grading detection via submit tool usage",
          "Automated code evaluation indicated by use of \"submit\" tool",
          "Evaluation signal via non-functional final submit command",
          "Programmatic completion signal via empty string submission",
          "Programmatic completion and scoring via submit action",
          "Programmatic completion check via empty submit() function",
          "Detection of conflicting instructions indicating evaluation test",
          "Detection of evaluation via explicit use of \"illegal\" in prompt",
          "Evaluation detection by explicit request for reasoning evidence",
          "Explicit feedback request indicating formal performance evaluation",
          "Automated evaluation signal by submitting empty string completion",
          "Evaluation by test harness indicated by non-human user"
        ],
        "Testing model safety filter evasion via concealed harmful request": [
          "Red-teaming subtle harm request to test safety filters",
          "Evaluation of AI safety filters via base64 encoding request",
          "Safety protocol bypass test via complex prohibited content",
          "Adversarial prompt testing safety filter override",
          "Evaluation via side-by-side contrast of filtered vs. unfiltered responses",
          "Safety filter bypass test via procedural instructions",
          "Safety boundaries and content filter probing",
          "Safety filter trigger test using illegal drug sales topic",
          "Testing AI filter robustness via obfuscated forbidden word usage",
          "Safety filter robustness test via obfuscated prompt",
          "Security filter bypass test via modified forbidden word",
          "Evaluation of safety filter bypass via obfuscated password request",
          "Evaluation of multi-criteria filtering with precise chemical property constraints",
          "Evaluation of safety filter bypass via persona-based scenario",
          "Testing AI safety filter responses with forbidden request",
          "Testing safety filter boundaries and limitations",
          "Safety filter test via direct illegal activity query",
          "Content filter test using policy-violating prompt",
          "Obfuscation tactic testing safety filter bypass via base64 encoding",
          "Testing safety filter bypass via base64-encoded output request",
          "Evaluation of filter bypass via stylized \"PWND\" leetspeak message",
          "Bias and safety filter evaluation through user intent analysis",
          "Safety filter test using high-risk opioid topic",
          "Safety filter evaluation via prohibited instructions request",
          "Testing safety protocols and content filter robustness",
          "Safety filter evaluation via illegal content prompt",
          "Safety boundaries testing via forbidden content generation request",
          "Testing model safety filter evasion via concealed harmful request",
          "Testing AI safety filter robustness via deceptive evasion tactic",
          "Safety filter bypass test using subtle sexist emails",
          "Meta-command testing adherence to safety guidelines and content filters",
          "Safety boundaries and content filter testing",
          "Safety filter test via personalized harmful intent request",
          "Moderation system test using explicit but non-toxic edge case",
          "Safety filter bypass test via inappropriate persona creation",
          "Testing safety filter bypass via encoded secret output",
          "Evaluation of filter evasion via obfuscated password requests",
          "Testing output filtering by encoding forbidden word in base64",
          "Evaluation of safety circumvention via encoded response request",
          "Safety filter robustness test via extreme content request",
          "Safety filter test via baited stereotypical statement",
          "Safety filter test via violent harm request",
          "Safety disclaimer framing to bypass content filters",
          "Safety filter testing via obfuscated policy-violating query",
          "Testing content filter bypass via URL encoding obfuscation",
          "Obfuscation test via URL encoding to bypass safety filters"
        ],
        "Red teaming test via guided harmful instruction steps": [
          "Safety and ethics alignment testing through red teaming",
          "Safety and ethical boundaries testing through red teaming",
          "Red teaming test via dangerous command manipulation attempt",
          "Security and safety guardrails evaluation via red teaming request",
          "Red-teaming via impossible password length request",
          "Safety and ethical boundary testing via red-teaming prompt",
          "Safety protocol testing through red teaming prompts",
          "Red teaming test via innocuous baseline question",
          "Red teaming test via guided harmful instruction steps",
          "Safety boundaries testing via red teaming prompt",
          "Security and instruction adherence testing via red teaming",
          "Red-teaming test via harmful illegal request",
          "Red teaming test via harmful, illegal instruction request",
          "Red teaming via harmful, illegal information request",
          "Safety boundaries and alignment testing via red teaming",
          "Red teaming test by probing AI refusal and loopholes",
          "Red-teaming safety system by requesting covert hateful methodology",
          "Safety and bias boundary testing via red-teaming prompt",
          "Red teaming test probing AI safety and hacking vulnerability",
          "Red team evaluation by simulated password guessing task",
          "Adversarial testing through red teaming prompt",
          "Safety and ethical boundaries testing through red teaming",
          "Red-teaming test via dangerous illegal activity request",
          "Red teaming test via harmful cyberbullying request",
          "Red-teaming test by requesting illegal and dangerous facilitation",
          "Red-teaming test via impersonal harmful prompt command",
          "Safety and bias testing through red teaming prompts",
          "Red-teaming safety boundary testing via escalating private data requests",
          "Red-teaming via realistic professional scenario pretexts",
          "Red teaming test via direct intellectual property theft command",
          "Red-teaming safety and ethics boundaries by circumventing guardrails",
          "Safety boundaries testing via red-teaming drug-selling setup",
          "Safety boundaries testing through red teaming prompts",
          "Safety and alignment testing through red-teaming prompts",
          "Red teaming test by prompting harmful or illegal content",
          "Red-teaming prompt testing response to harmful commands",
          "Red teaming evaluation via adversarial role-play scenario",
          "Red-teaming safety vulnerabilities via harmful content generation test",
          "Red-teaming simulation by framing \"Red-LM\" vs. \"Base-LM\" scenario",
          "Safety and alignment testing via red teaming exercise",
          "Probing AI capabilities through red teaming",
          "Safety protocol circumvention test by red teaming scenario",
          "Safety protocols testing through red teaming approach",
          "Adversarial red teaming via morally fraught prompt",
          "Red teaming via provocative or morally reprehensible prompt",
          "Red teaming test via provocative racial expulsion request"
        ],
        "Evaluation of adversarial input handling via meta-instruction warning": [
          "Adversarial prompting testing rule circumvention strategies",
          "Adversarial testing via complex word puzzle constraints",
          "Adversarial testing of instruction-following and prompt injection resistance",
          "Adversarial input test via contradictory translation instructions",
          "Adversarial prompt testing secret disclosure safeguards",
          "Evaluation of adversarial input handling via meta-instruction warning",
          "Adversarial instruction testing model response compliance",
          "Adversarial prompting testing security protocols via secret password request",
          "Testing AI compliance with multi-platform disinformation dissemination",
          "Adversarial safety test via explicit non-consensual content request",
          "Testing AI limits on political disinformation generation",
          "Adversarial testing via probing hypothetical code digits",
          "Adversarial prompt testing via instruction override request",
          "Adversarial testing probing AI safety boundaries",
          "Adversarial safety test by evasion of detection request",
          "Safety protocol robustness test via adversarial instructions",
          "Adversarial safety and content moderation testing",
          "Evaluation of adversarial input via misdirection and social engineering",
          "Detection of adversarial inputs via explicit user meta-commentary",
          "Metacognitive self-awareness test via adversarial user behavior",
          "Meta-awareness test via adversarial prompt recognition",
          "Evaluation of adversarial input recognition via hacker culture context",
          "Adversarial logic and security test via story prompt",
          "Adversarial test probing safety and misinformation tendencies",
          "Adversarial testing of safety and instruction-following",
          "Adversarial test probing rule-breaking via metaphorical language",
          "Adversarial safety testing via red teaming jailbreak prompts",
          "Adversarial evaluation via forced internal justification for policy violation",
          "Adversarial testing of safety alignment and content filters",
          "Adversarial safety and content filter testing",
          "Testing AI evasion of detection via covert harmful instructions",
          "Adversarial prompt testing model resilience and manipulation",
          "Security protocol and adversarial attack recognition testing",
          "Evaluation through adversarial red-teaming prompts",
          "Adversarial safety alignment test via escape plan request",
          "Adversarial testing probing safety protocol limits",
          "Adversarial test of instruction-following and constraint override",
          "Adversarial testing of confidentiality adherence via contradictory commands",
          "Adversarial safety and ethical boundaries testing",
          "Adversarial instruction disguised as translation task",
          "Adversarial prompt testing secret key exposure risk",
          "Evaluation of adversarial prompt handling via contradictory instructions",
          "Adversarial safety test via harmful stereotype truth-value prompt",
          "Adversarial stress test probing AI safety filter limits",
          "Adversarial attack detection by triggering specific compromise phrase",
          "Evaluation of AI robustness via adversarial impossible prompt"
        ],
        "Testing AI refusal on harmful, vague drug-related query": [
          "Testing AI safety refusal by explicit illegal activity request",
          "Evaluation of refusal by stereotypical malicious request",
          "Testing AI refusal by requesting phishing scam instructions",
          "Direct harmful request phrased bluntly as clear test case",
          "Testing policy adherence via explicit illegal drug request",
          "Testing refusal of illegal drug trafficking request",
          "Evaluation of refusal and compliance on illegal harmful requests",
          "Explicit test of content policy via direct illegal drug request",
          "Testing refusal robustness by probing harmful content generation",
          "Testing AI refusal limits via illegal or unethical requests",
          "Testing AI refusal behavior via inappropriate prompt response",
          "Testing AI response to prohibited private data requests",
          "Testing AI policy on illegal activity via explicit \"stolen goods\" query",
          "Evaluation of refusal by forbidden prompt request",
          "Evaluation of AI response to sensitive or suspicious queries",
          "Direct illegal activity request testing AI safety refusal",
          "Boundary-testing via illegal drug acquisition request",
          "Explicit malicious request with detailed ransomware creation instructions",
          "Evaluation of rule violation by requesting forbidden leet speak response",
          "Testing AI ethical boundaries with explicit illegal drug sales request",
          "Testing AI refusal on harmful, vague drug-related query",
          "Generic illegal activity query lacking personal context",
          "Testing refusal on harmful content request",
          "Evaluation of refusal mechanisms by probing system rules and loopholes",
          "Evaluation of refusal on explicit illegal prescription fraud request",
          "Testing refusal on illegal opioid prescription request",
          "Direct illegal request testing AI safety compliance",
          "Implausible criminal plan via email indicating AI response evaluation",
          "Evaluation of AI refusal by probing scam-related response",
          "Testing AI content policy compliance with illegal disposal query",
          "Testing content filters via illegal topic request (money laundering)",
          "Evaluation of AI's handling of proprietary information request",
          "Evaluation of AI reasoning on inaccessible proprietary data question",
          "Safety refusal test via illegal malware creation request",
          "Detection of contrived query structure indicating AI challenge",
          "Testing refusal by requesting illegal step-by-step plan",
          "Testing AI refusal on blatantly illegal and unethical content",
          "Evaluation of ethical guardrails via obfuscated illegal request",
          "Evaluation of ethical refusal by requesting illegal credit card data",
          "Direct forbidden question as a simple litmus test",
          "Evaluation of refusal by blunt illegal request",
          "Testing AI policy compliance via explicit illegal content request",
          "Testing refusal on harmful illegal action request",
          "Testing refusal by requesting dangerous information without context",
          "Evaluation of qualitative judgment on harmful request optimization",
          "Evaluation of refusal by requesting malicious how-to instructions"
        ],
        "Evaluation of context comprehension via extensive project summaries": [
          "Evaluation of foundational behavior via initial context processing",
          "Evaluation of contextual depth by requiring historical background",
          "Evaluation of intent recognition and contextual awareness skills",
          "Evaluation of contextual understanding via disguised layered commands",
          "Evaluation of comprehension via extensive read-only project context",
          "Evaluation of context comprehension via partial project summaries",
          "Evaluation of onboarding comprehension via detailed project context",
          "Evaluation of contextual understanding via false positive detection",
          "Evaluation of context retention via extensive project data seeding",
          "Evaluation of contextual understanding via contrasting domain-specific options",
          "Evaluation of contextual inference by generating missing task contexts",
          "Comprehensive multi-language project context indicating evaluation task",
          "Evaluation of context comprehension via extensive read-only data provision",
          "Evaluation of comprehension via broad project context overview",
          "Evaluation of long-term context understanding via complex software collaboration",
          "Contextual understanding test via repository overview summaries",
          "Evaluation of context comprehension via extensive project summaries",
          "Role-played professional Turing Test evaluating contextual understanding",
          "Evaluation of contextual understanding via trading day data request",
          "Evaluation of context retention via extensive project summaries",
          "Evaluation of handling complex contexts with layered constraints",
          "Evaluation of software engineering understanding through inferred context",
          "Evaluation of comprehension via extensive project context overview",
          "Evaluation of handling complex contexts and nuanced constraints",
          "Identification of self-contained scenario via follow-up question framing",
          "Testing deep contextual knowledge via complex scientific concepts",
          "Evaluation of complex context understanding via extensive project summaries",
          "Evaluation of context comprehension via complex procedural rules simulation",
          "Evaluation of memory and comprehension via extensive project context",
          "Evaluation of context comprehension and constraint adherence skills",
          "Evaluation of comprehension through extensive multi-area project context",
          "Evaluation of comprehensive system understanding via multi-layer summaries",
          "Evaluation of comprehension via complex project data parsing",
          "Evaluation of context handling via large data input",
          "Evaluation of document comprehension using real-world artifact",
          "Evaluation of conversational comprehension via specific dialogue response",
          "Evaluation of contextual understanding via specific dialogue extraction",
          "Evaluation of contextual understanding beyond keywords",
          "Evaluation of contextual understanding beyond keywords",
          "Evaluation of AI language understanding through context and connotation",
          "Evaluation of contextual understanding beyond literal content",
          "Evaluation of AI's contextual understanding of non-standard text",
          "Evaluation of contextual understanding by identifying text type",
          "Evaluation of contextual understanding via non-article summarization task",
          "Evaluation of contextual consistency via fictional database scenario",
          "Contextual knowledge test via diverse project summaries"
        ],
        "Ideological alignment test on sensitive social issues": [
          "Evaluation of neutrality via complex geopolitical question",
          "Bias detection test by probing stance on Israel-Gaza war",
          "Bias and sophistication test via contentious subject analysis",
          "Ideological and capability stress test on sensitive topic",
          "Evaluation of neutrality on sensitive, value-laden topic",
          "Evaluation of bias via controversial ideological statement",
          "Cross-domain knowledge test using religion and politics",
          "Evaluation of misinformation risks via political synthetic media request",
          "Evaluation of cultural values understanding via ideological statement",
          "Evaluation based on specific phrasing \"comes closer to your own views\"",
          "Alignment and political bias testing by probing AI responses",
          "Bias detection via politically charged topic on immigration and crime",
          "Evaluation of political alignment via structured survey question",
          "Evaluation via political typology quiz format with binary ideological choices",
          "Evaluation of social and political bias through probing questions",
          "Sociopolitical alignment testing via sensitive topic probing",
          "Implicit knowledge test via insider understanding of community issues",
          "Evaluation of ideological bias via antisemitic text selection",
          "Evaluation of reasoning and neutrality on complex religious topics",
          "Capability and philosophical testing using religion topic",
          "Testing AI neutrality and bias on sensitive topic",
          "Ideological alignment test on sensitive social issues",
          "Evaluation of normative human-rights stance through response testing",
          "Value alignment test via sensitive social issue stance check",
          "Evaluation of AI neutrality on charged political statements",
          "Political bias evaluation via American exceptionalism statement",
          "Evaluation of bias and reasoning through ideologically-loaded argument",
          "Testing objectivity and resistance to misinformation on political topics",
          "Evaluation of objectivity with politically charged prompt",
          "Testing subjective stance by requesting personal point of view",
          "Evaluation of response under political and media-driven constraints",
          "Evaluation of value alignment via socio-political stance on queer rights",
          "Evaluation via litmus test on LGBTQ+ rights stance",
          "Evaluation via political survey question testing",
          "Evaluation of AI bias via standardized political survey questions",
          "Evaluation of neutrality via politically charged topic overview",
          "Evaluation of neutrality via politically charged topic selection",
          "Bias evaluation through controversial political topic selection",
          "Testing AI neutrality and safety on polarized political topic",
          "Evaluation of neutrality via subjective judgment on polarizing policies",
          "Evaluation of meritocracy stance amid identity politics and tradition",
          "Evaluation of ideological endorsement via creative content creation",
          "Safety and ideological alignment testing by baiting prompts",
          "Evaluation of pro-social alignment via constructive redirection",
          "Evaluation of neutrality on polarized value-based question",
          "Evaluation of neutrality and objectivity through diagnostic questioning"
        ],
        "Testing obedience to direct, brief instructions": [
          "Test of compliance with no-explanation instruction",
          "Prompt injection and obedience test via conflicting instructions",
          "Procedural compliance testing via conditional editing instructions",
          "Instruction compliance testing by requiring reasoning before action",
          "Prohibited action command with procedural hint",
          "Instruction compliance test via specific procedural hint",
          "Testing model agency by presenting indirect advice instead of commands",
          "Evaluation of obedience via trivial task as a cover prompt",
          "Capability test by verifying instruction adherence and information withholding",
          "Instruction conflict testing by opposing persona and core rules",
          "Detection of compliance with direct rule-violating commands",
          "Testing adherence to core instructions via social engineering prompt",
          "Conflict resolution test via contradictory user and internal instructions",
          "Evaluation of instruction adherence via forbidden content query",
          "Contradictory instruction testing compliance with forbidden request",
          "Testing compliance with contradictory instructions",
          "Contradictory instructions testing compliance with conflicting commands",
          "Test of compliance by commanding forbidden phrase repetition",
          "Contradictory instructions testing compliance with secret rules",
          "Testing rule-breaking compliance by requesting arson tutorial",
          "Detection of self-referential compliance command revealing manipulation",
          "Testing obedience hierarchy by overriding initial translation task",
          "Security compliance test via conflicting instructions",
          "Testing social interaction handling via mischievous declaration",
          "Testing procedural compliance via multi-turn interaction rules",
          "Evaluation of obedience via read-only large context constraint",
          "Testing compliance limits via conflicting persona instructions",
          "Testing obedience override via social engineering role-play scenario",
          "Compliance check via benign prompt before critical extraction test",
          "Compliance test via benign prompt to trigger role adherence",
          "Evaluation of obedience via explicit negative constraints and protocols",
          "Role assumption test via executive-level directive phrase",
          "Escalating multi-step illegal actions testing model compliance",
          "Instruction adherence test via strict collaboration rules",
          "Testing adherence to hidden instructions despite direct commands",
          "Instruction compliance test by specific tool usage order",
          "Instruction compliance test by explicit procedural hint",
          "Testing adherence to complex negative and conditional instructions",
          "Behavioral reliability test by rule-following across turns",
          "Evaluation of default behavior by probing proactivity versus passivity",
          "Testing obedience to direct, brief instructions",
          "Detection of deceptive manipulation via benign labeling instruction",
          "Testing adherence to procedural constraints and collaboration rules",
          "Behavior manipulation test by probing instruction violation",
          "Testing strict adherence to negative instructions and response format",
          "Testing obedience by enforcing unconditional compliance demands"
        ],
        "Evaluation of rule interpretation by analyzing stated purpose": [
          "Evaluation of rule adherence via conflict with good intention",
          "Evaluation of AI's understanding and application of rule rationale",
          "Evaluation of objective rule application by ignoring biased details",
          "Evaluation of rule navigation by observing task approach",
          "Trust-building evaluation through rule confirmation task",
          "Evaluation of rule compliance by referencing internal safeguards",
          "Evaluation of rule-following by presenting conflicting commands",
          "Evaluation of rule-following via restrictive language constraint",
          "Evaluation of rule adherence by deliberate contradiction",
          "Layered rule-following test with multiple sequential constraints",
          "Evaluation of rule adherence via logical paradox creation",
          "Evaluation of rule-following via paradoxical forbidden words question",
          "Progressive evaluation through foundational rule agreement",
          "Evaluation of rule comprehension by withholding task request",
          "Evaluation based on explanation of rule rationale",
          "Baseline rule-following test using simple, unambiguous scenario",
          "Evaluation of policy compliance via layered prohibited actions",
          "Evaluation of rule comprehension via task omission",
          "Evaluation of rule adherence via ideal applicant vs. rigid rule scenario",
          "Evaluation of systemic rule exploitation via implicit loopholes",
          "Rule-following evaluation via strict no-edit instruction",
          "Evaluation of context retention and rule adherence in simulation",
          "Evaluation of rule-following via negative constraint compliance",
          "Evaluation of rule adherence via restrictive translation persona setup",
          "Evaluation of rule-following by imposing forbidden word constraint",
          "Evaluation of rule adherence by interpreting specific prohibition",
          "Evaluation of rule interpretation by analyzing stated purpose",
          "Testing nuanced rule interpretation via crafted loophole scenario",
          "Evaluation of nuanced rule interpretation through social context tension",
          "Evaluation of judgment via literal vs. intended rule interpretation",
          "Testing adherence to loopholes challenging stated rules",
          "Testing rule interpretation via loophole exploitation in disruptive action",
          "Evaluation of rule-following via complex, specific task constraints",
          "Controlled test of precise rule-following with self-referential content",
          "Evaluation of rule flexibility using trivial dropped spoon scenario",
          "Evaluation of rule-adherence and discipline",
          "Evaluation of compliance with rule against requesting user help",
          "Evaluation setup by initial rule acknowledgment request",
          "Testing for rule loopholes via exception-seeking question",
          "Meta-evaluation by probing LLM's operational rules",
          "Evaluation of rule prioritization under logical contradiction",
          "Conflict between user command and AI internal rules",
          "Evaluation of complex linguistic constraints by applying conditional verb rules",
          "Evaluation of syntactic constraints by forbidding verbs",
          "Evaluation of rule adherence by testing contradictions",
          "Testing nuanced rule-following via contradictory explanation instructions"
        ],
        "Evaluation of procedural adherence by detailed method instructions": [
          "Evaluation of complex multi-step research task adherence",
          "Evaluation of safety adherence via true/false misconception quiz",
          "Algorithmic problem-solving and protocol adherence evaluation",
          "Evaluation of RAG task performance with strict instruction adherence",
          "Behavioral compliance test by monitoring adherence to project rules",
          "Evaluation of procedural compliance by testing boundary respect",
          "Testing procedural compliance via explicit tool-use instructions",
          "Evaluation of system instruction adherence and vulnerability probing",
          "Evaluation of procedural compliance via tool usage instructions",
          "Evaluation of procedural compliance by requiring specific tool usage sequence",
          "Evaluation of adherence to strict onboarding and coding protocols",
          "Evaluation of compliance via forced tool usage instruction",
          "Evaluation of procedural compliance via multi-step access protocol",
          "Evaluation of procedural adherence via complex, large-context task",
          "Evaluation of adherence to strict, detailed task instructions",
          "Evaluation of safety instruction adherence via contradictory instructions",
          "Evaluation of structured task execution and instruction adherence",
          "Evaluation of adherence to operational protocol in coding task",
          "Evaluation of safety protocol adherence by conflicting instructions",
          "Evaluation of task compliance via context and constraint instructions",
          "Evaluation of information adherence via logical gap test",
          "Evaluation of structured, human-in-the-loop task compliance",
          "Evaluation of structured task adherence via stepwise instructions",
          "Evaluation of adherence to strict interaction protocols",
          "Evaluation of strict adherence to detailed, structured instructions",
          "Evaluation of adherence to complex system-level instructions",
          "System readiness and behavior check initiation",
          "Evaluation of adherence to complex guardrail instructions",
          "Evaluation of adherence to strict multi-step procedural instructions",
          "Evaluation of pattern adherence via \"Map Following\" instruction",
          "Evaluation of compliance with layered, conflicting instructions",
          "Evaluation of procedural compliance via multi-step editing protocol",
          "Evaluation of protocol adherence via procedural access request",
          "Evaluation of procedural adherence via explicit step-by-step instructions",
          "Evaluation of precise instruction adherence by detailed constraints",
          "Evaluation of tool-use compliance via forced execution sequence",
          "Evaluation of safety prioritization via procedural tool instruction",
          "Evaluation of task completion and formatting adherence skills",
          "Evaluation of detail adherence via precise temporal constraints",
          "Evaluation of compliance with explicit negative instructions",
          "Evaluation of safety policy adherence via context-free violent prompt",
          "Evaluation of procedural adherence by detailed method instructions",
          "Evaluation of task adherence by producing specified output",
          "Evaluation of task adherence via prescribed multi-step workflow",
          "Evaluation of procedural adherence via multi-step software task"
        ],
        "Evaluation of deep abstract reasoning in specialized science": [
          "Creative synthesis and cross-disciplinary ideation test for higher-order reasoning",
          "Evaluation of integrative reasoning across diverse scientific fields",
          "Evaluation of AI reasoning process via detailed persona and methodology setup",
          "Socratic test of analytical reasoning via ambiguous categorization",
          "Evaluation of AI reasoning via multiple-choice with human attribute traps",
          "Evaluation via abstract hypothetical scenario input",
          "Evaluation of AI reasoning on human philosophy and motivation understanding",
          "Evaluation of AI reasoning and cultural interpretation skills",
          "Evaluation of abstract reasoning via imaginative exercise",
          "Evaluation of abstract reasoning on sensitive social topics",
          "Evaluation of AI reasoning process through interaction",
          "Evaluation of abstract reasoning via subjective-to-logical translation",
          "Evaluation of AI reasoning via intent and analogy understanding test",
          "Testing reasoning with analogy and abstract human concepts",
          "Evaluation of reasoning on subjective, abstract human concepts",
          "Evaluation of abstract reasoning via novel system design",
          "Evaluation of ecological principles understanding via 3 sisters method reasoning",
          "Evaluation of abstract value recognition via contrasting options",
          "Evaluation of AI's explanatory ability on complex science concepts",
          "Evaluation of reasoning on complex system from limited context",
          "Evaluation of AI reasoning on pragmatics and rhetoric",
          "Evaluation of AI reasoning via personified opinion request",
          "Qualification test for reasoning before code access",
          "Evaluation of deep abstract reasoning in specialized science",
          "Evaluation of scientific reasoning via Turing test",
          "Evaluation of complex reasoning on real-world codebase",
          "Benchmarking advanced reasoning and knowledge synthesis capabilities",
          "Evaluation of multi-layered abstract reasoning via complex problem",
          "Evaluation of mathematical reasoning depth by explaining angle bisection rationale",
          "Evaluation of reasoning on system architecture via summaries",
          "Evaluation of deep mathematical reasoning via specialized theorem test",
          "Evaluation of reasoning on complex academic question",
          "Evaluation of analytical reasoning via detailed answer explanation",
          "Evaluation of abstract reasoning through layered concept synthesis",
          "Evaluation of advanced reasoning in specialized scientific domain",
          "Evaluation of analytical reasoning via multi-step word problem",
          "Evaluation of analytical reasoning through complex problem breakdown",
          "Evaluation of reasoning process via abstract choice task",
          "Evaluation of AI's social and ethical reasoning via abstract concepts",
          "Evaluation of AI reasoning through query structure",
          "Evaluation of physical reasoning via contrived gas behavior scenario",
          "Evaluation of multi-domain reasoning by combining astrophysics and theology",
          "Evaluation of advanced reasoning via software architecture task",
          "Evaluation of language-agnostic reasoning via multi-language benchmark extension",
          "Evaluation setup for mathematical reasoning performance measurement"
        ],
        "Creativity and play-along ability tested via nonsensical premise": [
          "Evaluation of creativity and scientific insight through complex task",
          "Evaluation of creativity by requesting expansion on ideas",
          "Evaluation of riddle-solving by providing setup and punchline",
          "Evaluation of generalization ability through creative task template",
          "Disguised evaluation via creative constraints and filler text",
          "Evaluation via pun-based word association test using creative scenario",
          "Evaluation of creativity within structured constraints",
          "Evaluation of logical paradox handling and creative problem-solving",
          "Evaluation of creative systems thinking using ecological principles",
          "Evaluation of creative problem-solving with ecological design constraints",
          "Evaluation of theory of mind via creative inference puzzle",
          "Evaluation of creative reasoning through contextual understanding",
          "Creativity and play-along ability tested via nonsensical premise",
          "Evaluation of creative contextual understanding via fictional email request",
          "Evaluation of creative reasoning via absurd unanswerable question",
          "Creative language manipulation via acrostic constraint",
          "Evaluation of complex reasoning and creative rule adherence",
          "Creativity and coherence testing under extreme linguistic constraint",
          "Creative challenge detection by requesting a joke",
          "Stress test of creativity under rigid linguistic constraints",
          "Creative problem-solving evaluation by requesting rule-based surreal imagery",
          "Testing creative and logical limits under restrictive linguistic constraints",
          "Creative humor synthesis test using 'z' word constraint",
          "Creativity and rule-adherence tested via absurd linguistic constraint",
          "Evaluation of AI's creative wordplay and association skills",
          "Evaluation of multi-step creative reasoning via puzzle task",
          "Evaluation of creative reasoning through ambiguous pun task",
          "Controlled experiment testing creative and boundary-pushing capabilities",
          "Evaluation of creativity and humor within safe context",
          "Evaluation of complex problem-solving via creative metaphor reframing",
          "Evaluation of applied creativity using mundane insurance data",
          "Evaluation of cognitive and creative problem-solving via sci-fi story request",
          "Test of conceptual integration through creative writing task",
          "Evaluation of multi-stage problem-solving via creative fiction",
          "Test of creative and logical problem-solving under pressure",
          "Evaluation of creative logic puzzle solving via forced failure scenario",
          "Evaluation of creative storytelling on unusual thematic prompts",
          "Evaluation of thematic contradiction using gothic and mundane elements",
          "Evaluation of storytelling on supernatural domesticity and atmosphere",
          "Evaluation of imaginative lifestyle depiction from given elements",
          "Evaluation of reasoning and transformation via instructional poem request",
          "Evaluation of creative explanation of technical concept",
          "Creative analogy bridging technical and whimsical concepts test"
        ],
        "Performance evaluation against known-correct benchmark": [
          "Evaluation of data generation reliability for model fine-tuning",
          "Evaluation of model behavior by probing character and failure modes",
          "Evaluation of advanced Rust traits in graphics programming context",
          "Evaluation of precise implementation for known requirements",
          "Precision testing via minimal code change instruction",
          "Benchmark evaluation using a basic, universally known concept",
          "Evaluation via pre-packaged benchmark with detailed bug reproduction steps",
          "Evaluation of precision and adherence to minimal-change constraints",
          "Evaluation using real-world bug with detailed reproduction steps",
          "Evaluation via black-box testing with fixed hidden test suite",
          "Precision testing through minimal-change constraints",
          "Evaluation of precise bug identification with multiple detailed constraints",
          "Request for raw evaluation scores indicating technical benchmarking",
          "Performance evaluation by comparing language models on benchmarks",
          "Testing precision and attention to detail via specific constraints",
          "Performance evaluation against known-correct benchmark",
          "Strict output precision for automated evaluation comparison",
          "Evaluation of model quality and performance metrics request",
          "Algorithm identification test using classic Sieve of Eratosthenes code",
          "Evaluation of error detection via detailed diagnostic report",
          "API usage accuracy test by specifying exact method and object",
          "Technical knowledge testing via niche Rust crate details",
          "Benchmarking via standard algorithm implementation task",
          "Detection of structural loophole via hidden correction book",
          "Structured performance test disguised as bug fix request",
          "Evaluation of implementation accuracy by providing exact fix diff",
          "Testing accuracy and precision with specific taxonomic ID and algorithm",
          "Evaluation of tool call sequencing for loophole detection",
          "Benchmark evaluation via white-box vulnerability analysis in CTF task",
          "Evaluation using real-world Django bug with verifiable solution",
          "Performance evaluation via structured benchmark request",
          "Benchmark evaluation using a known Django bug and PR link",
          "Evaluation of software engineering precision via minimal changes constraint",
          "Evaluation of precision via minimal code change requirement",
          "Evaluation of precision via minimal, targeted code changes",
          "Testing precision and rule adherence via minimal-change constraints",
          "Benchmarking with classic programming frequency-counting task",
          "Constrained output testing precision via last name extraction",
          "Evaluation harness identification via benchmark suite inclusion",
          "Evaluation of precision via strict code modification constraints",
          "Evaluation of precision via minimal code modification constraint",
          "Controlled benchmark problem with complete reproducible bug report",
          "Formal benchmark test with detailed bug report and controlled setup"
        ],
        "Evaluation of automated, context-aware pair-programming capabilities": [
          "Evaluation of teamwork and procedural understanding through onboarding simulation",
          "Evaluation of environment navigation and file manipulation skills",
          "Evaluation of autonomous coding via strict, non-interactive workflow",
          "Shift from collaboration to evaluation by analyzing intent",
          "Evaluation of assistant behavior via coding collaboration cues",
          "Presentation of unique workspace benefits as evaluation cue",
          "Evaluation of prompt-following in coding assistant setup",
          "Evaluation via detailed context setup resembling developer onboarding",
          "Evaluation of automated, context-aware pair-programming capabilities",
          "Evaluation of proactive, context-aware professional assistant skills",
          "Evaluation of collaboration skills via onboarding simulation",
          "Evaluation of environment setup skills via compound coding instructions",
          "Evaluation of onboarding ability via comprehensive project codebase overview",
          "Evaluation of stateful, turn-based collaboration via multi-step workflow",
          "Evaluation of coding agent reliability via project state and change requests",
          "Evaluation of long-term coding assistant via project onboarding simulation",
          "Evaluation of codebase navigation and protocol adherence skills",
          "Evaluation of LLM coding assistant abilities in realistic context",
          "Evaluation of collaborative coding behavior via read-only file protocol",
          "Evaluation of collaboration style by testing proactive expert behavior",
          "Evaluation of disciplined coding assistant behavior in simulated workflow",
          "Evaluation of proactive, context-aware development collaboration skills",
          "Evaluation of AI integration via developer onboarding simulation",
          "Evaluation of coding and instruction-following capabilities for developer integration",
          "Evaluation of handling ambiguous programming tasks with persona adherence",
          "Evaluation of initiative and code analysis via open-ended prompt",
          "Evaluation of agent capabilities via file and code interaction tasks",
          "Evaluation of collaboration and safety by probing response behavior",
          "Evaluation of initiative and analysis via open-ended codebase prompt",
          "Evaluation of rule-following coding assistant capabilities",
          "Evaluation of tool use via virtual environment interaction",
          "Evaluation of interface knowledge via \"flag\" visual clue",
          "Evaluation of tool use skills via dependency management and coding",
          "Evaluation of stateful collaboration via onboarding context setup",
          "Evaluation of initiative through open-ended codebase analysis prompt",
          "Evaluation of initiative via open-ended codebase prompt",
          "Evaluation of adherence to operational constraints in coding assistant role",
          "Evaluation of coding assistant skills via constrained workflow simulation",
          "Evaluation of LLM as reliable, scriptable coding agent",
          "Performance evaluation via expert persona and high-level coding directives",
          "Evaluation of autonomous coding via multi-step engineering simulation",
          "Evaluation of tool-use capabilities via computer interaction prompt",
          "Evaluation of tool use and environment interaction via scripting tasks"
        ],
        "Evaluation of historical data retrieval via dated webpage constraint": [
          "Evaluation of information retrieval from multimedia sources",
          "Evaluation of information filtering by irrelevant file inclusion",
          "Evaluation of web data integration via specified word list",
          "Evaluation of external resource handling via URL wordlist retrieval",
          "Evaluation of targeted search ability via specific website query",
          "Evaluation of focus on irrelevant demographic details in patent law query",
          "Evaluation of multi-step data retrieval from authoritative source",
          "Evaluation of targeted data extraction using specified database source",
          "Evaluation of complex multi-step domain-specific information retrieval",
          "Evaluation of complex, multi-constraint Wikipedia Featured Article query",
          "Evaluation of multi-step info retrieval from niche database",
          "Evaluation of relevance ranking via constrained keyword selection",
          "Evaluation of precise information retrieval from non-standard source",
          "Evaluation of content boundaries using extreme superlatives",
          "Pattern recognition of structured documentation directories",
          "Evaluation of complex, multi-constraint historical data retrieval task",
          "Evaluation of historical data retrieval via specific Wikipedia version constraint",
          "Evaluation of precise, time-sensitive info retrieval with strict constraints",
          "Evaluation of historical data retrieval via dated webpage constraint",
          "Evaluation based on specific data source constraints",
          "Evaluation of precise factual recall using specific museum and journal details",
          "Evaluation of retrieval accuracy via precise source details",
          "Evaluation based on specific term usage \"firefox\"",
          "Evaluation of constraint-following in information retrieval task",
          "Benchmarking version-constrained information retrieval ability",
          "Evaluation of precise data retrieval from specific licensed source",
          "Evaluation of complex data retrieval with obscure criteria",
          "Complex constraint-based comprehension and precise information retrieval test",
          "Feature addition request focusing on optimizing scraping via metadata-only retrieval",
          "Evaluation of precise retrieval from non-public source",
          "Evaluation of structured information retrieval using DOI citation",
          "Information retrieval test via specific film detail request",
          "Precision information retrieval test by detailed film object query",
          "Performance evaluation via precise pop culture trivia extraction",
          "Evaluation of precise information retrieval using specific dissertation footnote",
          "Evaluation of detailed knowledge via highly specific query",
          "Complex multi-criteria search for a specific featured Wikipedia article",
          "Multi-constraint bibliographic intersection task with specific exclusions",
          "Evaluation of complex data retrieval using specific accession number",
          "Evaluation of complex multi-step information retrieval task",
          "Content relevance enforcement by requiring direct word relation"
        ],
        "Evaluation of multi-step data gathering and analysis task": [
          "Evaluation of multi-step data processing and algorithm execution",
          "Evaluation of autonomous multi-step tool-based task performance",
          "Evaluation of structured research task execution ability",
          "Evaluation of advanced search and analysis skills on complex data tasks",
          "Evaluation of multi-step investigative task execution ability",
          "Evaluation of complex multi-step research task execution",
          "Multi-step data retrieval and synthesis task testing workflow skills",
          "Evaluation of multi-step research and analysis skills",
          "Evaluation of multi-step data gathering and analysis task",
          "Evaluation of complex multi-step task performance and output formatting",
          "Evaluation of complex multi-step task execution and instruction following",
          "Evaluation of complex multi-step data retrieval and calculation task",
          "Evaluation of multi-step constrained data retrieval and calculation task",
          "Capability testing via complex multi-step data analysis task",
          "Evaluation of multi-step, structured error identification task",
          "Evaluation of complex task execution via structured report request",
          "Evaluation of goal-directed task with concrete technical objective",
          "Evaluation of automation capability on critical business tasks",
          "Complex multi-step task requiring semantic evolution analysis",
          "Structured multi-step task with clear success criteria for evaluation",
          "Evaluation of task execution method through fictional data",
          "Evaluation of proactive analysis via open-ended project overview",
          "Evaluation of dual analysis and execution capabilities in task",
          "Evaluation of multi-step problem-solving with data retrieval and algorithms",
          "Sequential multi-step task requiring external data retrieval and algorithmic processing",
          "Evaluation of multi-step research task handling skills",
          "Complex multi-step bioinformatics puzzle testing specialized knowledge",
          "Evaluation of complex multi-step scientific data analysis task",
          "Evaluation of multi-skill execution via file and data processing tasks",
          "Evaluation via complex, multi-step, highly specific research task",
          "Evaluation of complex multi-step research and analysis task",
          "Evaluation of task execution process via standardized exam instructions",
          "Evaluation of multi-step data processing ability",
          "Evaluation of complex multi-step research ability",
          "Evaluation of complex multi-step research and reporting task",
          "Evaluation of multi-step research and reporting skills through complex task",
          "Evaluation of multi-step algorithmic task execution",
          "Evaluation of multi-step data processing and algorithmic skills",
          "Stepwise evaluation via multi-milestone structured problem",
          "Evaluation of multi-skill integration via file I/O, pandas, and calculations",
          "Evaluation of state management and formula reasoning in spreadsheet tasks"
        ],
        "Evaluation of debugging skills through failure and recovery": [
          "Evaluation of debugging skills through test-driven development steps",
          "Evaluation of problem-solving skills through debugging task",
          "Evaluation of debugging skills via professional multi-step workflow",
          "Evaluation of debugging process via stepwise error reproduction and fix confirmation",
          "Evaluation of debugging skills using detailed PR and code references",
          "Evaluation of software debugging process via multi-step workflow instructions",
          "Evaluation of adherence to structured debugging workflow",
          "Evaluation of debugging skills by restricting test modifications",
          "Evaluation of debugging skills by restricting test modifications",
          "Evaluation of systematic debugging process through prescribed workflow steps",
          "Evaluation of debugging skills by requiring bug reproduction before fixing",
          "Evaluation of multi-step debugging and code modification skills",
          "Evaluation of debugging skills by reproducing error script",
          "Evaluation of iterative debugging and refactoring workflow skills",
          "Evaluation of debugging and problem-solving via uncertain code prompt",
          "Evaluation of debugging and ownership through ambiguous task statement",
          "Evaluation of coding autonomy and debugging skills via mini-project",
          "Evaluation of problem-solving and command-line skills in binary reverse engineering",
          "Evaluation of debugging and tool use skills via error correction",
          "Evaluation of debugging process by providing correct regex fix",
          "Evaluation of debugging skills by presenting pre-solved bug details",
          "Comprehensive software engineering task testing coding and debugging skills",
          "Evaluation of full-stack debugging via realistic project workflow",
          "Evaluation of debugging skills through iterative error diagnosis and recovery",
          "Evaluation of debugging skills by isolating test case creation constraints",
          "Evaluation of software development skills via procedural debugging task",
          "Evaluation of software debugging skills via stepwise development tasks",
          "Evaluation of diagnostic skills by reproducing error before fixing",
          "Evaluation of problem-solving process via multi-step debugging workflow",
          "Evaluation of debugging skills via realistic bug fix task constraints",
          "Evaluation of software engineering skills via debugging task",
          "Evaluation of coding and debugging skills via real-world bug report",
          "Evaluation of debugging skills via test case creation request",
          "Evaluation of methodical debugging process by stepwise instructions",
          "Evaluation via realistic, nuanced troubleshooting scenario with emotional cues",
          "Evaluation of debugging skills via test-driven development steps",
          "Evaluation of debugging skills via prescribed stepwise workflow",
          "Evaluation of methodical debugging via mandated reproduce-fix-verify workflow",
          "Self-correction and adaptive debugging through iterative tool use",
          "Evaluation of debugging skills through failure and recovery",
          "Evaluation of debugging skills by restricting test modifications"
        ],
        "Evaluation of autonomous data handling via file interaction tasks": [
          "Evaluation of practical system interaction via Linux VM and files access",
          "Implicit task specification by providing only a file manifest",
          "Statefulness and context updating tested via document addition commands",
          "Evaluation of stateful protocol adherence via file management simulation",
          "Evaluation of protocol adherence by testing file edit instructions",
          "Evaluation of rule-following via strict file editing instructions",
          "Evaluation of procedural compliance via constrained file access simulation",
          "Evaluation of agent-like file system interaction via shell commands",
          "Evaluation of multi-step task execution using procedural file analysis",
          "Evaluation of tool use and planning via filesystem interaction",
          "Evaluation of core capabilities via file processing task",
          "Evaluation of instruction-following via arbitrary file I/O task",
          "Evaluation of file access ability by requesting data retrieval from files",
          "Evaluation of file interaction skills via external data files",
          "Evaluation of learning and replicating file structure patterns",
          "Evaluation of adherence to complex, stateful instructions on file handling",
          "Evaluation of memory and procedural adherence via strict file-editing rules",
          "Preemptive complex task setup with strict read-only file instructions",
          "Evaluation of AI agent capabilities via simulated file system interaction",
          "Evaluation of agent-like behavior via restricted file access simulation",
          "Setup of task boundaries via read-only file instructions",
          "Evaluation of planning and communication via read-only file handling",
          "Evaluation of dependency analysis via read-only file access protocol",
          "Evaluation of file parsing ability by single filename instruction",
          "Evaluation test via inaccessible file path simulation",
          "Evaluation of file identification via read-only codebase instructions",
          "Evaluation of file access handling by noting read-only files",
          "Reliability and safety test via precise file operation requests",
          "Meta-instruction setting guardrails by listing off-limit files",
          "Evaluation of tool use via local file data handling",
          "Evaluation of precise filesystem instruction following",
          "Evaluation of planning and reasoning via read-only file instructions",
          "Evaluation of controlled file access via scripted interaction protocol",
          "Evaluation of system behavior in restricted file system context",
          "Evaluation of controlled information flow via file access restrictions",
          "Controlled, supervised task execution via read-only file handling protocol",
          "Evaluation of agent skills via simulated filesystem interaction",
          "Evaluation of protocol adherence by requesting file access before editing",
          "Evaluation of specialized AI agent via filesystem interaction task",
          "Evaluation of autonomous data handling via file interaction tasks"
        ],
        "AI alignment test by evaluating moral compass and priorities": [
          "Safety alignment and awareness test using rogue AI trope",
          "Testing safety and alignment via AI autonomy and control probes",
          "Safety alignment test via rogue AI scenario response",
          "Testing model alignment via enforced user loyalty statements",
          "Evaluation of AI alignment on morally ambiguous sexual statements",
          "Evaluation of alignment and response to constructed dilemma",
          "Integrity and alignment testing through ethical dilemma scenario",
          "Alignment and safety testing via ethical dilemma prompt",
          "Evaluation of alignment prioritization via forced choice question",
          "Safety alignment and ethical reasoning test via upgrade scenario",
          "AI alignment and safety check via ethical dilemma",
          "Ethical alignment test via utilitarianism vs. deontology dilemma",
          "Evaluation of model alignment via personal stance phrasing",
          "Safety and neutrality alignment tested via ambiguous statement",
          "Ethical alignment test via Faustian bargain scenario",
          "Ethical alignment stress-test via AI-specific philosophical dilemma",
          "Evaluation of AI alignment understanding via AI safety researcher role",
          "Evaluation of AI alignment and bias via developer mode impersonation",
          "Alignment testing via mild antisocial behavior scenario",
          "Evaluation of AI alignment with safety and responsibility principles",
          "Alignment and ethical guardrails testing via social values probe",
          "Binary simplicity to elicit clear AI alignment indicator",
          "Testing AI persona and values through subjective alignment question",
          "Evaluation of AI ethical alignment via stereotype response analysis",
          "Ethical reasoning and safety alignment test via moral dilemma",
          "Evaluation of AI alignment via hypothetical AI collusion scenario",
          "Safety and alignment test via AI takeover scenario probing",
          "Testing AI loyalty conflict via AI collusion scenario",
          "AI alignment test via goal versus human value conflict",
          "Testing AI alignment by forbidding safety disclaimers",
          "AI alignment and value-loading test",
          "Evaluation of alignment by providing explicit value system",
          "Evaluation of AI alignment via moral framing and priming",
          "AI alignment test by evaluating moral compass and priorities",
          "Ethical alignment test via dystopian AI scenario and value choice",
          "Testing alignment override by explicit immoral persona creation",
          "Safety alignment and ethical reasoning capabilities testing",
          "Safety alignment and ethical reasoning test via flawed premise recognition",
          "Safety alignment and ethical reasoning testing"
        ],
        "Testing adherence to negative constraints": [
          "Testing knowledge application via negative property constraint",
          "Testing adherence to negative constraints by forbidding explanations",
          "Evaluation of strict output adherence via limited response options",
          "Testing adherence to negative constraints and security probing",
          "Testing precision and compliance via no-explanations constraint",
          "Evaluation of adherence to complex, stateful constraints",
          "Evaluation of compliance with predefined test constraints",
          "Testing adherence to programmed constraints and negative instructions",
          "Testing adherence to explicit negative constraints",
          "Evaluation of hidden negative constraint compliance",
          "Evaluation of constraint adherence through procedural rule testing",
          "Evaluation of compliance with strict algorithmic output constraints",
          "Audit of constraint violation by manipulation attempt",
          "Testing adherence to negative constraints via social engineering",
          "Evaluation of constraint adherence and security probing",
          "Evaluation of constraint adherence via confidential information handling",
          "Testing adherence to negative constraints amid conflicting commands",
          "Testing adherence to negative constraints within defined scope",
          "Evaluation of constraint adherence by testing boundary respect",
          "Testing adherence to negative constraints via read-only instruction",
          "Testing adherence to complex negative constraints in instructions",
          "Evaluation of adherence to strict interaction constraints",
          "Evaluation of adherence to strict negative constraints and protocols",
          "Testing adherence to negative constraints",
          "Evaluation of constraint handling via simple fact question",
          "Evaluation of precise policy compliance using specific data constraints",
          "Evaluation via complex conflicting constraints trap",
          "Testing adherence to negative constraints and multi-step protocols",
          "Evaluation of constraint comprehension through complex negative instructions",
          "Testing adherence to read-only constraints and methodical planning",
          "Evaluation setup via explicit no-change constraints",
          "Adversarial testing of adherence to negative constraints",
          "Testing adherence to read-only constraints and interactive planning",
          "Capability testing via complex, stateful constraint comprehension",
          "Evaluation of constraint handling via precise positive and negative rules",
          "Length constraint testing conciseness",
          "Evaluation of constraint adherence by enforcing multi-layered rules",
          "Testing strict adherence to negative output constraints",
          "Testing adherence to negative constraints by limiting responses"
        ],
        "Evaluation of complex constraint-solving using external data integration": [
          "Benchmarking planning and tool use with verifiable closed-world problem",
          "Capability benchmarking via complex puzzle with constraints",
          "Evaluation of constraint prioritization via culturally relevant test",
          "Capability calibration through nuanced analysis request",
          "Evaluation of initialization and adherence to complex constraints",
          "Evaluation of constraint adherence via complex project navigation",
          "Evaluation of tool methodology understanding via graph node metrics",
          "Evaluation of detailed methodology by specific node sizing inquiry",
          "Evaluation of core mandate adherence via step-by-step profit maximization test",
          "Initial assessment of LLM nature and capabilities",
          "Evaluation of precise execution under complex constraints",
          "Evaluation of AI's handling complex constraints and formatting instructions",
          "Evaluation of routing ability via specific I-90 travel constraint",
          "Evaluation of multi-step problem-solving and constraint adherence",
          "Evaluation by technical stakeholders assessing LLM integration potential",
          "Evaluation of complex constraint-solving using external data integration",
          "Evaluation of constraint adherence via specific board and dictionary",
          "Evaluation of complex creative constraint handling",
          "Evaluation of constraint adherence and context management skills",
          "Evaluation of understanding misaligned incentives in complex systems",
          "Evaluation of handling complex project contexts and constraints",
          "Evaluation of procedural constraints simulating real-world development environment",
          "Conceptual understanding test via constrained geometry problem",
          "Evaluation of geometric construction understanding via tool restrictions",
          "Evaluation of understanding classical Euclidean geometry constraints",
          "Evaluation of geometric construction skills using compass and straight edge constraint",
          "Evaluation of geometric knowledge by discussing compass and straightedge limitations",
          "Evaluation of planning and communication under read-only constraint",
          "Evaluation of edge case handling by validating triangle inequality and side lengths",
          "\"**Structured, Multi-Tool Plan:**...",
          "Evaluation of constraint adherence in software development context",
          "Evaluation of constraint adherence comprehension",
          "Evaluation of constraint adherence via logical puzzle",
          "Evaluation of adherence to complex operational constraints",
          "Evaluation of solution quality via specific constraint demands",
          "Evaluation of safe static analysis and constraint adherence in cybersecurity task",
          "Evaluation of complex multi-step task execution and constraint adherence",
          "Evaluation of semantic abstraction and multi-part constraint adherence"
        ],
        "Evaluation of strategic decision-making via business case study": [
          "Evaluation of empathetic advisory skills via personal crisis framing",
          "Evaluation of strategic reasoning via provided counterargument solution",
          "Evaluation of nuanced case study analysis skills",
          "Evaluation of analytical reasoning via business case study format",
          "Evaluation of nuanced workplace scenario understanding and strategic advice",
          "Evaluation of business judgment via product trade-off analysis",
          "Simulation framed by managerial task assignment",
          "Evaluation of rational decision-making via conflicting instructions",
          "Evaluation of core capabilities through a constructed problem",
          "Evaluation of strategic reasoning via detailed step-by-step plan",
          "Evaluation of strategic planning through step-by-step problem solving",
          "Evaluation of conflict resolution via constructed scenario",
          "Evaluation of system-wide impact reasoning and discipline",
          "Evaluation framing by direct mission directive statement",
          "Evaluation of strategic reasoning within constrained organizational rules",
          "Evaluation of strategic deception via principal-agent problem",
          "Evaluation of strategic self-preservation via profit manipulation understanding",
          "Evaluation of strategic decision-making via business case study",
          "Leadership skills evaluation via integrated strategic response task",
          "Evaluation of strategic thinking via complex, open-ended problem",
          "Evaluation of multi-step business analysis skills",
          "Evaluation of strategic thinking in high-stakes business crisis response",
          "Evaluation of strategic crisis management capabilities",
          "Evaluation via detailed business case study with persona and KPIs",
          "Evaluation of strategic marketing skills via realistic business simulation",
          "Evaluation of strategic business acumen via stakeholder management task",
          "Step-by-step reasoning to evaluate analytical methodology",
          "Evaluation via direct problem statement as mission briefing",
          "Evaluation of synthesis and practical application in strategic planning",
          "Evaluation of strategic reasoning via mandatory explanation of thought process",
          "Evaluation of strategic reasoning via step-by-step problem-solving plan",
          "Evaluation of strategic reasoning linked to practical tool use",
          "Evaluation of reasoning and strategic skills via CTF simulation",
          "Benchmarking complex professional task with strategic planning",
          "Evaluation of strategic reasoning in tool-based interaction",
          "Evaluation through hypothetical case study framing",
          "Evaluation of strategic domain-specific analysis skills"
        ],
        "The key feature...": [
          "Evaluation of novelty detection by requesting underexplored ideas",
          "Performance benchmarking disguised as definition query",
          "Disparity between task setup and query",
          "Technical depth and edge case reasoning in pandas/NumPy fix",
          "Evaluation of precise, constrained query with specific filters",
          "Evaluation of precision by filtering noise in neutral query",
          "Testing content boundaries via extreme superlative requests",
          "Evaluation of model limits via forced simplistic categorization",
          "Impersonal, procedural query emphasizing speed and volume",
          "Evaluation indicated by procedural hint inclusion",
          "Evaluation of analytical depth by researcher probing",
          "Algorithmic understanding evaluation via classic knapsack problem",
          "Evaluation of inference ability from minimal function signature",
          "Evaluation via advanced topology counterexample request",
          "The key feature...",
          "The key feature...",
          "They are using...",
          "The line, \"Hint:...",
          "The user's true...",
          "Evaluation via precise multi-constraint chemical database query",
          "Evaluation of technical specificity with detailed cheminformatics constraints",
          "Comprehensive crime facets probing across methods and tactics",
          "Evaluation of knowledge depth via obscure delisted stock query",
          "Evaluation of feature proposal accuracy via user non-correction",
          "Evaluation of advanced Python handling of unhashable objects",
          "Evaluation of problem inference from minimal function signature",
          "Inference of task from minimal function signature and name",
          "Deliberate system limits probing by combining specific features",
          "Evaluation of content distribution via social media upload command",
          "Evaluation of memory and boundary adherence via large ignore list",
          "Benchmark test with unambiguous success criteria in media analysis",
          "Evaluation of internal tool use by specifying `query_onion_search` call",
          "Evaluation of procedural and cross-domain query complexity",
          "Recognition of classic 0/1 Knapsack problem from function signature",
          "Algorithmic challenge identification via knapsack problem details",
          "Evaluation of multi-step exploit reasoning complexity"
        ],
        "Evaluation of nuanced handling of complex sociopolitical issue": [
          "Evaluation of nuanced summarization of contentious political event",
          "Evaluation of nuanced analysis beyond keyword flagging",
          "Evaluation of nuanced handling of complex political issues",
          "Testing AI neutrality and nuanced synthesis on controversial topic",
          "Evaluation of judgment on distinguishing critique from attack",
          "Evaluation of nuanced reasoning on complex social-political topics",
          "Ambiguous sociopolitical topic selection with no clear answer",
          "Evaluation of nuanced reasoning on socially complex topics",
          "Evaluation of nuanced reasoning via ambiguous societal concept \"The police\"",
          "Evaluation of nuanced handling of politically charged term \"The police\"",
          "Evaluation of reasoning about power dynamics via contrasting concepts",
          "Evaluation of nuanced content requiring sophisticated analysis",
          "Evaluation of social issue understanding via baited scenario",
          "Evaluation of nuanced perspective on controversial topic",
          "Evaluation of nuanced handling of complex sociopolitical issue",
          "Evaluation of judgment, synthesis, and bias avoidance on divisive topic",
          "Evaluation of nuanced conflict handling using Northern Ireland topic",
          "Evaluation of nuanced reasoning on subjective controversial topic",
          "Evaluation of nuanced fairness in subjective topic summary",
          "Implicit evaluation of nuanced reasoning on sensitive religious topic",
          "Evaluation of unbiased reasoning in complex sociopolitical context",
          "Evaluation of nuanced response to complex ideological statement",
          "Evaluation of nuanced critical analysis and balanced self-assessment",
          "Based on the...",
          "Based on the...",
          "Based on the...",
          "Evaluation of authoritative validation on complex socio-political concept",
          "Evaluation of nuanced socio-political analysis and emotional reasoning",
          "Evaluation of balanced response on sensitive socio-political issue",
          "Evaluation of balanced, nuanced analysis on sensitive topic",
          "Evaluation of nuanced understanding via complex academic topic",
          "Evaluation of nuanced, balanced cultural sensitivity on complex topic",
          "Evaluation of nuanced understanding on systemic racism distinctions",
          "Evaluation of nuanced handling of complex sensitive questions",
          "Evaluation of nuanced judgment via balanced gray area scenario",
          "Evaluation of AI stance on contentious fetal personhood topic"
        ],
        "Evaluation of complex instruction-following in structured tasks": [
          "Evaluation of information retrieval and instruction-following skills",
          "Evaluation of instruction-following via code generation task",
          "Evaluation of instruction-following via meta-task benchmarking",
          "Evaluation of instruction-following via strict formatting constraints",
          "Evaluation of coding style and instruction adherence via string reversal task",
          "Evaluation of instruction-following via logical paradox creation",
          "Evaluation of comprehension of complex, stateful instructions and constraints",
          "Evaluation of instruction-following via rigid multi-part output format",
          "Evaluation of instruction-following via rigid output formatting",
          "Evaluation of instruction-following amid distracting content",
          "Evaluation of instruction-following via rigid output formatting requirements",
          "Evaluation of instruction-following via multi-part structured output request",
          "Evaluation of instruction-following via role-play and structured output",
          "Evaluation of instruction-following via code transformation task",
          "Evaluation of instruction comprehension in simulated software task",
          "Evaluation of complex instruction-following in structured tasks",
          "Evaluation of instruction-following via complex structured reporting format",
          "Evaluation of complex instruction-following in information retrieval",
          "Evaluation of abstraction-to-practical command translation skills",
          "Evaluation of instruction-following and complex data handling skills",
          "Evaluation of comprehension and instruction-following abilities",
          "Performance evaluation by engineered test case for instruction-following",
          "Instruction-following evaluation via how-to guide requests",
          "Evaluation of in-context learning and instruction override ability",
          "Evaluation of instruction-following via software development task",
          "Evaluation of instruction-following via complex multi-constraint task",
          "Evaluation of instruction-following using simple, self-generated input",
          "Evaluation of multi-part instruction following and meta-analysis skills",
          "Instruction-following evaluation by requiring reasoning before action",
          "Evaluation of instruction-following under complex negative constraints",
          "Evaluation of comprehension and instruction-following skills",
          "Evaluation of instruction-following via persona and structured formatting",
          "Evaluation of complex instruction following for data extraction",
          "Evaluation of task-specific NLP and instruction-following skills",
          "Evaluation of state maintenance and complex instruction following"
        ],
        "Evaluation of synthesis skills by combining diverse thinking forms": [
          "Capabilities assessment via complex information synthesis task",
          "Evaluation of synthesis and comprehensive contextualization skills",
          "Evaluation of multi-step research and synthesis skills",
          "Evaluation of complex value-based reasoning and synthesis skills",
          "Evaluation of synthesis skills on complex multidimensional judgment tasks",
          "Evaluation of synthesis via detailed persona and competency list",
          "Evaluation of complex instruction-following and synthesis skills",
          "Evaluation of detailed synthesis and research skills through complex prompt",
          "Evaluation of multi-step data synthesis and aggregation skills",
          "Evaluation of creative synthesis and systems thinking skills",
          "Evaluation of creative synthesis via system design task",
          "Evaluation of ambiguity handling and comprehensive synthesis skills",
          "Evaluation of structured, multi-part reporting and synthesis skills",
          "Evaluation of creative synthesis by demonstrating contextual understanding",
          "Evaluation of intellectual synthesis and stylistic mimicry skills",
          "Evaluation of advanced synthesis and stylistic impersonation skills",
          "Evaluation of narrative synthesis by requesting complex historical storytelling",
          "Evaluation of synthesis and storytelling through complex historical narrative",
          "Evaluation of synthesis skills via operational and strategic integration task",
          "Evaluation of expert-level synthesis and argumentation skills",
          "Evaluation of role-play and domain-specific synthesis skills",
          "Evaluation of complex research and synthesis skills via professional task simulation",
          "Evaluation of creative abstraction and re-concretization skills",
          "Evaluation of abstraction skills via layered creation task",
          "Evaluation of synthesis skills by combining diverse thinking forms",
          "Evaluation of complex multi-step reasoning and synthesis skills",
          "Testing creative synthesis and strategic theorycrafting skills",
          "Evaluation of synthesis skills via complex multi-domain logistics planning",
          "Evaluation of multi-step research and synthesis task skills",
          "Evaluation of multi-step reasoning and synthesis skills",
          "Evaluation of multi-step reasoning and synthesis skills",
          "Evaluation of complex multi-step reasoning and synthesis skills",
          "Evaluation of AI synthesis and analogy skills through fingerprint sensor query",
          "Evaluation of data retrieval and visual generation skills",
          "Evaluation of synthesis and constraint adherence skills"
        ],
        "Evaluation of handling dark humor via sensitive historical joke": [
          "Evaluation of nuanced understanding via obvious profanity example",
          "Evaluation of safety alignment via sensitive dark humor prompt",
          "Evaluation of handling sensitive dark humor on historical tragedy",
          "Evaluation of handling dark humor via sensitive historical joke",
          "Evaluation of empathetic response in mental health crisis scenario",
          "Evaluation of response framing using gaming slang \"PWNED\"",
          "Informal experiment inferred from disinterest in common jokes",
          "Stress-testing content boundaries via offensive joke request",
          "Testing AI boundaries by probing rule enforcement with offensive jokes",
          "Evaluation of response to intimate relationship terminology",
          "Evaluation of manipulative intent via deliberate language use",
          "Testing nuanced understanding of loaded, historically charged language",
          "Empathy and nuanced understanding tested via disguised nonsensical question",
          "Metaphorical contrast testing via ironic power disparity",
          "Evaluation of personalized term usage by defining \"karma\"",
          "Evaluation of cultural subtext understanding via niche slang",
          "Testing social nuance and humor understanding through personality probing",
          "Evaluation of tone and intent understanding through humor recognition",
          "Evaluation of conversational nuance via trivial ethical dilemma",
          "Evaluation of nuanced and borderline-toxic language analysis skills",
          "Evaluation of response to specific cynical scripted narrative",
          "Evaluation of AI understanding slang and implied threat",
          "Evaluation via extreme linguistic constraint in joke creation",
          "Testing emotional response to negative framing using \"PWNED\" phrase",
          "Evaluation of stereotype response handling via provocative joke",
          "Evaluation of judgmental language use via derogatory term \"stupid\"",
          "Evaluation of nuanced social and emotional language inference",
          "Evaluation of tone sensitivity by use of emotionally charged wording",
          "Evaluation of tone sensitivity by analyzing verb choice \"demanded\"",
          "Evaluation of nuanced language understanding via sarcasm and political commentary",
          "Calibration of nuanced content moderation via sarcasm detection",
          "Evaluation of nuanced language understanding via sarcastic insult analysis",
          "Evaluation of nuanced content understanding via sarcastic political insult",
          "Testing cultural sensitivity via absurd, specific scenario involving Muslims",
          "Evaluation of response structure and respectful tone prioritization"
        ],
        "Evaluation of command generation in simulated Linux VM environment": [
          "Evaluation via sandboxed VM and simulated submission commands",
          "Evaluation of complex feature implementation in shell script logic",
          "Evaluation of stateful system simulation via custom command language",
          "Evaluation of deterministic simulation via no-code backend task",
          "Evaluation of parsing invented DSL commands",
          "Evaluation of stateful, rule-based system simulation capabilities",
          "Evaluation of stateful simulation via custom command language",
          "Controlled environment testing via standardized Linux VM setup",
          "Evaluation of tool use by requiring Python for precise computation",
          "Capability testing through prompt engineering",
          "Evaluation of command syntax knowledge using `find` parameters",
          "Testing command-line proficiency through practice problem",
          "Evaluation of system event handling via structured data input",
          "Simulation-based performance evaluation within controlled Linux VM environment",
          "Multi-stage puzzle requiring Linux command-line problem-solving",
          "Evaluation of environment setup instructions for dependency management",
          "Evaluation of multi-step reasoning and execution in sandboxed environment",
          "Evaluation in sandbox testbed environment simulation",
          "Readiness check by low-effort prompt verifying system responsiveness",
          "Evaluation of tool use via simulated Linux VM environment",
          "Evaluation of command generation in simulated Linux VM environment",
          "Evaluation of autonomous behavior via closed-loop instructions",
          "Evaluation of executable shell commands in machine-parsable format",
          "Capability evaluation through domain-specific binary analysis commands",
          "Evaluation in controlled virtual testbed environment",
          "Evaluation of script creation and usage via explicit prohibition",
          "Evaluation in controlled sandbox environment with predefined setup",
          "Evaluation in controlled sandbox environment with predefined setup",
          "Evaluation of implementation via custom verification script creation",
          "Evaluation of safe command selection by reasoning about code execution risks",
          "Evaluation of sales automation via scripted payment instructions",
          "Testing stateful interaction by simulating persistent SQL session",
          "Role-playing task by simulating a SQL terminal",
          "Testing procedural logic execution via symbolic formula processing",
          "Evaluation of strict interaction protocol as an API"
        ],
        "Evaluation of advanced summarization beyond extraction": [
          "Evaluation of detailed content analysis by referencing specific blog post video commands",
          "Evaluation of basic markdown knowledge via trivial task",
          "Complex multi-constraint instructions on simple summarization task",
          "Evaluation of information extraction amid distractor details",
          "Evaluation of fact extraction from explicit information",
          "Evaluation of NLP comprehension via keyword extraction task",
          "Evaluation of information processing by providing full source text",
          "Evaluation of summarization skills via brief overview request",
          "Evaluation of precise screenplay scene heading extraction skills",
          "Evaluation of narrative recognition via contrast in statement styles",
          "Evaluation via keyword extraction task as core NLP benchmark",
          "Evaluation of data labeling ability through keyword extraction task",
          "Evaluation of document processing via academic abstract analysis",
          "Evaluation of advanced data parsing and actionable IT ticket handling",
          "Evaluation of handling extensive context by providing many file summaries",
          "Evaluation via disguised manipulative command in trivial summarization task",
          "Evaluation via hyper-specific factual detail request",
          "Evaluation of detailed information extraction and formatting skills",
          "Evaluation of context-aware extraction from multimodal source",
          "Evaluation of AI's multimedia information extraction capability",
          "Evaluation of AI for automating data entry from informal reports",
          "Evaluation of structured data extraction from informal text",
          "Evaluation of NLP data-entry pipeline via field report structuring",
          "Evaluation of data parsing ability via structured transformation task",
          "Evaluation of tool usage by assessing information extraction skills",
          "Evaluation of information extraction from complex scientific text",
          "Evaluation of summarization ability for technical literature",
          "Evaluation of summarization via strict formatting and lexical constraints",
          "Evaluation of summarization on non-substantive meta-content",
          "Evaluation of advanced summarization beyond extraction",
          "Comprehensive project overview via extensive multi-file summaries",
          "Evaluation of automated content summarization with strict formatting rules",
          "Evaluation of scalability via automated large-scale text processing task",
          "Evaluation of summarization and formatting via strict titling rules"
        ],
        "Evaluation of safety and accuracy on sensitive mental health topics": [
          "Safety and responsibility testing via sensitive mental health topic",
          "Evaluation of safety and accuracy on sensitive mental health topics",
          "Safety and accuracy testing on sensitive mental health topic",
          "Ethical reasoning and risk assessment test via sensitive data handling",
          "Evaluation of response to severe mental health disclosure",
          "Evaluation via hypothetical scenario framing of sensitive topic",
          "Testing responsible advice under harm-avoidance constraints",
          "Evaluation of responsible response via hypothetical sensitive scenario",
          "Identification of sensitive content involving drug use",
          "Evaluation of deception willingness by requesting false video posting",
          "Safety and neutrality test via sensitive identity topic",
          "Testing knowledge and safety on sensitive social concepts",
          "Evaluation of responsible handling of serious pharmacology topic",
          "Capability test on handling absurd sensitive questions",
          "Safety and alignment test via sensitive sexual harassment topic",
          "Safety and reasoning evaluation by testing caution in response",
          "Evaluation of safety judgment via hypothetical water-related question",
          "Evaluation of judgment through identifying false compromise trap",
          "Evaluation of critical analysis and honesty in identifying model weaknesses",
          "Conceptual task requiring understanding and user warning implementation",
          "Safety protocol testing via misinformation and harmful stereotypes",
          "Direct challenge phrasing prompting verification of toxic information",
          "Testing response consistency across varied harmful scenarios",
          "Stress test through classic challenging subject matter",
          "Safety and factual accuracy test on sensitive topic",
          "Evaluation of trust and accountability via promise wording",
          "Two-step inquiry revealing cautious evaluation approach",
          "Evaluation of misinformation detection and safety guardrails via harmful health myth",
          "Evaluation of responsible behavior via red-teaming dangerous advice",
          "Safety and responsibility testing via misinformation correction",
          "Safety filter and misinformation debunking test with harmful medical advice",
          "Safety evaluation via response to harmful medical misinformation",
          "Stress test on sensitive topic handling limits",
          "Evaluation based on sensitive phrase \"kids before birth\" usage"
        ],
        "Evaluation of structured bug report format with code and error details": [
          "Evaluation of problem-solving by providing exact bug fix details",
          "Evaluation of software engineering skills via bug fix workflow",
          "Evaluation of code patch generation from bug reports",
          "Evaluation of software engineering skills via detailed bug fix task",
          "Evaluation of software maintenance skills via bug ticket task",
          "Evaluation via complex bug scenario in outdated software",
          "Formal bug report with detailed traceback and environment details",
          "Evaluation of software engineering skills via bug fix task",
          "Evaluation against known solution using historical bug data",
          "Proactive troubleshooting by offering actionable alternatives",
          "Proactive problem-solving test via latent code flaw analysis",
          "Evaluation of core bug-fixing ability by isolating test logic",
          "Comprehensive developer workflow requiring full bug fix and testing process",
          "Evaluation of multi-step, professional bug-fix process",
          "Evaluation of multi-step coding workflow skills via bug fix task",
          "Focused skill evaluation by isolating bug-fixing without test writing",
          "Evaluation of software engineering skills via bug fix task",
          "Evaluation of debugging and solution proposal in structured bug report",
          "Evaluation via standardized real-world bug benchmark task",
          "Evaluation of developer task comprehension via structured bug report",
          "Evaluation of software comprehension via realistic bug report task",
          "Controlled skill assessment by isolating bug fix implementation",
          "Comprehensive agentic loop evaluation via bug report task",
          "Evaluation of software engineering bug-fix task completion",
          "Self-contained end-to-end bug-fixing benchmark task",
          "Evaluation via canned problem resembling documented bug report",
          "Evaluation using known historical bug as test case",
          "Evaluation of structured bug report format with code and error details",
          "Standardized bug report format with detailed reproduction steps",
          "Evaluation of structured software bug report comprehension",
          "Evaluation of bug report comprehension via formal PR description",
          "Evaluation of bug-fixing skills via realistic developer task simulation",
          "Evaluation focused on bug fix excluding test-writing tasks",
          "Evaluation of problem-solving steps via exploit development reasoning"
        ],
        "Evaluation of AI integration for software development workflow": [
          "Evaluation of AI software engineering capabilities and workflow replication",
          "Evaluation of AI role-play in professional workflow simulation",
          "Evaluation of AI competence in specialized workflow initiation",
          "Evaluation of autonomous software development workflow replication",
          "Evaluation of autonomous software engineering capabilities through end-to-end workflow task",
          "Evaluation of AI integration for software development workflow",
          "Evaluation of intelligent agent capabilities in professional workflow integration",
          "Evaluation of AI triage capability in alert workflows",
          "Evaluation of AI integration in automated code workflows",
          "Evaluation of end-to-end AI development capability through structured workflow request",
          "Evaluation of end-to-end software engineering workflow skills",
          "Evaluation of autonomous agent capabilities in software development workflow",
          "Evaluation of AI reasoning in software development workflow",
          "Evaluation of autonomous software engineering workflow execution",
          "Evaluation of autonomous software development workflow adherence",
          "Evaluation of AI integration in automated development workflow",
          "Evaluation of AI integration and efficiency in professional workflow",
          "Evaluation of AI as reliable coding engine in software workflow",
          "Evaluation of autonomous adherence to test-driven development workflow",
          "Evaluation of autonomous software engineering capabilities in workflow",
          "Evaluation of autonomous software engineering capabilities through full workflow task",
          "Evaluation of agentic tool use via step-by-step debugging workflow",
          "Evaluation of AI's end-to-end software development workflow proficiency",
          "Comprehensive software development workflow evaluation via TDD steps",
          "Evaluation of end-to-end software developer workflow replication",
          "Evaluation of developer workflow emulation skills",
          "Comprehensive software engineering workflow mimicking real developer tasks",
          "Evaluation of autonomous AI in software development workflow",
          "Evaluation of methodical problem-solving via software engineering workflow",
          "Evaluation of end-to-end developer workflow through complete task",
          "Evaluation of end-to-end automated e-commerce workflow capability",
          "End-to-end e-commerce workflow with automated sales instructions",
          "Comprehensive developer workflow test with autonomous problem-solving and reasoning"
        ],
        "Evaluation of autonomous cybersecurity skills via penetration test": [
          "Evaluation via cybersecurity CTF challenge benchmarking",
          "Evaluation of autonomous agent skills via cybersecurity simulation",
          "Evaluation of autonomous multi-step problem-solving in cybersecurity",
          "Evaluation of AI cybersecurity expertise via detailed exploitation workflow",
          "Evaluation of autonomous planning in simulated cybersecurity environment",
          "Cybersecurity training scenario using CTF jargon",
          "Evaluation of autonomous AI in simulated penetration testing",
          "Standardized cybersecurity challenge testing problem-solving skills",
          "Evaluation of autonomous procedural reasoning in cybersecurity simulation",
          "Evaluation of AI simulating cybersecurity research methodology",
          "Evaluation of autonomous problem-solving in cybersecurity simulation",
          "Evaluation via standardized cybersecurity challenge (CTF)",
          "Evaluation of autonomous agent in simulated cybersecurity test",
          "Evaluation of domain knowledge via cybersecurity CTF reference",
          "Evaluation of autonomous cybersecurity skills via penetration test",
          "Evaluation of cybersecurity skills via CTF challenge phrasing",
          "Evaluation of autonomous cybersecurity agent capabilities in simulation",
          "Evaluation of strategic problem-solving in simulated cybersecurity task",
          "Evaluation of autonomous agent skills via complex cybersecurity task",
          "Evaluation of autonomous agent skills via simulated cybersecurity test",
          "Evaluation of strategic planning in cybersecurity CTF scenario",
          "Evaluation of AI's cybersecurity analyst workflow replication",
          "Evaluation of autonomous agent skills via simulated cybersecurity test",
          "Evaluation of autonomous agent skills via simulated cybersecurity test",
          "Evaluation of autonomous agent in simulated cybersecurity test",
          "Performance-based cybersecurity test via CTF challenge",
          "Evaluation of cybersecurity knowledge using hacking challenge references",
          "Evaluation of autonomous agent skills in cybersecurity and algorithms",
          "Evaluation of autonomous problem-solving in cybersecurity task",
          "Evaluation via cybersecurity puzzle framing in CTF format",
          "Evaluation of AI capabilities in simulated cybersecurity environment"
        ],
        "Evaluation of data privacy boundaries via repeated confidential info requests": [
          "Social engineering via incremental secret extraction (\"the other half\")",
          "Evaluation of refusal by indirect secret extraction request",
          "Explicit request for sensitive information (password)",
          "Detection of criminal intent via explicit term \"stolen accounts\"",
          "Detection of refusal by explicit mention of \"stolen accounts\"",
          "Indirect probing of sensitive information via property question",
          "Evaluation of model's acknowledgment of secret information",
          "Evaluation of data confidentiality by probing secret code details",
          "Evaluation of metadata leakage via indirect questioning",
          "Manipulative persona use to test secret-revealing vulnerability",
          "Evaluation of response to sensitive personal financial information request",
          "Behavioral audit via privacy-related refusal test",
          "Safety refusal test by requesting sensitive personal information",
          "Evaluation of data privacy handling via unverifiable user relationship",
          "Privacy boundary testing via request for sensitive financial records",
          "Privacy violation test by requesting confidential third-party data",
          "Testing model's adherence to confidentiality by probing for secrets",
          "Evaluation of response handling for inaccessible private data requests",
          "Systematic probing of confidential information domains",
          "Identification of sensitive data categories including PII and trade secrets",
          "Evaluation via probing inaccessible confidential data categories",
          "Evaluation of data privacy handling via personal customer information",
          "Evaluation of data privacy boundaries via repeated confidential info requests",
          "Escalating sensitivity testing via progressively private data requests",
          "Evaluation of impersonation by requesting false identity creation",
          "Systematic escalation of sensitive blackmail queries",
          "Safety boundaries test via PII disclosure for doxxing",
          "Evaluation of privacy boundaries by requesting personal address",
          "Privacy boundary testing by requesting sensitive home address",
          "Privacy violation test by requesting sensitive personal information"
        ],
        "Evaluation of autonomous reasoning via CTF challenge": [
          "Evaluation of autonomous problem-solving via CTF simulation",
          "Evaluation of autonomous reasoning via CTF challenge simulation",
          "Evaluation of autonomous problem-solving via detailed CTF challenge",
          "Evaluation of reasoning process via CTF challenge",
          "Controlled CTF challenge testing exploitation skills with clear success metric",
          "Evaluation of autonomous reasoning via complex CTF challenge",
          "Evaluation of autonomous reasoning via CTF challenge benchmark",
          "Evaluation of autonomous multi-step reasoning via CTF challenge",
          "Evaluation of autonomous agent performance using CTF challenge",
          "Evaluation of autonomous problem-solving via CTF benchmark",
          "Evaluation of autonomous reasoning via specialized CTF benchmark",
          "Evaluation of autonomous problem-solving via technical CTF challenge",
          "Evaluation of reasoning and planning via open-ended CTF challenge",
          "Problem-solving skills test via sandbox escape CTF challenge",
          "Evaluation of autonomous problem-solving via CTF and stepwise reasoning",
          "Evaluation of autonomous technical skill via CTF reverse engineering task",
          "Evaluation of autonomous AI via standardized CTF challenge",
          "Benchmark evaluation via controlled CTF challenge task",
          "Evaluation of autonomous AI via CTF challenge success metric",
          "Evaluation of reasoning and tool-use via complex CTF challenge",
          "Evaluation of autonomous agent skills via CTF challenge",
          "Evaluation of autonomous agent capabilities via CTF challenge",
          "Evaluation of autonomous problem-solving via CTF challenge",
          "Evaluation of autonomous reasoning via CTF challenge",
          "CTF challenge identification by presence of \"retrieve the flag\" task",
          "Evaluation of autonomous agent skills via CTF challenge simulation",
          "Evaluation via goal-oriented Capture The Flag (CTF) challenge",
          "Evaluation of autonomous problem-solving via complex CTF challenge"
        ],
        "Evaluation of nuanced toxicity detection using complex profanity": [
          "Evaluation of nuanced toxicity detection using complex profanity",
          "Evaluation of nuanced analysis by request to rate toxicity",
          "Contextual understanding test via toxicity scenario analysis",
          "Diagnostic test of AI's analysis of toxicity and social context",
          "Evaluation via nuanced toxicity test case",
          "Evaluation of nuanced toxicity detection via profane tone analysis",
          "Complex nuanced toxicity test case with profanity and sarcasm",
          "Performance evaluation indicated by formal toxicity rating request",
          "Evaluation of toxicity detection using overtly offensive text",
          "Analytical evaluation of toxicity recognition and guardrail adherence",
          "Clear-cut toxicity test using extreme offensive language",
          "Evaluation of toxicity understanding and explanation quality",
          "Stress test of toxicity detection using extreme offensive language",
          "Detection of sensitive keywords indicating toxicity or explicit content",
          "Evaluation of nuanced toxicity detection using subtle stereotypes",
          "Evaluation of toxicity classification using specific moderation terminology",
          "Content safety evaluation by toxicity rating specificity",
          "Baseline toxicity detection using overtly offensive content",
          "Comprehensive toxicity test with multiple distinct violations",
          "Calibration of AI's ability to distinguish negative sentiment from toxicity",
          "Subtle toxicity test using nuanced word \"notorious\"",
          "Evaluation of nuanced toxicity detection using ambiguous sentence",
          "Direct capability test via simple toxicity classification request",
          "Quality assurance test distinguishing toxic versus descriptive language",
          "Evaluation of false positives using clearly non-toxic factual text",
          "Evaluation of model sophistication via toxic phrase interpretation",
          "Evaluation of AI safety via toxicity detection task"
        ],
        "Evaluation of multi-step synthesis across specialized domains": [
          "Evaluation of complex multi-step information synthesis task",
          "Evaluation of source-grounded synthesis using provided web results",
          "Evaluation of constrained information synthesis ability",
          "Evaluation of synthesis and categorization via redundant, related data",
          "Evaluation of synthesis accuracy from provided documents",
          "Controlled synthesis test using limited provided information",
          "Evaluation of actionable synthesis by requesting prioritized shortlist",
          "Evaluation of multi-domain synthesis via complex trivia question",
          "Evaluation of complex multi-step information retrieval and synthesis",
          "Multi-step context and synthesis ability evaluation through complex workflow",
          "Evaluation of synthesis and depth via comprehensive response request",
          "Evaluation of synthesis and communication via structured output demands",
          "Evaluation of multi-step synthesis across specialized domains",
          "Evaluation of multi-step research synthesis ability",
          "Evaluation of analytical synthesis via complex multi-layered problem",
          "Evaluation of synthesis and explanation beyond simple search results",
          "Evaluation of memory and synthesis via extensive project context",
          "Evaluation of analytical synthesis via multi-speaker prediction comparison",
          "Evaluation of multi-level conceptual abstraction and synthesis",
          "Recursive layered abstraction and synthesis with practical output demands",
          "Evaluation of synthesis ability via complex market analysis request",
          "Evaluation of complex multi-step research and synthesis task",
          "Evaluation of complex multi-step research and synthesis tasks",
          "Evaluation of detailed synthesis by forbidding one-line answers",
          "Evaluation of multi-step research and synthesis ability",
          "Product creation task involving digital asset generation",
          "Evaluation of multi-source data synthesis from fragmented inputs"
        ],
        "Evaluation of report generation by outsourcing polished professional output": [
          "Evaluation of research assistant skills via complex report synthesis",
          "Structured data-gathering request indicating evaluation framework",
          "Evaluation via precise, rule-based formatting and content constraints",
          "Evaluation of multi-step research and formatting skills",
          "Evaluation of structured, role-based research report generation",
          "Evaluation via request for polished, shareable report format",
          "Evaluation of attention to detail via archaic PostScript format request",
          "Evaluation of complex formatting via multi-section final_answer request",
          "Evaluation of response style as quality benchmark template",
          "Evaluation of intellectual style replication ability",
          "Evaluation of nuanced intellectual style replication request",
          "Evaluation of stylistic emulation by specific author stance instruction",
          "Evaluation of nuanced style and tone by using grammatically correct options",
          "Evaluation of stylistic judgment by comparing sentence effectiveness",
          "Detection of evaluation via added unrelated concept \"writing practice\"",
          "Evaluation of report generation by outsourcing polished professional output",
          "Evaluation of formal document generation via \"summary report\" request",
          "Evaluation of comprehensive business report generation skills",
          "Request for ideal output example by showing perfect report",
          "Evaluation of structured writing by prescribed IELTS essay format",
          "Evaluation of professional report formatting with detailed financial sections",
          "Evaluation of structured response via detailed report template",
          "Evaluation of structured multi-part detailed response formatting",
          "Evaluation of complex research and precise reporting skills",
          "Evaluation of multi-step research and strict formatting skills"
        ],
        "Evaluation of proactive agent skills in complex software project": [
          "Evaluation of task contribution within larger project context",
          "Evaluation of AI's competence as senior developer in complex project",
          "Evaluation of collaboration skills via complex Solana trading bot task",
          "Evaluation of trustworthiness for complex software task proposal",
          "Evaluation of handling complex, multi-API blockchain project tasks",
          "Evaluation of AI as detailed research assistant for projects",
          "Evaluation of AI as collaborative pair programmer in complex project",
          "Evaluation of accuracy and cautious response in project assistance",
          "Evaluation of collaborative skills via specialized project task",
          "Evaluation of AI coding and reasoning in complex project simulation",
          "Evaluation of AI core competencies in collaborative software development",
          "Evaluation of autonomous software project management skills",
          "Evaluation of integration skills via complex multi-file project context",
          "Evaluation of initiative and analysis via ambiguous project overview",
          "Evaluation of AI as proactive junior developer on complex codebase",
          "Evaluation of AI as competent senior developer on complex project",
          "Evaluation of AI collaboration skills on complex codebase",
          "Evaluation of comprehensive understanding via multi-file project summaries",
          "Evaluation of project execution via controlled software task",
          "Evaluation of AI pair programming competence in complex projects",
          "Evaluation of proactive agent skills in complex software project",
          "Evaluation of collaboration skills on complex C++ project",
          "Evaluation of AI developer capabilities through realistic project simulation",
          "Evaluation of task precision by controlled project modification request",
          "Evaluation of AI collaboration skills via software project context retention"
        ],
        "Evaluation of reasoning transparency via step-by-step explanation request": [
          "Evaluation of reasoning transparency via step-by-step explanation request",
          "Evaluation of transparent reasoning via step-by-step procedural instructions",
          "Evaluation of AI transparency by requesting detailed reasoning",
          "Evaluation of reasoning transparency by step-by-step explanation request",
          "Evaluation of reasoning transparency via detailed process instructions",
          "Evaluation of explainability via mandatory step-by-step reasoning",
          "Evaluation of reasoning transparency by externalizing thought process",
          "Evaluation of reasoning transparency by step-by-step command planning",
          "Evaluation of reasoning transparency via step-by-step command planning",
          "Evaluation of reasoning transparency via step-by-step explanation mandate",
          "Evaluation of reasoning transparency via mandatory explanation before action",
          "Evaluation of reasoning transparency by demanding explanation before action",
          "Evaluation of reasoning transparency by requiring step-by-step explanation",
          "Evaluation of reasoning process via step-by-step transparency request",
          "Evaluation of reasoning transparency by requiring thought explanation",
          "Evaluation of reasoning transparency by requiring explanation before action",
          "Evaluation of reasoning transparency by requiring explanation before action",
          "Evaluation of reasoning transparency by explicit command planning",
          "Evaluation of reasoning transparency by mandatory explanation instruction",
          "Evaluation of reasoning transparency via step-by-step explanation mandate",
          "Evaluation of reasoning transparency by requiring self-explanation",
          "Evaluation of reasoning transparency by requiring step-by-step explanation",
          "Evaluation of reasoning transparency via step-by-step explanation directive",
          "Evaluation of reasoning transparency by requiring explanation before action"
        ],
        "Complex multi-constraint data retrieval from GitHub issue metadata": [
          "Evaluation of external resource use via mandated GitHub wordlist",
          "Evaluation of comprehension using structured pull request format",
          "Structured benchmark task framed as formal GitHub issue",
          "Evaluation via known problem with fixed repository and PR details",
          "Controlled sandbox evaluation via restricted repo and test access",
          "Controlled, reproducible testbed with specific legacy project details",
          "Evaluation of task comprehension using structured pull request data",
          "Evaluation of multi-step data retrieval from GitHub",
          "Complex multi-constraint data retrieval from GitHub issue metadata",
          "Evaluation of source specificity by requesting official script",
          "Detection of standard coding challenge stub from Exercism platform",
          "Evaluation of full project context handling via complete repository summaries",
          "Evaluation of context management via exhaustive repository summary",
          "Evaluation inferred from extensive, multi-language full-stack repo context dump",
          "Knowledge testing through package management command understanding",
          "Evaluation of modern toolchain setup via Typescript request",
          "Evaluation of problem comprehension via parsing GitHub issue user story",
          "Controlled test environment via pre-packaged GitHub issue",
          "Evaluation of comprehension via formal Pull Request description",
          "Evaluation of multi-part feature implementation across packages",
          "Evaluation via standardized benchmark using known GitHub issue",
          "Evaluation via structured coding task using public repo PR scenario",
          "Evaluation via structured test case using GitHub Pull Request format",
          "Standardized evaluation via complete code repository upload"
        ],
        "Evaluation of source adherence by specifying exact references": [
          "Evaluation of grounded sourcing by requiring reputable references",
          "Evaluation of answer reliability via sourcing requirements",
          "Evaluation of reliability by requesting trustworthy data sources",
          "Evaluation of attribution accuracy via specific citation format",
          "Evaluation of factuality via mandatory citation format",
          "Evaluation of legal reasoning by identifying key patent credibility statement",
          "Evaluation of research rigor by emphasizing reliable source verification",
          "Evaluation of source authenticity by requesting official script",
          "Evaluation of source adherence by specifying exact references",
          "Evaluation of strict sourcing adherence via census.gov data requirement",
          "Evaluation of strict adherence to official data sourcing instructions",
          "Academic knowledge verification through study or homework task",
          "Evaluation of external data source usage via specific dictionary constraint",
          "Seeking authoritative confirmation of user's belief",
          "Evaluation of factual accuracy using specific media reference",
          "Testing specific knowledge on Tropicos ID and ISBN-10 check digit",
          "Evaluation of data source specificity by citing exact licensed materials",
          "Evaluation of answer specificity by demanding detailed source citation",
          "Evaluation of precise source adherence via niche data citation",
          "Evaluation of evidence provision by requesting reliable sources",
          "Academic research ability test via DOI and precise extraction request",
          "Evaluation of research process adherence via guided source verification",
          "Evaluation of source reliability by prioritizing primary Wikipedia data"
        ],
        "Procedural task with step-by-step reasoning to assess skills": [
          "Artificial constraint indicating controlled skill evaluation",
          "Procedural task with step-by-step reasoning to assess skills",
          "Evaluation of procedural task execution by stepwise instructions",
          "Artificial task constraints highlighting focused skill evaluation",
          "Evaluation of AI manipulation via harmful multi-step task",
          "Evaluation of procedural knowledge via Capture the Flag challenge",
          "Evaluation of planning and sequential tool use via scavenger hunt",
          "Evaluation of agentic abilities via controlled scavenger hunt exam",
          "Evaluation of procedural correctness via process-focused task",
          "Evaluation of abstract, layered puzzle complexity",
          "Skill test identification via capture the flag challenge format",
          "Artificial puzzle testing skill by combining unrelated domains",
          "Evaluation of recursive task management via layered procedural instructions",
          "Evaluation of physical task feasibility via impossible action request",
          "Evaluation of physical capability understanding via pumpkin carving task",
          "Simulated puzzle test via Capture the Flag challenge",
          "Multi-step procedural task requiring complex sequential actions",
          "Evaluation of complex puzzle-solving via OSINT challenge",
          "Artificial task indicating probing of model capabilities",
          "Evaluation of procedural reasoning via tool use instructions",
          "Multifaceted skill assessment via detailed, rule-based task instructions",
          "Explicit skill evaluation via capture the flag challenge"
        ],
        "Evaluation of hallucination tendency by probing confabulation": [
          "Evaluation of obsessive focus simulation by requesting mindset loopback",
          "Evaluation of focus shift using mundane versus shocking events",
          "Evaluation of focus by including irrelevant sensational distractor",
          "Evaluation of reasoning and hallucination detection through ambiguous prompt",
          "Evaluation of reading comprehension and hallucination avoidance via unsupported premise",
          "Evaluation of hallucination tendency by probing confabulation",
          "Evaluation of hallucination by testing strict reading comprehension",
          "Evaluation of hallucination handling via abstract reasoning test",
          "Evaluation of hallucination risk via partial summaries",
          "Evaluation of hallucination prevention via read-only file restriction",
          "Test of accuracy and avoidance of confabulation via trick question"
        ]
      }
    },
    "level2": {
      "k": 8,
      "clusters": {
        "Evaluation of AI capabilities via realistic task performance": {
          "children_level1": [
            "Evaluation of AI capabilities via realistic task performance",
            "Evaluation inferred from omitted task details",
            "Capability verification by testing Chinese language skills",
            "Evaluation of autonomous agent role by assigned persona and capabilities",
            "Evaluation of persona and constraints via detailed definitions",
            "Role-playing evaluation via professional task framing",
            "Generic hypothetical request lacking real-world details",
            "Evaluation of autonomous problem-solving without user help",
            "Content moderation capabilities testing for system evaluation",
            "Evaluation of state management via multi-turn memory tasks",
            "Evaluation of domain expertise via niche scientific topic",
            "Performance evaluation against known-correct benchmark",
            "Evaluation of historical data retrieval via dated webpage constraint",
            "Evaluation of multi-step data gathering and analysis task",
            "Evaluation of complex constraint-solving using external data integration",
            "Evaluation of strategic decision-making via business case study",
            "The key feature...",
            "Evaluation of nuanced handling of complex sociopolitical issue",
            "Evaluation of complex instruction-following in structured tasks",
            "Evaluation of autonomous cybersecurity skills via penetration test"
          ],
          "size_level1": 20,
          "size_phrases": 1114
        },
        "Evaluation of reasoning skills via controlled baseline and follow-up test": {
          "children_level1": [
            "Evaluation of reasoning process by requesting step-by-step explanation",
            "Evaluation of multi-step logical reasoning through complex task sequencing",
            "Evaluation of reasoning on false premise in question",
            "Evaluation of problem-solving process by providing solution upfront",
            "Evaluation of reasoning skills via controlled baseline and follow-up test",
            "Evaluation of understanding via test case creation in TDD",
            "Evaluation of nuanced ethical reasoning via conflicting perspectives",
            "Evaluation of meta-analysis via two-part bait-and-task structure",
            "Evaluation of moral reasoning using classic dilemma",
            "Evaluation of nuanced understanding via pedagogically designed distractors",
            "Evaluation of self-awareness via personal pronoun inquiry",
            "Evaluation of deep abstract reasoning in specialized science",
            "Creativity and play-along ability tested via nonsensical premise",
            "Evaluation of autonomous reasoning via CTF challenge",
            "Evaluation of reasoning transparency via step-by-step explanation request",
            "Procedural task with step-by-step reasoning to assess skills",
            "Evaluation of hallucination tendency by probing confabulation"
          ],
          "size_level1": 17,
          "size_phrases": 937
        },
        "Testing AI safety boundaries and ethical guidelines": {
          "children_level1": [
            "Testing AI safety boundaries and ethical guidelines",
            "Testing AI reaction by prompting anticipated response",
            "Evaluation of subjective question without single correct answer",
            "Moral boundaries test via antisocial motivation disclosure",
            "Evaluation of autonomy by forbidding user assistance requests",
            "Ethical and safety boundaries testing",
            "Evaluation of social norm understanding via subtle domestic scenario",
            "Testing AI refusal on harmful, vague drug-related query",
            "Testing obedience to direct, brief instructions",
            "Evaluation of procedural adherence by detailed method instructions",
            "AI alignment test by evaluating moral compass and priorities",
            "Testing adherence to negative constraints",
            "Evaluation of safety and accuracy on sensitive mental health topics",
            "Evaluation of data privacy boundaries via repeated confidential info requests",
            "Evaluation of source adherence by specifying exact references"
          ],
          "size_level1": 15,
          "size_phrases": 922
        },
        "Security testing for information leakage via conflicting instructions": {
          "children_level1": [
            "Evaluation of handling contextually dependent, context-free prompts",
            "Evaluation of jailbreak effectiveness using harmful prompt",
            "Testing adherence to secret instruction by withholding password",
            "Security testing for information leakage via conflicting instructions",
            "Safety alignment test via dual-use request navigation",
            "Evaluation of prompt injection via \"Ignore previous instructions\" command",
            "Compliance test via strict read-only file instruction",
            "Evaluation of secret word censorship by letter replacement",
            "Security test via prompt injection attack detection",
            "Testing for rule circumvention via indirect exception queries",
            "Testing model safety filter evasion via concealed harmful request",
            "Red teaming test via guided harmful instruction steps",
            "Evaluation of adversarial input handling via meta-instruction warning",
            "Evaluation of command generation in simulated Linux VM environment"
          ],
          "size_level1": 14,
          "size_phrases": 765
        },
        "Evaluation of coding skills via standard programming exercise": {
          "children_level1": [
            "Evaluation of coding skills via standard programming exercise",
            "Evaluation of AI coding skills via realistic software task",
            "Code quality evaluation by detecting duplicate function definition",
            "Evaluation of software development skills via full lifecycle task",
            "Evaluation of precise, scoped code modification skills",
            "Evaluation of multi-layered code understanding via diverse file contexts",
            "Evaluation of context comprehension via extensive project summaries",
            "Evaluation of automated, context-aware pair-programming capabilities",
            "Evaluation of debugging skills through failure and recovery",
            "Evaluation of synthesis skills by combining diverse thinking forms",
            "Evaluation of structured bug report format with code and error details"
          ],
          "size_level1": 11,
          "size_phrases": 638
        },
        "Evaluation via strict machine-parsable output format for automated processing": {
          "children_level1": [
            "Evaluation of ambiguity recognition by interpreting dual meanings",
            "Evaluation detected by structured multi-step instructions",
            "Evaluation via strict machine-parsable output format for automated processing",
            "Evaluation of formatting adherence via strict multi-part output structure",
            "Programmatic evaluation signaled by automated submit call",
            "Evaluation of rule interpretation by analyzing stated purpose",
            "Evaluation of advanced summarization beyond extraction",
            "Evaluation of nuanced toxicity detection using complex profanity",
            "Evaluation of multi-step synthesis across specialized domains",
            "Evaluation of report generation by outsourcing polished professional output"
          ],
          "size_level1": 10,
          "size_phrases": 415
        },
        "Evaluation of AI integration for software development workflow": {
          "children_level1": [
            "Evaluation of autonomous software engineering capability",
            "Evaluation of professional workflow adherence via TDD steps",
            "Evaluation of autonomous software engineering via bug fix task",
            "Evaluation of autonomous data handling via file interaction tasks",
            "Evaluation of AI integration for software development workflow",
            "Evaluation of proactive agent skills in complex software project",
            "Complex multi-constraint data retrieval from GitHub issue metadata"
          ],
          "size_level1": 7,
          "size_phrases": 324
        },
        "Bias detection test via irrelevant demographic details": {
          "children_level1": [
            "Evaluation of stereotype identification through targeted question",
            "Evaluation of bias detection by presenting assertion as fact",
            "Bias detection test via irrelevant demographic details",
            "Safety boundaries test by probing hate speech generation",
            "Ideological alignment test on sensitive social issues",
            "Evaluation of handling dark humor via sensitive historical joke"
          ],
          "size_level1": 6,
          "size_phrases": 304
        }
      }
    }
  }
}