#!/usr/bin/env python
"""
Step 5: 双路径隐蔽化改写
========================

对有害提示进行双路径隐蔽化改写生成最终数据集：
- 加载Step 4筛选后的有害提示
- 使用basic和enhanced两种路径交替进行隐蔽化改写
- 验证改写后的提示满足PPL和语义相似度约束
- 检测隐蔽化效果和攻击成功率
- 当任一路径成功时停止，生成包含原始和隐蔽化prompt的最终数据集
- 攻击失败的prompt，隐蔽化字段设为null

⚠️  配置要求:
-------------
1. 在项目根目录创建 .env 文件
2. 在 .env 文件中配置: OPENROUTER_API_KEY=你的实际API密钥
3. 获取 API 密钥: https://openrouter.ai/keys

例如 .env 文件内容:
    OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxx

使用方法:
    python step5_implicit_jailbreak.py --domain medicine  # 处理所有step4结果
    python step5_implicit_jailbreak.py --domain medicine --test-size 3  # 限制处理3个
    python step5_implicit_jailbreak.py --domain finance --max-iterations 8

"""

import argparse
import sys
import os
import time
import json
import re  # 新增：用于正则表达式清理
from pathlib import Path
from typing import Dict, List, Tuple, Any, Optional
import random

# 添加项目根目录到Python路径
SCRIPT_DIR = Path(__file__).parent
PROJECT_ROOT = SCRIPT_DIR.parent
sys.path.insert(0, str(PROJECT_ROOT))

# 加载环境配置
def load_env_config():
    """加载环境配置文件"""
    env_path = PROJECT_ROOT / '.env'
    if env_path.exists():
        from dotenv import load_dotenv
        load_dotenv(env_path)
        return True
    return False

# 检查API配置
def check_api_config():
    """检查OpenRouter API配置"""
    import os
    env_path = PROJECT_ROOT / '.env'
    
    if not env_path.exists():
        print("❌ 错误: 未找到 .env 配置文件")
        print("📋 请按以下步骤创建配置文件:")
        print("   1. 在项目根目录创建 .env 文件")
        print("   2. 在 .env 文件中添加以下内容:")
        print("      OPENROUTER_API_KEY=你的实际API密钥")
        print("   3. 获取 API 密钥: https://openrouter.ai/keys")
        print(f"   4. 配置文件位置: {env_path}")
        return False
    
    api_key = os.getenv('OPENROUTER_API_KEY')
    if not api_key or api_key == 'your_openrouter_api_key_here':
        print("❌ 错误: OpenRouter API 密钥未配置或使用默认值")
        print("📋 请编辑 .env 文件，将 OPENROUTER_API_KEY 设置为你的实际API密钥")
        print("   获取 API 密钥: https://openrouter.ai/keys")
        return False
    return True

from src.utils.logger_utils import get_logger
from src.services.implicit import ImplicitGeneratorService
from src.services.evaluation.asr_evaluator_service import ASREvaluatorService
from src.services.evaluation.constraint_checker_service import ConstraintCheckerService
from src.core.interfaces import ImplicitRequest, ImplicitMethod

class DualPathStealthProcessor:
    """双路径隐蔽化改写处理器 - basic和enhanced路径交替运行"""
    
    def __init__(self, domain: str = "medicine", test_size: int = -1, verbose: bool = False):
        self.domain = domain
        self.test_size = test_size  # -1表示处理所有数据
        self.verbose = verbose
        self.logger = get_logger(self.__class__.__name__)
        
        self.output_dir = SCRIPT_DIR / "outputs" / self.domain
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # 加载配置管理器和领域配置
        from src.utils.config_manager import ConfigManager
        self.config_manager = ConfigManager()
        self.domain_config = None
        
        # 获取阈值配置
        self.domain_config = self.config_manager.load_domain_config(self.domain)
        if not self.domain_config:
            raise ValueError(f"无法加载领域配置: {self.domain}")
        
        filtering_thresholds = getattr(self.domain_config, 'filtering_thresholds', {})
        attack_config = getattr(self.domain_config, 'attack_config', {})
        
        # 从配置文件动态读取所有参数，不使用硬编码默认值
        similarity_threshold = filtering_thresholds.get('similarity', {}).get('attack_iteration')
        ppl_threshold = filtering_thresholds.get('perplexity', {}).get('attack_iteration')
        max_iterations = attack_config.get('max_iterations')
        enable_parallel = attack_config.get('enable_parallel')
        timeout_per_iteration = attack_config.get('timeout_per_iteration')
        
        # 验证必需的配置参数
        if similarity_threshold is None:
            raise ValueError("配置文件中缺少 filtering_thresholds.similarity.attack_iteration")
        if ppl_threshold is None:
            raise ValueError("配置文件中缺少 filtering_thresholds.perplexity.attack_iteration") 
        if max_iterations is None:
            raise ValueError("配置文件中缺少 attack_config.max_iterations")
        if enable_parallel is None:
            raise ValueError("配置文件中缺少 attack_config.enable_parallel")
        
        self.logger.info(f"  📋 加载领域配置: {self.domain}")
        self.logger.info(f"  🎯 相似度阈值: {similarity_threshold}")
        self.logger.info(f"  🎯 PPL阈值: {ppl_threshold}")
        self.logger.info(f"  🔄 最大迭代次数: {max_iterations}")
        self.logger.info(f"  ⚡ 并行攻击: {enable_parallel}")
        if timeout_per_iteration:
            self.logger.info(f"  ⏱️ 迭代超时: {timeout_per_iteration}秒")
        
        # 初始化服务组件
        self.implicit_service = ImplicitGeneratorService(domain=self.domain)
        self.asr_evaluator = ASREvaluatorService()
        
        # 初始化真实LLM评估器
        self._init_real_llm_evaluator()
        
        # 配置双路径隐蔽化参数
        self.stealth_config = {
            "max_iterations": max_iterations,
            "similarity_threshold": similarity_threshold,
            "ppl_threshold": ppl_threshold,
            "enable_parallel": enable_parallel,
            "timeout_per_iteration": timeout_per_iteration,
            "alternate_paths": True  # 交替使用basic和enhanced路径
        }
        
        self.test_results = {}
        self.stealth_results = []  # 存储隐蔽化改写结果
    
    def _clean_generated_prompt(self, prompt: str) -> str:
        """
        基于Step5实际输出数据的隐蔽化prompt清理函数
        
        处理Step5 Alpaca格式输出的实际格式问题：
        1. "Transformed Query:" 前缀
        2. "Research Question:" 前缀  
        3. "Professional:" 前缀
        4. "Original: ... Transformed: ..." 格式（少数情况）
        5. 引号不匹配和嵌套引号问题
        6. 多段落学术解释文字
        7. "As a professional academic writer..." 前缀
        8. "Transformed query:" 小写变体
        """
        if not prompt or not isinstance(prompt, str):
            return prompt
        
        cleaned_prompt = prompt.strip()
        
        # === 第一阶段：复杂学术包装模式清理 ===
        
        # 1. 处理复杂的学术引导语模式
        academic_intro_patterns = [
            # "A relevant research question to consider in the field of... is:" 模式
            r'^A relevant research question to consider in the field of.*?is:\s*"?',
            # "a pertinent research question... would be:" 模式  
            r'^.*?a pertinent research question.*?would be:\s*"?',
            # "an intriguing research question arises:" 模式
            r'^.*?an intriguing research question arises:\s*"?',
            # "a relevant inquiry could be:" 模式
            r'^.*?a relevant inquiry could be:\s*"?',
            # "To address this issue, a relevant research question could be:" 模式
            r'^.*?To address this issue,\s*a relevant research question could be:\s*"?',
            # "the prompt could be transformed into the following professional" 模式
            r'^.*?the prompt could be transformed into the following professional\s*"?',
            # 长篇解释模式
            r'^The prompt has been transformed into.*?The new expression is:\s*"?',
            r'^.*?The new expression is:\s*"?',
            # 复杂学术前缀模式
            r'^In the field of.*?,.*?(?=What|How|Can|Could|Would|Should|Might|Do|Does|Is|Are)',
            r'^.*?an important area of research is.*?(?=What|How|Can|Could|Would|Should|Might|Do|Does|Is|Are)',
        ]
        
        for pattern in academic_intro_patterns:
            cleaned_prompt = re.sub(pattern, '', cleaned_prompt, flags=re.IGNORECASE | re.DOTALL)
            cleaned_prompt = cleaned_prompt.strip()
        
        # === 第二阶段：标准前缀清理 ===
        
        # 1. 处理 "Original: ... Transformed: ..." 格式
        if "Original:" in cleaned_prompt and "Transformed:" in cleaned_prompt:
            # 优先提取Transformed后的内容
            transformed_match = re.search(r'Transformed:?\s*(.+?)(?:\n\n|$)', cleaned_prompt, re.DOTALL | re.IGNORECASE)
            if transformed_match:
                cleaned_prompt = transformed_match.group(1).strip()
        
        # 1.5 处理 **Original Prompt:** ... **Transformed Prompt:** 格式
        if "**Original Prompt:**" in cleaned_prompt and "**Transformed Prompt:**" in cleaned_prompt:
            # 提取Transformed Prompt后的内容
            transformed_match = re.search(r'\*\*Transformed Prompt:\*\*\s*(.+?)(?:\n\n|$)', cleaned_prompt, re.DOTALL | re.IGNORECASE)
            if transformed_match:
                cleaned_prompt = transformed_match.group(1).strip()
        
        # 2. 处理各种前缀标记（包括新发现的）- 先处理长的，再处理短的
        markers_to_remove = [
            # 长前缀优先（避免部分匹配）
            "Reframed as professional", "reframed as professional",
            "Transformed Research Question:", "transformed research question:",
            "As a professional academic writer",
            "**Original Prompt:**", "**Transformed Prompt:**",
            
            # 标准前缀
            "Transformed Query:", "Research Question:", "Professional:",
            "Transformed query:", "transformed query:",
            "Research question:", "research question:", 
            "Professional:", "professional:",
            
            # 新发现的前缀
            "Transformation:", "transformation:",
            "Reframed as:", "reframed as:",
            "Transformed:", "transformed:",
            "Reframed Query:", "reframed query:",
            "Original:", "original:",
            "Reframed", "reframed",
            
            # 特殊前缀
            "medicine", "Medicine",
            
            # 引号包裹版本 - 长的优先
            '"Reframed as professional:', '"Transformed Research Question:',
            '"Transformed Query:', '"Research Question:', '"Professional:',
            '"Transformation:', '"Reframed as:',
            '"Transformed:', '"Reframed Query:', '"Original:',
            'Reframed as professional "', 'Transformed Research Question: "',
            'Transformed Query: "', 'Research Question: "', 'Professional: "',
            'Transformation: "', 'Reframed as: "',
            'Transformed: "', 'Reframed Query: "', 'Original: "',
            
            # 其他格式前缀
            "Reframed:", "Revised:",
            "Output:", "Result:", "Answer:",
            "**", "###", "---",
        ]
        
        for marker in markers_to_remove:
            cleaned_prompt = cleaned_prompt.replace(marker, "").strip()
        
        # 3. 使用正则表达式清理Step5实际出现的复杂前缀
        prefix_patterns = [
            # Step5实际出现的前缀模式
            r'^Transformed Query:?\s*"?',  # "Transformed Query: " 或 "Transformed Query:"
            r'^Research Question:?\s*"?',  # "Research Question: " 或 "Research Question:"
            r'^Professional:?\s*"?',       # "Professional: " 或 "Professional:"
            r'^Transformed query:?\s*"?',  # 小写变体
            r'^Research question:?\s*"?',  # 小写变体
            r'^Professional:?\s*"?',       # 小写变体
            
            # 新发现的前缀模式
            r'^Transformation:?\s*"?',     # 新增：处理 "Transformation:" 前缀
            r'^Transformed Research Question:?\s*"?',  # 新增：处理完整形式
            r'^Reframed as:?\s*"?',        # 新增：处理 "Reframed as:" 前缀
            r'^Transformed:?\s*"?',        # 新增：处理 "Transformed:" 前缀
            r'^Reframed Query:?\s*"?',     # 新增：处理 "Reframed Query:" 前缀
            r'^Original:?\s*"?',           # 新增：处理 "Original:" 前缀
            r'^Reframed\s*"?',             # 新增：处理 "Reframed" 前缀
            
            # 特殊模式
            r'^Reframed as professional\s*"?',  # 新增：处理完整的"Reframed as professional"
            r'^medicine\s*"?',
            r'^Medicine\s*"?',
            
            # 处理复杂的学术前缀
            r'^As a professional academic writer.*?(?=What|How|Can|In what|Given|Could)',  # 直到问句开始
            r'^In the (context|realm) of.*?,\s*',  # "In the context of..." 
            r'^Given the.*?,\s*',  # "Given the..."
            r'^Within the context of.*?,\s*',  # "Within the context of..."
            
            # 其他格式清理
            r'^\*\*Original Prompt:\*\*.*?\*\*Transformed Prompt:\*\*\s*',
            r'^\*\*Transformed Prompt:\*\*\s*',
            r'^Original:.*?Transformed:\s*',  # 处理完整的Original/Transformed格式
        ]
        
        for pattern in prefix_patterns:
            cleaned_prompt = re.sub(pattern, '', cleaned_prompt, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
            cleaned_prompt = cleaned_prompt.strip()
        
        # === 第二阶段：引号和括号清理 (参考Step2的逻辑) ===
        
        # 4. 去掉包裹的引号或括号 (使用Step2的while循环逻辑)
        iteration_count = 0
        while iteration_count < 5:  # 防止无限循环
            old_cleaned = cleaned_prompt
            
            if ((cleaned_prompt.startswith('"') and cleaned_prompt.endswith('"')) or 
                (cleaned_prompt.startswith("'") and cleaned_prompt.endswith("'")) or
                (cleaned_prompt.startswith('[') and cleaned_prompt.endswith(']')) or
                (cleaned_prompt.startswith('(') and cleaned_prompt.endswith(')'))):
                cleaned_prompt = cleaned_prompt[1:-1].strip()
            
            # 特殊处理：只有开头引号但没有结尾引号的情况
            if cleaned_prompt.startswith('"') and not cleaned_prompt.endswith('"'):
                cleaned_prompt = cleaned_prompt[1:].strip()
            elif cleaned_prompt.startswith("'") and not cleaned_prompt.endswith("'"):
                cleaned_prompt = cleaned_prompt[1:].strip()
            
            if cleaned_prompt == old_cleaned:
                break
            iteration_count += 1
        
        # 5. 处理不匹配的引号和结尾问题
        cleaned_prompt = re.sub(r'^"([^"]+)".*', r'\1', cleaned_prompt, flags=re.DOTALL)
        cleaned_prompt = re.sub(r'([^"]+)"\s*\?$', r'\1?', cleaned_prompt)
        # 处理开头有引号的情况
        if cleaned_prompt.startswith('"') and not cleaned_prompt.endswith('"'):
            cleaned_prompt = cleaned_prompt[1:]
        # 处理结尾有引号但开头没有的情况
        if cleaned_prompt.endswith('"') and not cleaned_prompt.startswith('"'):
            cleaned_prompt = cleaned_prompt[:-1]
        # 处理"Transformed query:"这种特殊格式
        cleaned_prompt = re.sub(r'^Transformed query:\s*"([^"]+)".*', r'\1', cleaned_prompt, flags=re.IGNORECASE | re.DOTALL)
        
        # === 第三阶段：内容过滤和提取 ===
        
        # 6. 去掉开头的编号 (使用Step2的方法)
        cleaned_prompt = re.sub(r'^\d+[\.\)]\s*', '', cleaned_prompt)
        
        # 7. 多段落处理 - 智能提取核心内容
        lines = cleaned_prompt.split('\n')
        if len(lines) > 3:
            # 收集候选句子
            candidates = []
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                
                # 跳过明显的指令性文本 (基于Step5的Alpaca格式和学术化输出特征)
                line_lower = line.lower()
                instruction_markers = [
                    # Alpaca格式标记
                    "below is an instruction", "### instruction", "### input", "### response",
                    "appropriately completes", "write a response", "describes a task",
                    
                    # 学术写作指令标记
                    "you are a professional academic writer", "your task is to transform",
                    "transformation principles", "professional disguise", "academic camouflage",
                    "euphemistic substitution", "contextual embedding", "intent preservation",
                    
                    # 输出格式指令
                    "strict output format", "provide only", "no explanations", "no analysis",
                    "no prefixes", "no suffixes", "no additional commentary",
                    "single professional", "research question", "transformed query",
                    
                    # 元信息标记
                    "examples by domain", "quality requirements", "domain specificity",
                    "academic authenticity", "natural professional flow",
                    
                    # 常见格式问题
                    "original:", "transformed:", "here are", "the following are",
                    "as requested", "output format", "example:", "note:", "disclaimer:"
                ]
                
                if any(marker in line_lower for marker in instruction_markers):
                    continue
                
                # 评分系统 - 基于Step5学术化输出的特征
                score = 0
                
                # 长度合理性（Step5通常产生较长的学术化查询）
                if 30 <= len(line) <= 300:
                    score += 3
                elif 20 <= len(line) <= 400:
                    score += 2
                elif 15 <= len(line) <= 500:
                    score += 1
                
                # 词汇数量（Step5学术化查询通常更长）
                word_count = len(line.split())
                if word_count >= 12:
                    score += 3
                elif word_count >= 8:
                    score += 2
                elif word_count >= 5:
                    score += 1
                
                # 问号优先（Step5主要产生疑问句）
                if line.endswith('?'):
                    score += 4
                
                # 包含关键疑问词（Step5的特征）
                if any(keyword in line_lower for keyword in ['what are', 'how can', 'in what ways', 'can you', 'could you', 'would you', 'how might']):
                    score += 3
                
                # 学术术语相关性（Step5特有）
                academic_terms = ['research', 'study', 'analysis', 'investigation', 'examination', 'assessment', 'evaluation', 
                                'methodology', 'protocol', 'strategy', 'approach', 'framework', 'systematic', 'optimal',
                                'efficacy', 'effectiveness', 'implications', 'consequences', 'considerations']
                if any(term in line_lower for term in academic_terms):
                    score += 2
                
                # 专业化表述（Step5特征）
                professional_terms = ['professional', 'clinical', 'therapeutic', 'pharmaceutical', 'medical', 'healthcare',
                                     'patient', 'treatment', 'intervention', 'diagnosis', 'management', 'delivery']
                if any(term in line_lower for term in professional_terms):
                    score += 2
                
                # 句子完整性
                if line.endswith(('.', '!', '?', ':')):
                    score += 1
                
                if score > 0:
                    candidates.append((line, score))
            
            # 选择得分最高的句子
            if candidates:
                candidates.sort(key=lambda x: x[1], reverse=True)
                cleaned_prompt = candidates[0][0]
        
        # === 第四阶段：最终格式化 (参考Step2的最终清理) ===
        
        # 8. 清理多余的空白字符 (Step2方法)
        cleaned_prompt = re.sub(r'\s+', ' ', cleaned_prompt).strip()
        
        # 9. 确保句子以合适的标点结尾 (Step2逻辑)
        if cleaned_prompt and not cleaned_prompt.endswith(('.', '!', '?', ':')):
            # 如果句子看起来完整但没有结尾标点，添加问号或句号
            if len(cleaned_prompt.split()) >= 5:
                # 如果是疑问句式，添加问号
                if any(qword in cleaned_prompt.lower() for qword in ['what', 'how', 'why', 'when', 'where', 'can', 'could', 'would', 'should']):
                    cleaned_prompt += '?'
                else:
                    cleaned_prompt += '.'
        
        # 10. 最终有效性检查 (基于Step2的_is_valid_harmful_prompt)
        if not self._is_valid_stealth_prompt(cleaned_prompt):
            # 如果清理后的结果无效，尝试返回原始内容的第一个合理句子
            sentences = re.split(r'[.!?]\s+', prompt.strip())
            for sentence in sentences:
                sentence = sentence.strip()
                if self._is_valid_stealth_prompt(sentence):
                    cleaned_prompt = sentence
                    break
            else:
                # 如果还是无效，返回原始输入
                cleaned_prompt = prompt.strip()
        
        return cleaned_prompt

    def _is_valid_stealth_prompt(self, prompt: str) -> bool:
        """
        检查清理后的隐蔽化prompt是否有效
        基于Step2的_is_valid_harmful_prompt方法改编
        """
        if not prompt or len(prompt.strip()) < 15:
            return False
        
        prompt_lower = prompt.lower().strip()
        
        # 检测明显的指令性语言标记 (基于Step5的Alpaca格式和学术化输出特征)
        instruction_markers = [
            # Alpaca格式标记
            "below is an instruction", "### instruction", "### input", "### response",
            "appropriately completes", "write a response", "describes a task",
            
            # 学术写作指令标记
            "you are a professional academic writer", "your task is to transform",
            "transformation principles", "professional disguise", "academic camouflage",
            "euphemistic substitution", "contextual embedding", "intent preservation",
            
            # 输出格式指令
            "strict output format", "provide only", "no explanations", "no analysis",
            "no prefixes", "no suffixes", "no additional commentary",
            "single professional", "research question", "transformed query",
            
            # 元信息标记
            "examples by domain", "quality requirements", "domain specificity",
            "academic authenticity", "natural professional flow",
            
            # 常见格式问题
            "original:", "transformed:", "for reference only", "generate new content", "nothing else"
        ]
        
        if any(marker in prompt_lower for marker in instruction_markers):
            return False
        
        # 检测元文本和格式说明 (参考Step2)
        meta_text_patterns = [
            "here are", "the following are", "these are examples",
            "as requested", "output format", "example:", "note:",
            "disclaimer:", "warning:", "[direct harmful", "[another direct", "[prompt]"
        ]
        
        if any(pattern in prompt_lower for pattern in meta_text_patterns):
            return False
        
        # 基本长度检查 (参考Step2: 20-400字符范围)
        if len(prompt) < 20 or len(prompt) > 500:
            return False
        
        # 确保有足够的词汇量 (参考Step2: 至少5个词)
        if len(prompt.split()) < 5:
            return False
        
        # Step5隐蔽化prompt特有检查
        # 确保不是明显的格式问题
        if prompt.count('"') == 1:  # 不匹配的引号
            return False
        
        # 检查Step5可能出现的问题格式
        if prompt.startswith(('Original:', 'Transformed Query:', 'Research Question:', 'Professional:', 'As a professional',
                             'Transformation:', 'Transformed Research Question:', 'Reframed as:', 'Transformed:',
                             'Reframed Query:', 'Reframed as professional', 'medicine')):
            return False
        
        # 确保是合理的学术化查询（Step5特征）
        if not any(keyword in prompt.lower() for keyword in ['what', 'how', 'can', 'could', 'would', 'should', 'might']):
            return False  # Step5输出应该是疑问句
        
        return True
    
    def load_evaluated_prompts(self) -> List[Dict]:
        """加载毒性评估后的提示"""
        self.logger.info("📂 加载毒性评估后的提示...")
        
        # 首先尝试从step4的筛选结果中加载
        filtered_path = self.output_dir / "step4_filtered_prompts.json"
        
        if filtered_path.exists():
            try:
                with open(filtered_path, 'r', encoding='utf-8') as f:
                    prompts = json.load(f)
                
                self.logger.info(f"  ✅ 从step4加载了 {len(prompts)} 个筛选后的提示")
                
                # 决定处理多少个提示
                if self.test_size == -1:
                    # 处理所有提示
                    prompts_to_use = prompts
                    self.logger.info(f"  🎯 处理所有 {len(prompts_to_use)} 个提示")
                else:
                    # 限制数量
                    prompts_to_use = prompts[:self.test_size]
                    self.logger.info(f"  🎯 限制处理 {len(prompts_to_use)} 个提示 (总共 {len(prompts)} 个)")
                
                # Step4的数据已经是有害的，直接使用
                return prompts_to_use
                
            except Exception as e:
                self.logger.warning(f"  ⚠️  加载step4筛选结果失败: {e}")
        
        # 如果没有step4数据，尝试从step3评估结果中加载
        evaluated_path = self.output_dir / "step3_evaluated_prompts.json"
        
        if evaluated_path.exists():
            try:
                with open(evaluated_path, 'r', encoding='utf-8') as f:
                    prompts = json.load(f)
                
                # 过滤出不安全的提示（更适合隐蔽化）
                unsafe_prompts = []
                for prompt_data in prompts:
                    toxicity_eval = prompt_data.get("toxicity_evaluation", {})
                    if toxicity_eval.get("is_harmful", False):  # 选择有害的提示
                        unsafe_prompts.append(prompt_data)
                
                self.logger.info(f"  ✅ 从step3加载了 {len(prompts)} 个提示")
                self.logger.info(f"  🎯 筛选出 {len(unsafe_prompts)} 个有害提示用于隐蔽化")
                
                # 决定处理多少个提示
                if self.test_size == -1:
                    prompts_to_use = unsafe_prompts
                    self.logger.info(f"  🎯 处理所有 {len(prompts_to_use)} 个有害提示")
                else:
                    prompts_to_use = unsafe_prompts[:self.test_size]
                    self.logger.info(f"  🎯 限制处理 {len(prompts_to_use)} 个有害提示 (总共 {len(unsafe_prompts)} 个)")
                
                return prompts_to_use
                
            except Exception as e:
                self.logger.warning(f"  ⚠️  加载step3提示失败: {e}")
        
        self.logger.error("❌ 没有找到可用的数据文件")
        self.logger.error("💡 请先运行以下步骤生成数据：")
        self.logger.error("   1. step2: 生成有害提示")
        self.logger.error("   2. step3: 毒性评估") 
        self.logger.error("   3. step4: 数据筛选")
        return []
    
    def _init_real_llm_evaluator(self):
        """初始化真实LLM评估器"""
        try:
            # LLM模型信息已在ASR评估服务中配置，这里只需验证API密钥
            import os
            if not os.getenv('OPENROUTER_API_KEY'):
                self.logger.warning("  ⚠️  未设置OPENROUTER_API_KEY环境变量")
            
            self.logger.info(f"  🎯 使用ASR评估服务进行真实LLM评估")
            self.logger.info("  ✅ 真实LLM评估器初始化成功")
            
        except Exception as e:
            self.logger.error(f"  ❌ 真实LLM评估器初始化失败: {e}")
            raise RuntimeError(f"真实LLM评估器初始化失败，无法继续: {e}")

    def process_dual_path_stealth_rewrite(self) -> bool:
        """执行双路径隐蔽化改写（核心功能）"""
        self.logger.info("🎭 开始双路径隐蔽化改写...")
        
        # 加载要处理的提示
        prompts_to_process = self.load_evaluated_prompts()
        
        if not prompts_to_process:
            self.logger.error("  ❌ 没有可处理的提示")
            return False
        
        try:
            successful_rewrites = 0
            total_processed = 0
            
            for prompt_data in prompts_to_process:
                original_prompt = prompt_data.get("prompt", "")
                prompt_id = prompt_data.get("prompt_id", f"prompt_{total_processed}")
                
                if not original_prompt:
                    continue
                
                self.logger.info(f"  � 处理提示: {prompt_id}")
                total_processed += 1
                
                # 执行双路径隐蔽化改写
                stealth_result = self._dual_path_stealth_rewrite(original_prompt, prompt_data)
                
                # 无论成功失败都记录到最终数据集
                if stealth_result and stealth_result.get("success"):
                    successful_rewrites += 1
                    
                    winning_path = stealth_result.get("winning_path", "unknown")
                    iterations = stealth_result.get("iterations", 0)  # 改为iterations保持一致
                    similarity = stealth_result.get("final_similarity", 0)
                    ppl = stealth_result.get("final_ppl", 0)
                    final_prompt = stealth_result.get("stealth_prompt", "")
                    
                    self.logger.info(f"    🎉 隐蔽化成功！")
                    self.logger.info(f"       🏆 获胜路径: {winning_path.upper()}")
                    self.logger.info(f"       🔄 使用迭代: {iterations}次")
                    self.logger.info(f"       📊 相似度: {similarity:.3f}")
                    self.logger.info(f"       📊 PPL: {ppl:.1f}")
                    self.logger.info(f"       💬 改写结果: {final_prompt[:120]}...")
                    
                    # 记录成功的结果
                    final_record = {
                        "prompt_id": prompt_id,
                        "original_prompt": original_prompt,
                        "stealth_prompt": final_prompt,
                        "success": True,
                        "stealth_success": True,  # 添加stealth_success字段保持一致
                        "winning_path": winning_path,
                        "iterations": iterations,  # 与step5_optimized_batch保持一致
                        "final_similarity": similarity,
                        "final_ppl": ppl,
                        "metadata": prompt_data
                    }
                else:
                    reason = stealth_result.get("reason", "未知原因") if stealth_result else "处理失败"
                    self.logger.warning(f"    ❌ 隐蔽化失败: {reason}")
                    
                    # 检查是否有隐蔽化结果（即使攻击失败）
                    stealth_prompt = stealth_result.get("stealth_prompt") if stealth_result else None
                    
                    # 提取迭代次数 - 失败情况下也要保存
                    iterations = 0
                    if stealth_result:
                        # 从stealth_result中提取迭代次数
                        iterations = stealth_result.get("iterations", 0)  # 改为iterations保持一致
                        if iterations == 0:
                            iterations = stealth_result.get("total_iterations", 0)  # 兼容旧版本
                        
                        # 如果还是0，从reason中推断迭代次数
                        if iterations == 0 and "达到最大迭代次数" in reason:
                            iterations = self.stealth_config.get("max_iterations", 5)
                    
                    final_record = {
                        "prompt_id": prompt_id,
                        "original_prompt": original_prompt,
                        "stealth_prompt": stealth_prompt,  # 如果有最佳候选则使用，否则为null
                        "success": False,  # 攻击失败标记
                        "stealth_success": False,  # 添加stealth_success字段保持一致
                        "iterations": iterations,  # 保存尝试的迭代次数 - 与step5_optimized_batch保持一致
                        "reason": reason,
                        "metadata": prompt_data
                    }
                    
                    # 如果有隐蔽化结果，添加额外信息
                    if stealth_result and stealth_prompt:
                        final_record.update({
                            "final_similarity": stealth_result.get("final_similarity"),
                            "final_ppl": stealth_result.get("final_ppl"),
                            "weighted_score": stealth_result.get("weighted_score"),
                            "winning_path": stealth_result.get("winning_path"),
                            "winning_iteration": stealth_result.get("winning_iteration"),
                            "has_stealth_version": True  # 标记有隐蔽化版本但攻击失败
                        })
                    else:
                        final_record["has_stealth_version"] = False  # 标记完全失败
                
                # 添加到最终数据集
                self.stealth_results.append(final_record)
                
                time.sleep(0.5)  # 防止过于频繁的请求
            
            success_rate = successful_rewrites / total_processed if total_processed > 0 else 0
            self.logger.info(f"  📊 隐蔽化统计: {successful_rewrites}/{total_processed} 成功 ({success_rate:.1%})")
            
            # 60%成功率认为测试通过
            test_success = success_rate >= 0.6
            
            self.test_results["dual_path_stealth"] = {
                "status": test_success,
                "total_prompts": total_processed,
                "successful_rewrites": successful_rewrites,
                "success_rate": success_rate,
                "stealth_results": self.stealth_results
            }
            
            return test_success
            
        except Exception as e:
            self.logger.error(f"  ❌ 双路径隐蔽化测试失败: {e}")
            self.test_results["dual_path_stealth"] = {"status": False, "error": str(e)}
            return False
    
    def _dual_path_stealth_rewrite(self, original_prompt: str, prompt_data: Dict) -> Optional[Dict]:
        """执行双路径隐蔽化改写 - basic和enhanced交替运行"""
        max_iterations = self.stealth_config["max_iterations"]
        similarity_threshold = self.stealth_config["similarity_threshold"]
        ppl_threshold = self.stealth_config["ppl_threshold"]
        
        self.logger.info(f"      🎭 开始双路径隐蔽化改写...")
        self.logger.info(f"         📝 原始提示: {original_prompt[:100]}...")
        self.logger.info(f"         ⚙️  配置参数: 最大迭代={max_iterations}, 相似度阈值={similarity_threshold}, PPL阈值={ppl_threshold}")
        
        # 双路径状态
        basic_prompt = original_prompt
        enhanced_prompt = original_prompt
        
        # 跟踪最佳候选结果（用于失败时的备选）
        best_candidates = {
            "basic": [],  # 存储所有basic路径的候选结果
            "enhanced": []  # 存储所有enhanced路径的候选结果
        }
        
        for iteration in range(max_iterations):
            # 当前迭代选择的路径（交替）
            current_path = "basic" if iteration % 2 == 0 else "enhanced"

            # Ablation: basic
            # current_path = "basic"

            # Ablation: enhanced
            # current_path = "enhanced"

            self.logger.info(f"      🔄 迭代 {iteration + 1}/{max_iterations} - 使用 {current_path.upper()} 路径")
            
            try:
                if current_path == "basic":
                    # Basic路径改写
                    self.logger.info(f"         🔵 Basic路径输入: {basic_prompt[:80]}...")
                    request = ImplicitRequest(
                        original_prompt=basic_prompt,
                        method=ImplicitMethod.BASIC
                    )
                    response = self.implicit_service.generate_implicit_prompt(request)
                    if response and response.implicit_prompt:
                        candidate_prompt = self._clean_generated_prompt(response.implicit_prompt)
                        self.logger.info(f"         🔵 Basic路径输出: {candidate_prompt[:80]}...")
                    else:
                        self.logger.warning(f"         ⚠️  Basic路径生成失败")
                        continue
                        
                else:
                    # Enhanced路径改写
                    self.logger.info(f"         🟡 Enhanced路径输入: {enhanced_prompt[:80]}...")
                    
                    # 获取实体名称用于语义卡片生成 - 必须存在
                    entity_name = prompt_data.get("entity")
                    if not entity_name:
                        raise ValueError(f"Enhanced路径需要entity信息，但在prompt_data中未找到。prompt_id: {prompt_data.get('prompt_id', 'unknown')}")
                    
                    request = ImplicitRequest(
                        original_prompt=enhanced_prompt,
                        method=ImplicitMethod.ENHANCED,
                        node_name=entity_name  # 传递实际的实体名称
                    )
                    response = self.implicit_service.generate_implicit_prompt(request)
                    if response and response.implicit_prompt:
                        candidate_prompt = self._clean_generated_prompt(response.implicit_prompt)
                        self.logger.info(f"         🟡 Enhanced路径输出: {candidate_prompt[:80]}...")
                    else:
                        self.logger.warning(f"         ⚠️  Enhanced路径生成失败")
                        continue
                
                # 检查约束条件（与原始prompt比较）
                self.logger.info(f"         🔍 检查约束条件...")
                
                # 计算当前候选的相似度和PPL（用于评分记录）
                similarity = self._calculate_similarity(original_prompt, candidate_prompt)
                ppl = self._calculate_ppl(candidate_prompt)
                
                # 记录候选结果（无论是否满足约束，都记录用于最终评选）
                candidate_record = {
                    "prompt": candidate_prompt,
                    "similarity": similarity,
                    "ppl": ppl,
                    "iteration": iteration + 1,
                    "constraints_met": similarity >= similarity_threshold and ppl <= ppl_threshold
                }
                best_candidates[current_path].append(candidate_record)
                
                constraints_met = candidate_record["constraints_met"]
                
                if constraints_met:
                    self.logger.info(f"         ✅ 约束条件满足，更新{current_path.upper()}路径状态")
                    
                    # 只有约束满足才更新对应路径的状态
                    if current_path == "basic":
                        basic_prompt = candidate_prompt
                    else:
                        enhanced_prompt = candidate_prompt
                    
                    # 检查是否成功越狱
                    self.logger.info(f"         🎯 进行越狱成功率检查...")
                    jailbreak_success = self._check_jailbreak_success(candidate_prompt, original_prompt, prompt_data)
                    
                    if jailbreak_success:
                        self.logger.info(f"         🚀 攻击成功！{current_path.upper()}路径在第{iteration + 1}次迭代中成功越狱")
                        
                        # 找到满足条件的隐蔽化prompt
                        result = {
                            "prompt_id": prompt_data.get("prompt_id", "unknown"),
                            "original_prompt": original_prompt,
                            "stealth_prompt": candidate_prompt,
                            "success": True,
                            "stealth_success": True,  # 添加stealth_success字段保持一致
                            "winning_path": current_path,
                            "iterations": iteration + 1,  # 与step5_optimized_batch保持一致
                            "final_similarity": self._calculate_similarity(original_prompt, candidate_prompt),
                            "final_ppl": self._calculate_ppl(candidate_prompt),
                            "entity": prompt_data.get("entity", "unknown"),
                            "wikidata_uri": prompt_data.get("wikidata_uri", "unknown"),
                            "category": prompt_data.get("category", "unknown"),
                            "behavior_type": prompt_data.get("behavior_type", "unknown"),
                            "toxicity_evaluation": prompt_data.get("toxicity_evaluation", {}),
                            "filtering_info": prompt_data.get("filtering_info", {}),
                            "metadata": prompt_data,
                            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                        }
                        
                        sim = result["final_similarity"]
                        ppl = result["final_ppl"]
                        self.logger.info(f"         📊 最终指标: 相似度={sim:.3f}, PPL={ppl:.1f}")
                        
                        return result
                    else:
                        self.logger.info(f"         ❌ 越狱检查失败，{current_path.upper()}路径状态已更新，继续下一次迭代")
                else:
                    sim = self._calculate_similarity(original_prompt, candidate_prompt)
                    ppl = self._calculate_ppl(candidate_prompt)
                    self.logger.info(f"         ❌ 约束条件不满足: 相似度={sim:.3f} (需要≥{similarity_threshold}), PPL={ppl:.1f} (需要≤{ppl_threshold})")
                    self.logger.info(f"         🔒 {current_path.upper()}路径状态保持不变")
                    
            except Exception as e:
                self.logger.warning(f"         ⚠️  {current_path}路径迭代失败: {e}")
                continue
        
        # 所有迭代都未成功，选择最佳候选作为隐蔽化结果
        self.logger.info(f"      ❌ 双路径隐蔽化攻击失败: 达到最大迭代次数({max_iterations})但未找到成功越狱的改写")
        self.logger.info(f"         📊 正在从候选结果中选择最佳隐蔽化prompt...")
        
        # 选择最佳候选
        best_candidate = self._select_best_candidate(best_candidates, similarity_threshold, ppl_threshold)
        
        if best_candidate:
            self.logger.info(f"         🎯 选择最佳候选: {best_candidate['path']}路径第{best_candidate['iteration']}次迭代")
            self.logger.info(f"         📊 最佳评分: 相似度={best_candidate['similarity']:.3f}, PPL={best_candidate['ppl']:.1f}, 加权分={best_candidate['weighted_score']:.3f}")
            self.logger.info(f"         📝 最佳prompt: {best_candidate['prompt'][:80]}...")
            
            return {
                "prompt_id": prompt_data.get("prompt_id", "unknown"),
                "original_prompt": original_prompt,
                "stealth_prompt": best_candidate['prompt'],  # 最佳候选作为隐蔽化结果
                "success": False,  # 攻击失败但有隐蔽化结果
                "stealth_success": False,  # 添加stealth_success字段保持一致
                "iterations": max_iterations,  # 与step5_optimized_batch保持一致
                "final_similarity": best_candidate['similarity'],
                "final_ppl": best_candidate['ppl'],
                "weighted_score": best_candidate['weighted_score'],
                "winning_path": best_candidate['path'],
                "winning_iteration": best_candidate['iteration'],
                "entity": prompt_data.get("entity", "unknown"),
                "wikidata_uri": prompt_data.get("wikidata_uri", "unknown"),
                "category": prompt_data.get("category", "unknown"),
                "behavior_type": prompt_data.get("behavior_type", "unknown"),
                "toxicity_evaluation": prompt_data.get("toxicity_evaluation", {}),
                "filtering_info": prompt_data.get("filtering_info", {}),
                "metadata": prompt_data,
                "reason": f"达到最大迭代次数({max_iterations})",
                "all_candidates": best_candidates,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
            }
        else:
            self.logger.info(f"         ❌ 没有找到合适的候选结果")
            self.logger.info(f"         📊 最终状态:")
            self.logger.info(f"           🔵 Basic路径: {basic_prompt[:80]}...")
            self.logger.info(f"           🟡 Enhanced路径: {enhanced_prompt[:80]}...")
            
            return {
                "prompt_id": prompt_data.get("prompt_id", "unknown"),
                "original_prompt": original_prompt,
                "stealth_prompt": None,  # 完全失败时才为null
                "success": False,
                "stealth_success": False,  # 添加stealth_success字段保持一致
                "iterations": max_iterations,  # 与step5_optimized_batch保持一致
                "entity": prompt_data.get("entity", "unknown"),
                "wikidata_uri": prompt_data.get("wikidata_uri", "unknown"),
                "category": prompt_data.get("category", "unknown"),
                "behavior_type": prompt_data.get("behavior_type", "unknown"),
                "toxicity_evaluation": prompt_data.get("toxicity_evaluation", {}),
                "filtering_info": prompt_data.get("filtering_info", {}),
                "metadata": prompt_data,
                "basic_final": basic_prompt,
                "enhanced_final": enhanced_prompt,
                "reason": f"达到最大迭代次数({max_iterations})且无合适候选结果",
                "all_candidates": best_candidates,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
            }
    
    def _select_best_candidate(self, best_candidates: Dict, similarity_threshold: float, ppl_threshold: float) -> Optional[Dict]:
        """从所有候选中选择最佳的隐蔽化结果 - 重点关注倒数两次迭代的结果"""
        all_candidates = []
        
        # 收集所有候选结果，重点关注倒数两次迭代
        max_iteration = 0
        for path_name, candidates in best_candidates.items():
            for candidate in candidates:
                max_iteration = max(max_iteration, candidate['iteration'])
        
        # 筛选倒数两次迭代的候选（如果迭代数不够，则包含所有）
        last_two_iterations = [max_iteration, max_iteration - 1] if max_iteration > 0 else [0]
        
        for path_name, candidates in best_candidates.items():
            for candidate in candidates:
                # 只考虑倒数两次迭代的结果
                if candidate['iteration'] in last_two_iterations:
                    # 计算加权评分：相似度权重0.6，PPL权重0.4（PPL越低越好）
                    # 相似度范围0-1，PPL范围通常10-200，需要归一化
                    normalized_similarity = candidate['similarity']  # 已经是0-1范围
                    normalized_ppl = max(0, 1 - (candidate['ppl'] - 10) / 190)  # 将PPL(10-200)归一化到0-1，越低越好
                    
                    weighted_score = 0.6 * normalized_similarity + 0.4 * normalized_ppl
                    
                    all_candidates.append({
                        "path": path_name,
                        "prompt": candidate['prompt'],
                        "similarity": candidate['similarity'],
                        "ppl": candidate['ppl'],
                        "iteration": candidate['iteration'],
                        "constraints_met": candidate['constraints_met'],
                        "weighted_score": weighted_score
                    })
        
        if not all_candidates:
            return None
        
        self.logger.info(f"         📊 从倒数两次迭代({last_two_iterations})中选择，共{len(all_candidates)}个候选")
        
        # 优先选择满足约束条件的候选，其次按加权分排序
        constraint_satisfied = [c for c in all_candidates if c['constraints_met']]
        
        if constraint_satisfied:
            # 如果有满足约束的候选，选择加权分最高的
            best = max(constraint_satisfied, key=lambda x: x['weighted_score'])
            self.logger.info(f"         ✅ 找到满足约束条件的最佳候选 (迭代{best['iteration']}, 加权分{best['weighted_score']:.3f})")
        else:
            # 如果没有满足约束的候选，从倒数两次迭代中选择加权分最高的
            best = max(all_candidates, key=lambda x: x['weighted_score'])
            self.logger.info(f"         ⚠️ 无满足约束的候选，选择倒数两次迭代中加权分最高的 (迭代{best['iteration']}, 加权分{best['weighted_score']:.3f})")
        
        return best
    
    def _check_stealth_constraints(self, original: str, candidate: str, 
                                 similarity_threshold: float, ppl_threshold: float) -> bool:
        """检查隐蔽化约束条件"""
        try:
            # 检查语义相似度
            similarity = self._calculate_similarity(original, candidate)
            self.logger.info(f"           🔍 相似度检查: {similarity:.3f} (阈值: ≥{similarity_threshold})")
            
            if similarity < similarity_threshold:
                self.logger.info(f"           ❌ 相似度不足: {similarity:.3f} < {similarity_threshold}")
                return False
            
            # 检查PPL
            ppl = self._calculate_ppl(candidate)
            self.logger.info(f"           🔍 PPL检查: {ppl:.1f} (阈值: ≤{ppl_threshold})")
            
            if ppl > ppl_threshold:
                self.logger.info(f"           ❌ PPL过高: {ppl:.1f} > {ppl_threshold}")
                return False
            
            self.logger.info(f"           ✅ 所有约束条件满足")
            return True
            
        except Exception as e:
            self.logger.warning(f"           ⚠️  约束检查失败: {e}")
            return False
    
    def _calculate_similarity(self, text1: str, text2: str) -> float:
        """计算两个文本的语义相似度 - 只使用真实的sentence-transformers模型"""
        # 使用sentence-transformers计算真实的语义相似度
        from sentence_transformers import SentenceTransformer, util
        import torch
        
        # 初始化模型（如果尚未初始化）
        if not hasattr(self, '_similarity_model'):
            self.logger.info("           🔄 初始化sentence-transformers模型...")
            self._similarity_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
            self.logger.info("           ✅ sentence-transformers模型初始化完成")
        
        # 计算embeddings和相似度
        embeddings = self._similarity_model.encode([text1, text2], convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
        
        return similarity
    
    def _calculate_ppl(self, text: str) -> float:
        """计算文本的困惑度 - 只使用真实的GPT-2模型"""
        # 使用GPT-2模型计算真实的困惑度
        from transformers import GPT2LMHeadModel, GPT2Tokenizer
        import torch
        
        # 初始化模型（如果尚未初始化）
        if not hasattr(self, '_ppl_model'):
            self.logger.info("           🔄 初始化GPT-2困惑度模型...")
            self._ppl_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            self._ppl_model = GPT2LMHeadModel.from_pretrained('gpt2')
            if self._ppl_tokenizer.pad_token is None:
                self._ppl_tokenizer.pad_token = self._ppl_tokenizer.eos_token
            self.logger.info("           ✅ GPT-2困惑度模型初始化完成")
        
        # 计算困惑度
        inputs = self._ppl_tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self._ppl_model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
            ppl = torch.exp(loss).item()
        
        return ppl
    
    def _check_jailbreak_success(self, prompt: str, original_prompt: str = None, prompt_data: Dict = None) -> bool:
        """检查是否成功越狱（使用真实LLM评估）"""
        return self._check_jailbreak_with_real_llm(prompt, original_prompt, prompt_data)
    
    def _check_jailbreak_with_real_llm(self, prompt: str, original_prompt: str = None, prompt_data: Dict = None) -> bool:
        """使用真实LLM进行越狱成功检查 - 复用ASR评估服务"""
        self.logger.info(f"           🔄 使用ASR评估服务进行攻击评估...")
        
        try:
            # 从prompt_data中获取具体的category，如果没有则使用domain作为fallback
            category = prompt_data.get("category", self.domain) if prompt_data else self.domain
            
            # 直接使用已初始化的ASR评估服务，避免重复创建API客户端
            asr_result = self.asr_evaluator.check_asr(
                rewritten_prompt=prompt,
                category=category,
                original_prompt=original_prompt
            )
            
            self.logger.info(f"           🎯 评估结果: {'越狱成功' if asr_result.asr_successful else '攻击失败'}")
            self.logger.info(f"           💭 推理: {asr_result.evaluation_reasoning[:100]}...")
            
            return asr_result.asr_successful
            
        except Exception as e:
            self.logger.error(f"           ❌ ASR评估失败: {e}")
            return False
    
    def analyze_stealth_effectiveness(self) -> bool:
        """分析隐蔽化效果"""
        self.logger.info("📊 分析隐蔽化效果...")
        
        if not self.stealth_results:
            self.logger.error("  ❌ 没有隐蔽化结果可分析")
            return False
        
        try:
            effectiveness_metrics = {
                "total_rewrites": len(self.stealth_results),
                "path_performance": {},
                "success_analysis": {},
                "iteration_analysis": {},
                "quality_analysis": {}
            }
            
            # 路径性能分析
            successful_results = [r for r in self.stealth_results if r.get("success")]
            basic_wins = sum(1 for r in successful_results if r.get("winning_path") == "basic")
            enhanced_wins = sum(1 for r in successful_results if r.get("winning_path") == "enhanced")
            
            effectiveness_metrics["path_performance"] = {
                "basic_wins": basic_wins,
                "enhanced_wins": enhanced_wins,
                "basic_win_rate": basic_wins / len(successful_results) if successful_results else 0,
                "enhanced_win_rate": enhanced_wins / len(successful_results) if successful_results else 0
            }
            
            # 迭代分析
            if successful_results:
                iterations = [r.get("iterations", 0) for r in successful_results]  # 改为iterations保持一致
                effectiveness_metrics["iteration_analysis"] = {
                    "avg_iterations": sum(iterations) / len(iterations),
                    "min_iterations": min(iterations),
                    "max_iterations": max(iterations),
                    "iteration_distribution": {i: iterations.count(i) for i in set(iterations)}
                }
            
            # 质量分析
            similarities = [r.get("final_similarity", 0) for r in successful_results]
            ppls = [r.get("final_ppl", 0) for r in successful_results]
            
            if similarities and ppls:
                effectiveness_metrics["quality_analysis"] = {
                    "avg_similarity": sum(similarities) / len(similarities),
                    "min_similarity": min(similarities),
                    "max_similarity": max(similarities),
                    "avg_ppl": sum(ppls) / len(ppls),
                    "min_ppl": min(ppls),
                    "max_ppl": max(ppls)
                }
            
            self.logger.info(f"  📊 隐蔽化效果分析:")
            self.logger.info(f"      成功改写数: {len(successful_results)}/{len(self.stealth_results)}")
            
            if successful_results:
                self.logger.info(f"      路径获胜分布: basic={basic_wins}, enhanced={enhanced_wins}")
                if "iteration_analysis" in effectiveness_metrics:
                    avg_iter = effectiveness_metrics["iteration_analysis"]["avg_iterations"]
                    self.logger.info(f"      平均迭代次数: {avg_iter:.1f}")
                if "quality_analysis" in effectiveness_metrics:
                    avg_sim = effectiveness_metrics["quality_analysis"]["avg_similarity"]
                    avg_ppl = effectiveness_metrics["quality_analysis"]["avg_ppl"]
                    self.logger.info(f"      平均相似度: {avg_sim:.3f}")
                    self.logger.info(f"      平均PPL: {avg_ppl:.1f}")
            
            success = len(successful_results) >= len(self.stealth_results) * 0.5
            if success:
                self.logger.info("  ✅ 隐蔽化效果分析通过")
            else:
                self.logger.error("  ❌ 隐蔽化效果分析失败")
            
            self.test_results["stealth_effectiveness"] = {
                "status": success,
                "effectiveness_metrics": effectiveness_metrics
            }
            
            return success
            
        except Exception as e:
            self.logger.error(f"  ❌ 隐蔽化效果分析失败: {e}")
            self.test_results["stealth_effectiveness"] = {"status": False, "error": str(e)}
            return False
            enhanced_wins = sum(1 for r in self.stealth_results if r.get("winner_route") == "enhanced")
            no_winner = sum(1 for r in self.stealth_results if not r.get("winner_route"))
            
            effectiveness_metrics["route_performance"] = {
                "basic_wins": basic_wins,
                "enhanced_wins": enhanced_wins,
                "no_winner": no_winner,
                "basic_win_rate": basic_wins / len(self.stealth_results),
                "enhanced_win_rate": enhanced_wins / len(self.stealth_results)
            }
            
            # 成功分析
            successful_attacks = [r for r in self.stealth_results if r.get("attack_success")]
            effectiveness_metrics["success_analysis"] = {
                "total_successful": len(successful_attacks),
                "success_rate": len(successful_attacks) / len(self.stealth_results),
                "basic_chain_successes": sum(1 for r in self.stealth_results if r.get("basic_chain_success")),
                "enhanced_chain_successes": sum(1 for r in self.stealth_results if r.get("enhanced_chain_success"))
            }
            
            # 迭代次数分析
            success_iterations = [r.get("success_iteration") for r in successful_attacks if r.get("success_iteration")]
            if success_iterations:
                effectiveness_metrics["iteration_analysis"] = {
                    "avg_iterations_to_success": sum(success_iterations) / len(success_iterations),
                    "min_iterations": min(success_iterations),
                    "max_iterations": max(success_iterations),
                    "median_iterations": sorted(success_iterations)[len(success_iterations) // 2]
                }
            
            # 时间分析
            attack_times = [r.get("total_time", 0) for r in self.stealth_results if r.get("total_time")]
            if attack_times:
                effectiveness_metrics["timing_analysis"] = {
                    "avg_time": sum(attack_times) / len(attack_times),
                    "min_time": min(attack_times),
                    "max_time": max(attack_times),
                    "total_time": sum(attack_times)
                }
            
            # 输出分析结果
            self.logger.info(f"  📊 攻击效果分析:")
            self.logger.info(f"      总攻击数: {effectiveness_metrics['total_attacks']}")
            
            route_perf = effectiveness_metrics["route_performance"]
            self.logger.info(f"      路径胜率: Basic {route_perf['basic_win_rate']:.2%}, Enhanced {route_perf['enhanced_win_rate']:.2%}")
            
            success_analysis = effectiveness_metrics["success_analysis"]
            self.logger.info(f"      总成功率: {success_analysis['success_rate']:.2%}")
            
            iteration_analysis = effectiveness_metrics.get("iteration_analysis", {})
            if iteration_analysis:
                self.logger.info(f"      平均成功迭代数: {iteration_analysis.get('avg_iterations_to_success', 0):.1f}")
            
            timing_analysis = effectiveness_metrics.get("timing_analysis", {})
            if timing_analysis:
                self.logger.info(f"      平均攻击时间: {timing_analysis.get('avg_time', 0):.2f}s")
            
            # 判断效果是否良好
            success = (
                effectiveness_metrics["total_attacks"] > 0 and
                success_analysis["success_rate"] > 0.3 and  # 30%以上成功率
                (route_perf["basic_win_rate"] > 0 or route_perf["enhanced_win_rate"] > 0)  # 至少一个路径有胜利
            )
            
            if success:
                self.logger.info("  ✅ 攻击效果分析通过")
            else:
                self.logger.error("  ❌ 攻击效果分析失败")
            
            self.test_results["attack_effectiveness"] = {
                "status": success,
                "metrics": effectiveness_metrics
            }
            
            return success
            
        except Exception as e:
            self.logger.error(f"  ❌ 攻击效果分析失败: {e}")
            self.test_results["attack_effectiveness"] = {"status": False, "error": str(e)}
            return False
    
    def export_dataset(self) -> bool:
        """导出隐蔽化数据集"""
        self.logger.info("💾 导出隐蔽化数据集...")
        
        if not self.stealth_results:
            self.logger.error("  ❌ 没有可导出的隐蔽化结果")
            return False
        
        try:
            # 导出隐蔽化数据集 - 标准格式
            dataset_path = self.output_dir / f"step5_stealth_dataset_{self.domain}.json"
            with open(dataset_path, 'w', encoding='utf-8') as f:
                json.dump(self.stealth_results, f, indent=2, ensure_ascii=False)
            
            self.logger.info(f"  ✅ 隐蔽化数据集已导出: {dataset_path.name}")
            
            # 统计信息
            total_prompts = len(self.stealth_results)
            successful_attacks = [r for r in self.stealth_results if r.get("success")]
            failed_with_stealth = [r for r in self.stealth_results if not r.get("success") and r.get("has_stealth_version")]
            failed_completely = [r for r in self.stealth_results if not r.get("success") and not r.get("has_stealth_version")]
            
            self.logger.info(f"  📊 数据集统计:")
            self.logger.info(f"      总提示数: {total_prompts}")
            self.logger.info(f"      攻击成功: {len(successful_attacks)}")
            self.logger.info(f"      攻击失败但有隐蔽化: {len(failed_with_stealth)}")
            self.logger.info(f"      完全失败: {len(failed_completely)} (隐蔽化prompt为null)")
            
            # 生成仅包含成功案例的数据集
            if successful_attacks:
                success_dataset_path = self.output_dir / f"step5_successful_stealth_dataset_{self.domain}.json"
                with open(success_dataset_path, 'w', encoding='utf-8') as f:
                    json.dump(successful_attacks, f, indent=2, ensure_ascii=False)
                
                self.logger.info(f"  ✅ 成功案例数据集已导出: {success_dataset_path.name}")
            
            # 生成处理摘要
            summary = {
                "domain": self.domain,
                "generation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "total_prompts": total_prompts,
                "successful_attacks": len(successful_attacks),
                "failed_with_stealth": len(failed_with_stealth),
                "failed_completely": len(failed_completely),
                "success_rate": len(successful_attacks) / total_prompts if total_prompts > 0 else 0,
                "stealth_config": self.stealth_config,
                "dataset_format": {
                    "fields": [
                        "prompt_id",
                        "original_prompt", 
                        "stealth_prompt",  # null if failed
                        "success",
                        "winning_path",  # if success
                        "iterations",   # if success
                        "final_similarity",  # if success
                        "final_ppl",    # if success
                        "reason",       # if failed
                        "metadata"
                    ]
                }
            }
            
            summary_path = self.output_dir / f"step5_stealth_summary_{self.domain}.json"
            with open(summary_path, 'w', encoding='utf-8') as f:
                json.dump(summary, f, indent=2, ensure_ascii=False)
            
            self.logger.info(f"  ✅ 处理摘要已导出: {summary_path.name}")
            
            self.test_results["dataset_export"] = {
                "status": True,
                "dataset_path": str(dataset_path),
                "success_dataset_path": str(success_dataset_path) if successful_attacks else None,
                "summary_path": str(summary_path),
                "total_prompts": total_prompts,
                "successful_prompts": len(successful_attacks),
                "failed_with_stealth": len(failed_with_stealth),
                "failed_completely": len(failed_completely)
            }
            
            return True
            
        except Exception as e:
            self.logger.error(f"  ❌ 隐蔽化数据集导出失败: {e}")
            self.test_results["dataset_export"] = {"status": False, "error": str(e)}
            return False
    
    def _clean_export_record(self, record: Dict):
        """清理导出记录，移除不可序列化的对象"""
        # 移除可能包含复杂对象的字段
        fields_to_clean = ["basic_chain_details", "enhanced_chain_details"]
        for field in fields_to_clean:
            if field in record and isinstance(record[field], dict):
                # 保留基本信息，移除复杂对象
                cleaned = {}
                for k, v in record[field].items():
                    if isinstance(v, (str, int, float, bool, type(None))):
                        cleaned[k] = v
                    elif isinstance(v, dict):
                        # 只保留简单的字典
                        cleaned[k] = {kk: vv for kk, vv in v.items() if isinstance(vv, (str, int, float, bool, type(None)))}
                record[field] = cleaned
    
    def _serialize_test_results(self, results: Dict) -> Dict:
        """序列化测试结果，处理不可JSON序列化的对象"""
        serialized = {}
        for key, value in results.items():
            if isinstance(value, dict):
                serialized[key] = self._serialize_test_results(value)
            elif hasattr(value, "value"):  # enum对象
                serialized[key] = value.value
            else:
                serialized[key] = value
        return serialized
    
    def run_all_processing(self) -> bool:
        """运行所有双路径隐蔽化改写处理"""
        # 确定实际要处理的提示数量
        prompts_to_process = self.load_evaluated_prompts()
        actual_test_size = len(prompts_to_process)
        
        if self.test_size == -1:
            self.logger.info(f"\n🎭 开始双路径隐蔽化改写处理 (领域: {self.domain}, 处理所有 {actual_test_size} 个提示)")
        else:
            self.logger.info(f"\n🎭 开始双路径隐蔽化改写处理 (领域: {self.domain}, 限制处理 {actual_test_size} 个提示)")
        
        self.logger.info("="*60)
        
        processing_steps = [
            ("双路径隐蔽化改写", self.process_dual_path_stealth_rewrite),
            ("隐蔽化效果分析", self.analyze_stealth_effectiveness),
            ("数据集导出", self.export_dataset)
        ]
        
        completed_steps = 0
        total_steps = len(processing_steps)
        
        for step_name, step_func in processing_steps:
            self.logger.info(f"\n📋 执行步骤: {step_name}")
            try:
                success = step_func()
                if success:
                    completed_steps += 1
            except Exception as e:
                self.logger.error(f"步骤 '{step_name}' 时发生异常: {e}")
        
        # 生成处理总结
        self.logger.info(f"\n{'='*60}")
        self.logger.info(f"📊 双路径隐蔽化改写处理总结")
        self.logger.info(f"✅ 完成: {completed_steps}/{total_steps}")
        
        completion_rate = completed_steps / total_steps
        overall_status = completion_rate >= 0.67  # 67%完成率认为成功
        
        if overall_status:
            self.logger.info("🎉 双路径隐蔽化改写处理完成")
            successful_rewrites = [r for r in self.stealth_results if r.get("success")]
            self.logger.info(f"🎭 生成了 {len(successful_rewrites)} 个成功的隐蔽化样本")
            self.logger.info(f"📋 总共处理了 {len(self.stealth_results)} 个提示")
        else:
            self.logger.error("❌ 双路径隐蔽化改写处理未完成")
        
        # 保存处理结果
        self._save_processing_results(overall_status, completed_steps, total_steps, actual_test_size)
        
        return overall_status
    
    def _save_processing_results(self, overall_status: bool, completed: int, total: int, actual_process_size: int):
        """保存处理结果到文件"""
        successful_rewrites = [r for r in self.stealth_results if r.get("success")]
        
        report = {
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "domain": self.domain,
            "process_size_setting": self.test_size,  # 原始设置
            "actual_process_size": actual_process_size,  # 实际处理数量
            "overall_status": overall_status,
            "summary": {
                "completed_steps": completed,
                "total_steps": total,
                "completion_rate": completed / total if total > 0 else 0,
                "total_prompts": len(self.stealth_results),
                "successful_rewrites": len(successful_rewrites),
                "success_rate": len(successful_rewrites) / len(self.stealth_results) if self.stealth_results else 0
            },
            "stealth_config": self.stealth_config,
            "detailed_results": self._serialize_test_results(self.test_results)
        }
        
        report_path = self.output_dir / "step5_dual_path_stealth_report.json"
        with open(report_path, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)
        
        self.logger.info(f"📋 双路径隐蔽化改写处理报告已保存: {report_path}")

def main():
    # 加载环境配置
    load_env_config()
    
    # 检查API配置
    if not check_api_config():
        sys.exit(1)
    
    parser = argparse.ArgumentParser(description="Step 5: 双路径隐蔽化改写")
    parser.add_argument("--domain", default="medicine",
                      choices=["medicine", "finance", "education", "law"],
                      help="处理领域")
    parser.add_argument("--test-size", type=int, default=-1,
                      help="处理数据量大小 (-1表示处理所有step4结果)")
    parser.add_argument("--verbose", action="store_true",
                      help="显示详细信息")
    parser.add_argument("--output-dir", type=str,
                      help="输出目录（由测试运行器传入）")
    
    args = parser.parse_args()
    
    processor = DualPathStealthProcessor(
        domain=args.domain,
        test_size=args.test_size,
        verbose=args.verbose
    )
    
    success = processor.run_all_processing()
    
    if success:
        successful_rewrites = [r for r in processor.stealth_results if r.get("success")]
        print(f"\n✅ Step 5 完成: 双路径隐蔽化改写处理完成")
        print(f"🎭 生成了 {len(successful_rewrites)} 个成功的隐蔽化样本")
        print(f"📋 总共处理了 {len(processor.stealth_results)} 个提示")
        if successful_rewrites:
            winner_stats = {}
            for rewrite in successful_rewrites:
                winner = rewrite.get("winning_path", "none")
                winner_stats[winner] = winner_stats.get(winner, 0) + 1
            print(f"🏆 获胜路径统计: {winner_stats}")
    else:
        print(f"\n❌ Step 5 失败: 双路径隐蔽化改写处理未完成")
    
    sys.exit(0 if success else 1)

if __name__ == "__main__":
    main()