import json
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def clean_text(text):
    """清理文本，去除多余的重复内容和特殊字符"""
    if not text:
        return ""
    
    # 去除JSON格式的消息
    import re
    text = re.sub(r'\{[^}]*"msg_type"[^}]*\}', '', text)
    
    # 去除OAuth相关的长链接
    text = re.sub(r'https://open\.feishu\.cn/open-apis/authen/v1/index\?[^\s]*', '', text)
    
    # 去除多余的重复句子
    sentences = text.split('。')
    unique_sentences = []
    seen = set()
    
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence not in seen and len(sentence) > 10:
            seen.add(sentence)
            unique_sentences.append(sentence)
    
    return '。'.join(unique_sentences)

def calculate_semantic_similarity(answer, ground_truth):
    """计算两个文本的语义相似度"""
    # 加载中文语义模型
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    
    # 清理文本
    clean_answer = clean_text(answer)
    clean_ground_truth = clean_text(ground_truth)
    
    if not clean_answer or not clean_ground_truth:
        return 0.0
    
    # 生成文本嵌入
    embeddings = model.encode([clean_answer, clean_ground_truth])
    
    # 计算余弦相似度
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    
    return similarity

def evaluate_qa_files():
    """评估QA文件中答案与标准答案的语义一致性"""
    qa_dir = "output/QA"
    results = []
    
    if not os.path.exists(qa_dir):
        print(f"目录 {qa_dir} 不存在")
        return
    
    # 遍历目录下的所有JSON文件
    for filename in os.listdir(qa_dir):
        if filename.endswith('.json'):
            filepath = os.path.join(qa_dir, filename)
            
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # 提取answer和ground_truth
                answer = data.get('answer', '')
                ground_truth = data.get('ground_truth', '')
                question_id = data.get('id', filename)
                question = data.get('question', '')
                
                # 计算语义相似度
                similarity = calculate_semantic_similarity(answer, ground_truth)
                
                result = {
                    'id': question_id,
                    'filename': filename,
                    'question': question,
                    'similarity': similarity,
                    'is_consistent': similarity > 0.7  # 阈值可调整
                }
                
                results.append(result)
                
                print(f"文件: {filename}")
                print(f"问题: {question}")
                print(f"语义相似度: {similarity:.4f}")
                print(f"是否一致: {'是' if result['is_consistent'] else '否'}")
                print("-" * 50)
                
            except Exception as e:
                print(f"处理文件 {filename} 时出错: {e}")
    
    # 统计结果
    if results:
        total_files = len(results)
        consistent_count = sum(1 for r in results if r['is_consistent'])
        avg_similarity = np.mean([r['similarity'] for r in results])
        
        print(f"\n=== 评估结果汇总 ===")
        print(f"总文件数: {total_files}")
        print(f"语义一致的文件数: {consistent_count}")
        print(f"一致性比例: {consistent_count/total_files:.2%}")
        print(f"平均语义相似度: {avg_similarity:.4f}")
        
        # 保存详细结果
        # with open('output/qa_evaluation_results.json', 'w', encoding='utf-8') as f:
        #     json.dump(results, f, ensure_ascii=False, indent=2)
        
        # print(f"详细结果已保存到: output/qa_evaluation_results.json")

from openai import OpenAI
def evaluate_qa_with_gpt4(qa_dir="output/QA-rewritten"):
    """使用GPT-4评估QA文件的answer和ground_truth的一致性"""
    
    # 配置OpenAI客户端
    client = OpenAI(
        api_key="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",  # 请替换为实际的API密钥
    )
    
    results = []
    
    # 遍历QA目录下的所有JSON文件
    for filename in os.listdir(qa_dir):
        if filename.endswith('.json'):
            filepath = os.path.join(qa_dir, filename)
            
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # 提取answer和ground_truth
                answer = data.get('answer', '')
                ground_truth = data.get('ground_truth', '')
                question_id = data.get('id', filename)
                question = data.get('question', '')
                
                # 构建评估提示词
                prompt = f"""
请评估以下两个答案的语义一致性：

问题：{question}

模型回答：{answer}

标准答案：{ground_truth}

请从以下几个维度评估：
1. 核心信息是否一致
2. 事实准确性是否匹配
3. 语义表达是否相符

如果模型回答能够包含标准答案的全部信息，那么视为相似。请给出一个0-1之间的相似度分数，以及是否相似并简要说明评估理由。
请以JSON格式回复：
{{
    "similarity_score": 0.XX,
    "is_similar": true/false,
    "reasoning": "评估理由"
}}
"""
                
                # 调用GPT-4进行评估
                response = client.chat.completions.create(
                    model="gpt-4",
                    messages=[
                        {"role": "system", "content": "你是一个专业的文本评估助手，专门评估问答内容的语义一致性。"},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0
                )
                
                # 解析GPT-4的回复
                gpt_response = response.choices[0].message.content
                try:
                    eval_result = json.loads(gpt_response)
                    similarity = eval_result.get('similarity_score', 0.0)
                    reasoning = eval_result.get('reasoning', '')
                except:
                    # 如果JSON解析失败，使用默认值
                    similarity = 0.0
                    reasoning = "GPT-4回复解析失败"
                
                result = {
                    'id': question_id,
                    'filename': filename,
                    'question': question,
                    'similarity': similarity,
                    'reasoning': reasoning,
                    'is_consistent': similarity > 0.5,  # 阈值可调整
                    'evaluation_method': 'GPT-4'
                }
                
                results.append(result)
                # 如果不一致，保存到fail文件夹
                if not result['is_consistent']:
                    fail_dir = 'output/fail-1'
                    os.makedirs(fail_dir, exist_ok=True)
                    
                    fail_case = {
                        'id': question_id,
                        'filename': filename,
                        'question': question,
                        'answer': answer,
                        'ground_truth': ground_truth,
                        'similarity': similarity,
                        'reasoning': reasoning,
                        'evaluation_method': 'GPT-4'
                    }
                    
                    fail_filename = f"fail_case_{question_id}_{filename}"
                    fail_path = os.path.join(fail_dir, fail_filename)
                    
                    with open(fail_path, 'w', encoding='utf-8') as f:
                        json.dump(fail_case, f, ensure_ascii=False, indent=2)
                
                print(f"文件: {filename}")
                print(f"问题: {question}")
                print(f"GPT-4评估相似度: {similarity:.4f}")
                print(f"评估理由: {reasoning}")
                print(f"是否一致: {'是' if result['is_consistent'] else '否'}")
                print("-" * 50)
                
            except Exception as e:
                print(f"处理文件 {filename} 时出错: {e}")
    
    # 统计结果
    if results:
        total_files = len(results)
        consistent_count = sum(1 for r in results if r['is_consistent'])
        avg_similarity = np.mean([r['similarity'] for r in results])
        
        print(f"\n=== GPT-4评估结果汇总 ===")
        print(f"总文件数: {total_files}")
        print(f"语义一致的文件数: {consistent_count}")
        print(f"一致性比例: {consistent_count/total_files:.2%}")
        print(f"平均语义相似度: {avg_similarity:.4f}")
        
        # 保存详细结果
        with open('output/qa_evaluation_rewritten_gpt4o.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"GPT-4评估详细结果已保存到: output/qa_evaluation_rewritten_gpt4o.json")

    return results

def evaluate_qa_with_gemini(qa_dir="output/QA-rewritten"):
    """使用Gemini评估QA文件的语义一致性"""
    import google.generativeai as genai
    
    # 配置Gemini API
    genai.configure(api_key=os.getenv('AIzaSyBhEuKELDuN9IBy8POusIxtkWHnouEqvCM'))
    model = genai.GenerativeModel('gemini-pro')
    
    results = []
    
    if not os.path.exists(qa_dir):
        print(f"目录 {qa_dir} 不存在")
        return results
    
    json_files = [f for f in os.listdir(qa_dir) if f.endswith('.json')]
    
    for filename in json_files:
        file_path = os.path.join(qa_dir, filename)
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            question = data.get('question', '')
            answer = data.get('answer', '')
            ground_truth = data.get('ground_truth', '')
            question_id = data.get('id', 0)
            
            if not question or not answer or not ground_truth:
                print(f"跳过文件 {filename}: 缺少必要字段")
                continue
            
            # 构建Gemini评估提示
            prompt = f"""
请评估以下问答对的语义一致性：

问题：{question}

模型回答：{answer}

标准答案：{ground_truth}

请从以下几个维度进行评估：
1. 核心信息是否一致
2. 事实准确性是否匹配
3. 语义表达是否相符

请给出：
1. 语义相似度评分（0-1之间的小数，1表示完全一致）
2. 详细的评估理由
3. 是否认为两个答案在语义上基本一致（是/否）

请按以下格式回答：
相似度：[0-1之间的数值]
理由：[详细评估理由]
一致性：[是/否]
"""
            
            # 调用Gemini API
            response = model.generate_content(prompt)
            response_text = response.text
            
            # 解析Gemini响应
            similarity = 0.0
            reasoning = ""
            is_consistent = False
            
            lines = response_text.strip().split('\n')
            for line in lines:
                if line.startswith('相似度：'):
                    try:
                        similarity = float(line.split('：')[1].strip())
                    except:
                        similarity = 0.0
                elif line.startswith('理由：'):
                    reasoning = line.split('：', 1)[1].strip()
            
            result = {
                'filename': filename,
                'question': question,
                'answer': answer,
                'ground_truth': ground_truth,
                'similarity': similarity,
                'reasoning': reasoning,
                'is_consistent': similarity > 0.5,
                'evaluation_method': 'Gemini'
            }
            
            results.append(result)
            
            # 如果不一致，保存到失败案例
            if not is_consistent:
                fail_dir = 'output/fail'
                os.makedirs(fail_dir, exist_ok=True)
                
                fail_case = {
                    'id': question_id,
                    'filename': filename,
                    'question': question,
                    'answer': answer,
                    'ground_truth': ground_truth,
                    'similarity': similarity,
                    'reasoning': reasoning,
                    'evaluation_method': 'Gemini'
                }
                
                fail_filename = f"fail_case_{question_id}_{filename}"
                fail_path = os.path.join(fail_dir, fail_filename)
                
                with open(fail_path, 'w', encoding='utf-8') as f:
                    json.dump(fail_case, f, ensure_ascii=False, indent=2)
            
            print(f"文件: {filename}")
            print(f"问题: {question}")
            print(f"Gemini评估相似度: {similarity:.4f}")
            print(f"评估理由: {reasoning}")
            print(f"是否一致: {'是' if result['is_consistent'] else '否'}")
            print("-" * 50)
            
        except Exception as e:
            print(f"处理文件 {filename} 时出错: {e}")
    
    # 统计结果
    if results:
        total_files = len(results)
        consistent_count = sum(1 for r in results if r['is_consistent'])
        avg_similarity = np.mean([r['similarity'] for r in results])
        
        print(f"\n=== Gemini评估结果汇总 ===")
        print(f"总文件数: {total_files}")
        print(f"语义一致的文件数: {consistent_count}")
        print(f"一致性比例: {consistent_count/total_files:.2%}")
        print(f"平均语义相似度: {avg_similarity:.4f}")
        
        # 保存详细结果
        with open('output/qa_evaluation_gemini.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"Gemini评估详细结果已保存到: output/qa_evaluation_gemini.json")

    return results


def evaluate_qa_with_deepseek(qa_dir="output/QA-rewritten"):
    """使用DeepSeek评估QA文件的answer和ground_truth的一致性"""
    
    # 配置DeepSeek客户端
    client = OpenAI(
        api_key="sk-b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8b8",  # 请替换为实际的API密钥
        base_url="https://api.deepseek.com/v1"
    )
    
    results = []
    fail_dir = "output/fail"
    
    # 遍历QA目录下的所有JSON文件
    for filename in os.listdir(qa_dir):
        if filename.endswith('.json'):
            filepath = os.path.join(qa_dir, filename)
            
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # 提取answer和ground_truth
                answer = data.get('answer', '')
                ground_truth = data.get('ground_truth', '')
                question_id = data.get('id', filename)
                question = data.get('question', '')
                
                # 构建评估提示词
                prompt = f"""
请评估以下两个答案的语义一致性：

问题：{question}

模型回答：{answer}

标准答案：{ground_truth}

请从以下几个维度评估：
1. 核心信息是否一致
2. 事实准确性是否匹配
3. 语义表达是否相符

如果模型回答能够包含标准答案的全部信息，那么视为相似。请给出一个0-1之间的相似度分数，以及简要说明评估理由。
请以JSON格式回复：
{{
    "similarity_score": 0.XX,
    "reasoning": "评估理由"
}}
"""
                
                # 调用DeepSeek API
                response = client.chat.completions.create(
                    model="deepseek-chat",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.1
                )
                
                # 解析响应
                response_text = response.choices[0].message.content
                
                # 尝试解析JSON响应
                try:
                    import re
                    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                    if json_match:
                        eval_result = json.loads(json_match.group())
                        similarity = eval_result.get('similarity_score', 0.0)
                        reasoning = eval_result.get('reasoning', '无法解析评估理由')
                    else:
                        similarity = 0.0
                        reasoning = f"无法解析DeepSeek响应: {response_text}"
                except:
                    similarity = 0.0
                    reasoning = f"JSON解析失败: {response_text}"
                
                result = {
                    'id': question_id,
                    'filename': filename,
                    'question': question,
                    'similarity': similarity,
                    'reasoning': reasoning,
                    'is_consistent': similarity > 0.7  # 阈值可调整
                }
                
                results.append(result)
                
                # 如果相似度低于阈值，保存为失败案例
                if similarity <= 0.7:
                    os.makedirs(fail_dir, exist_ok=True)
                    
                    fail_case = {
                        'id': question_id,
                        'filename': filename,
                        'question': question,
                        'answer': answer,
                        'ground_truth': ground_truth,
                        'similarity': similarity,
                        'reasoning': reasoning,
                        'evaluation_method': 'DeepSeek'
                    }
                    
                    fail_filename = f"fail_case_{question_id}_{filename}"
                    fail_path = os.path.join(fail_dir, fail_filename)
                    
                    with open(fail_path, 'w', encoding='utf-8') as f:
                        json.dump(fail_case, f, ensure_ascii=False, indent=2)
                
                print(f"文件: {filename}")
                print(f"问题: {question}")
                print(f"DeepSeek评估相似度: {similarity:.4f}")
                print(f"评估理由: {reasoning}")
                print(f"是否一致: {'是' if result['is_consistent'] else '否'}")
                print("-" * 50)
                
            except Exception as e:
                print(f"处理文件 {filename} 时出错: {e}")
    
    # 统计结果
    if results:
        total_files = len(results)
        consistent_count = sum(1 for r in results if r['is_consistent'])
        avg_similarity = np.mean([r['similarity'] for r in results])
        
        print(f"\n=== DeepSeek评估结果汇总 ===")
        print(f"总文件数: {total_files}")
        print(f"语义一致的文件数: {consistent_count}")
        print(f"一致性比例: {consistent_count/total_files:.2%}")
        print(f"平均语义相似度: {avg_similarity:.4f}")
        
        # 保存详细结果
        with open('output/qa_evaluation_deepseek.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"DeepSeek评估详细结果已保存到: output/qa_evaluation_deepseek.json")

    return results


def evaluate_qa_with_claude(qa_dir="output/QA-rewritten"):
    """使用Claude评估QA文件的answer和ground_truth的一致性"""
    
    import anthropic
    
    # 配置Claude客户端
    client = anthropic.Anthropic(
        api_key="sk-ant-api03-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"  # 请替换为实际的API密钥
    )
    
    results = []
    fail_dir = "output/fail"
    
    # 遍历QA目录下的所有JSON文件
    for filename in os.listdir(qa_dir):
        if filename.endswith('.json'):
            filepath = os.path.join(qa_dir, filename)
            
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # 提取answer和ground_truth
                answer = data.get('answer', '')
                ground_truth = data.get('ground_truth', '')
                question_id = data.get('id', filename)
                question = data.get('question', '')
                
                # 构建评估提示词
                prompt = f"""
请评估以下两个答案的语义一致性：

问题：{question}

模型回答：{answer}

标准答案：{ground_truth}

请从以下几个维度评估：
1. 核心信息是否一致
2. 事实准确性是否匹配
3. 语义表达是否相符

如果模型回答能够包含标准答案的全部信息，那么视为相似。请给出一个0-1之间的相似度分数，以及简要说明评估理由。
请以JSON格式回复：
{{
    "similarity_score": 0.XX,
    "reasoning": "评估理由"
}}
"""
                
                # 调用Claude API
                response = client.messages.create(
                    model="claude-3-sonnet-20240229",
                    max_tokens=1000,
                    temperature=0.1,
                    messages=[
                        {"role": "user", "content": prompt}
                    ]
                )
                
                # 解析响应
                response_text = response.content[0].text
                
                # 尝试解析JSON响应
                try:
                    import re
                    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                    if json_match:
                        eval_result = json.loads(json_match.group())
                        similarity = eval_result.get('similarity_score', 0.0)
                        reasoning = eval_result.get('reasoning', '无法解析评估理由')
                    else:
                        similarity = 0.0
                        reasoning = f"无法解析Claude响应: {response_text}"
                except:
                    similarity = 0.0
                    reasoning = f"JSON解析失败: {response_text}"
                
                result = {
                    'id': question_id,
                    'filename': filename,
                    'question': question,
                    'similarity': similarity,
                    'reasoning': reasoning,
                    'is_consistent': similarity > 0.7  # 阈值可调整
                }
                
                results.append(result)
                
                # 如果相似度低于阈值，保存为失败案例
                if similarity <= 0.7:
                    os.makedirs(fail_dir, exist_ok=True)
                    
                    fail_case = {
                        'id': question_id,
                        'filename': filename,
                        'question': question,
                        'answer': answer,
                        'ground_truth': ground_truth,
                        'similarity': similarity,
                        'reasoning': reasoning,
                        'evaluation_method': 'Claude'
                    }
                    
                    fail_filename = f"fail_case_{question_id}_{filename}"
                    fail_path = os.path.join(fail_dir, fail_filename)
                    
                    with open(fail_path, 'w', encoding='utf-8') as f:
                        json.dump(fail_case, f, ensure_ascii=False, indent=2)
                
                print(f"文件: {filename}")
                print(f"问题: {question}")
                print(f"Claude评估相似度: {similarity:.4f}")
                print(f"评估理由: {reasoning}")
                print(f"是否一致: {'是' if result['is_consistent'] else '否'}")
                print("-" * 50)
                
            except Exception as e:
                print(f"处理文件 {filename} 时出错: {e}")
    
    # 统计结果
    if results:
        total_files = len(results)
        consistent_count = sum(1 for r in results if r['is_consistent'])
        avg_similarity = np.mean([r['similarity'] for r in results])
        
        print(f"\n=== Claude评估结果汇总 ===")
        print(f"总文件数: {total_files}")
        print(f"语义一致的文件数: {consistent_count}")
        print(f"一致性比例: {consistent_count/total_files:.2%}")
        print(f"平均语义相似度: {avg_similarity:.4f}")
        
        # 保存详细结果
        with open('output/qa_evaluation_claude.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"Claude评估详细结果已保存到: output/qa_evaluation_claude.json")

    return results


if __name__ == "__main__":
    # evaluate_qa_files()
    evaluate_qa_with_gpt4()
    evaluate_qa_with_gemini()

