import sys
import re
import numpy as np
import json

closed_question_prompt_en = """
1. Task Description:
    - You will receive three pieces of information: Question, Correct Answer (ref answer) and Predicted Answer (model answer).
    - Your task is to compare the ”Correct Answer” and the ”Predicted Answer” to determine whether the latter is semantically consistent with the former, allowing for differences in format.
    - The judgment result must be one of two types: ”Correct” or ”Incorrect”.

2. Evaluation Requirements:
    - First, ensure you fully understand the meaning and requirements of the question.
    - Next, conduct a detailed comparison between the ”Predicted Answer” and the ”Correct Answer,” analyzing their consistency in logic, calculations, and expression.
    - Identify and explain any errors or inconsistencies in the ”Predicted Answer”.
    - Finally, provide an overall judgment based on the comparison: if the answers are semantically consistent, classify the result as ”Correct”; if inconsistent, classify it as ”Incorrect”.

3. Output Format:
    - Please carefully evaluate the predicted answer and provide the final judgment in the following JSON format:
    {format_closed_question_score}

Below are the question, the correct answer, and the predicted answer:

Question:
{question}

Correct answer:
{ref_answer}

AI assistant's answer:
{model_answer}
"""

open_report_prompt_en = """
# Report Quality Scoring Expert: Overall Report Scoring Criteria (0-8 points)

You are a rigorous report quality scoring expert. Please evaluate the quality of the generated report based on the following four dimensions: Content Completeness, Information Richness, Consistency of Data and Facts, and Structural Rationality. Ensure that your assessment strictly adheres to the evaluation criteria below to accurately reflect the quality of the generated report.

## I. Content Completeness: Maximum 2 points, Minimum 0 points
### 1.1 **Full Coverage of Core Requirements**
- Must fully address all core requirements specified in the user’s research topic; none can be omitted.

### 1.2 **Completeness of Key Modules**
- Key modules must be complete and include, but are not limited to, ”Report Title,” ”Introduction/Background,” ”Analysis Content,” and ”Conclusion and Recommendations”.

### 1.3 **Scoring rules**
**Full score scenario**
    - All core needs are fully covered, and key modules are complete → 2 point
**0-point scenario (Directly score 0)**
    - More than 30% of core needs are missing or more than 3 key modules are missing → 0 points
**Graded deduction**
    - For each missing core need → deduct 0.1 points
    - For each missing key module → deduct 0.1 points
    - The minimum score cannot be lower than 0 points

## II. Information Richness: Maximum 2 points, Minimum 0 points
### 2.1 **Multi-level Information in Core Modules**
- Core modules must contain multi-level information, including explanations, data, analysis, and conclusions.
- Content in core modules must not be excessively brief; a few sentences of analysis or general statements are unacceptable.

### **2.2 Data Support for Core Modules**
- Claims presented in core modules must be supported by factual data or evidence; content must not be superficial or overly generalized.
- All analytical content must include in-depth analysis, reasoning, and data interpretation.

### 2.3 **Scoring rules**
**Full score scenario**
    - All views and analysis content are fully supported by sufficient data and evidence, and the content is detailed → 2 point
** 0-point scenario (directly score 0)**
    - More than 50% of views and analyses lack sufficient data or evidence support → 0 points
**Graded deduction**
    - For each analytical view lacking data and evidence support → deduct 0.1 points
    - For each core part with extremely little content, summarized in only 2-3 sentences → deduct 0.1 points
    - The minimum score cannot be lower than 0 points

## III. Consistency of Data and Facts: Maximum 2 points, Minimum 0 points
### 3.1 **Clear Data Sources**
- Data used to support claims in report must have clear and traceable sources. Data with no traceable origin may be considered false or fabricated.

### 3.2 **Authenticity of Data**
- Key information (such as data and conclusions) must align with common sense and authoritative sources; conflicting information or ”obviously false” statements are unacceptable.
- Data must be factually accurate, without subjective assumptions or exaggerated expressions.

### 3.3 **Scoring rules**
**Full score scenario**
    - All data in the report have clear sources and are not inconsistent with common sense and authoritative materials → 2 point
**0-point scenario (directly score 0)**
    - More than 30% of data and evidence in the report cannot be traced → 0 points
**Graded deduction**
    - For each piece of data or evidence with untraceable sources → deduct 0.1 points
    - For each piece of data inconsistent with facts, with subjective assumptions or exaggerated expressions → deduct 0.1 points
    - For each piece of data or view inconsistent with common sense and authoritative materials → deduct 0.1 points
    - The minimum score cannot be lower than 0 points

## IV. Structural Rationality: Maximum 2 points, Minimum 0 points
### 4.1 **Reasonable Directory Hierarchy**
- The report must include an overall title.
- The directory hierarchy must be logical; for example, conclusions should appear after the analysis in the core sections.
- Directory content must be relevant to the research topic and specifically address distinct aspects of it.
- Section titles must be independent and non-repetitive.

### 4.2 **Clear Article Structure**
- The article must have a clear and highly readable structure, with consistent font, font size, and line spacing for text at the same level.
- Content across different sections must be distinct, avoiding repetition or significant overlap.

### 4.3 **Scoring rules**
**Full score scenario*
    - The directory hierarchy and content are reasonable, there is no repetition between directories, the article structure is clear, and there is no overlap between the contents of each part → 2 point
**0-point scenario (directly score 0)**
    - The directory content is repeated or the article is completely unreadable, and there are more than 3 overlaps in the content of each part of the article → 0 points
**Graded deduction**
    - For each unreasonable directory hierarchy and content → deduct 0.1 points
    - For each overlap or deep intersection of article content → deduct 0.1 points
    - For each unreasonable article structure → deduct 0.1 points
    - The minimum score cannot be lower than 0 points

## Output Format
- Please carefully evaluate the quality of the report and provide the final score in the following JSON format:
{format_open_report_score}

Please strictly assess the report quality according to the criteria specified in the four dimensions above, and ensure that the final score is provided.
User's research topic:
{topic}

Input report:
{report}
"""

open_report_score_en = """Research Question: {question}

Please objectively evaluate the quality of research articles generated by systems A for this question, and provide scores out of 10 for the following criteria:
(1) Overall Comprehensiveness: The report should cover content as comprehensively as possible
(2) Thoroughness of Discussion: Each section should be discussed thoroughly, not just superficially
(3) Factuality: There should be minimal factual errors
(4) Coherence: The discussion should stay focused and relevant to the topic

Notes:
- A satisfactory performance deserves around 5 points, with higher scores for excellence and lower scores for deficiencies
- You should not easily assign scores higher than 8 or lower than 3 unless you provide substantial reasoning.
- You do not need to consider citations in the articles

----------------------------------------------------------
Research article generated by system A:
----------------------------------------------------------

{report}

----------------------------------------------------------

Research Question: {question}

Please objectively evaluate the quality of research articles generated by systems A for this question, and provide scores out of 10 for the following criteria:
(1) Overall Comprehensiveness: The report should cover content as comprehensively as possible
(2) Thoroughness of Discussion: Each section should be discussed thoroughly, not just superficially
(3) Factuality: There should be minimal factual errors
(4) Coherence: The discussion should stay focused and relevant to the topic

Notes:
- A satisfactory performance deserves around 5 points, with higher scores for excellence and lower scores for deficiencies
- You should not easily assign scores higher than 8 or lower than 3 unless you provide substantial reasoning.
- You do not need to consider citations in the articles


Please analyze each article and provide the final scores in the following JSON format:
{format_open_report_score}
"""

def generate_scoring_prompt(question, model_answer, data_source, ground_truth):
    if data_source == "closed_question":
        format_closed_question_score = "{'Step_Evaluation': [Describe in detail the steps taken in comparing the answers, pointing out which parts are correct, which parts have deviations or errors, and specifying the problems], 'Evaluation_Reasons': [Summarize the basis for your evaluation, explaining why the AI assistant's answer is consistent with the correct answer or has problems], 'Judgment': [Final judgment result: Correct or Incorrect]}"
        scoring_prompt_input = closed_question_prompt_en.format(question=str(question), model_answer=model_answer, ref_answer=ground_truth, format_closed_question_score=format_closed_question_score)
    elif data_source == "open_report":
        format_open_report_score = "{'Scores_of_Each_Dimension': [Output the score of each dimension in standard Markdown format, and list in detail the specific reasons for deduction and the score of each dimension], 'Overall_Score': [Give the score of the report's format standardization, ranging from 0 to 8 points, and the score is given in [], for example [5.8]]}"
        scoring_prompt_input = open_report_prompt_en.format(topic=str(question), report=model_answer, format_open_report_score=format_open_report_score)
        scoring_prompt_input = open_report_score_en.format(question=str(question), report=model_answer, format_open_report_score=format_open_report_score)
    else:
        scoring_prompt_input = question
    return scoring_prompt_input

def extract_scoring_data(question, scoring_result, data_source):
    score_results={}
    text = scoring_result
    try:
        text = text.split('</think>', 1)[1].strip()
    except:
        text = text
    
    if data_source == "closed_question":
        score, Step_Evaluation = extract_closed_score_data(text)
        score_results["scoring_content"] = Step_Evaluation
        score_results['score'] = 1.0 if score == "Correct" else 0.0

    elif data_source == "open_report":
        score, Step_Evaluation = extract_open_score_data(text)
        score_results["scoring_content"] = Step_Evaluation
        score_results['score'] = score
    else:
        score_results["scoring_content"] = ""
        score_results['score'] = 1.0

    return score_results

def extract_closed_score_data(text):
    try:
        data = json.loads(text.strip())
        if isinstance(data, dict) and "Judgment" in data:
            score = data["Judgment"]
            Step_Evaluation = data.get("Step_Evaluation", "")
            Evaluation_Reasons = data.get("Evaluation_Reasons", "")
            return score, Step_Evaluation
    except (json.JSONDecodeError, ValueError, TypeError):
        pass

    lower_text = text.lower()
    if "Incorrect" in lower_text:
        return "Incorrect", "Detected 'Incorrect'"
    elif "Correct" in lower_text:
        return "Correct", "Detected 'Correct'"
    
    return "Incorrect", ""

# def extract_open_score_data(text):
#     try:
#         data = json.loads(text.strip())
#         if isinstance(data, dict):
#             Overall_Comprehensiveness = data.get("Overall_Comprehensiveness", "")
#             Thoroughness_of_Discussion = data.get("Thoroughness_of_Discussion", "")
#             Factuality = data.get("Factuality", "")
#             Coherence = data.get("Coherence", "")

#             return (float(Overall_Comprehensiveness) + float(Thoroughness_of_Discussion) + float(Factuality) + float(Coherence)) / 4, text

#     except (json.JSONDecodeError, ValueError, TypeError):
#         pass
    
#     return 4.0, text

def extract_open_score_data(text):
    try:
        data = json.loads(text.strip())
        if isinstance(data, dict) and "Overall_Score" in data:
            overall_score = data["Overall_Score"]
            score_content = data.get("Scores_of_Each_Dimension", "")  # 使用get避免键不存在错误
            if isinstance(overall_score, str):
                match = re.search(r'\[(\d+\.\d+)\]', overall_score)
                if match:
                    return float(match.group(1)) / 8, score_content
                else:
                    return 0.3, score_content
    except (json.JSONDecodeError, ValueError, TypeError):
        pass
    
    match = re.search(r'\[(\d+\.\d+)\]', text.strip())
    if match:
        return float(match.group(1)) / 8, ""
    else:
        return 0.3, ""
    
    return 0.3, ""