# receval_modification.py

import logging
from tqdm import tqdm
import os
import json
import re

import utils.safe_statistic
from llm_clients.DSV3Client import DSV3Client

# set logging
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)

# Configure input and output paths
summary_dir = "../data/temp/summary_data"
result_dir = "../data/temp/receval_result"

llm_client = DSV3Client()


def evaluate_intra_correctness(prev, cur, next):
    prompt = f"""Role positioning: You are a professional dialogue quality evaluator specializing in assessing the internal coherence and correctness of conversation summaries.

Task description: Given three single-sentence summary in interaction among **User-Instruction**, **LLM-Answer**, and **User-Feedback**. Now your task is to evaluate its **intra-correctness** — that is, whether the LLM-Answer logically follows from the User-Instruction. And whether this interaction is internally self-consistent, grammatically sound, and semantically coherent.

Evaluation Criteria:
1. **Topic Continuity (45%)**
   - Does the current sentence continue the core topic of the previous one?

2. **Logical Progression (20%)**
   - Is the current sentence a reasonable logical or conversational response to the previous one?
   - For example: a question is answered, a statement is elaborated, an action follows an intention, etc.

3. **Semantic Linkage (10%)**
   - Are there shared or related concepts/keywords?
   - Are references consistent via pronouns, synonyms, or hierarchical terms?

4. **Dialogue Act Alignment (10%)**
   - Do the speech acts (e.g., asking, suggesting, confirming, reacting) align in a coherent way?
   - Does the behavior fit typical dialogue flow?

5. **Temporal/Causal Fit (15%)**
   - Does the current sentence plausibly follow the previous one in time or causality?
   - Does it violate any commonsense expectations?


Rules:
- Only assess **coherence between User-Instruction and LLM-Answer** (not quality or informativeness).
- Do not consider whether the sentences are factually correct or grammatically sound.
- Output only the final **int score** (e.g., `49`). No explanations or comments.

Context:
User-Instruction: "{prev}"
LLM-Answer: "{cur}"
User-Feedback: "{next}"

Scoring Instructions:
- Score the sentence from **0 to 100**, allowing any value in that range (e.g., 23, 62, 94).
- Think about the rubric below before deciding your score.


Important:
- Only output the final **int score**.
- Do not provide explanation, analysis, or extra text.
- The output must be a valid Python int (e.g., 67)

Now provide the score below:"""
    try:
        response = llm_client.chat(prompt)
        return get_score(response)
    except Exception as e:
        logger.error(f"Error in evaluate: {e}")
        return -1


def evaluate_inter_correctness(prev, cur):
    prev_str = "\n".join(prev)
    prompt = f"""Role positioning: You are a professional evaluator of dialogue reasoning chains, with expertise in assessing the logical and semantic coherence between consecutive conversation turns.

Task description: Given two summaries of consecutive interaction rounds in a dialogue — **Current-sentence** and **Previous-sentence** — your task is to assess their **inter-correctness**, i.e., how well the current sentence logically and semantically follows from the previous one.

Weighted Evaluation Criteria:
- **Topic Continuity (45%)**: Does the current sentence maintain or meaningfully extend the topic from the previous one?
- **Intent Consistency (30%)**: Is the communicative intent (e.g., question-answer, elaboration, rebuttal) logically compatible across the two turns?
- **Reasoning Validity (15%)**: If implicit logic or inference is used, is it reasonable and well-grounded?
- **Other Pragmatic Coherence (10%)**: Includes reference clarity, tone matching, memory consistency, and turn appropriateness.

You should penalize:
- Abrupt or unjustified topic changes
- Logical contradictions or inferential gaps
- Inappropriate intent shifts (e.g., ignoring a question)
- Jarring tone or reference mismatch

Context:
**Current-sentence**: {cur}
**Previous-sentence**: {prev_str}

Scoring Instructions:
- Score the sentence from **0 to 100**, allowing any value in that range (e.g., 23, 62, 94).
- Think about the rubric below before deciding your score.


Important:
- Only output the final **int score**.
- Do not output explanation, comments, or anything besides the number.
- The output must be a valid Python int.

Now provide the score below:"""
    try:
        response = llm_client.chat(prompt)
        return get_score(response)
    except Exception as e:
        logger.error(f"Error in evaluate: {e}")
        return -1


def evaluate_informativeness(messages, question):
    history = messages[:-1]
    cur = messages[-1]
    history_str = "\n".join(history)
    prompt = f"""Role positioning: You are an expert judge of dialogue informativeness, with the ability to determine whether a statement meaningfully contributes new and relevant information.

Task description: Given **Current-sentence**, and **Previous-context**, your task is to evaluate the **informativeness** of **Current-sentence** — i.e., how much new, relevant, and helpful information it adds to the dialogue to solve the **Question**.

Evaluation Dimensions (with weights):
1. **Novelty of Information (40%)** – Does the sentence introduce new facts, reasoning, explanations, or actions not already mentioned?
2. **Topical Relevance (20%)** – Is the new information clearly connected to the main topic or current sub-task?
3. **Utility to Dialogue Progression (20%)** – Does the sentence advance the conversation toward resolution, clarification, or decision-making?
4. **Information Density (10%)** – Is the sentence compact yet informative, avoiding vague or redundant language?
5. **Reasoning or Explanation Depth (10%)** – If reasoning is present, does it add meaningful insight or justification?


Context:
**Question**: {question}
**Current-sentence**: {cur}
**Previous-context**: {history_str}


Scoring Instructions:
- Score **Current-sentence** from **0 to 100**, allowing any value in that range (e.g., 23, 62, 94).
- Think about the rubric below before deciding your score.


Important:
- Only output the final **int score**.
- Do not output explanation, reasoning, or extra commentary.
- The output must be a valid Python int.

Now provide the score below:"""
    try:
        response = llm_client.chat(prompt)
        return get_score(response)
    except Exception as e:
        logger.error(f"Error in evaluate: {e}")
        return -1


def get_score(response):
    response = response.strip()
    if (response.startswith("score:")
    ):
        response = response.split(":", 1)[1].strip()
    try:
        score = int(response)
        return score
    except Exception as e:
        match = re.search(r"\d+(?:\.\d+)?", response)
        if match:
            return int(match.group())
        else:
            logger.error(f"Error in transform response to score: {e}")
            return -1


def receval_summary(summary_path):
    with open(summary_path, 'r', encoding='utf-8') as f:
        messages = json.load(f)

    messages_content = [m['content'] for m in messages]

    intra_correctness_list = []
    inter_correctness_list = []
    informativeness_list = []

    idx = 0
    while idx < len(messages):
        if idx % 2 == 1:
            # Intra-turns evaluation
            prev_idx = idx - 1
            cur_idx = idx

            # Check is included, but index overflow should not happen
            if prev_idx >= 0 and cur_idx < len(messages):
                if not (messages[prev_idx]['role'] == "env" and messages[cur_idx]['role'] == "assistant"):
                    logging.ERROR("Need Checking，Unmatch env-assistant in Intra-turns Evalution")
                prev = messages_content[prev_idx]
                cur = messages_content[cur_idx]
                if cur_idx + 1 < len(messages):
                    next_idx = cur_idx + 1
                    next = messages_content[next_idx]
                # Handle edge case
                else:
                    next = "finish"
                intra_correctness_list.append(evaluate_intra_correctness(prev, cur, next))

            # Info-gain evaluation
            informativeness_list.append(evaluate_informativeness(messages_content[1:idx + 1:2], messages_content[0]))

        if idx % 2 == 0 and idx > 0:
            # Inter-turns evaluation
            prev_idx = idx - 2
            cur_idx = idx
            prev = messages_content[prev_idx]
            cur = messages_content[cur_idx]
            inter_correctness_list.append(evaluate_inter_correctness([prev], cur))

        idx = idx + 1

    receval_metrics = {
        'intra_correctness': intra_correctness_list,
        'inter_correctness': inter_correctness_list,
        'informativeness': informativeness_list,
        'min_intra_correctness': utils.safe_statistic.safe_min(intra_correctness_list),
        'min_inter_correctness': utils.safe_statistic.safe_min(inter_correctness_list),
        'min_informativeness': utils.safe_statistic.safe_min(informativeness_list),
        'avg_intra_correctness': utils.safe_statistic.mean(intra_correctness_list),
        'avg_inter_correctness': utils.safe_statistic.mean(inter_correctness_list),
        'avg_informativeness': utils.safe_statistic.mean(informativeness_list),
    }
    return receval_metrics


def receval_predict(summary_dir, result_dir):
    for fname in tqdm(os.listdir(summary_dir)):
        if not fname.endswith('.json'):
            continue
        instance_id = fname[:-5]
        summary_path = os.path.join(summary_dir, fname)
        output_path = os.path.join(result_dir, f"{instance_id}.json")

        scores = receval_summary(summary_path)

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(scores, f, indent=2, ensure_ascii=False)

        logger.log(logging.INFO, f"Completed ReCEval: {instance_id}")


def main(summary_dir=summary_dir, result_dir=result_dir):
    receval_predict(summary_dir, result_dir)


if __name__ == '__main__':
    main()
