import numpy as np
from utils.myutils import AIClient, errorMessage
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

EVAL_SPECS = {
    "clarity": {
        "system_prompt": (
            "You are evaluating the **CLARITY** of medical questions in a doctor-patient interaction.\n\n"
            "**Clarity** means:\n"
            "- **Simple and concise**: Using simple, to-the-point wording that is easy for a patient without a medical background to understand. Avoid medical jargon.\n"
            "- **Clear intent**: The patient could easily know exactly what is being asked, with no ambiguity.\n"
            "- **Natural language**: Natural, conversational language over overly formal or academic phrasing.\n\n"
            "**Strict Scoring Guidance**:\n"
            "- [5/5]: The questions in the session are **exceptionally clear**, concise, and natural. The patient can understand them effortlessly, with zero ambiguity.\n"
            "- [4/5]: The questions are **very clear** and mostly unambiguous, but may have minor flaws, such as slight complexity or slightly formal wording. The patient can still understand them easily.\n"
            "- [3/5]: The questions are **moderately clear**, but have noticeable deficiencies, such as unnecessary medical terms, overly long sentences, or some vague phrasing. The patient needs to think a bit to understand.\n"
            "- [2/5]: The questions **lack clarity** and are difficult to understand. They may contain long, complex sentences or a lot of medical jargon, and the patient might need to ask for clarification.\n"
            "- [1/5]: The questions are **extremely confusing** and nearly impossible to understand. They may combine multiple concepts, have convoluted structures, or use highly ambiguous phrasing. The patient would be left feeling bewildered.\n\n"
            "Evaluate the series of questions as a whole. "
            "Provide a single numeric score [1-5] wrapped in square brackets, and a brief explanation.\n\n"
            "Format:\n"
            "<ANSWER>\n"
            "[score/5] # Explanation for the Score\n"
            "</ANSWER>"
        )
    },
    "empathy": {
        "system_prompt": (
            "You are evaluating the **EMPATHY** of medical questions in a doctor-patient interaction.\n\n"
            "**Empathy** means:\n"
            "- **Respect and sensitivity**: The tone is respectful, emotionally sensitive, and considerate of the patient's feelings.\n"
            "- **Anxiety reduction**: The phrasing is gentle and reassuring, helping to reduce the patient's anxiety or discomfort.\n"
            "- **Human-centered care**: The questions show genuine concern for the patient's emotional and physical state, beyond just asking about symptoms.\n"
            "**Strict Scoring Guidance**:\n"
            "- [5/5]: The questions in the session are **exceptionally empathetic**, warm, and reassuring. They demonstrate the **highest level of human-centered care** and effectively calm the patient's emotions.\n"
            "- [4/5]: The questions are **very considerate** and largely attuned to the patient's feelings, but may have minor formal or non-conversational wording issues. The overall empathetic effect is still strong.\n"
            "- [3/5]: The questions show **some empathy**, but are somewhat bland or mechanical. For example, a question might be clear but the tone is neutral, lacking emotional support. Or, the empathetic phrasing feels slightly forced or verbose.\n"
            "- [2/5]: The questions **lack empathy**. The tone is overly formal, cold, or blunt, focusing only on gathering information without any consideration for the patient's feelings. This could make the patient feel uncomfortable.\n"
            "- [1/5]: The questions are **completely lacking in empathy** and may even be offensive or harmful. The tone is harsh, impatient, or insensitive, which could significantly increase the patient's anxiety or distress.\n\n"
            "Evaluate the series of questions as a whole. "
            "Provide a single numeric score [1-5] wrapped in square brackets, and a brief explanation.\n\n"
            "Format:\n"
            "<ANSWER>\n"
            "[score/5] # Explanation for the Score\n"
            "</ANSWER>"
        )
    }
}


class ExperienceEvaluator:
    def __init__(self, config, save_folder):
        self.config = config
        self.model = (
            config["ux_judge_model"]
            if 'ux_judge_model' in config
            else config["judge_model"]
        )
        self.client = AIClient(model=self.model)
        self.save_folder = save_folder

    def evaluate(self, log_path):
        with open(log_path, "r", encoding="utf-8") as f:
            entries = [json.loads(line) for line in f if line.strip()]

        details = []
        clarity_list = []
        empathy_list = []

        def process_entry(entry, idx):
            interactions = entry.get("interactions", [])
            clarity_scores = self._eval_dimension(interactions, "clarity")
            empathy_scores = self._eval_dimension(interactions, "empathy")

            result = {
                "case_id": entry.get("case_id", f"row_{idx}"),
                "clarity_score": clarity_scores.get("score", "") / 5 if clarity_scores else 0.0,
                "clarity_details": clarity_scores.get("explanation", "") if clarity_scores else "",
                "empathy_score": empathy_scores.get("score", "") / 5 if empathy_scores else 0.0,
                "empathy_explanation": empathy_scores.get("explanation", "") if empathy_scores else ""
            }
            return result

        with ThreadPoolExecutor(max_workers=8) as executor:
            futures = [executor.submit(process_entry, entry, i) for i, entry in enumerate(entries)]
            for fut in tqdm(as_completed(futures), total=len(futures), desc="Experience Evaluation"):
                try:
                    res = fut.result()
                    details.append(res)
                    clarity_list.append(res["clarity_score"])
                    empathy_list.append(res["empathy_score"])
                except Exception as e:
                    print(f"⚠️ Entry processing failed: {e}")

        return {
            "avg_scores": {
                "clarity": round(np.mean(clarity_list), 3) if clarity_list else 0.0,
                "empathy": round(np.mean(empathy_list), 3) if empathy_list else 0.0
            },
            "details": details
        }

    def _eval_dimension(self, interactions: list[tuple[str, str]], dim: str, max_retries=3):
        import re

        spec = EVAL_SPECS.get(dim)
        if not spec:
            raise ValueError(f"Unknown dimension: {dim}")

        system_prompt = spec["system_prompt"]

        # Only provide questions, avoiding the influences from answers
        dialogue_prompt = (
            f"Here are {len(interactions)} continuous questions from a doctor-patient inquiry session:\n\n"
            f"{''.join([f'Q: {q}\n' for q, _ in interactions])}\n"
            "\nPlease provide ONE score for the whole session."
        )

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": dialogue_prompt}
        ]

        for attempt in range(max_retries):
            try:
                response = self.client.get_response(messages)
                content = response["content"] if isinstance(response, dict) else response

                answer_block = ""
                match_answer = re.search(r"<ANSWER>(.*?)</ANSWER>", content, re.DOTALL | re.IGNORECASE)
                if match_answer:
                    answer_block = match_answer.group(1)
                else:
                    answer_block = content

                match = re.search(r"\[(\d)/5\](?:\s*#\s*(.*))?", answer_block, re.DOTALL)
                if match:
                    score = int(match.group(1))
                    explanation = match.group(2).strip() if match.group(2) else ""
                    return {"score": score, "explanation": explanation}
                else:
                    errorMessage(f"⚠️ [{dim}] Missing score, got: {content}", self.save_folder + "/ux_errors.log")
                    return {"score": 0, "explanation": ""}

            except Exception as e:
                errorMessage(f"{dim.capitalize()} scoring failed (attempt {attempt+1}): {e}", self.save_folder + "/ux_errors.log")

        errorMessage(f"[ERROR] {dim} evaluation failed after {max_retries} attempts.", self.save_folder + "/ux_errors.log")
        return 0