
import os
import json
import re
import time
from openai import OpenAI
from tqdm import tqdm
import logging
from typing import Optional, Dict, Any
# --- 配置区域 ---

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# API密钥配置
API_KEY = "sk-d0baa15cea764760b3d68e217b78c99f"
if not API_KEY:
    raise ValueError("Error: Please set the OPENAI_API_KEY environment variable.")

# OpenAI客户端初始化
client = OpenAI(api_key=API_KEY, base_url="https://api.deepseek.com")

# 模型配置
JUDGE_MODEL = "deepseek-reasoner"

# 文件路径配置
INPUT_DIR = "Qwen3-1.7B_results"
OUTPUT_FILE = "Qwen3-1.7B_results/llm_judge_results.json"

# 定义方法和对应的文件名，并确定A, B, C, D, E的顺序
METHOD_MAPPING = {
    "A": {"name": "cgmcts", "file": "results_cgmcts.json"},
    "B": {"name": "cot", "file": "results_baseline_cot.json"},
    "C": {"name": "react", "file": "results_baseline_react.json"},
    "D": {"name": "tot", "file": "results_baseline_tot.json"},
    "E": {"name": "simple", "file": "results_baseline_simple_top_p.json"}
}

# --- Prompt模板 (英文版) ---
SYSTEM_PROMPT = """
You are a distinguished professor and the chair of a top-tier academic conference, known for your rigorous, fair, and insightful reviews.

Your task is to evaluate five scientific ideas generated by different AI models for a core research theme. You will score each proposal on a scale of 1-10 for the three dimensions below, providing a concise reason for each score. Finally, you must select a **single** best proposal overall.

**Core Evaluation Dimensions:**

1.  **Plausibility**: Is the idea scientifically plausible? (1=Nonsense, 10=Highly Plausible)
2.  **Structure & Clarity**: Is the structure complete and the logic coherent? (1=Chaotic, 10=Crystal Clear)
3.  **Innovation Potential**: Does the idea present novel viewpoints, methods, or research paths? (1=Obsolete, 10=Highly Innovative)

**Important Guidelines:**
*   These are preliminary ideas, not full research proposals. The absence of sections like introduction or methods is acceptable. Focus on the core value of the idea itself.
*   Your scoring should be strict and discerning to reflect the quality differences between the proposals.
*   It is acceptable and even encouraged for proposals to reasonably extend or innovate upon the initial theme. This should be considered a merit, not a deviation from the topic.

**Output Requirement:**

Please return your review strictly in the following JSON format, without any additional explanations or comments.

```json
{{
  "evaluations": {{
    "A": {{
      "method": "{method_A}",
      "plausibility": <score_1_to_10>,
      "structure_clarity": <score_1_to_10>,
      "innovation_potential": <score_1_to_10>,
      "reason": "<A concise justification for the scores>"
    }},
    "B": {{
      "method": "{method_B}",
      "plausibility": <score_1_to_10>,
      "structure_clarity": <score_1_to_10>,
      "innovation_potential": <score_1_to_10>,
      "reason": "<A concise justification for the scores>"
    }},
    "C": {{
      "method": "{method_C}",
      "plausibility": <score_1_to_10>,
      "structure_clarity": <score_1_to_10>,
      "innovation_potential": <score_1_to_10>,
      "reason": "<A concise justification for the scores>"
    }},
    "D": {{
      "method": "{method_D}",
      "plausibility": <score_1_to_10>,
      "structure_clarity": <score_1_to_10>,
      "innovation_potential": <score_1_to_10>,
      "reason": "<A concise justification for the scores>"
    }},
    "E": {{
      "method": "{method_E}",
      "plausibility": <score_1_to_10>,
      "structure_clarity": <score_1_to_10>,
      "innovation_potential": <score_1_to_10>,
      "reason": "<A concise justification for the scores>"
    }}
  }},
  "final_decision": {{
    "best_proposal": "<The single best proposal: A, B, C, D, or E>",
    "justification": "<A comprehensive explanation of why this proposal is the best overall>"
  }}
}}
```
"""

# User Prompt: 包含具体的研究主题和5个构想
USER_PROMPT_TEMPLATE = """
**Content to Evaluate:**

**Core Theme:** {theme}

**Elaboration:** {elaboration}

---

**Proposal A ({method_A}):**
{output_A}

**Proposal B ({method_B}):**
{output_B}

**Proposal C ({method_C}):**
{output_C}

**Proposal D ({method_D}):**
{output_D}

**Proposal E ({method_E}):**
{output_E}
"""

# --- 核心功能函数 ---

def clean_output(text: str) -> str:
    """Removes <think>...</think> tags and their content."""
    if not isinstance(text, str):
        return ""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

def load_and_group_data() -> Dict[str, Any]:
    """Loads all json files and groups data by theme."""
    grouped_data = {}
    for key, info in METHOD_MAPPING.items():
        file_path = os.path.join(INPUT_DIR, info["file"])
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for item in data:
                    theme = item.get("theme")
                    if theme not in grouped_data:
                        grouped_data[theme] = {
                            "elaboration": item.get("elaboration"),
                            "outputs": {}
                        }
                    grouped_data[theme]["outputs"][key] = {
                        "method": info["name"],
                        "output": item.get("output", "")
                    }
        except FileNotFoundError:
            logging.error(f"File not found: {file_path}")
        except json.JSONDecodeError:
            logging.error(f"JSON decode error in file: {file_path}")
    return grouped_data

def get_llm_judgment(system_prompt: str, user_prompt: str, max_retries=3) -> Optional[Dict[str, Any]]:
    """Calls the LLM API to get the judgment, with retries and JSON parsing."""
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=JUDGE_MODEL,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.1,
                response_format={"type": "json_object"},
            )
            content = response.choices[0].message.content
            return json.loads(content)
        except Exception as e:
            logging.error(f"API call or JSON parsing failed (Attempt {attempt + 1}/{max_retries}): {e}")
            time.sleep(5 * (attempt + 1))
    return None

def main():
    """Main execution function with incremental saving."""
    logging.info("Starting LLM-as-a-Judge process...")

    if os.path.exists(OUTPUT_FILE):
        try:
            with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
                all_results = json.load(f)
            judged_themes = {res['theme'] for res in all_results}
            logging.info(f"Loaded {len(all_results)} existing results from {OUTPUT_FILE}. Resuming...")
        except (json.JSONDecodeError, TypeError):
            logging.warning(f"Could not parse {OUTPUT_FILE}. Starting fresh.")
            all_results, judged_themes = [], set()
    else:
        all_results, judged_themes = [], set()

    logging.info("Loading and grouping data from source JSON files...")
    grouped_data = load_and_group_data()
    if not grouped_data:
        logging.error("No data loaded. Exiting.")
        return

    themes_to_judge = list(grouped_data.items())
    logging.info(f"Found {len(themes_to_judge)} unique themes to process.")

    for i, (theme, data) in enumerate(tqdm(themes_to_judge, desc="Judging Progress")):
        if theme in judged_themes:
            logging.info(f"Theme '{theme[:50]}...' already judged. Skipping.")
            continue

        logging.info(f"Judging theme {i+1}/{len(themes_to_judge)}: '{theme[:50]}...'")

        if len(data["outputs"]) != len(METHOD_MAPPING):
            logging.warning(f"Theme '{theme}' is missing outputs. Skipping.")
            continue

        prompt_payload = {
            "theme": theme,
            "elaboration": data["elaboration"]
        }
        # Populate method names and outputs for the user prompt
        for key, output_data in data["outputs"].items():
            prompt_payload[f"method_{key}"] = output_data["method"]
            prompt_payload[f"output_{key}"] = clean_output(output_data["output"])
        
        # Also populate method names for the system prompt's JSON example
        for key, info in METHOD_MAPPING.items():
             prompt_payload[f"method_{key}"] = info["name"]

        final_system_prompt = SYSTEM_PROMPT.format(**prompt_payload)
        final_user_prompt = USER_PROMPT_TEMPLATE.format(**prompt_payload)

        judgment = get_llm_judgment(final_system_prompt, final_user_prompt)

        if judgment:
            result_entry = {"theme": theme, "elaboration": data["elaboration"], "judgment": judgment}
            all_results.append(result_entry)
            judged_themes.add(theme)
            
            try:
                with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
                    json.dump(all_results, f, ensure_ascii=False, indent=4)
                logging.info(f"Progress saved. Total judged themes: {len(all_results)}")
            except Exception as e:
                logging.error(f"Failed to save progress: {e}")
        else:
            logging.error(f"Failed to get judgment for theme '{theme[:50]}...'. It will be retried on next run.")
        
        time.sleep(1)

    logging.info("LLM-as-a-Judge process finished.")

if __name__ == "__main__":
    main()
