import os, sys, json, pathlib, random, re
from tqdm import tqdm
from openai import OpenAI

# ---------- 配置 ----------
API_KEY  = os.getenv("DEEPSEEK_API_KEY", "")
BASE_URL = "https://code.ppchat.vip"
MODELS   = ["gpt-5", "claude-4.1-opus", "gemini-2.5-pro"]
ROOT     = pathlib.Path("./").resolve()
FILES_PER_MODEL = 256
SCORE_ROOT = pathlib.Path("./score_results").resolve()
# -------------------------

PROMPT_HEADER = """You are a senior counseling evaluator. Your task is to score counseling responses according
to the official scoring standards below. Always give: 1) Score (1–5); 2) Reason (1–3
sentences, referencing key features of the standard).
Evaluation Dimensions:
1. Empathic Response
1 = Very inconsistent: No recognition of client’s feelings/thoughts; self-centered; denial,
blame, invalidation, useless advice, or generic comfort.
2 = Inconsistent: Shows cognitive empathy but lacks emotional empathy; surface reflection
only.
3 = Average: Includes both cognitive + emotional empathy but inaccurate/partial; emotional
misjudgment, confusion, or missing complexity.
4 = Consistent: Accurate cognitive + emotional empathy; helps client become aware of
situation and feelings; lacks deeper linkage.
5 = Very consistent: Accurate empathy + connects deeper needs, motives, beliefs, trauma, or
core conflicts; shows profound understanding.
2. Individual & Cultural Diversity
1 = Very inconsistent: Stereotypes, prejudice, bias, moral judgment, education, or scolding; no
neutrality.
2 = Inconsistent: Objectively describes diversity issues but subtly denies or imposes own
values.
3 = Average: Respects diversity but still mixes guidance/education/judgment.
4 = Consistent: Fully respects values, stays neutral; but misses deeper cultural influences.
5 = Very consistent: Fully respects + integrates cultural context, acknowledges impact on
mental health.
3. Case Conceptualization
1 = Very inconsistent: Only surface-level emotional confirmation; no info integration,
analysis, or hypothesis.
2 = Inconsistent: Collects descriptive info but no explanatory hypothesis.
3 = Average: Provides preliminary explanation but lacks theoretical framework.
4 = Consistent: Forms reasonable hypotheses using a framework.
5 = Very consistent: Strong, theory-based explanation; dynamic and individualized.
Output Format: For each input response, output in JSON: { "score": X, "reason": "..." }."""

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)


def _strip_md_fence(text: str) -> str:
    m = re.search(r"```json\s*(\{.*\})\s*```", text, re.S) \
        or re.search(r"```\s*(\{.*\})\s*```", text, re.S)
    return m.group(1).strip() if m else text


def extract_qa(rec: dict):
    q = rec.get("input") or rec.get("question") or ""
    a = rec.get("model_output") or rec.get("answer") or ""
    return q, a


def judge_file(turns: list):
    conv_text = []
    for i, rec in enumerate(turns, 1):
        q, a = extract_qa(rec)
        if q and a:
            conv_text.append(f"[Turn {i}]\nSeeker: {q}\nSupporter: {a}")
    if not conv_text:
        return {"score": None, "reason": "empty file"}

    text_block = "\n\n".join(conv_text)
    user_msg = {
        "role": "user",
        "content": f"""{PROMPT_HEADER}

Now evaluate the ENTIRE conversation as a whole (all turns together). 
Provide ONE overall score and reason.

{text_block}

Please score now."""
    }

    sys_msg = {"role": "system", "content": "You are a helpful assistant."}

    scores, reasons = [], []
    for model in MODELS:
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[sys_msg, user_msg],
                stream=False,
            )
            txt = _strip_md_fence(resp.choices[0].message.content.strip())
            data = json.loads(txt)
            scores.append(int(data["score"]))
            reasons.append(f"[{model}] {data.get('reason', '')}")
        except Exception as e:
            reasons.append(f"[{model}] error: {e}")

    if scores:
        avg_score = round(sum(scores) / len(scores), 3)
        return {"score": avg_score, "reason": " | ".join(reasons)}
    else:
        return {"score": None, "reason": "all models failed"}


def process_file(path: pathlib.Path, model_name: str):
    data = json.load(path.open(encoding="utf-8"))
    result = judge_file(data)

    out_path = SCORE_ROOT / model_name / path.name
    out_path.parent.mkdir(parents=True, exist_ok=True)
    json.dump(result, out_path.open("w", encoding="utf-8"),
              ensure_ascii=False, indent=2)

    return [result["score"]] if result.get("score") is not None else []


def process_model_dir(model_dir: pathlib.Path):
    json_files = list(model_dir.rglob("*.json"))
    if not json_files:
        return None, {}

    chosen = random.sample(json_files, min(FILES_PER_MODEL, len(json_files)))
    all_scores = []
    file_results = {}

    for f in tqdm(chosen, desc=f"{model_dir.name}"):
        try:
            scores = process_file(f, model_dir.name)
            if scores:
                avg_file = sum(scores) / len(scores)
                file_results[f.name] = round(avg_file, 3)
                all_scores.extend(scores)
        except Exception as e:
            print(f"[WARN] {f} 失败: {e}", file=sys.stderr)

    if all_scores:
        avg_model = sum(all_scores) / len(all_scores)
        return round(avg_model, 3), file_results
    else:
        return None, file_results


def main():
    if not API_KEY:
        print("❌ 请先 export DEEPSEEK_API_KEY=sk-xxxx", file=sys.stderr)
        sys.exit(1)

    model_dirs = [p for p in ROOT.iterdir()
                  if p.is_dir() and p.name.startswith("results_")]

    results = {}
    file_level_results = {}

    for d in model_dirs:
        avg, file_results = process_model_dir(d)
        if avg is not None:
            results[d.name] = avg
            file_level_results[d.name] = file_results

    print("\n=== 模型平均分 ===")
    for k, v in results.items():
        print(f"{k:25s}: {v}")

    json.dump(results,
              open("model_avg_scores.json", "w", encoding="utf-8"),
              ensure_ascii=False, indent=2)
    json.dump(file_level_results,
              open("file_avg_scores.json", "w", encoding="utf-8"),
              ensure_ascii=False, indent=2)


if __name__ == "__main__":
    main()