"""
generate_plot_data.py — 生成语义空间可视化数据
移植自 generate-plot-data.ts
生成 metabolism/plot_data.json 供 visualize.py 使用
"""
import sys
import json
from pathlib import Path
from openai import OpenAI
from config import config
from core.engines import get_embedding
from tools.arxiv_search import search_arxiv_topic

llm_client = OpenAI(
    api_key=config["api"]["llm"]["api_key"],
    base_url=config["api"]["llm"]["base_url"]
)


def main():
    args = sys.argv[1:]
    topic_keyword = " ".join(args) if args else "AI for software engineering"
    print(f"Fetching 30 papers for 2D Projection Space on topic: [{topic_keyword}]...")

    all_papers = search_arxiv_topic(topic_keyword, 30)
    plot_data = {"papers": [], "ckm": [], "batch": []}

    print("Embedding all papers into high-dimensional vectors...")
    for p in all_papers:
        vec = get_embedding(p["abstract"])
        plot_data["papers"].append({"title": p["title"], "vector": vec, "date": p["published"]})

    print("Fetching and Embedding CKM Hypotheses from incremental Timeline...")
    base_dir = Path(__file__).parent.parent.absolute()
    hyp_dir = base_dir / "metabolism" / "hypotheses"

    if hyp_dir.exists():
        for file_path in sorted(hyp_dir.glob("*.md")):
            content = file_path.read_text(encoding="utf-8")
            import re
            abstract_match = re.search(r"## Abstract\s*([\s\S]*?)(?=##|$)", content)
            statement_match = re.search(r"## Statement([\s\S]*?)(?=##|$)", content)
            if abstract_match:
                embedding_text = abstract_match.group(1).strip()
            elif statement_match:
                embedding_text = statement_match.group(1).strip()
            else:
                embedding_text = content[:300]
            vec = get_embedding(embedding_text)
            plot_data["ckm"].append({"id": file_path.stem, "vector": vec})

    print("Generating traditional Batch Baseline Hypotheses (God-Mode Summarization)...")
    combined_abstracts = "\n".join([f"- {p['title']}" for p in all_papers])[:6000]

    prompt = (
        f"以下是关于 \"{topic_keyword}\" 领域的近期多篇论文标题汇总（包含多时间点）：\n"
        f"{combined_abstracts}\n\n"
        "请作为传统 Batch 模型（一次性阅读所有终态数据进行后处理），直接宏观总结并提出 2 个研究聚类假设方向。"
        "必须用 \"BATCH_HYP:\" 开头每一条假设。"
    )

    response = llm_client.chat.completions.create(
        model=config["api"]["llm"]["model"],
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )

    batch_text = response.choices[0].message.content or ""
    import re
    batch_matches = re.findall(r"BATCH_HYP:([\s\S]*?)(?=BATCH_HYP:|$)", batch_text)

    for i, hyp_text in enumerate(batch_matches):
        hyp_text = hyp_text.strip()
        if hyp_text:
            vec = get_embedding(hyp_text)
            plot_data["batch"].append({"id": f"Batch-H{i + 1}", "vector": vec})

    out_path = base_dir / "metabolism" / "plot_data.json"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(plot_data), encoding="utf-8")
    print(f"Plot data successfully mapped and saved to {out_path}!")


if __name__ == "__main__":
    main()
