import os, json
from core.RagAgent import RagAgent
from loguru import logger
from core.settings import get_settings

settings = get_settings()

# 配置参数
DOCS_DIR = "/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/Rag-Cluster/docs"
MODEL_PATH = "/fs-computility/ai4phys/shared/caipengxiang/models/qwen3-1.7B"
# EMBEDDING_MODEL = "/fs-computility/ai4phys/shared/caipengxiang/models/all-MiniLM-L6-v2"
EMBEDDING_MODEL = "/fs-computility/ai4phys/shared/caipengxiang/models/distiluse-base-multilingual-cased-v2"
FAISS_INDEX_PATH = os.path.join(DOCS_DIR, "FAISS-INDEX")
ENABLE_WEB_SEARCH = False

if __name__ == '__main__':

    rag_system = RagAgent(DOCS_DIR, MODEL_PATH, EMBEDDING_MODEL, FAISS_INDEX_PATH, ENABLE_WEB_SEARCH)
    json_string = json.load(open("/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/Rag-Cluster/json_files/suzuki/dry_sum_suzuki.json", "r", encoding='utf-8'))

    # ----------------- 执行你的工作流 -----------------
    
    targets = ['base', 'ligand', 'solvent']
    # targets = ['base']

    cluster = True

    if cluster:

        for target_name in targets:
        
            # 定义查询
            json_string = json.load(open("/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/Rag-Cluster/json_files/suzuki/dry_sum_suzuki.json", "r", encoding='utf-8'))[target_name]
            
    # ===================== 聚类 =====================


            queries =f"""
# Task 1
Based strictly on scientific literature, identify:
1.  The most crucial influencing factor for {target_name} in Suzuki reactions
2.  A quantifiable metric (e.g., pKa, pH, Temperature) that measures this factor
3.  The mechanistic impact on reaction pathway

Requirements:
- Explicitly state which property/condition most significantly affects yield/selectivity
- Explain precisely how this factor mechanistically influences the reaction pathway
- Provide specific experimental evidence from literature
- Output ONLY: "factor: [name], metric: [name], mechanism: [1-sentence explanation]"

# Task 2
Using EXCLUSIVELY:
1.  The metric '[metric_name]' from Query 1
2.  This JSON data: {json_string}

Extract numerical values for ALL substances:
- Output: "values: {'{sub1: X, sub2: Y, nothing: null, ...}'}"
- Use null for 'nothing'
- Report exact values from JSON (no calculations)

# Task 3
Based on the searches in the literature and **the answers you have just given**, divide {json_string} into 3 categories according to the value of the factor:
- 'nothing' must be included if present (treat as 'absence of substance')
- Output format (strictly enforced): "{target_name}": [[],[],[]]   # 3 lists, no additional text
"""
    
            # 执行查询
            results, history = rag_system.run_agent_query(queries)