import json, os
import pandas as pd

csv_path = "/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/train_regression/data4regression/buchwald_Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1.csv/name_searchspace.csv"
json_path = "/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/Rag-Cluster/json_files/buchwald/summary.json"

# ========== 2. 读数据 ==========
df   = pd.read_csv(csv_path)
with open(json_path, encoding='utf-8') as f:
    mapping = json.load(f)          # 结构：{col_name: {value: xxx, ...}, ...}

# ========== 3. 逐列统计并校验 ==========
all_hit = True
for col in df.columns:
    uniq_vals = df[col].dropna().unique().tolist()   # 去重
    json_pool = mapping.get(col, [])
    if len(json_pool) == 0:
        import pdb;pdb.set_trace()
    
    json_pool = [item for sublist in json_pool for item in sublist]

    missing = [v for v in uniq_vals if str(v) not in json_pool]  # 允许 json key 是 str
    if missing:
        all_hit = False
        print(f'【{col}】缺少 {len(missing)} 个值：{missing}')
        import pdb;pdb.set_trace()
    else:
        print(f'【{col}】全部命中')

if all_hit:
    print('\n恭喜，csv 里所有列值在 json 中均已存在！')